[med-svn] [Git][med-team/last-align][upstream] New upstream version 1145

Nilesh Patra gitlab at salsa.debian.org
Tue Nov 10 21:40:48 GMT 2020



Nilesh Patra pushed to branch upstream at Debian Med / last-align


Commits:
649febce by Nilesh Patra at 2020-11-11T02:56:23+05:30
New upstream version 1145
- - - - -


21 changed files:

- ChangeLog.txt
- doc/last-split.html
- doc/last-split.txt
- src/Alignment.cc
- src/Alignment.hh
- src/AlignmentWrite.cc
- src/Centroid.cc
- src/Centroid.hh
- src/GappedXdropAligner.cc
- src/GappedXdropAligner2qual.cc
- src/GappedXdropAlignerDna.cc
- src/GappedXdropAlignerFrame.cc
- src/GappedXdropAlignerPssm.cc
- src/LastalArguments.cc
- src/lastal.cc
- src/makefile
- + src/mcf_frameshift_xdrop_aligner.cc
- + src/mcf_frameshift_xdrop_aligner.hh
- src/mcf_gap_costs.cc
- src/mcf_gap_costs.hh
- src/version.hh


Changes:

=====================================
ChangeLog.txt
=====================================
@@ -1,8 +1,71 @@
+2020-11-06  Martin C. Frith  <Martin C. Frith>
+
+	* src/mcf_frameshift_xdrop_aligner.hh:
+	Try to appease C++ compilers
+	[62a147543f2b] [tip]
+
+	* doc/last-split.txt:
+	Add awk mismap trick to doc
+	[aabc3e86c600]
+
+	* src/GappedXdropAlignerFrame.cc:
+	Refactor
+	[10c6c0ff4ac2]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerDna.cc, src/GappedXdropAlignerFrame.cc,
+	src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[0bff03a84a63]
+
+	* src/Alignment.cc, src/Alignment.hh, src/AlignmentWrite.cc,
+	src/LastalArguments.cc, src/lastal.cc, src/makefile,
+	src/mcf_frameshift_xdrop_aligner.cc,
+	src/mcf_frameshift_xdrop_aligner.hh, test/last-test.out, test/last-
+	test.sh:
+	Add probabilities for new-style frameshifts
+	[8b133765d82e]
+
+2020-11-05  Martin C. Frith  <Martin C. Frith>
+
+	* src/Alignment.hh, src/Centroid.cc, src/Centroid.hh:
+	Refactor
+	[8291399ebe14]
+
+	* src/Alignment.hh, src/AlignmentWrite.cc, test/last-test.out, test
+	/last-test.sh:
+	Fix protein-codon alignment probabilities
+	[86ac07a750ac]
+
+2020-11-04  Martin C. Frith  <Martin C. Frith>
+
+	* src/Centroid.cc, src/lastal.cc, src/mcf_gap_costs.cc,
+	src/mcf_gap_costs.hh:
+	Refactor
+	[8383c3867429]
+
+	* src/Alignment.cc, src/Alignment.hh, src/lastal.cc, src/makefile:
+	Refactor
+	[115ac5aa5bee]
+
+	* src/Alignment.cc, src/AlignmentWrite.cc:
+	Refactor
+	[defdc3f2ccdd]
+
+	* src/Alignment.cc:
+	Refactor
+	[7b420067c0fa]
+
+	* src/Alignment.cc, src/Alignment.hh, src/AlignmentWrite.cc,
+	src/Centroid.cc, src/Centroid.hh, src/lastal.cc:
+	Refactor
+	[60f728268532]
+
 2020-10-14  Martin C. Frith  <Martin C. Frith>
 
 	* scripts/last-dotplot:
 	last-dotplot: fix crash on long annotation names
-	[04663552433d] [tip]
+	[04663552433d]
 
 	* scripts/last-dotplot:
 	last-dotplot: fix crash on annotation names


=====================================
doc/last-split.html
=====================================
@@ -500,6 +500,14 @@ Error probability <= 10 ^ -((ASCII value - 33) / 10)
 you run last-split twice, as in the genome alignment recipes, the
 mismap is the lowest combined error probability from both "p" lines.)</p>
 </div>
+<div class="section" id="tips">
+<h2>Tips</h2>
+<p>To omit alignments with mismap probability > <tt class="docutils literal"><span class="pre">10^-6</span></tt> (say), you can
+use the <tt class="docutils literal"><span class="pre">-m</span></tt> option (see below), or do this:</p>
+<pre class="literal-block">
+awk -F= '/^a/ {i = $3 <= 1e-6} i' out.maf > out2.maf
+</pre>
+</div>
 <div class="section" id="split-versus-spliced-alignment">
 <h2>Split versus spliced alignment</h2>
 <p>Here is a split alignment:</p>


=====================================
doc/last-split.txt
=====================================
@@ -111,6 +111,14 @@ The "mismap" is simply the lowest probability from the "p" line.  (If
 you run last-split twice, as in the genome alignment recipes, the
 mismap is the lowest combined error probability from both "p" lines.)
 
+Tips
+----
+
+To omit alignments with mismap probability > ``10^-6`` (say), you can
+use the ``-m`` option (see below), or do this::
+
+  awk -F= '/^a/ {i = $3 <= 1e-6} i' out.maf > out2.maf
+
 Split versus spliced alignment
 ------------------------------
 


=====================================
src/Alignment.cc
=====================================
@@ -2,54 +2,36 @@
 
 #include "Alignment.hh"
 #include "Alphabet.hh"
-#include "Centroid.hh"
 #include "GeneticCode.hh"
-#include "GreedyXdropAligner.hh"
 #include "TwoQualityScoreMatrix.hh"
-#include <cassert>
+
+#include <assert.h>
 
 // make C++ tolerable:
 #define IT(type) std::vector<type>::iterator
 
 using namespace cbrc;
 
-void Alignment::fromSegmentPair( const SegmentPair& sp ){
-  blocks.assign( 1, sp );
-  score = sp.score;
-}
-
-static void addExpectedCounts( double* expectedCounts,
-			       const ExpectedCount& ec,
-			       const Alphabet& alph ){
+static void addExpectedCounts(double *totalCounts, const ExpectedCount &ec) {
   for( unsigned i = 0; i < scoreMatrixRowSize; ++i ){
-    unsigned x = alph.numbersToUppercase[i];
-    if( x >= alph.size ) continue;
     for( unsigned j = 0; j < scoreMatrixRowSize; ++j ){
-      unsigned y = alph.numbersToUppercase[j];
-      if( y >= alph.size ) continue;
-      expectedCounts[ x * alph.size + y ] += ec.emit[i][j];
+      *totalCounts += ec.emit[i][j];
+      ++totalCounts;
     }
   }
-
-  const int numEmissionCounts = alph.size * alph.size;
-  double* transitionCounts = &expectedCounts[ numEmissionCounts ];
-
-  transitionCounts[0] += ec.toMatch;
-  transitionCounts[1] += ec.delNext + ec.delInit;  // deleted letter count
-  transitionCounts[2] += ec.insNext + ec.insInit;  // inserted letter count
-  transitionCounts[3] += ec.delInit;  // deletion open/close count
-  transitionCounts[4] += ec.insInit;  // insertion open/close count
+  totalCounts[0] += ec.toMatch;
+  totalCounts[1] += ec.delNext + ec.delInit;  // deleted letter count
+  totalCounts[2] += ec.insNext + ec.insInit;  // inserted letter count
+  totalCounts[3] += ec.delInit;  // deletion open/close count
+  totalCounts[4] += ec.insInit;  // insertion open/close count
 }
 
-static void countSeedMatches( double* expectedCounts,
-			      const uchar* seq1beg, const uchar* seq1end,
-			      const uchar* seq2beg, const Alphabet& alph ){
-  while( seq1beg < seq1end ){
-    unsigned x1 = alph.numbersToUppercase[ *seq1beg++ ];
-    unsigned x2 = alph.numbersToUppercase[ *seq2beg++ ];
-    if( x1 < alph.size && x2 < alph.size )
-      ++expectedCounts[ x1 * alph.size + x2 ];
+static void addSeedCounts(const uchar *seq1, const uchar *seq2, size_t size,
+			  double *counts) {
+  for (size_t i = 0; i < size; ++i) {
+    ++counts[seq1[i] * scoreMatrixRowSize + seq2[i]];
   }
+  counts[scoreMatrixRowSize * scoreMatrixRowSize] += size;
 }
 
 // Does x precede and touch y in both sequences?
@@ -57,12 +39,12 @@ static bool isNext( const SegmentPair& x, const SegmentPair& y ){
   return x.end1() == y.beg1() && x.end2() == y.beg2();
 }
 
-void Alignment::makeXdrop( Centroid& centroid,
-			   GreedyXdropAligner& greedyAligner, bool isGreedy,
+void Alignment::makeXdrop( Aligners &aligners, bool isGreedy,
 			   const uchar* seq1, const uchar* seq2, int globality,
 			   const ScoreMatrixRow* scoreMatrix,
 			   int smMax, int smMin,
-			   const mcf::GapCosts& gap, int maxDrop,
+			   const const_dbl_ptr* probMatrix, double scale,
+			   const GapCosts& gap, int maxDrop,
 			   size_t frameSize, const ScoreMatrixRow* pssm2,
                            const TwoQualityScoreMatrix& sm2qual,
                            const uchar* qual1, const uchar* qual2,
@@ -73,21 +55,17 @@ void Alignment::makeXdrop( Centroid& centroid,
 
   if( outputType == 7 ){
     assert( seed.size > 0 );  // makes things easier to understand
-    const int numEmissionCounts = alph.size * alph.size;
-    const int numTransitionCounts = 5;
-    std::vector<double>& expectedCounts = extras.expectedCounts;
-    expectedCounts.resize( numEmissionCounts + numTransitionCounts );
-    countSeedMatches( &expectedCounts[0],
-		      seq1 + seed.beg1(), seq1 + seed.end1(),
-		      seq2 + seed.beg2(), alph );
-    expectedCounts[ numEmissionCounts ] += seed.size;  // match count
+    const int numOfTransitions = frameSize ? 9 : 5;
+    std::vector<double> &ec = extras.expectedCounts;
+    ec.resize(scoreMatrixRowSize * scoreMatrixRowSize + numOfTransitions);
+    addSeedCounts(seq1 + seed.beg1(), seq2 + seed.beg2(), seed.size, &ec[0]);
   }
 
   // extend a gapped alignment in the left/reverse direction from the seed:
   std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
-  extend( blocks, columnAmbiguityCodes, centroid, greedyAligner, isGreedy,
+  extend( blocks, columnAmbiguityCodes, aligners, isGreedy,
 	  seq1, seq2, seed.beg1(), seed.beg2(), false, globality,
-	  scoreMatrix, smMax, smMin, maxDrop, gap,
+	  scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
 	  frameSize, pssm2, sm2qual, qual1, qual2, alph,
 	  extras, gamma, outputType );
 
@@ -105,9 +83,9 @@ void Alignment::makeXdrop( Centroid& centroid,
   // extend a gapped alignment in the right/forward direction from the seed:
   std::vector<SegmentPair> forwardBlocks;
   std::vector<char> forwardAmbiguities;
-  extend( forwardBlocks, forwardAmbiguities, centroid, greedyAligner, isGreedy,
+  extend( forwardBlocks, forwardAmbiguities, aligners, isGreedy,
 	  seq1, seq2, seed.end1(), seed.end2(), true, globality,
-	  scoreMatrix, smMax, smMin, maxDrop, gap,
+	  scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
 	  frameSize, pssm2, sm2qual, qual1, qual2, alph,
 	  extras, gamma, outputType );
 
@@ -162,7 +140,7 @@ void Alignment::makeXdrop( Centroid& centroid,
 
 // cost of the gap between x and y
 static int gapCost(const SegmentPair &x, const SegmentPair &y,
-		   const mcf::GapCosts &gapCosts, size_t frameSize) {
+		   const GapCosts &gapCosts, size_t frameSize) {
   if (gapCosts.isNewFrameshifts()) return x.score;
   size_t gapSize1 = y.beg1() - x.end1();
   size_t gapSize2, frameshift2;
@@ -174,7 +152,7 @@ static int gapCost(const SegmentPair &x, const SegmentPair &y,
 
 bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2, int globality,
 			   const ScoreMatrixRow* scoreMatrix, int maxDrop,
-			   const mcf::GapCosts& gapCosts, size_t frameSize,
+			   const GapCosts& gapCosts, size_t frameSize,
 			   const ScoreMatrixRow* pssm2,
                            const TwoQualityScoreMatrix& sm2qual,
                            const uchar* qual1, const uchar* qual2 ) const{
@@ -216,7 +194,7 @@ bool Alignment::isOptimal( const uchar* seq1, const uchar* seq2, int globality,
 
 bool Alignment::hasGoodSegment(const uchar *seq1, const uchar *seq2,
 			       int minScore, const ScoreMatrixRow *scoreMatrix,
-			       const mcf::GapCosts &gapCosts, size_t frameSize,
+			       const GapCosts &gapCosts, size_t frameSize,
 			       const ScoreMatrixRow *pssm2,
 			       const TwoQualityScoreMatrix &sm2qual,
 			       const uchar *qual1, const uchar *qual2) const {
@@ -273,27 +251,46 @@ static void getColumnCodes(const Centroid& centroid, std::vector<char>& codes,
   }
 }
 
+static void getColumnCodes(const FrameshiftXdropAligner &fxa,
+			   std::vector<char> &codes,
+			   const std::vector<SegmentPair> &chunks) {
+  for (size_t i = 0; i < chunks.size(); ++i) {
+    const SegmentPair &x = chunks[i];
+    for (size_t k = x.size; k-- > 0;) {
+      double p = fxa.matchProb(x.beg1() + k, x.beg2() + k * 3);
+      codes.push_back(asciiProbability(p));
+    }
+    size_t j = i + 1;
+    bool isNext = (j < chunks.size());
+    size_t end1 = isNext ? chunks[j].end1() : 0;
+    size_t end2 = isNext ? chunks[j].beg2() + chunks[j].size * 3 : 0;
+    size_t n1 = x.beg1() - end1;
+    size_t n2 = (x.beg2() - end2 + 1) / 3;
+    codes.insert(codes.end(), n1 + n2, '-');
+  }
+}
+
 void Alignment::extend( std::vector< SegmentPair >& chunks,
 			std::vector< char >& columnCodes,
-			Centroid& centroid,
-			GreedyXdropAligner& greedyAligner, bool isGreedy,
+			Aligners &aligners, bool isGreedy,
 			const uchar* seq1, const uchar* seq2,
 			size_t start1, size_t start2,
 			bool isForward, int globality,
 			const ScoreMatrixRow* sm, int smMax, int smMin,
-			int maxDrop, const mcf::GapCosts& gap,
-			size_t frameSize,
+			const const_dbl_ptr* probMat, double scale,
+			int maxDrop, const GapCosts& gap, size_t frameSize,
 			const ScoreMatrixRow* pssm2,
-                        const TwoQualityScoreMatrix& sm2qual,
+			const TwoQualityScoreMatrix& sm2qual,
                         const uchar* qual1, const uchar* qual2,
 			const Alphabet& alph, AlignmentExtras& extras,
 			double gamma, int outputType ){
-  const mcf::GapCosts::Piece &del = gap.delPieces[0];
-  const mcf::GapCosts::Piece &ins = gap.insPieces[0];
+  const GapCosts::Piece &del = gap.delPieces[0];
+  const GapCosts::Piece &ins = gap.insPieces[0];
+  Centroid &centroid = aligners.centroid;
   GappedXdropAligner& aligner = centroid.aligner();
+  GreedyXdropAligner &greedyAligner = aligners.greedyAligner;
 
   if( frameSize ){
-    assert( outputType < 4 );
     assert( !isGreedy );
     assert( !globality );
     assert( !pssm2 );
@@ -313,7 +310,27 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
       while (aligner.getNextChunkFrame(end1, end2, size, gapCost, gap))
 	chunks.push_back(SegmentPair(end1 - size, end2 - size * 3, size,
 				     gapCost));
+      if (outputType > 3) {
+	FrameshiftXdropAligner &fxa = aligners.frameshiftAligner;
+	double probDropLimit = exp(scale * -maxDrop);
+	double s = fxa.forward(seq1 + start1, seq2 + start2,
+			       seq2 + dnaToAa(frame1, frameSize),
+			       seq2 + dnaToAa(frame2, frameSize),
+			       isForward, probMat, gap, probDropLimit);
+	extras.fullScore += s / scale;
+	fxa.backward(isForward, probMat, gap);
+	getColumnCodes(fxa, columnCodes, chunks);
+	if (outputType == 7) {
+	  double *ec = &extras.expectedCounts[0];
+	  double *subsCounts[scoreMatrixRowSize];
+	  for (int i = 0; i < scoreMatrixRowSize; ++i)
+	    subsCounts[i] = ec + i * scoreMatrixRowSize;
+	  double *tranCounts = ec + scoreMatrixRowSize * scoreMatrixRowSize;
+	  fxa.count(isForward, gap, subsCounts, tranCounts);
+	}
+      }
     } else {
+      assert(outputType < 4);
       size_t frame2 = isForward ? dnaStart - 1 : dnaStart + 1;
       score += aligner.align3(seq1 + start1, seq2 + start2,
 			      seq2 + dnaToAa(frame1, frameSize),
@@ -417,7 +434,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
       ExpectedCount ec;
       centroid.computeExpectedCounts( seq1, seq2, start1, start2,
 				      isForward, gap, alph.size, ec );
-      addExpectedCounts( &extras.expectedCounts[0], ec, alph );
+      addExpectedCounts(&extras.expectedCounts[0], ec);
     }
   }
 }


=====================================
src/Alignment.hh
=====================================
@@ -4,25 +4,28 @@
 
 #ifndef ALIGNMENT_HH
 #define ALIGNMENT_HH
-#include "ScoreMatrixRow.hh"
+
+#include "Centroid.hh"
+#include "GreedyXdropAligner.hh"
 #include "SegmentPair.hh"
-#include "mcf_gap_costs.hh"
-#include <stddef.h>  // size_t
-#include <string>
+#include "mcf_frameshift_xdrop_aligner.hh"
+
 #include <vector>
 #include <cstring>
 
 namespace cbrc{
 
-typedef unsigned char uchar;
-
-class GreedyXdropAligner;
 class LastEvaluer;
 class MultiSequence;
 class Alphabet;
-class Centroid;
 class TwoQualityScoreMatrix;
 
+struct Aligners {
+  Centroid centroid;
+  FrameshiftXdropAligner frameshiftAligner;
+  GreedyXdropAligner greedyAligner;
+};
+
 struct AlignmentText {
   // This holds the final text representation of an alignment, along
   // with data for sorting it.
@@ -70,18 +73,21 @@ struct AlignmentExtras {
 
 struct Alignment{
   // make a single-block alignment:
-  void fromSegmentPair( const SegmentPair& sp );
+  void fromSegmentPair(const SegmentPair &sp) {
+    blocks.assign(1, sp);
+    score = sp.score;
+  }
 
   // Make an Alignment by doing gapped X-drop extension in both
   // directions starting from a seed SegmentPair.  The resulting
   // Alignment might not be "optimal" (see below).
   // If outputType > 3: calculates match probabilities.
   // If outputType > 4: does gamma-centroid alignment.
-  void makeXdrop( Centroid& centroid,
-		  GreedyXdropAligner& greedyAligner, bool isGreedy,
+  void makeXdrop( Aligners &aligners, bool isGreedy,
 		  const uchar* seq1, const uchar* seq2, int globality,
 		  const ScoreMatrixRow* scoreMatrix, int smMax, int smMin,
-		  const mcf::GapCosts& gap, int maxDrop, size_t frameSize,
+		  const const_dbl_ptr* probMatrix, double scale,
+		  const GapCosts& gap, int maxDrop, size_t frameSize,
 		  const ScoreMatrixRow* pssm2,
                   const TwoQualityScoreMatrix& sm2qual,
                   const uchar* qual1, const uchar* qual2,
@@ -94,7 +100,7 @@ struct Alignment{
   // If "globality" is non-zero, skip the prefix and suffix checks.
   bool isOptimal( const uchar* seq1, const uchar* seq2, int globality,
                   const ScoreMatrixRow* scoreMatrix, int maxDrop,
-                  const mcf::GapCosts& gapCosts, size_t frameSize,
+                  const GapCosts& gapCosts, size_t frameSize,
 		  const ScoreMatrixRow* pssm2,
                   const TwoQualityScoreMatrix& sm2qual,
                   const uchar* qual1, const uchar* qual2 ) const;
@@ -102,7 +108,7 @@ struct Alignment{
   // Does the Alignment have any segment with score >= minScore?
   bool hasGoodSegment(const uchar *seq1, const uchar *seq2,
 		      int minScore, const ScoreMatrixRow *scoreMatrix,
-		      const mcf::GapCosts &gapCosts, size_t frameSize,
+		      const GapCosts &gapCosts, size_t frameSize,
 		      const ScoreMatrixRow *pssm2,
 		      const TwoQualityScoreMatrix &sm2qual,
 		      const uchar *qual1, const uchar *qual2) const;
@@ -130,13 +136,13 @@ struct Alignment{
 
   void extend( std::vector< SegmentPair >& chunks,
 	       std::vector< char >& columnCodes,
-	       Centroid& centroid,
-	       GreedyXdropAligner& greedyAligner, bool isGreedy,
+	       Aligners &aligners, bool isGreedy,
 	       const uchar* seq1, const uchar* seq2,
 	       size_t start1, size_t start2,
 	       bool isForward, int globality,
-	       const ScoreMatrixRow* sm, int smMax, int smMin, int maxDrop,
-	       const mcf::GapCosts& gap, size_t frameSize,
+	       const ScoreMatrixRow* sm, int smMax, int smMin,
+	       const const_dbl_ptr* probMat, double scale,
+	       int maxDrop, const GapCosts& gap, size_t frameSize,
 	       const ScoreMatrixRow* pssm2,
                const TwoQualityScoreMatrix& sm2qual,
                const uchar* qual1, const uchar* qual2,
@@ -169,6 +175,9 @@ struct Alignment{
 
   char *writeBotSeq(char *dest, const uchar *seq, const Alphabet &alph,
 		    size_t qualsPerBase, size_t frameSize, bool isCodon) const;
+
+  char *writeColumnProbs(char *dest, const char *probSymbols,
+			 size_t frameSize, bool isCodon) const;
 };
 
 }  // end namespace cbrc


=====================================
src/AlignmentWrite.cc
=====================================
@@ -302,14 +302,33 @@ static char *writeMafHeadQ(char *out,
 
 // Write a "c" line
 static void writeMafLineC(std::vector<char> &cLine,
-			  const std::vector<double> &counts) {
-  size_t s = counts.size();
-  if (s == 0) return;
-  cLine.resize(2 + 32 * s);
+			  const std::vector<double> &counts,
+			  const Alphabet &alph, bool isCodon) {
+  if (counts.empty()) return;
+  unsigned alphSize2 = isCodon ? 64 : alph.size;
+  unsigned numOfSubstitutions = scoreMatrixRowSize * scoreMatrixRowSize;
+  size_t numOfTransitions = counts.size() - numOfSubstitutions;
+  size_t numOfCounts = alph.size * alphSize2 + numOfTransitions;
+  cLine.resize(2 + 32 * numOfCounts);
   char *e = &cLine[0];
   *e++ = 'c';
-  for (size_t i = 0; i < s; ++i)
-    e += std::sprintf(e, " %.3g", counts[i]);
+
+  for (unsigned i = 0; i < alph.size; ++i) {
+    unsigned x = alph.numbersToLowercase[i];
+    for (unsigned j = 0; j < alphSize2; ++j) {
+      unsigned y = isCodon ? j : alph.numbersToLowercase[j];
+      double c = counts[i * scoreMatrixRowSize + j];
+      if (x != i) c += counts[x * scoreMatrixRowSize + j];
+      if (y != j) c += counts[i * scoreMatrixRowSize + y];
+      if (x != i && y != j) c += counts[x * scoreMatrixRowSize + y];
+      e += std::sprintf(e, " %.3g", c);
+    }
+  }
+
+  for (size_t i = 0; i < numOfTransitions; ++i) {
+    e += std::sprintf(e, " %.3g", counts[numOfSubstitutions + i]);
+  }
+
   *e++ = '\n';
   cLine.resize(e - &cLine[0]);
 }
@@ -324,7 +343,7 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
 				  const AlignmentExtras& extras) const {
   bool isCodon = (translationType == 2);
   double fullScore = extras.fullScore;
-  const std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
+  const std::vector<char>& columnProbSymbols = extras.columnAmbiguityCodes;
 
   size_t alnBeg1 = beg1();
   size_t alnEnd1 = end1();
@@ -365,14 +384,14 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
   size_t sLineLen = 2 + pLineBlankLen + numColumns(frameSize2, isCodon) + 1;
 
   std::vector<char> cLine;
-  writeMafLineC(cLine, extras.expectedCounts);
+  writeMafLineC(cLine, extras.expectedCounts, alph, isCodon);
 
   size_t qualsPerBase1 = seq1.qualsPerLetter();
   size_t qualsPerBase2 = seq2.qualsPerLetter();
   bool isQuals1 = qualsPerBase1 && (translationType != 2);
   bool isQuals2 = qualsPerBase2 && (translationType != 1);
 
-  size_t sLineNum = 2 + isQuals1 + isQuals2 + !columnAmbiguityCodes.empty();
+  size_t sLineNum = 2 + isQuals1 + isQuals2 + !columnProbSymbols.empty();
   size_t textLen = aLineLen + sLineLen * sLineNum + cLine.size() + 1;
   char *text = new char[textLen + 1];
 
@@ -402,17 +421,17 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
     *dest++ = '\n';
   }
 
-  Writer w(dest);
-
-  if (!columnAmbiguityCodes.empty()) {
+  if (!columnProbSymbols.empty()) {
+    Writer w(dest);
     w << 'p' << ' ';
     w.fill(pLineBlankLen, ' ');
-    w.copy(&columnAmbiguityCodes[0], columnAmbiguityCodes.size());
-    w << '\n';
+    dest = w.pointer();
+    dest = writeColumnProbs(dest, &columnProbSymbols[0], frameSize2, isCodon);
+    *dest++ = '\n';
   }
 
+  Writer w(dest);
   if (!cLine.empty()) w.copy(&cLine[0], cLine.size());
-
   w << '\n' << '\0';  // blank line afterwards
 
   return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
@@ -649,3 +668,42 @@ char *Alignment::writeBotSeq(char *dest, const uchar *seq,
 
   return dest;
 }
+
+static char *writeProbSymbols(char *dest, const char *probSymbols,
+			      size_t size, int aaLen) {
+  for (size_t i = 0; i < size; ++i)
+    for (int j = 0; j < aaLen; ++j)
+      *dest++ = probSymbols[i];
+  return dest;
+}
+
+char *Alignment::writeColumnProbs(char *dest, const char *probSymbols,
+				  size_t frameSize, bool isCodon) const {
+  const int aaLen = isCodon ? 3 : 1;
+
+  for (size_t i = 0; i < blocks.size(); ++i) {
+    const SegmentPair &y = blocks[i];
+    if (i > 0) {  // between each pair of aligned blocks:
+      const SegmentPair &x = blocks[i - 1];
+
+      size_t topLen = y.beg1() - x.end1();
+      dest = writeProbSymbols(dest, probSymbols, topLen, aaLen);
+      probSymbols += topLen;
+
+      size_t beg, end, shift;
+      setFrameCoords(beg, end, shift, x.end2(), y.beg2(), frameSize, isCodon);
+      size_t rawLen = end - beg;
+      size_t botLen = (rawLen + aaLen / 3) / aaLen;
+      if (shift) *dest++ = '-';
+      dest = writeProbSymbols(dest, probSymbols, botLen, aaLen);
+      probSymbols += botLen;
+      if (beg < end && rawLen % aaLen == 1) *dest++ = '-';
+      if (beg < end && rawLen % aaLen == 2) --dest;
+    }
+
+    dest = writeProbSymbols(dest, probSymbols, y.size, aaLen);
+    probSymbols += y.size;
+  }
+
+  return dest;
+}


=====================================
src/Centroid.cc
=====================================
@@ -3,12 +3,11 @@
 
 #include "Centroid.hh"
 #include "GappedXdropAlignerInl.hh"
+
 #include <algorithm>
 #include <cassert>
 #include <cmath> // for exp
 #include <cfloat>   // for DBL_MAX
-#include <cstdlib>  // for abs
-#include <iomanip>
 
 static const double DINF = DBL_MAX / 2;
 
@@ -114,14 +113,14 @@ namespace cbrc{
 
   void Centroid::forward(const uchar* seq1, const uchar* seq2,
 			 const ExpMatrixRow* pssm, bool isExtendFwd,
-			 const GapCosts& gap, int globality) {
+			 const GapCosts& gapCosts, int globality) {
     const int seqIncrement = isExtendFwd ? 1 : -1;
     initForwardMatrix();
 
-    const double delOpen = EXP(-gap.delPieces[0].openCost / T);
-    const double delGrow = EXP(-gap.delPieces[0].growCost / T);
-    const double insOpen = EXP(-gap.insPieces[0].openCost / T);
-    const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+    const double delOpen = gapCosts.delProbPieces[0].openProb;
+    const double delGrow = gapCosts.delProbPieces[0].growProb;
+    const double insOpen = gapCosts.insProbPieces[0].openProb;
+    const double insGrow = gapCosts.insProbPieces[0].growProb;
 
     const double delInit = delOpen * delGrow;  // for 1st letter in a deletion
     const double insInit = insOpen * insGrow;  // for 1st letter in an insert
@@ -231,14 +230,14 @@ namespace cbrc{
   // compute posterior probabilities while executing backward algorithm
   void Centroid::backward(const uchar* seq1, const uchar* seq2,
 			  const ExpMatrixRow* pssm, bool isExtendFwd,
-			  const GapCosts& gap, int globality) {
+			  const GapCosts& gapCosts, int globality) {
     const int seqIncrement = isExtendFwd ? 1 : -1;
     initBackwardMatrix();
 
-    const double delOpen = EXP(-gap.delPieces[0].openCost / T);
-    const double delGrow = EXP(-gap.delPieces[0].growCost / T);
-    const double insOpen = EXP(-gap.insPieces[0].openCost / T);
-    const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+    const double delOpen = gapCosts.delProbPieces[0].openProb;
+    const double delGrow = gapCosts.delProbPieces[0].growProb;
+    const double insOpen = gapCosts.insProbPieces[0].openProb;
+    const double insGrow = gapCosts.insProbPieces[0].growProb;
 
     const double delInit = delOpen * delGrow;  // for 1st letter in a deletion
     const double insInit = insOpen * insGrow;  // for 1st letter in an insert
@@ -516,21 +515,6 @@ namespace cbrc{
     }
   }
 
-  // Return an ASCII code representing an error probability.  The
-  // printable codes are 33--126, but here we use a maximum of 125, so
-  // that 126 is reserved for special cases.
-  static char asciiProbability( double probCorrect ){
-    assert( probCorrect >= 0 );
-    //assert( probCorrect <= 1 );  // can fail: floating point is imperfect
-    double e = 1 - probCorrect;
-    double f = std::max( e, 1e-10 );  // avoid overflow errors
-    double g = -10 * std::log10(f);
-    int i = static_cast<int>(g);  // round fractions down
-    int j = i + 33;
-    int k = std::min( j, 125 );
-    return static_cast<char>(k);
-  }
-
   void Centroid::getMatchAmbiguities(std::vector<char>& ambiguityCodes,
 				     size_t seq1end, size_t seq2end,
 				     size_t length) const {
@@ -584,7 +568,7 @@ namespace cbrc{
   void Centroid::computeExpectedCounts ( const uchar* seq1, const uchar* seq2,
 					 size_t start1, size_t start2,
 					 bool isExtendFwd,
-					 const GapCosts& gap,
+					 const GapCosts& gapCosts,
 					 unsigned alphabetSize,
 					 ExpectedCount& c ) const{
     seq1 += start1;
@@ -600,10 +584,10 @@ namespace cbrc{
     int alphabetSizeIncrement = alphabetSize;
     if (!isExtendFwd) alphabetSizeIncrement *= -1;
 
-    const double delOpen = EXP(-gap.delPieces[0].openCost / T);
-    const double delGrow = EXP(-gap.delPieces[0].growCost / T);
-    const double insOpen = EXP(-gap.insPieces[0].openCost / T);
-    const double insGrow = EXP(-gap.insPieces[0].growCost / T);
+    const double delOpen = gapCosts.delProbPieces[0].openProb;
+    const double delGrow = gapCosts.delProbPieces[0].growProb;
+    const double insOpen = gapCosts.insProbPieces[0].openProb;
+    const double insGrow = gapCosts.insProbPieces[0].growProb;
 
     const double delInit = delOpen * delGrow;  // for 1st letter in a deletion
     const double insInit = insOpen * insGrow;  // for 1st letter in an insert


=====================================
src/Centroid.hh
=====================================
@@ -3,10 +3,16 @@
 
 #ifndef CENTROID_HH
 #define CENTROID_HH
+
 #include "GappedXdropAligner.hh"
 #include "mcf_gap_costs.hh"
 #include "SegmentPair.hh"
 #include "OneQualityScoreMatrix.hh"
+
+#include <assert.h>
+#include <math.h>
+
+#include <algorithm>
 #include <vector>
 #include <iostream> // for debug
 
@@ -56,17 +62,17 @@ namespace cbrc{
 
     void doForwardBackwardAlgorithm(const uchar* seq1, const uchar* seq2,
 				    size_t start1, size_t start2,
-				    bool isExtendFwd, const GapCosts& gap,
+				    bool isExtendFwd, const GapCosts& gapCosts,
 				    int globality) {
       seq1 += start1;
       seq2 += start2;
       const ExpMatrixRow *pssm = isPssm ? pssmExp2 + start2 : 0;
       numAntidiagonals = xa.numAntidiagonals();
       scale.assign(numAntidiagonals + 2, 1.0);
-      forward(seq1, seq2, pssm, isExtendFwd, gap, globality);
+      forward(seq1, seq2, pssm, isExtendFwd, gapCosts, globality);
       mD.assign(numAntidiagonals + 2, 0.0);
       mI.assign(numAntidiagonals + 2, 0.0);
-      backward(seq1, seq2, pssm, isExtendFwd, gap, globality);
+      backward(seq1, seq2, pssm, isExtendFwd, gapCosts, globality);
     }
 
     double dp( double gamma );
@@ -97,7 +103,7 @@ namespace cbrc{
     // Added by MH (2008/10/10) : compute expected counts for transitions and emissions
     void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
 			       size_t start1, size_t start2, bool isExtendFwd,
-			       const GapCosts& gap, unsigned alphabetSize,
+			       const GapCosts& gapCosts, unsigned alphabetSize,
 			       ExpectedCount& count) const;
 
   private:
@@ -138,11 +144,11 @@ namespace cbrc{
 
     void forward(const uchar* seq1, const uchar* seq2,
 		 const ExpMatrixRow* pssm, bool isExtendFwd,
-		 const GapCosts& gap, int globality);
+		 const GapCosts& gapCosts, int globality);
 
     void backward(const uchar* seq1, const uchar* seq2,
 		  const ExpMatrixRow* pssm, bool isExtendFwd,
-		  const GapCosts& gap, int globality);
+		  const GapCosts& gapCosts, int globality);
 
     void initForwardMatrix();
     void initBackwardMatrix();
@@ -156,6 +162,20 @@ namespace cbrc{
     }
   };
 
+  // Return an ASCII code representing an error probability.  The
+  // printable codes are 33--126, but here we use a maximum of 125, so
+  // that 126 is reserved for special cases.
+  inline char asciiProbability(double probCorrect) {
+    assert(probCorrect >= 0);
+    //assert(probCorrect <= 1);  // can fail: floating point is imperfect
+    double e = 1 - probCorrect;
+    double f = std::max(e, 1e-10);  // avoid overflow errors
+    double g = -10 * log10(f);
+    int i = static_cast<int>(g);  // round fractions down
+    int j = i + 33;
+    int k = std::min(j, 125);
+    return static_cast<char>(k);
+  }
 }
 
 #endif


=====================================
src/GappedXdropAligner.cc
=====================================
@@ -74,7 +74,7 @@ int GappedXdropAligner::align(const uchar *seq1,
   const SimdInt mInsGrowCost = simdFill(insExtensionCost);
   const int seqIncrement = isForward ? 1 : -1;
 
-  size_t seq1beg = 0;
+  int numCells = 1;
   size_t seq1end = 1;
   size_t diagPos = xdropPadLen - 1;
   size_t horiPos = xdropPadLen * 2 - 1;
@@ -103,9 +103,7 @@ int GappedXdropAligner::align(const uchar *seq1,
 
   size_t antidiagonal;
   for (antidiagonal = 0; /* noop */; ++antidiagonal) {
-    int numCells = seq1end - seq1beg;
     int n = numCells - 1;
-
     const const_int_ptr *s1 = &pssmQueue.fromEnd(n + simdLen);
     const uchar *s2 = seq2queue.begin();
 
@@ -129,7 +127,7 @@ int GappedXdropAligner::align(const uchar *seq1,
       const Score *z2 = &zScores[diagPos];
       int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
       updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		  minScore, b, antidiagonal, seq1beg);
+		  minScore, b, antidiagonal, seq1end - numCells);
     }
 
     if (globality && isDelimiter1) {
@@ -200,6 +198,7 @@ int GappedXdropAligner::align(const uchar *seq1,
     thisPos += numCells;
 
     if (x0[n] > -INF / 2) {
+      ++numCells;
       ++seq1end;
       const int *x = scorer[*seq1];
       pssmQueue.push(x, n + simdLen);
@@ -213,10 +212,10 @@ int GappedXdropAligner::align(const uchar *seq1,
       seq2 += seqIncrement;
       isDelimiter2 = isDelimiter(y, scorer[0]);
     } else {
-      ++seq1beg;
+      --numCells;
+      if (numCells == 0) break;
       ++diagPos;
       ++horiPos;
-      if (seq1beg == seq1end) break;
     }
   }
 


=====================================
src/GappedXdropAligner2qual.cc
=====================================
@@ -27,7 +27,7 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
                                    int maxMatchScore) {
   const int seqIncrement = isForward ? 1 : -1;
 
-  size_t seq1beg = 0;
+  int numCells = 1;
   size_t seq1end = 1;
   size_t diagPos = xdropPadLen - 1;
   size_t horiPos = xdropPadLen * 2 - 1;
@@ -41,15 +41,13 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
 
   size_t antidiagonal;
   for (antidiagonal = 0; /* noop */; ++antidiagonal) {
-    int numCells = seq1end - seq1beg;
     int n = numCells - 1;
-
-    size_t seq2pos = antidiagonal - seq1beg;
-
-    const uchar *s1 = isForward ? seq1 + seq1beg : seq1 - seq1beg;
-    const uchar *q1 = isForward ? qual1 + seq1beg : qual1 - seq1beg;
-    const uchar *s2 = isForward ? seq2 + seq2pos : seq2 - seq2pos;
-    const uchar *q2 = isForward ? qual2 + seq2pos : qual2 - seq2pos;
+    const uchar *s1 = seq1;
+    const uchar *q1 = qual1;
+    const uchar *s2 = seq2;
+    const uchar *q2 = qual2;
+    seq2 += seqIncrement;
+    qual2 += seqIncrement;
 
     initAntidiagonal(antidiagonal + 2, seq1end, thisPos, numCells);
     thisPos += xdropPadLen;
@@ -73,7 +71,7 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
       const Score *z2 = &zScores[diagPos];
       int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
       updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		  minScore, b, antidiagonal, seq1beg);
+		  minScore, b, antidiagonal, seq1end - numCells);
     }
 
     if (globality && isDelimiter1) {
@@ -154,14 +152,19 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
     thisPos += numCells;
 
     if (x0[0] > -INF / 2) {
+      ++numCells;
       ++seq1end;
     }
 
     if (x0[-n] <= -INF / 2) {
-      ++seq1beg;
+      --numCells;
+      if (numCells == 0) break;
       ++diagPos;
       ++horiPos;
-      if (seq1beg >= seq1end) break;
+      seq1 += seqIncrement;
+      qual1 += seqIncrement;
+      seq2 -= seqIncrement;
+      qual2 -= seqIncrement;
     }
   }
 


=====================================
src/GappedXdropAlignerDna.cc
=====================================
@@ -54,7 +54,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
 		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0]);
 
-  size_t seq1beg = 0;
+  int numCells = 1;
   size_t seq1end = 1;
   size_t diagPos = xdropPadLen - 1;
   size_t horiPos = xdropPadLen * 2 - 1;
@@ -83,9 +83,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
 
   size_t antidiagonal;
   for (antidiagonal = 2; /* noop */; ++antidiagonal) {
-    int numCells = seq1end - seq1beg;
     int n = numCells - 1;
-
     const uchar *s1 = &seq1queue.fromEnd(n + seqLoadLen);
     const uchar *s2 = seq2queue.begin();
 
@@ -199,6 +197,7 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
     thisPos += numCells;
 
     if (x0[n] != droppedTinyScore) {
+      ++numCells;
       ++seq1end;
       uchar x = toUnmasked[*seq1];
       seq1queue.push(x, n + seqLoadLen);
@@ -217,10 +216,10 @@ int GappedXdropAligner::alignDna(const uchar *seq1,
 	isDna = false;
       }
     } else {
-      ++seq1beg;
+      --numCells;
+      if (numCells == 0) break;
       ++diagPos;
       ++horiPos;
-      if (seq1beg == seq1end) break;
     }
   }
 


=====================================
src/GappedXdropAlignerFrame.cc
=====================================
@@ -30,7 +30,10 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
 				   const ScoreMatrixRow *scorer,
 				   const GapCosts &gap,
 				   int maxScoreDrop) {
-  const const_uchar_ptr frames[] = {seq2frame0, seq2frame1, seq2frame2};
+  if (!isForward) {
+    --seq1; --seq2frame0; --seq2frame1; --seq2frame2;
+  }
+  const_uchar_ptr frames[] = {seq2frame0, seq2frame1, seq2frame2};
   const int seqIncrement = isForward ? 1 : -1;
 
   const int delOpenScore = -gap.delPieces[0].openCost;
@@ -44,7 +47,7 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
 
   int runOfDrops = 2;
   int runOfEdges = 0;
-  size_t seq1beg = 0;
+  int numCells = 1;
   size_t seq1end = 1;
   size_t diagPos6 = xdropPadLen - 1;
   size_t horiPos5 = xdropPadLen * 2 - 1;
@@ -60,13 +63,9 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
   initFrame();
 
   for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
-    int numCells = seq1end - seq1beg;
-    int n = numCells - 1;
-
-    const uchar *seq2 = frames[antidiagonal % 3];
-    size_t seq2pos = antidiagonal / 3 - seq1beg;
-    const uchar *s1 = isForward ? seq1 + seq1beg : seq1 - seq1beg - 1;
-    const uchar *s2 = isForward ? seq2 + seq2pos : seq2 - seq2pos - 1;
+    const uchar *s1 = seq1;
+    const uchar *s2 = frames[antidiagonal % 3];
+    frames[antidiagonal % 3] += seqIncrement;
 
     initAntidiagonal(antidiagonal + 6, seq1end, thisPos, numCells);
     thisPos += xdropPadLen;
@@ -114,9 +113,11 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
     vertPos1 = thisPos;
     thisPos += numCells;
 
+    int n = numCells - 1;
     if (X0[n] > -INF / 2 || runOfEdges) {
       ++runOfEdges;
       if (runOfEdges == 3) {
+	++numCells;
 	++seq1end;
 	runOfEdges = 0;
       }
@@ -127,8 +128,12 @@ int GappedXdropAligner::alignFrame(const uchar *seq1,
     } else {
       ++runOfDrops;
       if (runOfDrops == 3) {
-	++seq1beg;
-	if (seq1beg == seq1end) break;
+	--numCells;
+	if (numCells == 0) break;
+	seq1 += seqIncrement;
+	frames[0] -= seqIncrement;
+	frames[1] -= seqIncrement;
+	frames[2] -= seqIncrement;
 	++diagPos6;
 	++horiPos5;
 	++horiPos4;


=====================================
src/GappedXdropAlignerPssm.cc
=====================================
@@ -25,7 +25,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
   const SimdInt mInsGrowCost = simdFill(insExtensionCost);
   const int seqIncrement = isForward ? 1 : -1;
 
-  size_t seq1beg = 0;
+  int numCells = 1;
   size_t seq1end = 1;
   size_t diagPos = xdropPadLen - 1;
   size_t horiPos = xdropPadLen * 2 - 1;
@@ -54,9 +54,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
 
   size_t antidiagonal;
   for (antidiagonal = 0; /* noop */; ++antidiagonal) {
-    int numCells = seq1end - seq1beg;
     int n = numCells - 1;
-
     const uchar *s1 = &seq1queue.fromEnd(n + simdLen);
     const const_int_ptr *s2 = &pssmQueue.fromEnd(1);
 
@@ -80,7 +78,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
       const Score *z2 = &zScores[diagPos];
       int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
       updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		  minScore, b, antidiagonal, seq1beg);
+		  minScore, b, antidiagonal, seq1end - numCells);
     }
 
     if (globality && isDelimiter1) {
@@ -151,6 +149,7 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
     thisPos += numCells;
 
     if (x0[n] > -INF / 2) {
+      ++numCells;
       ++seq1end;
       uchar x = *seq;
       seq1queue.push(x, n + simdLen);
@@ -165,10 +164,10 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
       pssm += seqIncrement;
       isDelimiter2 = isDelimiter(0, y);
     } else {
-      ++seq1beg;
+      --numCells;
+      if (numCells == 0) break;
       ++diagPos;
       ++horiPos;
-      if (seq1beg == seq1end) break;
     }
   }
 


=====================================
src/LastalArguments.cc
=====================================
@@ -382,7 +382,10 @@ LAST home page: http://last.cbrc.jp/\n\
   if( isTranslated() && inputFormat == 5 )
     ERR( "can't combine option -F with option -Q 5" );
 
-  if( isFrameshift() && outputType > 3 )
+  if( frameshiftCosts.size() == 1 && frameshiftCosts[0] > 0 && outputType > 3 )
+    ERR( "can't combine option -F > 0 with option -j > 3" );
+
+  if( isFrameshift() && outputType > 4 && outputType < 7 )
     ERR( "can't combine option -F > 0 with option -j > 3" );
 
   if( isFrameshift() && globality == 1 )


=====================================
src/lastal.cc
=====================================
@@ -12,15 +12,12 @@
 #include "GeneticCode.hh"
 #include "SubsetMinimizerFinder.hh"
 #include "SubsetSuffixArray.hh"
-#include "Centroid.hh"
 #include "AlignmentPot.hh"
 #include "Alignment.hh"
 #include "SegmentPairPot.hh"
-#include "SegmentPair.hh"
 #include "ScoreMatrix.hh"
 #include "TantanMasker.hh"
 #include "DiagonalTable.hh"
-#include "GreedyXdropAligner.hh"
 #include "gaplessXdrop.hh"
 #include "gaplessPssmXdrop.hh"
 #include "gaplessTwoQualityXdrop.hh"
@@ -28,6 +25,9 @@
 #include "zio.hh"
 #include "stringify.hh"
 #include "threadUtil.hh"
+
+#include <math.h>
+
 #include <iomanip>  // setw
 #include <iostream>
 #include <fstream>
@@ -45,21 +45,26 @@ static void warn( const char* programName, const char* s ){
 using namespace cbrc;
 
 struct LastAligner {  // data that changes between queries
-  Centroid centroid;
-  GreedyXdropAligner greedyAligner;
+  Aligners engines;
   std::vector<int> qualityPssm;
   std::vector<AlignmentText> textAlns;
 };
 
 struct SubstitutionMatrices {
-  int scores[scoreMatrixRowSize][scoreMatrixRowSize];
-  int scoresMasked[scoreMatrixRowSize][scoreMatrixRowSize];
   mcf::SubstitutionMatrixStats stats;
-  OneQualityScoreMatrix oneQual;
-  OneQualityScoreMatrix oneQualMasked;
   OneQualityExpMatrix oneQualExp;
   QualityPssmMaker maker;
+
+  int scores[scoreMatrixRowSize][scoreMatrixRowSize];
+  double ratiosMatrix[scoreMatrixRowSize][scoreMatrixRowSize];
+  double *ratios[scoreMatrixRowSize];
+  OneQualityScoreMatrix oneQual;
   TwoQualityScoreMatrix twoQual;
+
+  int scoresMasked[scoreMatrixRowSize][scoreMatrixRowSize];
+  double ratiosMaskedMatrix[scoreMatrixRowSize][scoreMatrixRowSize];
+  double *ratiosMasked[scoreMatrixRowSize];
+  OneQualityScoreMatrix oneQualMasked;
   TwoQualityScoreMatrix twoQualMasked;
 };
 
@@ -118,7 +123,9 @@ calculateSubstitutionScoreMatrixStatistics(const std::string &matrixName) {
     }
   } else {
     if (scoreMatrix.isCodonCols()) {
-      LOG("can't calculate probabilities");
+      static const char msg[] = "can't calculate probabilities";
+      if (args.outputType > 3) ERR(msg);  // xxx what if temperature is given?
+      LOG(msg);
       return;
     } else if (args.temperature < 0) {
       const char *canonicalMatrixName = ScoreMatrix::canonicalName(matrixName);
@@ -175,9 +182,9 @@ void makeScoreMatrix( const std::string& matrixName,
   }
 
   scoreMatrix.init(alph.encode);
+  const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
   if (args.outputType > 0) {
     calculateSubstitutionScoreMatrixStatistics(matrixName);
-    const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
     if (!stats.isBad()) {
       scoreMatrix.addAmbiguousScores(alph.letters == alph.dna,
 				     args.ambiguousLetterOpt % 2,
@@ -208,6 +215,20 @@ void makeScoreMatrix( const std::string& matrixName,
     complementMatrix(fwdMatrices.scores, revMatrices.scores);
     complementMatrix(fwdMatrices.scoresMasked, revMatrices.scoresMasked);
   }
+
+  double s = stats.lambda();
+  for (unsigned i = 0; i < scoreMatrixRowSize; ++i) {
+    fwdMatrices.ratios[i] = fwdMatrices.ratiosMatrix[i];
+    revMatrices.ratios[i] = revMatrices.ratiosMatrix[i];
+    fwdMatrices.ratiosMasked[i] = fwdMatrices.ratiosMaskedMatrix[i];
+    revMatrices.ratiosMasked[i] = revMatrices.ratiosMaskedMatrix[i];
+    for (unsigned j = 0; j < scoreMatrixRowSize; ++j) {
+      fwdMatrices.ratios[i][j] = exp(s * fwdMatrices.scores[i][j]);
+      revMatrices.ratios[i][j] = exp(s * revMatrices.scores[i][j]);
+      fwdMatrices.ratiosMasked[i][j] = exp(s * fwdMatrices.scoresMasked[i][j]);
+      revMatrices.ratiosMasked[i][j] = exp(s * revMatrices.scoresMasked[i][j]);
+    }
+  }
 }
 
 void makeQualityScorers(SubstitutionMatrices &m, bool isCheck) {
@@ -443,6 +464,7 @@ struct Dispatcher{
   const uchar* j;  // the query quality data
   const ScoreMatrixRow* p;  // the query PSSM
   const ScoreMatrixRow* m;  // the score matrix
+  const const_dbl_ptr* r;   // the substitution probability ratios
   const TwoQualityScoreMatrix& t;
   int d;  // the maximum score drop
   int z;
@@ -455,6 +477,7 @@ struct Dispatcher{
       j( getQueryQual(queryNum) ),
       p( getQueryPssm(aligner, queryNum) ),
       m( isMaskLowercase(e) ? matrices.scoresMasked : matrices.scores ),
+      r( isMaskLowercase(e) ? matrices.ratiosMasked : matrices.ratios ),
       t( isMaskLowercase(e) ? matrices.twoQualMasked : matrices.twoQual ),
       d( (e == Phase::gapless) ? args.maxDropGapless :
          (e == Phase::gapped ) ? args.maxDropGapped : args.maxDropFinal ),
@@ -719,9 +742,10 @@ void alignGapped( LastAligner& aligner,
     shrinkToLongestIdenticalRun( aln.seed, dis );
 
     // do gapped extension from each end of the seed:
-    aln.makeXdrop(aligner.centroid, aligner.greedyAligner, args.isGreedy,
+    aln.makeXdrop(aligner.engines, args.isGreedy,
 		  dis.a, dis.b, args.globality, dis.m,
-		  scoreMatrix.maxScore, scoreMatrix.minScore, gapCosts,
+		  scoreMatrix.maxScore, scoreMatrix.minScore,
+		  dis.r, matrices.stats.lambda(), gapCosts,
 		  dis.d, frameSize, dis.p, dis.t, dis.i, dis.j, alph, extras);
     ++gappedExtensionCount;
 
@@ -753,7 +777,7 @@ void alignGapped( LastAligner& aligner,
 void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
 		  size_t queryNum, const SubstitutionMatrices &matrices,
 		  const uchar* querySeq ){
-  Centroid& centroid = aligner.centroid;
+  Centroid& centroid = aligner.engines.centroid;
   Dispatcher dis(Phase::final, aligner, queryNum, matrices, querySeq);
   size_t queryLen = query.padLen(queryNum);
   size_t frameSize = args.isFrameshift() ? (queryLen / 3) : 0;
@@ -786,9 +810,10 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
       Alignment probAln;
       AlignmentExtras extras;
       probAln.seed = aln.seed;
-      probAln.makeXdrop(centroid, aligner.greedyAligner, args.isGreedy,
+      probAln.makeXdrop(aligner.engines, args.isGreedy,
 			dis.a, dis.b, args.globality, dis.m,
-			scoreMatrix.maxScore, scoreMatrix.minScore, gapCosts,
+			scoreMatrix.maxScore, scoreMatrix.minScore,
+			dis.r, matrices.stats.lambda(), gapCosts,
 			dis.d, frameSize, dis.p, dis.t, dis.i, dis.j, alph,
 			extras, args.gamma, args.outputType);
       assert( aln.score != -INF );
@@ -1232,7 +1257,8 @@ void lastal( int argc, char** argv ){
   makeScoreMatrix( matrixName, matrixFile );
   gapCosts.assign(args.delOpenCosts, args.delGrowCosts,
 		  args.insOpenCosts, args.insGrowCosts,
-		  args.frameshiftCosts, args.gapPairCost);
+		  args.frameshiftCosts, args.gapPairCost,
+		  fwdMatrices.stats.lambda());
 
   if( args.isTranslated() ){
     if( isDna )  // allow user-defined alphabet


=====================================
src/makefile
=====================================
@@ -33,11 +33,11 @@ ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o			\
 dna_words_finder.o fileMap.o tantan.o LastalArguments.o			\
 GappedXdropAligner.o GappedXdropAlignerDna.o GappedXdropAlignerPssm.o	\
 GappedXdropAligner2qual.o GappedXdropAligner3frame.o			\
-GappedXdropAlignerFrame.o mcf_gap_costs.o GeneticCode.o			\
-GreedyXdropAligner.o LastEvaluer.o OneQualityScoreMatrix.o		\
-QualityPssmMaker.o TwoQualityScoreMatrix.o gaplessXdrop.o		\
-gaplessPssmXdrop.o gaplessTwoQualityXdrop.o cbrc_linalg.o		\
-mcf_substitution_matrix_stats.o $(alpObj)
+GappedXdropAlignerFrame.o mcf_frameshift_xdrop_aligner.o		\
+mcf_gap_costs.o GeneticCode.o GreedyXdropAligner.o LastEvaluer.o	\
+OneQualityScoreMatrix.o QualityPssmMaker.o TwoQualityScoreMatrix.o	\
+gaplessXdrop.o gaplessPssmXdrop.o gaplessTwoQualityXdrop.o		\
+cbrc_linalg.o mcf_substitution_matrix_stats.o $(alpObj)
 
 alignObj4 = Alignment.o AlignmentPot.o AlignmentWrite.o Centroid.o	\
 DiagonalTable.o MultiSequence.o MultiSequenceQual.o SegmentPair.o	\
@@ -144,19 +144,26 @@ depend:
 	$(CXX) -MM alp/*.cpp | sed 's|.*:|alp/&|' >> m
 	$(CXX) -MM -I. split/*.cc | sed 's|.*:|split/&|' | $(fix8) >> m
 	mv m makefile
-Alignment.o Alignment.o8: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \
- mcf_gap_costs.hh Alphabet.hh Centroid.hh GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
- OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh GeneticCode.hh \
- GreedyXdropAligner.hh TwoQualityScoreMatrix.hh
-AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh \
- ScoreMatrixRow.hh SegmentPair.hh mcf_gap_costs.hh
-AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \
- SegmentPair.hh mcf_gap_costs.hh GeneticCode.hh LastEvaluer.hh \
- alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
- alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
- MultiSequence.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
- Alphabet.hh
+Alignment.o Alignment.o8: Alignment.cc Alignment.hh Centroid.hh GappedXdropAligner.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
+ mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh OneQualityScoreMatrix.hh \
+ mcf_substitution_matrix_stats.hh GreedyXdropAligner.hh \
+ mcf_frameshift_xdrop_aligner.hh Alphabet.hh GeneticCode.hh \
+ TwoQualityScoreMatrix.hh
+AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh Centroid.hh \
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_gap_costs.hh mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
+ GreedyXdropAligner.hh mcf_frameshift_xdrop_aligner.hh
+AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh Centroid.hh \
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_gap_costs.hh mcf_simd.hh ScoreMatrixRow.hh SegmentPair.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
+ GreedyXdropAligner.hh mcf_frameshift_xdrop_aligner.hh GeneticCode.hh \
+ LastEvaluer.hh alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp \
+ alp/sls_basic.hpp alp/sls_falp_alignment_evaluer.hpp \
+ alp/sls_fsa1_pvalues.hpp MultiSequence.hh VectorOrMmap.hh Mmap.hh \
+ fileMap.hh stringify.hh Alphabet.hh
 Alphabet.o Alphabet.o8: Alphabet.cc Alphabet.hh
 Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
  mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
@@ -251,11 +258,12 @@ lastal.o lastal.o8: lastal.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
  mcf_substitution_matrix_stats.hh TwoQualityScoreMatrix.hh LastEvaluer.hh \
  alp/sls_alignment_evaluer.hpp alp/sls_pvalues.hpp alp/sls_basic.hpp \
  alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
- GeneticCode.hh SubsetMinimizerFinder.hh SubsetSuffixArray.hh Centroid.hh \
- GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
- mcf_gap_costs.hh mcf_simd.hh SegmentPair.hh AlignmentPot.hh Alignment.hh \
- SegmentPairPot.hh ScoreMatrix.hh TantanMasker.hh tantan.hh \
- DiagonalTable.hh GreedyXdropAligner.hh gaplessXdrop.hh \
+ GeneticCode.hh SubsetMinimizerFinder.hh SubsetSuffixArray.hh \
+ AlignmentPot.hh Alignment.hh Centroid.hh GappedXdropAligner.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_gap_costs.hh \
+ mcf_simd.hh SegmentPair.hh GreedyXdropAligner.hh \
+ mcf_frameshift_xdrop_aligner.hh SegmentPairPot.hh ScoreMatrix.hh \
+ TantanMasker.hh tantan.hh DiagonalTable.hh gaplessXdrop.hh \
  gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh zio.hh mcf_zstream.hh \
  threadUtil.hh version.hh
 lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
@@ -263,6 +271,8 @@ lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh CyclicSubsetSeed.hh \
  stringify.hh SequenceFormat.hh dna_words_finder.hh qualityScoreUtil.hh \
  LastdbArguments.hh SubsetSuffixArray.hh TantanMasker.hh tantan.hh zio.hh \
  mcf_zstream.hh threadUtil.hh version.hh
+mcf_frameshift_xdrop_aligner.o mcf_frameshift_xdrop_aligner.o8: mcf_frameshift_xdrop_aligner.cc \
+ mcf_frameshift_xdrop_aligner.hh mcf_gap_costs.hh
 mcf_gap_costs.o mcf_gap_costs.o8: mcf_gap_costs.cc mcf_gap_costs.hh
 mcf_substitution_matrix_stats.o mcf_substitution_matrix_stats.o8: mcf_substitution_matrix_stats.cc \
  mcf_substitution_matrix_stats.hh LambdaCalculator.hh cbrc_linalg.hh


=====================================
src/mcf_frameshift_xdrop_aligner.cc
=====================================
@@ -0,0 +1,379 @@
+// Author: Martin C. Frith 2020
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "mcf_frameshift_xdrop_aligner.hh"
+
+#include <assert.h>
+#include <math.h>
+
+//#include <iostream>  // for debugging
+
+namespace mcf {
+
+double FrameshiftXdropAligner::forward(const uchar *protein,
+				       const uchar *frame0,
+				       const uchar *frame1,
+				       const uchar *frame2,
+				       bool isRightwardExtension,
+				       const const_dbl_ptr *substitutionProbs,
+				       const GapCosts &gapCosts,
+				       double probDropLimit) {
+  if (isRightwardExtension) {
+    --protein; --frame0; --frame1; --frame2;
+  }
+  proteinPtr = protein;
+  frames[0] = frame0;
+  frames[1] = frame1;
+  frames[2] = frame2;
+  const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+  const double delOpenProb = gapCosts.delProbPieces[0].openProb;
+  const double insOpenProb = gapCosts.insProbPieces[0].openProb;
+  const double delProb1 = gapCosts.delProb1;
+  const double delProb2 = gapCosts.delProb2;
+  const double delProb3 = gapCosts.delProb3;
+  const double insProb1 = gapCosts.insProb1;
+  const double insProb2 = gapCosts.insProb2;
+  const double insProb3 = gapCosts.insProb3;
+
+  int runOfDrops = 2;
+  int runOfEdges = 0;
+  int numCells = 1;
+  size_t proteinEnd = 1;
+  size_t diagPos6 = padSize - 1;  // xxx or 0 ?
+  size_t horiPos5 = diagPos6 + padSize;
+  size_t horiPos4 = horiPos5 + padSize;
+  size_t horiPos3 = horiPos4 + padSize;
+  size_t vertPos2 = horiPos3 + padSize + 1;
+  size_t vertPos1 = vertPos2 + padSize;
+  size_t thisPos  = vertPos1;  // xxx or + padSize ?
+  size_t antidiagonal = 0;
+
+  double sumOfProbRatios = 0;
+  double logSumOfProbRatios = 0;
+
+  if (xdropShape.empty()) xdropShape.assign(1, thisPos);
+
+  resizeFwdProbsIfSmaller(thisPos);
+  double behindProb = substitutionProbs[*protein][*frame0];
+  assert(behindProb > 0);  // XXX might fail?
+  xFwdProbs[padSize - 1] = 1 / behindProb;  // xxx or padSize ?
+
+  while (1) {
+    const uchar *s1 = proteinPtr;
+    frames[antidiagonal % 3] += seqIncrement;
+    const uchar *s2 = frames[antidiagonal % 3];
+
+    resizeFwdProbsIfSmaller(thisPos + padSize + numCells);
+    for (int i = 0; i < padSize; ++i) {
+      xFwdProbs[thisPos + i] = 0;
+      yFwdProbs[thisPos + i] = 0;
+      zFwdProbs[thisPos + i] = 0;
+    }
+    thisPos += padSize;
+    Prob *X0 = &xFwdProbs[thisPos];
+    Prob *Y0 = &yFwdProbs[thisPos];
+    Prob *Z0 = &zFwdProbs[thisPos];
+    const Prob *X1 = &xFwdProbs[vertPos1];
+    const Prob *X2 = &xFwdProbs[vertPos2];
+    const Prob *Y3 = &yFwdProbs[horiPos3];
+    const Prob *Z3 = &zFwdProbs[horiPos3 + 1];
+    const Prob *X4 = &xFwdProbs[horiPos4];
+    const Prob *X5 = &xFwdProbs[horiPos5];
+    const Prob *X6 = &xFwdProbs[diagPos6];
+
+    double minProb = sumOfProbRatios * probDropLimit;
+
+    bool isDelimiter2 = (substitutionProbs[0][*s2] <= 0);
+
+    for (int i = 0; i < numCells; ++i) {
+      s2 -= seqIncrement;
+      double x = X6[i] * substitutionProbs[*s1][*s2];
+      double y = X5[i] * delProb1 + X4[i] * delProb2 + Y3[i] * delProb3;
+      double z = X1[i] * insProb1 + X2[i] * insProb2 + Z3[i] * insProb3;
+      double b = x + y * delOpenProb + z * insOpenProb;
+      X0[i] = b;
+      Y0[i] = b + y;
+      Z0[i] = b + z;
+      sumOfProbRatios += b;
+      s1 += seqIncrement;
+    }
+
+    bool isDelimiter1 = (substitutionProbs[*s1][0] <= 0);
+
+    diagPos6 = horiPos5;
+    horiPos5 = horiPos4;
+    horiPos4 = horiPos3;
+    horiPos3 = vertPos2 - 1;
+    vertPos2 = vertPos1;
+    vertPos1 = thisPos;
+    thisPos += numCells;
+
+    ++antidiagonal;
+
+    size_t a2 = antidiagonal * 2;
+    if (xdropShape.size() < a2 + 1) xdropShape.resize(a2 + 1);
+    xdropShape[a2 - 1] = proteinEnd;
+    xdropShape[a2] = thisPos;
+
+    int n = numCells - 1;
+    bool isOkBeg = (X0[0] >= minProb && !isDelimiter2);
+    bool isOkEnd = (X0[n] >= minProb && !isDelimiter1);
+
+    if (isOkEnd || runOfEdges) {
+      ++runOfEdges;
+      if (runOfEdges == 3) {
+	++numCells;
+	++proteinEnd;
+	runOfEdges = 0;
+      }
+    }
+
+    if (isOkBeg) {
+      runOfDrops = 0;
+    } else {
+      ++runOfDrops;
+      if (runOfDrops == 3) {
+	--numCells;
+	if (numCells == 0) break;
+	proteinPtr += seqIncrement;
+	frames[0] -= seqIncrement;
+	frames[1] -= seqIncrement;
+	frames[2] -= seqIncrement;
+	++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+	runOfDrops = 0;
+      }
+    }
+
+    if (antidiagonal % rescaleStep == 0) {
+      assert(sumOfProbRatios > 0);
+      double scale = 1 / sumOfProbRatios;
+      size_t numOfRescales = antidiagonal / rescaleStep;
+      if (rescales.size() < numOfRescales) rescales.resize(numOfRescales);
+      rescales[numOfRescales - 1] = scale;
+      rescaleFwdProbs(begInProbs(antidiagonal - 6), thisPos, scale);
+      logSumOfProbRatios += log(sumOfProbRatios);
+      sumOfProbRatios = 1;
+    }
+  }
+
+  numOfAntidiagonals = antidiagonal;
+  rescaledSumOfProbRatios = sumOfProbRatios;
+  return logSumOfProbRatios + log(sumOfProbRatios);
+}
+
+void FrameshiftXdropAligner::backward(bool isRightwardExtension,
+				      const const_dbl_ptr *substitutionProbs,
+				      const GapCosts &gapCosts) {
+  const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+  const double delOpenProb = gapCosts.delProbPieces[0].openProb;
+  const double insOpenProb = gapCosts.insProbPieces[0].openProb;
+  const double delProb1 = gapCosts.delProb1;
+  const double delProb2 = gapCosts.delProb2;
+  const double delProb3 = gapCosts.delProb3;
+  const double insProb1 = gapCosts.insProb1;
+  const double insProb2 = gapCosts.insProb2;
+  const double insProb3 = gapCosts.insProb3;
+
+  size_t diagPos6 = padSize - 1;
+  size_t horiPos5 = diagPos6 + padSize;
+  size_t horiPos4 = horiPos5 + padSize;
+  size_t horiPos3 = horiPos4 + padSize;
+  size_t vertPos2 = horiPos3 + padSize + 1;
+  size_t vertPos1 = vertPos2 + padSize;
+  size_t thisPos  = vertPos1;
+  size_t antidiagonal = numOfAntidiagonals;
+
+  double scaledUnit = 1 / rescaledSumOfProbRatios;
+
+  resizeBckProbsIfSmaller(begInProbs(numOfAntidiagonals));
+
+  while (1) {
+    --antidiagonal;
+    const uchar *s1 = proteinPtr;
+    frames[antidiagonal % 3] -= seqIncrement;
+    const uchar *s2 = frames[antidiagonal % 3];
+
+    int numCells = numOfCellsInAntidiagonal(antidiagonal);
+    for (int i = 0; i < padSize; ++i) {
+      xBckProbs[thisPos + i] = 0;
+      yBckProbs[thisPos + i] = 0;
+      zBckProbs[thisPos + i] = 0;
+    }
+    thisPos += padSize;
+    Prob *X0 = &xBckProbs[thisPos];
+    Prob *Y0 = &yBckProbs[thisPos];
+    Prob *Z0 = &zBckProbs[thisPos];
+    const Prob *Z1 = &zBckProbs[vertPos1];
+    const Prob *Z2 = &zBckProbs[vertPos2];
+    const Prob *Y3 = &yBckProbs[horiPos3];
+    const Prob *Z3 = &zBckProbs[horiPos3 + 1];
+    const Prob *Y4 = &yBckProbs[horiPos4];
+    const Prob *Y5 = &yBckProbs[horiPos5];
+    const Prob *X6 = &xBckProbs[diagPos6];
+
+    for (int i = 0; i < numCells; ++i) {
+      double x  = X6[i];
+      double y1 = Y5[i] * delProb1;
+      double y2 = Y4[i] * delProb2;
+      double y3 = Y3[i] * delProb3;
+      double z1 = Z1[i] * insProb1;
+      double z2 = Z2[i] * insProb2;
+      double z3 = Z3[i] * insProb3;
+      double b  = x + y1 + y2 + y3 + z1 + z2 + z3 + scaledUnit;
+      X0[i] = b * substitutionProbs[*s1][*s2];
+      Y0[i] = b * delOpenProb + y3;
+      Z0[i] = b * insOpenProb + z3;
+      s1 -= seqIncrement;
+      s2 += seqIncrement;
+    }
+
+    diagPos6 = horiPos5;
+    horiPos5 = horiPos4;
+    horiPos4 = horiPos3;
+    horiPos3 = vertPos2 - 1;
+    vertPos2 = vertPos1;
+    vertPos1 = thisPos;
+    thisPos += numCells;
+
+    if (antidiagonal == 0) break;
+
+    if (endInProtein(antidiagonal - 1) < endInProtein(antidiagonal)) {
+      proteinPtr -= seqIncrement;
+      frames[0] += seqIncrement;
+      frames[1] += seqIncrement;
+      frames[2] += seqIncrement;
+      ++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+    }
+
+    size_t offset = 6;
+    if ((antidiagonal + offset) % rescaleStep == 0 &&
+	antidiagonal + offset < numOfAntidiagonals) {
+      double scale = rescales[antidiagonal / rescaleStep];
+      size_t length = begInProbs(antidiagonal + 6) - begInProbs(antidiagonal);
+      rescaleBckProbs(thisPos - length, thisPos, scale);
+      scaledUnit *= scale;
+    }
+  }
+
+  assert(thisPos == begInProbs(numOfAntidiagonals));
+}
+
+void FrameshiftXdropAligner::count(bool isRightwardExtension,
+				   const GapCosts &gapCosts,
+				   const dbl_ptr *substitutionCounts,
+				   double *transitionCounts) {
+  const int seqIncrement = isRightwardExtension ? 1 : -1;
+
+  size_t diagPos6 = padSize - 1;
+  size_t horiPos5 = diagPos6 + padSize;
+  size_t horiPos4 = horiPos5 + padSize;
+  size_t horiPos3 = horiPos4 + padSize;
+  size_t vertPos2 = horiPos3 + padSize + 1;
+  size_t vertPos1 = vertPos2 + padSize;
+  size_t thisPos  = vertPos1;
+  size_t antidiagonal = numOfAntidiagonals;
+  size_t totalNumOfCells = begInProbs(numOfAntidiagonals);
+
+  size_t proteinEnd = endInProtein(antidiagonal - 1);
+  if (isRightwardExtension) {
+    proteinPtr += proteinEnd;
+    frames[0] += (antidiagonal + 2) / 3 - (proteinEnd - 1);
+    frames[1] += (antidiagonal + 1) / 3 - (proteinEnd - 1);
+    frames[2] += (antidiagonal + 0) / 3 - (proteinEnd - 1);
+  } else {
+    proteinPtr -= proteinEnd;
+    frames[0] -= (antidiagonal + 2) / 3 - (proteinEnd - 1);
+    frames[1] -= (antidiagonal + 1) / 3 - (proteinEnd - 1);
+    frames[2] -= (antidiagonal + 0) / 3 - (proteinEnd - 1);
+  }
+
+  double del1 = 0; double del2 = 0; double del3 = 0; double delNext = 0;
+  double ins1 = 0; double ins2 = 0; double ins3 = 0; double insNext = 0;
+
+  while (1) {
+    --antidiagonal;
+    const uchar *s1 = proteinPtr;
+    const uchar *s2 = frames[antidiagonal % 3];
+    frames[antidiagonal % 3] -= seqIncrement;
+
+    int numCells = numOfCellsInAntidiagonal(antidiagonal);
+    thisPos += padSize;
+    const Prob *Z1 = &zBckProbs[vertPos1];
+    const Prob *Z2 = &zBckProbs[vertPos2];
+    const Prob *Y3 = &yBckProbs[horiPos3];
+    const Prob *Z3 = &zBckProbs[horiPos3 + 1];
+    const Prob *Y4 = &yBckProbs[horiPos4];
+    const Prob *Y5 = &yBckProbs[horiPos5];
+    const Prob *X6 = &xBckProbs[diagPos6];
+
+    const Prob *fX0 = &xFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+    const Prob *fY0 = &yFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+    const Prob *fZ0 = &zFwdProbs[totalNumOfCells - thisPos + padSize*7 - 1];
+
+    double d1 = 0; double d2 = 0; double d3 = 0; double dNext = 0;
+    double i1 = 0; double i2 = 0; double i3 = 0; double iNext = 0;
+
+    for (int i = 0; i < numCells; ++i) {
+      double matchProb = X6[i] * fX0[-i];
+      substitutionCounts[*s1][*s2] += matchProb;
+      transitionCounts[0] += matchProb;
+      d1 += Y5[i] * fX0[-i];
+      d2 += Y4[i] * fX0[-i];
+      d3 += Y3[i] * fX0[-i];
+      dNext += Y3[i] * fY0[-i];
+      i1 += Z1[i] * fX0[-i];
+      i2 += Z2[i] * fX0[-i];
+      i3 += Z3[i] * fX0[-i];
+      iNext += Z3[i] * fZ0[-i];
+      s1 -= seqIncrement;
+      s2 += seqIncrement;
+    }
+
+    size_t mod = antidiagonal % rescaleStep;
+    if (mod >= rescaleStep - 6 &&
+	antidiagonal - mod + rescaleStep < numOfAntidiagonals) {
+      double mul = 1 / rescales[antidiagonal / rescaleStep];
+      if (mod < rescaleStep - 1) i1 *= mul;
+      if (mod < rescaleStep - 2) i2 *= mul;
+      if (mod < rescaleStep - 3) {
+	i3 *= mul; iNext *= mul; d3 *= mul; dNext *= mul;
+      }
+      if (mod < rescaleStep - 4) d2 *= mul;
+      if (mod < rescaleStep - 5) d1 *= mul;
+    }
+
+    ins1 += i1; ins2 += i2; ins3 += i3; insNext += iNext;
+    del1 += d1; del2 += d2; del3 += d3; delNext += dNext;
+
+    diagPos6 = horiPos5;
+    horiPos5 = horiPos4;
+    horiPos4 = horiPos3;
+    horiPos3 = vertPos2 - 1;
+    vertPos2 = vertPos1;
+    vertPos1 = thisPos;
+    thisPos += numCells;
+
+    if (antidiagonal == 0) break;
+
+    if (endInProtein(antidiagonal - 1) < endInProtein(antidiagonal)) {
+      proteinPtr -= seqIncrement;
+      frames[0] += seqIncrement;
+      frames[1] += seqIncrement;
+      frames[2] += seqIncrement;
+      ++diagPos6; ++horiPos5; ++horiPos4; ++horiPos3; ++vertPos2; ++vertPos1;
+    }
+  }
+
+  transitionCounts[1] += gapCosts.insProb1 * ins1;
+  transitionCounts[2] += gapCosts.insProb2 * ins2;
+  transitionCounts[3] += gapCosts.insProb3 * ins3;
+  transitionCounts[4] += gapCosts.insProb3 * (insNext - ins3);
+  transitionCounts[5] += gapCosts.delProb1 * del1;
+  transitionCounts[6] += gapCosts.delProb2 * del2;
+  transitionCounts[7] += gapCosts.delProb3 * del3;
+  transitionCounts[8] += gapCosts.delProb3 * (delNext - del3);
+}
+
+}


=====================================
src/mcf_frameshift_xdrop_aligner.hh
=====================================
@@ -0,0 +1,121 @@
+// Author: Martin C. Frith 2020
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef FRAMESHIFT_XDROP_ALIGNER_HH
+#define FRAMESHIFT_XDROP_ALIGNER_HH
+
+#include "mcf_gap_costs.hh"
+
+#include <stddef.h>
+
+#include <vector>
+
+namespace mcf {
+
+typedef const double *const_dbl_ptr;
+typedef double *dbl_ptr;
+
+class FrameshiftXdropAligner {
+  typedef unsigned char uchar;
+  typedef const uchar *const_uchar_ptr;
+  typedef double Prob;
+
+  enum { padSize = 1 };
+  enum { rescaleStep = 32 };
+
+public:
+  double forward(const uchar *protein,
+		 const uchar *frame0,  // translated DNA,  0 frame
+		 const uchar *frame1,  // translated DNA, +1 frame
+		 const uchar *frame2,  // translated DNA, +2 frame
+		 bool isRightwardExtension,
+		 const const_dbl_ptr *substitutionProbs,
+		 const GapCosts &gapCosts,
+		 double probDropLimit);
+
+  void backward(bool isRightwardExtension,
+		const const_dbl_ptr *substitutionProbs,
+		const GapCosts &gapCosts);
+
+  void count(bool isRightwardExtension,
+	     const GapCosts &gapCosts,
+	     const dbl_ptr *substitutionCounts,
+	     double *transitionCounts);
+
+  double matchProb(size_t proteinCoordinate, size_t dnaCoordinate) const {
+    size_t antidiagonal = proteinCoordinate * 3 + dnaCoordinate;
+    if (antidiagonal + 6 >= numOfAntidiagonals) return 0;
+    size_t a = antidiagonal * 2;
+    size_t i = xdropShape[a + 2] - xdropShape[a + 1] + proteinCoordinate;
+    if (i < xdropShape[a] + padSize || i >= xdropShape[a + 2]) return 0;
+    size_t b = (antidiagonal + 6) * 2;
+    size_t j = xdropShape[b + 2] - xdropShape[b + 1] + proteinCoordinate + 1;
+    size_t totalNumOfCells = xdropShape[numOfAntidiagonals * 2];
+    return xFwdProbs[i] * xBckProbs[totalNumOfCells - j + padSize*7 - 1];
+  }
+
+private:
+  std::vector<Prob> xFwdProbs;
+  std::vector<Prob> yFwdProbs;
+  std::vector<Prob> zFwdProbs;
+
+  std::vector<Prob> xBckProbs;
+  std::vector<Prob> yBckProbs;
+  std::vector<Prob> zBckProbs;
+
+  std::vector<size_t> xdropShape;
+
+  std::vector<double> rescales;
+
+  double rescaledSumOfProbRatios;
+
+  size_t numOfAntidiagonals;
+
+  const uchar *proteinPtr;
+  const_uchar_ptr frames[3];
+
+  size_t begInProbs(size_t antidiagonal) const
+  { return xdropShape[antidiagonal * 2]; }
+
+  size_t endInProtein(size_t antidiagonal) const
+  { return xdropShape[antidiagonal * 2 + 1]; }
+
+  int numOfCellsInAntidiagonal(size_t antidiagonal) const
+  { return begInProbs(antidiagonal + 1) - begInProbs(antidiagonal) - padSize; }
+
+  void resizeFwdProbsIfSmaller(size_t size) {
+    if (xFwdProbs.size() < size) {
+      xFwdProbs.resize(size);
+      yFwdProbs.resize(size);
+      zFwdProbs.resize(size);
+    }
+  }
+
+  void resizeBckProbsIfSmaller(size_t size) {
+    if (xBckProbs.size() < size) {
+      xBckProbs.resize(size);
+      yBckProbs.resize(size);
+      zBckProbs.resize(size);
+    }
+  }
+
+  void rescaleFwdProbs(size_t beg, size_t end, double scale) {
+    for (size_t i = beg; i < end; ++i) {
+      xFwdProbs[i] *= scale;
+      yFwdProbs[i] *= scale;
+      zFwdProbs[i] *= scale;
+    }
+  }
+
+  void rescaleBckProbs(size_t beg, size_t end, double scale) {
+    for (size_t i = beg; i < end; ++i) {
+      xBckProbs[i] *= scale;
+      yBckProbs[i] *= scale;
+      zBckProbs[i] *= scale;
+    }
+  }
+};
+
+}
+
+#endif


=====================================
src/mcf_gap_costs.cc
=====================================
@@ -5,6 +5,7 @@
 #include <stdexcept>
 #include <assert.h>
 #include <limits.h>
+#include <math.h>
 #include <stddef.h>  // size_t
 
 static void err(const char *s) {
@@ -26,12 +27,23 @@ static void assignGapCostPieces(const std::vector<int> &openCosts,
   assert(!pieces.empty());
 }
 
+static void assignProbs(std::vector<GapCosts::ProbPiece> &probPieces,
+			const std::vector<GapCosts::Piece> &costPieces,
+			double scale) {
+  probPieces.resize(costPieces.size());
+  for (size_t i = 0; i < costPieces.size(); ++i) {
+    probPieces[i].openProb = exp(scale * -costPieces[i].openCost);
+    probPieces[i].growProb = exp(scale * -costPieces[i].growCost);
+  }
+}
+
 void GapCosts::assign(const std::vector<int> &delOpenCosts,
 		      const std::vector<int> &delGrowCosts,
 		      const std::vector<int> &insOpenCosts,
 		      const std::vector<int> &insGrowCosts,
 		      const std::vector<int> &frameshiftCosts,
-		      int unalignedPairCost) {
+		      int unalignedPairCost,
+		      double scale) {
   assignGapCostPieces(delOpenCosts, delGrowCosts, delPieces);
   assignGapCostPieces(insOpenCosts, insGrowCosts, insPieces);
   if (unalignedPairCost > 0) {
@@ -44,6 +56,9 @@ void GapCosts::assign(const std::vector<int> &delOpenCosts,
 	      pairCost >= delPieces[0].growCost + insPieces[0].growCost +
 	      std::max(delPieces[0].openCost, insPieces[0].openCost));
 
+  assignProbs(delProbPieces, delPieces, scale);
+  assignProbs(insProbPieces, insPieces, scale);
+
   if (frameshiftCosts.empty()) {
     frameshiftCost = -1;
   } else if (frameshiftCosts.size() == 1) {
@@ -57,6 +72,13 @@ void GapCosts::assign(const std::vector<int> &delOpenCosts,
     insScore2 = -(2 * insPieces[0].growCost + frameshiftCosts[3]);
     insScore3 = -(3 * insPieces[0].growCost);
     frameshiftCost = -2;
+
+    delProb1 = exp(scale * delScore1);
+    delProb2 = exp(scale * delScore2);
+    delProb3 = exp(scale * delScore3);
+    insProb1 = exp(scale * insScore1);
+    insProb2 = exp(scale * insScore2);
+    insProb3 = exp(scale * insScore3);
   }
 }
 


=====================================
src/mcf_gap_costs.hh
=====================================
@@ -27,6 +27,11 @@ struct GapCosts {
     int growCost;
   };
 
+  struct ProbPiece {
+    double openProb;
+    double growProb;
+  };
+
   std::vector<Piece> delPieces;
   std::vector<Piece> insPieces;
   int pairCost;
@@ -43,6 +48,16 @@ struct GapCosts {
 
   bool isAffine;
 
+  // these are for calculating alignment probabilities:
+  std::vector<ProbPiece> delProbPieces;
+  std::vector<ProbPiece> insProbPieces;
+  double delProb1;
+  double delProb2;
+  double delProb3;
+  double insProb1;
+  double insProb2;
+  double insProb3;
+
   // Assign piecewise linear open and grow costs, and one pairCost.
   // If unalignedPairCost <= 0, assign non-generalized costs.
   // Throw a runtime_error if any growCost is <= 0.
@@ -56,7 +71,8 @@ struct GapCosts {
 	      const std::vector<int> &insOpenCosts,
 	      const std::vector<int> &insGrowCosts,
 	      const std::vector<int> &frameshiftCosts,
-	      int unalignedPairCost);
+	      int unalignedPairCost,
+	      double scale);  // probability ratio  =  exp(scale * score)
 
   // The cost of a "gap" consisting of unaligned letters in the query
   // and/or reference sequence


=====================================
src/version.hh
=====================================
@@ -1 +1 @@
-"1133"
+"1145"



View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/649febce00d36a9bea89dda4805e112d5f9d052e

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/649febce00d36a9bea89dda4805e112d5f9d052e
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201110/10d7ab61/attachment-0001.html>


More information about the debian-med-commit mailing list