[med-svn] [last-align] 01/03: Imported Upstream version 752

Andreas Tille tille at debian.org
Tue Aug 16 12:39:34 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository last-align.

commit f587cedc3fac783b10361b16cd4a4ced83dfb55b
Author: Andreas Tille <tille at debian.org>
Date:   Tue Aug 16 14:34:54 2016 +0200

    Imported Upstream version 752
---
 ChangeLog.txt                |  32 +++++++++++-
 scripts/last-train           |   8 ++-
 src/OneQualityScoreMatrix.cc |   6 +--
 src/QualityPssmMaker.cc      |   2 +-
 src/TwoQualityScoreMatrix.cc | 116 ++++++++++++++++++++++++++++++++-----------
 src/TwoQualityScoreMatrix.hh |   3 +-
 src/lastal.cc                |  14 ++++--
 src/qualityScoreUtil.hh      |  20 +++++---
 src/version.hh               |   2 +-
 9 files changed, 155 insertions(+), 48 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 214187c..6caa061 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,8 +1,38 @@
+2016-08-01  Martin C. Frith  <Martin C. Frith>
+
+	* scripts/last-train:
+	Fixed last-train crash with gap existence cost < 0.
+	[9a03622aa2c0] [tip]
+
+	* src/TwoQualityScoreMatrix.cc, src/TwoQualityScoreMatrix.hh,
+	src/lastal.cc:
+	Made lastal startup yet faster for fastq-versus-fastq.
+	[3d9a91ae8385]
+
+	* src/TwoQualityScoreMatrix.cc, src/TwoQualityScoreMatrix.hh:
+	Refactoring.
+	[75fb5e4bd955]
+
+2016-07-20  Martin C. Frith  <Martin C. Frith>
+
+	* src/TwoQualityScoreMatrix.cc, src/qualityScoreUtil.hh:
+	Made lastal startup even faster for fastq-versus-fastq.
+	[b29ac42364b3]
+
+	* src/TwoQualityScoreMatrix.cc:
+	Made lastal startup faster for fastq-versus-fastq.
+	[be0485aed801]
+
+	* src/OneQualityScoreMatrix.cc, src/QualityPssmMaker.cc,
+	src/TwoQualityScoreMatrix.cc, src/qualityScoreUtil.hh:
+	Refactoring.
+	[ba10c9dd420f]
+
 2016-07-05  Martin C. Frith  <Martin C. Frith>
 
 	* src/SubsetSuffixArraySearch.cc:
 	Bugfix: lastal with -l0 was not reliable, I think.
-	[60738c7a24e3] [tip]
+	[60738c7a24e3]
 
 	* src/LastalArguments.cc:
 	Allowed gap open costs < 0.
diff --git a/scripts/last-train b/scripts/last-train
index f8e77ae..a96532a 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -134,9 +134,15 @@ def gapProbsFromCounts(counts, opts):
     insCloseProb = 1 - insExtendProb
     firstDelProb = delOpenProb * delCloseProb
     firstInsProb = insOpenProb * insCloseProb
-    # if we define "alignment" to mean "set of indistinguishable paths":
+
+    # If we define "an alignment" to mean "a set of indistinguishable
+    # paths", then:
     #delExtendProb += firstDelProb
     #insExtendProb += firstInsProb
+    # Else, this ensures gap existence cost >= 0:
+    delExtendProb = max(delExtendProb, firstDelProb)
+    insExtendProb = max(insExtendProb, firstInsProb)
+
     delExistProb = firstDelProb / delExtendProb
     insExistProb = firstInsProb / insExtendProb
 
diff --git a/src/OneQualityScoreMatrix.cc b/src/OneQualityScoreMatrix.cc
index d15aa6d..26adea3 100644
--- a/src/OneQualityScoreMatrix.cc
+++ b/src/OneQualityScoreMatrix.cc
@@ -42,9 +42,9 @@ void OneQualityScoreMatrix::init(const ScoreMatrixRow *scoreMatrix,
 
       for (int q2 = 0; q2 < qualityCapacity; ++q2) {
         if (isUseQuality) {
-          double p2 = letterProbs2[unmasked2];
-          double u2 = qualityUncertainty(q2, qualityOffset, isPhred, p2);
-          score = qualityPairScore(expScore, 0, u2, lambda);
+	  double e2 = errorProbFromQual(q2, qualityOffset, isPhred);
+          double c2 = qualityCertainty(e2, letterProbs2[unmasked2]);
+          score = qualityPairScore(expScore, 1, c2, lambda);
         }
 
         if (isMask) score = std::min(score, 0);
diff --git a/src/QualityPssmMaker.cc b/src/QualityPssmMaker.cc
index 0fe7deb..6104c7d 100644
--- a/src/QualityPssmMaker.cc
+++ b/src/QualityPssmMaker.cc
@@ -33,7 +33,7 @@ void QualityPssmMaker::init(const ScoreMatrixRow *scoreMatrix,
   double mismatchExp = std::exp(lambda * mismatchScore);
 
   for (int q = 0; q < qualityCapacity; ++q) {
-    double e = qualityUncertainty(q, qualityOffset, false, 0);
+    double e = errorProbFromQual(q, qualityOffset, false);
     double p = 1 - e;
     qualityToProbCorrect[q] = p;
 
diff --git a/src/TwoQualityScoreMatrix.cc b/src/TwoQualityScoreMatrix.cc
index fde44d6..dc42db7 100644
--- a/src/TwoQualityScoreMatrix.cc
+++ b/src/TwoQualityScoreMatrix.cc
@@ -7,6 +7,7 @@
 #include <algorithm>  // min
 #include <cassert>
 #include <cmath>
+//#include <iostream>  // for debugging
 
 namespace cbrc {
 
@@ -50,43 +51,102 @@ void TwoQualityScoreMatrix::init(const ScoreMatrixRow *scoreMatrix,
                                  bool isPhred2,
                                  int qualityOffset2,
                                  const uchar *toUnmasked,
-                                 bool isApplyMasking) {
+                                 bool isMask,
+				 bool isMatchMismatchMatrix) {
+  typedef TwoQualityMatrixIndexer Indexer;
+
   indexer.init(toUnmasked);
-  data.resize(indexer.numSymbols * indexer.numSymbols);
+  data.resize(Indexer::numSymbols * Indexer::numSymbols);
 
-  for (int letter1 = 0; letter1 < scoreMatrixRowSize; ++letter1) {
-    for (int letter2 = 0; letter2 < scoreMatrixRowSize; ++letter2) {
-      int unmasked1 = toUnmasked[letter1];
-      int unmasked2 = toUnmasked[letter2];
+  double expMat[Indexer::numNormalLetters][Indexer::numNormalLetters];
 
-      bool isMasked = (unmasked1 != letter1 || unmasked2 != letter2);
-      bool isMask = (isApplyMasking && isMasked);
+  for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1)
+    for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2)
+      expMat[x1][x2] = std::exp(lambda * scoreMatrix[x1][x2]);
 
-      bool isNormal1 = (unmasked1 < indexer.numNormalLetters);
-      bool isNormal2 = (unmasked2 < indexer.numNormalLetters);
-      bool isUseQuality = (isNormal1 && isNormal2);
+  double certainties1[Indexer::qualityCapacity][Indexer::numNormalLetters];
+  double certainties2[Indexer::qualityCapacity][Indexer::numNormalLetters];
 
-      int score = scoreMatrix[unmasked1][unmasked2];
-      double expScore = std::exp(lambda * score);
+  for (int q = Indexer::minQuality; q < Indexer::qualityCapacity; ++q) {
+    double e1 = errorProbFromQual(q, qualityOffset1, isPhred1);
+    double e2 = errorProbFromQual(q, qualityOffset2, isPhred2);
+    for (int x = 0; x < Indexer::numNormalLetters; ++x) {
+      certainties1[q][x] = qualityCertainty(e1, letterProbs1[x]);
+      certainties2[q][x] = qualityCertainty(e2, letterProbs2[x]);
+    }
+  }
 
-      int end1 = qualityEnd(indexer, letter1);
-      int end2 = qualityEnd(indexer, letter2);
+  // I tried pre-calculating 1/lambda, but there was little speed boost.
+
+  for (int q1 = Indexer::minQuality; q1 < Indexer::qualityCapacity; ++q1) {
+    int *dq1 = &data[q1 * Indexer::numQualityLetters * Indexer::numSymbols];
+    const double *c1s = certainties1[q1];
+    for (int q2 = Indexer::minQuality; q2 < Indexer::qualityCapacity; ++q2) {
+      int *dq2 = dq1 + q2 * Indexer::numQualityLetters;
+      const double *c2s = certainties2[q2];
+      if (isMatchMismatchMatrix) {  // do this common special case faster
+	int scoreSame = qualityPairScore(expMat[0][0], c1s[0], c2s[0], lambda);
+	int scoreDiff = qualityPairScore(expMat[0][1], c1s[0], c2s[0], lambda);
+	int scoreSameMask = isMask ? std::min(scoreSame, 0) : scoreSame;
+	int scoreDiffMask = isMask ? std::min(scoreDiff, 0) : scoreDiff;
+	for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1) {
+	  int *dx1 = dq2 + x1 * Indexer::numSymbols;
+	  int *dm1 = dx1 + Indexer::numNormalLetters * Indexer::numSymbols;
+	  for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2) {
+	    int m2 = x2 + Indexer::numNormalLetters;
+	    if (x2 == x1) {
+	      dx1[x2] = scoreSame;
+	      dx1[m2] = scoreSameMask;
+	      dm1[x2] = scoreSameMask;
+	      dm1[m2] = scoreSameMask;
+	    } else {
+	      dx1[x2] = scoreDiff;
+	      dx1[m2] = scoreDiffMask;
+	      dm1[x2] = scoreDiffMask;
+	      dm1[m2] = scoreDiffMask;
+	    }
+	  }
+	}
+      } else {
+	for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1) {
+	  int *dx1 = dq2 + x1 * Indexer::numSymbols;
+	  int *dm1 = dx1 + Indexer::numNormalLetters * Indexer::numSymbols;
+	  double c1 = c1s[x1];
+	  for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2) {
+	    int m2 = x2 + Indexer::numNormalLetters;
+	    double c2 = c2s[x2];
+	    int score = qualityPairScore(expMat[x1][x2], c1, c2, lambda);
+	    dx1[x2] = score;
+	    if (isMask) score = std::min(score, 0);
+	    dx1[m2] = score;
+	    dm1[x2] = score;
+	    dm1[m2] = score;
+	  }
+	}
+      }
+    }
+  }
 
-      for (int q1 = indexer.minQuality; q1 < end1; ++q1) {
-        for (int q2 = indexer.minQuality; q2 < end2; ++q2) {
-          if (isUseQuality) {
-            double p1 = letterProbs1[unmasked1];
-            double u1 = qualityUncertainty(q1, qualityOffset1, isPhred1, p1);
-            double p2 = letterProbs2[unmasked2];
-            double u2 = qualityUncertainty(q2, qualityOffset2, isPhred2, p2);
-            score = qualityPairScore(expScore, u1, u2, lambda);
-          }
+  for (int x1 = 0; x1 < scoreMatrixRowSize; ++x1) {
+    for (int x2 = 0; x2 < scoreMatrixRowSize; ++x2) {
+      int unmasked1 = toUnmasked[x1];
+      int unmasked2 = toUnmasked[x2];
 
-          if (isMask) score = std::min(score, 0);
+      bool isQuality1 = (unmasked1 < Indexer::numNormalLetters);
+      bool isQuality2 = (unmasked2 < Indexer::numNormalLetters);
+      if (isQuality1 && isQuality2) continue;
 
-          data[indexer(letter1, letter2, q1, q2)] = score;
-        }
-      }
+      bool isMasked = (unmasked1 != x1 || unmasked2 != x2);
+
+      int score = scoreMatrix[unmasked1][unmasked2];
+      if (isMasked && isMask) score = std::min(score, 0);
+
+      int end1 = qualityEnd(indexer, x1);
+      int end2 = qualityEnd(indexer, x2);
+
+      for (int q1 = Indexer::minQuality; q1 < end1; ++q1)
+        for (int q2 = Indexer::minQuality; q2 < end2; ++q2)
+          data[indexer(x1, x2, q1, q2)] = score;
     }
   }
 }
diff --git a/src/TwoQualityScoreMatrix.hh b/src/TwoQualityScoreMatrix.hh
index bb6c9b4..85760ab 100644
--- a/src/TwoQualityScoreMatrix.hh
+++ b/src/TwoQualityScoreMatrix.hh
@@ -76,7 +76,8 @@ class TwoQualityScoreMatrix {
             bool isPhred2,
             int qualityOffset2,
             const uchar *toUnmasked,  // maps letters to unmasked letters
-            bool isApplyMasking);
+            bool isMask,
+	    bool isMatchMismatchMatrix);  // if "true", init is faster
 
   // Tests whether init has been called:
   operator const void *() const { return data.empty() ? 0 : this; }
diff --git a/src/lastal.cc b/src/lastal.cc
index 41732e5..80f3886 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -140,6 +140,7 @@ void makeQualityScorers(){
 		   "quality data not used for DNA-versus-protein alignment" );
 
   const ScoreMatrixRow* m = scoreMatrix.caseSensitive;  // case isn't relevant
+  bool isMatchMismatch = (args.matrixFile.empty() && args.matchScore > 0);
   double lambda = lambdaCalculator.lambda();
   const double* lp1 = lambdaCalculator.letterProbs1();
   bool isPhred1 = isPhred( referenceFormat );
@@ -188,7 +189,6 @@ void makeQualityScorers(){
       }
     }
     else if( args.inputFormat == sequenceFormat::prb ){
-      bool isMatchMismatch = (args.matrixFile.empty() && args.matchScore > 0);
       qualityPssmMaker.init( m, alph.size, lambda, isMatchMismatch,
                              args.matchScore, -args.mismatchCost,
                              offset2, alph.numbersToUppercase );
@@ -203,22 +203,26 @@ void makeQualityScorers(){
       if( args.maskLowercase > 0 )
 	twoQualityMatrixMasked.init( m, lambda, lp1, lp2,
 				     isPhred1, offset1, isPhred2, offset2,
-				     alph.numbersToUppercase, true);
+				     alph.numbersToUppercase, true,
+				     isMatchMismatch );
       if( args.maskLowercase < 3 )
 	twoQualityMatrix.init( m, lambda, lp1, lp2,
 			       isPhred1, offset1, isPhred2, offset2,
-			       alph.numbersToUppercase, false );
+			       alph.numbersToUppercase, false,
+			       isMatchMismatch );
       if( args.outputType > 3 )
         ERR( "fastq-versus-fastq column probabilities not implemented" );
       if( args.isQueryStrandMatrix && args.strand != 1 ){
 	if( args.maskLowercase > 0 )
 	  twoQualityMatrixRevMasked.init( mRev, lambda, lp1rev, lp2rev,
 					  isPhred1, offset1, isPhred2, offset2,
-					  alph.numbersToUppercase, true );
+					  alph.numbersToUppercase, true,
+					  isMatchMismatch );
 	if( args.maskLowercase < 3 )
 	  twoQualityMatrixRev.init( mRev, lambda, lp1rev, lp2rev,
 				    isPhred1, offset1, isPhred2, offset2,
-				    alph.numbersToUppercase, false );
+				    alph.numbersToUppercase, false,
+				    isMatchMismatch );
       }
     }
     else{
diff --git a/src/qualityScoreUtil.hh b/src/qualityScoreUtil.hh
index 05b9f71..c8fb11f 100644
--- a/src/qualityScoreUtil.hh
+++ b/src/qualityScoreUtil.hh
@@ -27,24 +27,30 @@ inline double solexaErrorProb(int qualityScore) {
   return 1 / (1 + std::pow(10.0, 0.1 * qualityScore));
 }
 
-inline double qualityUncertainty(int qualityCode, int qualityOffset,
-                                 bool isPhred, double letterProb) {
+inline double errorProbFromQual(int qualityCode, int qualityOffset,
+				bool isPhred) {
   int q = qualityCode - qualityOffset;
   double errorProb = isPhred ? phredErrorProb(q) : solexaErrorProb(q);
   // The next line of code is a kludge to avoid numerical instability.
   // An error probability of 1 is bizarre and probably shouldn't occur anyway.
   if (errorProb >= 1) errorProb = 0.999999;
+  return errorProb;
+}
+
+inline double qualityUncertainty(double errorProb, double letterProb) {
   double otherProb = 1 - letterProb;
   assert(letterProb >= 0);
   assert(otherProb > 0);
   return errorProb / otherProb;
 }
 
-inline int qualityPairScore(double expScore, double uncertainty1,
-                            double uncertainty2, double lambda) {
-  double x = (1 - uncertainty1) * (1 - uncertainty2) * (expScore - 1) + 1;
-  assert(lambda > 0);
-  assert(x > 0);
+inline double qualityCertainty(double errorProb, double letterProb) {
+  return 1 - qualityUncertainty(errorProb, letterProb);
+}
+
+inline int qualityPairScore(double expScore, double certainty1,
+			    double certainty2, double lambda) {
+  double x = certainty1 * certainty2 * (expScore - 1) + 1;
   return nearestInt(std::log(x) / lambda);
 }
 
diff --git a/src/version.hh b/src/version.hh
index 2db7e02..2a9b445 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"746"
+"752"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git



More information about the debian-med-commit mailing list