[med-svn] [last-align] 01/03: Imported Upstream version 752
Andreas Tille
tille at debian.org
Tue Aug 16 12:39:34 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository last-align.
commit f587cedc3fac783b10361b16cd4a4ced83dfb55b
Author: Andreas Tille <tille at debian.org>
Date: Tue Aug 16 14:34:54 2016 +0200
Imported Upstream version 752
---
ChangeLog.txt | 32 +++++++++++-
scripts/last-train | 8 ++-
src/OneQualityScoreMatrix.cc | 6 +--
src/QualityPssmMaker.cc | 2 +-
src/TwoQualityScoreMatrix.cc | 116 ++++++++++++++++++++++++++++++++-----------
src/TwoQualityScoreMatrix.hh | 3 +-
src/lastal.cc | 14 ++++--
src/qualityScoreUtil.hh | 20 +++++---
src/version.hh | 2 +-
9 files changed, 155 insertions(+), 48 deletions(-)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 214187c..6caa061 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,8 +1,38 @@
+2016-08-01 Martin C. Frith <Martin C. Frith>
+
+ * scripts/last-train:
+ Fixed last-train crash with gap existence cost < 0.
+ [9a03622aa2c0] [tip]
+
+ * src/TwoQualityScoreMatrix.cc, src/TwoQualityScoreMatrix.hh,
+ src/lastal.cc:
+ Made lastal startup yet faster for fastq-versus-fastq.
+ [3d9a91ae8385]
+
+ * src/TwoQualityScoreMatrix.cc, src/TwoQualityScoreMatrix.hh:
+ Refactoring.
+ [75fb5e4bd955]
+
+2016-07-20 Martin C. Frith <Martin C. Frith>
+
+ * src/TwoQualityScoreMatrix.cc, src/qualityScoreUtil.hh:
+ Made lastal startup even faster for fastq-versus-fastq.
+ [b29ac42364b3]
+
+ * src/TwoQualityScoreMatrix.cc:
+ Made lastal startup faster for fastq-versus-fastq.
+ [be0485aed801]
+
+ * src/OneQualityScoreMatrix.cc, src/QualityPssmMaker.cc,
+ src/TwoQualityScoreMatrix.cc, src/qualityScoreUtil.hh:
+ Refactoring.
+ [ba10c9dd420f]
+
2016-07-05 Martin C. Frith <Martin C. Frith>
* src/SubsetSuffixArraySearch.cc:
Bugfix: lastal with -l0 was not reliable, I think.
- [60738c7a24e3] [tip]
+ [60738c7a24e3]
* src/LastalArguments.cc:
Allowed gap open costs < 0.
diff --git a/scripts/last-train b/scripts/last-train
index f8e77ae..a96532a 100755
--- a/scripts/last-train
+++ b/scripts/last-train
@@ -134,9 +134,15 @@ def gapProbsFromCounts(counts, opts):
insCloseProb = 1 - insExtendProb
firstDelProb = delOpenProb * delCloseProb
firstInsProb = insOpenProb * insCloseProb
- # if we define "alignment" to mean "set of indistinguishable paths":
+
+ # If we define "an alignment" to mean "a set of indistinguishable
+ # paths", then:
#delExtendProb += firstDelProb
#insExtendProb += firstInsProb
+ # Else, this ensures gap existence cost >= 0:
+ delExtendProb = max(delExtendProb, firstDelProb)
+ insExtendProb = max(insExtendProb, firstInsProb)
+
delExistProb = firstDelProb / delExtendProb
insExistProb = firstInsProb / insExtendProb
diff --git a/src/OneQualityScoreMatrix.cc b/src/OneQualityScoreMatrix.cc
index d15aa6d..26adea3 100644
--- a/src/OneQualityScoreMatrix.cc
+++ b/src/OneQualityScoreMatrix.cc
@@ -42,9 +42,9 @@ void OneQualityScoreMatrix::init(const ScoreMatrixRow *scoreMatrix,
for (int q2 = 0; q2 < qualityCapacity; ++q2) {
if (isUseQuality) {
- double p2 = letterProbs2[unmasked2];
- double u2 = qualityUncertainty(q2, qualityOffset, isPhred, p2);
- score = qualityPairScore(expScore, 0, u2, lambda);
+ double e2 = errorProbFromQual(q2, qualityOffset, isPhred);
+ double c2 = qualityCertainty(e2, letterProbs2[unmasked2]);
+ score = qualityPairScore(expScore, 1, c2, lambda);
}
if (isMask) score = std::min(score, 0);
diff --git a/src/QualityPssmMaker.cc b/src/QualityPssmMaker.cc
index 0fe7deb..6104c7d 100644
--- a/src/QualityPssmMaker.cc
+++ b/src/QualityPssmMaker.cc
@@ -33,7 +33,7 @@ void QualityPssmMaker::init(const ScoreMatrixRow *scoreMatrix,
double mismatchExp = std::exp(lambda * mismatchScore);
for (int q = 0; q < qualityCapacity; ++q) {
- double e = qualityUncertainty(q, qualityOffset, false, 0);
+ double e = errorProbFromQual(q, qualityOffset, false);
double p = 1 - e;
qualityToProbCorrect[q] = p;
diff --git a/src/TwoQualityScoreMatrix.cc b/src/TwoQualityScoreMatrix.cc
index fde44d6..dc42db7 100644
--- a/src/TwoQualityScoreMatrix.cc
+++ b/src/TwoQualityScoreMatrix.cc
@@ -7,6 +7,7 @@
#include <algorithm> // min
#include <cassert>
#include <cmath>
+//#include <iostream> // for debugging
namespace cbrc {
@@ -50,43 +51,102 @@ void TwoQualityScoreMatrix::init(const ScoreMatrixRow *scoreMatrix,
bool isPhred2,
int qualityOffset2,
const uchar *toUnmasked,
- bool isApplyMasking) {
+ bool isMask,
+ bool isMatchMismatchMatrix) {
+ typedef TwoQualityMatrixIndexer Indexer;
+
indexer.init(toUnmasked);
- data.resize(indexer.numSymbols * indexer.numSymbols);
+ data.resize(Indexer::numSymbols * Indexer::numSymbols);
- for (int letter1 = 0; letter1 < scoreMatrixRowSize; ++letter1) {
- for (int letter2 = 0; letter2 < scoreMatrixRowSize; ++letter2) {
- int unmasked1 = toUnmasked[letter1];
- int unmasked2 = toUnmasked[letter2];
+ double expMat[Indexer::numNormalLetters][Indexer::numNormalLetters];
- bool isMasked = (unmasked1 != letter1 || unmasked2 != letter2);
- bool isMask = (isApplyMasking && isMasked);
+ for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1)
+ for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2)
+ expMat[x1][x2] = std::exp(lambda * scoreMatrix[x1][x2]);
- bool isNormal1 = (unmasked1 < indexer.numNormalLetters);
- bool isNormal2 = (unmasked2 < indexer.numNormalLetters);
- bool isUseQuality = (isNormal1 && isNormal2);
+ double certainties1[Indexer::qualityCapacity][Indexer::numNormalLetters];
+ double certainties2[Indexer::qualityCapacity][Indexer::numNormalLetters];
- int score = scoreMatrix[unmasked1][unmasked2];
- double expScore = std::exp(lambda * score);
+ for (int q = Indexer::minQuality; q < Indexer::qualityCapacity; ++q) {
+ double e1 = errorProbFromQual(q, qualityOffset1, isPhred1);
+ double e2 = errorProbFromQual(q, qualityOffset2, isPhred2);
+ for (int x = 0; x < Indexer::numNormalLetters; ++x) {
+ certainties1[q][x] = qualityCertainty(e1, letterProbs1[x]);
+ certainties2[q][x] = qualityCertainty(e2, letterProbs2[x]);
+ }
+ }
- int end1 = qualityEnd(indexer, letter1);
- int end2 = qualityEnd(indexer, letter2);
+ // I tried pre-calculating 1/lambda, but there was little speed boost.
+
+ for (int q1 = Indexer::minQuality; q1 < Indexer::qualityCapacity; ++q1) {
+ int *dq1 = &data[q1 * Indexer::numQualityLetters * Indexer::numSymbols];
+ const double *c1s = certainties1[q1];
+ for (int q2 = Indexer::minQuality; q2 < Indexer::qualityCapacity; ++q2) {
+ int *dq2 = dq1 + q2 * Indexer::numQualityLetters;
+ const double *c2s = certainties2[q2];
+ if (isMatchMismatchMatrix) { // do this common special case faster
+ int scoreSame = qualityPairScore(expMat[0][0], c1s[0], c2s[0], lambda);
+ int scoreDiff = qualityPairScore(expMat[0][1], c1s[0], c2s[0], lambda);
+ int scoreSameMask = isMask ? std::min(scoreSame, 0) : scoreSame;
+ int scoreDiffMask = isMask ? std::min(scoreDiff, 0) : scoreDiff;
+ for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1) {
+ int *dx1 = dq2 + x1 * Indexer::numSymbols;
+ int *dm1 = dx1 + Indexer::numNormalLetters * Indexer::numSymbols;
+ for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2) {
+ int m2 = x2 + Indexer::numNormalLetters;
+ if (x2 == x1) {
+ dx1[x2] = scoreSame;
+ dx1[m2] = scoreSameMask;
+ dm1[x2] = scoreSameMask;
+ dm1[m2] = scoreSameMask;
+ } else {
+ dx1[x2] = scoreDiff;
+ dx1[m2] = scoreDiffMask;
+ dm1[x2] = scoreDiffMask;
+ dm1[m2] = scoreDiffMask;
+ }
+ }
+ }
+ } else {
+ for (int x1 = 0; x1 < Indexer::numNormalLetters; ++x1) {
+ int *dx1 = dq2 + x1 * Indexer::numSymbols;
+ int *dm1 = dx1 + Indexer::numNormalLetters * Indexer::numSymbols;
+ double c1 = c1s[x1];
+ for (int x2 = 0; x2 < Indexer::numNormalLetters; ++x2) {
+ int m2 = x2 + Indexer::numNormalLetters;
+ double c2 = c2s[x2];
+ int score = qualityPairScore(expMat[x1][x2], c1, c2, lambda);
+ dx1[x2] = score;
+ if (isMask) score = std::min(score, 0);
+ dx1[m2] = score;
+ dm1[x2] = score;
+ dm1[m2] = score;
+ }
+ }
+ }
+ }
+ }
- for (int q1 = indexer.minQuality; q1 < end1; ++q1) {
- for (int q2 = indexer.minQuality; q2 < end2; ++q2) {
- if (isUseQuality) {
- double p1 = letterProbs1[unmasked1];
- double u1 = qualityUncertainty(q1, qualityOffset1, isPhred1, p1);
- double p2 = letterProbs2[unmasked2];
- double u2 = qualityUncertainty(q2, qualityOffset2, isPhred2, p2);
- score = qualityPairScore(expScore, u1, u2, lambda);
- }
+ for (int x1 = 0; x1 < scoreMatrixRowSize; ++x1) {
+ for (int x2 = 0; x2 < scoreMatrixRowSize; ++x2) {
+ int unmasked1 = toUnmasked[x1];
+ int unmasked2 = toUnmasked[x2];
- if (isMask) score = std::min(score, 0);
+ bool isQuality1 = (unmasked1 < Indexer::numNormalLetters);
+ bool isQuality2 = (unmasked2 < Indexer::numNormalLetters);
+ if (isQuality1 && isQuality2) continue;
- data[indexer(letter1, letter2, q1, q2)] = score;
- }
- }
+ bool isMasked = (unmasked1 != x1 || unmasked2 != x2);
+
+ int score = scoreMatrix[unmasked1][unmasked2];
+ if (isMasked && isMask) score = std::min(score, 0);
+
+ int end1 = qualityEnd(indexer, x1);
+ int end2 = qualityEnd(indexer, x2);
+
+ for (int q1 = Indexer::minQuality; q1 < end1; ++q1)
+ for (int q2 = Indexer::minQuality; q2 < end2; ++q2)
+ data[indexer(x1, x2, q1, q2)] = score;
}
}
}
diff --git a/src/TwoQualityScoreMatrix.hh b/src/TwoQualityScoreMatrix.hh
index bb6c9b4..85760ab 100644
--- a/src/TwoQualityScoreMatrix.hh
+++ b/src/TwoQualityScoreMatrix.hh
@@ -76,7 +76,8 @@ class TwoQualityScoreMatrix {
bool isPhred2,
int qualityOffset2,
const uchar *toUnmasked, // maps letters to unmasked letters
- bool isApplyMasking);
+ bool isMask,
+ bool isMatchMismatchMatrix); // if "true", init is faster
// Tests whether init has been called:
operator const void *() const { return data.empty() ? 0 : this; }
diff --git a/src/lastal.cc b/src/lastal.cc
index 41732e5..80f3886 100644
--- a/src/lastal.cc
+++ b/src/lastal.cc
@@ -140,6 +140,7 @@ void makeQualityScorers(){
"quality data not used for DNA-versus-protein alignment" );
const ScoreMatrixRow* m = scoreMatrix.caseSensitive; // case isn't relevant
+ bool isMatchMismatch = (args.matrixFile.empty() && args.matchScore > 0);
double lambda = lambdaCalculator.lambda();
const double* lp1 = lambdaCalculator.letterProbs1();
bool isPhred1 = isPhred( referenceFormat );
@@ -188,7 +189,6 @@ void makeQualityScorers(){
}
}
else if( args.inputFormat == sequenceFormat::prb ){
- bool isMatchMismatch = (args.matrixFile.empty() && args.matchScore > 0);
qualityPssmMaker.init( m, alph.size, lambda, isMatchMismatch,
args.matchScore, -args.mismatchCost,
offset2, alph.numbersToUppercase );
@@ -203,22 +203,26 @@ void makeQualityScorers(){
if( args.maskLowercase > 0 )
twoQualityMatrixMasked.init( m, lambda, lp1, lp2,
isPhred1, offset1, isPhred2, offset2,
- alph.numbersToUppercase, true);
+ alph.numbersToUppercase, true,
+ isMatchMismatch );
if( args.maskLowercase < 3 )
twoQualityMatrix.init( m, lambda, lp1, lp2,
isPhred1, offset1, isPhred2, offset2,
- alph.numbersToUppercase, false );
+ alph.numbersToUppercase, false,
+ isMatchMismatch );
if( args.outputType > 3 )
ERR( "fastq-versus-fastq column probabilities not implemented" );
if( args.isQueryStrandMatrix && args.strand != 1 ){
if( args.maskLowercase > 0 )
twoQualityMatrixRevMasked.init( mRev, lambda, lp1rev, lp2rev,
isPhred1, offset1, isPhred2, offset2,
- alph.numbersToUppercase, true );
+ alph.numbersToUppercase, true,
+ isMatchMismatch );
if( args.maskLowercase < 3 )
twoQualityMatrixRev.init( mRev, lambda, lp1rev, lp2rev,
isPhred1, offset1, isPhred2, offset2,
- alph.numbersToUppercase, false );
+ alph.numbersToUppercase, false,
+ isMatchMismatch );
}
}
else{
diff --git a/src/qualityScoreUtil.hh b/src/qualityScoreUtil.hh
index 05b9f71..c8fb11f 100644
--- a/src/qualityScoreUtil.hh
+++ b/src/qualityScoreUtil.hh
@@ -27,24 +27,30 @@ inline double solexaErrorProb(int qualityScore) {
return 1 / (1 + std::pow(10.0, 0.1 * qualityScore));
}
-inline double qualityUncertainty(int qualityCode, int qualityOffset,
- bool isPhred, double letterProb) {
+inline double errorProbFromQual(int qualityCode, int qualityOffset,
+ bool isPhred) {
int q = qualityCode - qualityOffset;
double errorProb = isPhred ? phredErrorProb(q) : solexaErrorProb(q);
// The next line of code is a kludge to avoid numerical instability.
// An error probability of 1 is bizarre and probably shouldn't occur anyway.
if (errorProb >= 1) errorProb = 0.999999;
+ return errorProb;
+}
+
+inline double qualityUncertainty(double errorProb, double letterProb) {
double otherProb = 1 - letterProb;
assert(letterProb >= 0);
assert(otherProb > 0);
return errorProb / otherProb;
}
-inline int qualityPairScore(double expScore, double uncertainty1,
- double uncertainty2, double lambda) {
- double x = (1 - uncertainty1) * (1 - uncertainty2) * (expScore - 1) + 1;
- assert(lambda > 0);
- assert(x > 0);
+inline double qualityCertainty(double errorProb, double letterProb) {
+ return 1 - qualityUncertainty(errorProb, letterProb);
+}
+
+inline int qualityPairScore(double expScore, double certainty1,
+ double certainty2, double lambda) {
+ double x = certainty1 * certainty2 * (expScore - 1) + 1;
return nearestInt(std::log(x) / lambda);
}
diff --git a/src/version.hh b/src/version.hh
index 2db7e02..2a9b445 100644
--- a/src/version.hh
+++ b/src/version.hh
@@ -1 +1 @@
-"746"
+"752"
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/last-align.git
More information about the debian-med-commit
mailing list