[med-svn] [Git][med-team/last-align][upstream] New upstream version 1060
Steffen Möller
gitlab at salsa.debian.org
Wed Mar 25 00:14:38 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / last-align
Commits:
a03c3caa by Steffen Moeller at 2020-03-24T21:22:19+01:00
New upstream version 1060
- - - - -
6 changed files:
- ChangeLog.txt
- src/split/cbrc_int_exponentiator.hh
- src/split/cbrc_split_aligner.cc
- src/split/cbrc_split_aligner.hh
- src/split/last-split.cc
- src/version.hh
Changes:
=====================================
ChangeLog.txt
=====================================
@@ -1,9 +1,68 @@
+2020-03-16 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Reduce last-split memory & time usage a bit
+ [ca3f417edc01] [tip]
+
+ * src/split/cbrc_int_exponentiator.hh:
+ Make last-split a bit faster
+ [a380494ff77b]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Make last-split a bit faster
+ [0b3571f8e14b]
+
+2020-03-13 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc:
+ Refactor
+ [ff73ca674104]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Make last-split much faster
+ [6f4ec587fe25]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Refactor
+ [3887935792ab]
+
+2020-03-11 Martin C. Frith <Martin C. Frith>
+
+ * src/split/cbrc_split_aligner.cc:
+ Make last-split a bit faster
+ [8621c8e512ff]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+ Try to make last-split faster
+ [13c4e8b892dc]
+
+ * src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh,
+ src/split/last-split.cc:
+ Make last-split a bit faster
+ [ba6739cb6e7b]
+
+ * src/split/cbrc_split_aligner.cc, test/last-split-test.out:
+ Tiny change to last-split probabilities
+ [eb537293aa74]
+
+ * src/split/cbrc_split_aligner.cc:
+ Refactor
+ [cf3b0bf3dac8]
+
+ * src/split/cbrc_split_aligner.cc:
+ Refactor
+ [c0681ee6f1fd]
+
+ * src/split/cbrc_split_aligner.cc:
+ Refactor (rename a variable)
+ [c62c2b618d94]
+
2020-01-20 Martin C. Frith <Martin C. Frith>
* scripts/last-postmask, test/last-postmask-test.out, test/last-
postmask-test.sh:
last-postmask: fix crash on length-1 DB name
- [1015a8df8312] [tip]
+ [1015a8df8312]
* scripts/last-train:
last-train: fix occasional crashes since v1020
=====================================
src/split/cbrc_int_exponentiator.hh
=====================================
@@ -17,9 +17,23 @@ public:
void setBase(double b) {
base = b;
invBase = 1.0 / b;
+
+ double y = 1;
+ double z = 1;
+ for (int i = 0; i < 128; ++i) {
+ lookup[128 + i] = y;
+ y *= base;
+ z *= invBase;
+ lookup[127 - i] = z;
+ }
}
double operator()(int exponent) const {
+ unsigned u = exponent + 128;
+ if (u < 256) {
+ return lookup[u];
+ }
+
unsigned n = exponent;
double x;
if (exponent >= 0) {
@@ -40,6 +54,7 @@ public:
private:
double base;
double invBase;
+ double lookup[256];
};
}
=====================================
src/split/cbrc_split_aligner.cc
=====================================
@@ -23,6 +23,30 @@ template<typename T, int N> T arrayMax(T (&array)[N]) {
return *std::max_element(array, array + N);
}
+// Orders candidate alignments by increasing DP start coordinate.
+// Breaks ties by decreasing DP end coordinate.
+struct DpBegLess {
+ DpBegLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+ bool operator()(unsigned a, unsigned b) const {
+ return
+ dpBegs[a] != dpBegs[b] ? dpBegs[a] < dpBegs[b] : dpEnds[a] > dpEnds[b];
+ }
+ const unsigned *dpBegs;
+ const unsigned *dpEnds;
+};
+
+// Orders candidate alignments by decreasing DP end coordinate.
+// Breaks ties by increasing DP start coordinate.
+struct DpEndLess {
+ DpEndLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+ bool operator()(unsigned a, unsigned b) const {
+ return
+ dpEnds[a] != dpEnds[b] ? dpEnds[a] > dpEnds[b] : dpBegs[a] < dpBegs[b];
+ }
+ const unsigned *dpBegs;
+ const unsigned *dpEnds;
+};
+
// Orders candidate alignments by increasing DP start coordinate.
// Breaks ties by chromosome & strand, then by increasing genomic
// start coordinate.
@@ -104,9 +128,7 @@ void mergeInto(unsigned* beg1,
const unsigned* end2,
T lessFunc) {
unsigned* end3 = end1 + (end2 - beg2);
- for (;;) {
- if (beg2 == end2)
- break;
+ while (end2 != beg2) {
if (beg1 == end1) {
std::copy(beg2, end2, beg1);
break;
@@ -296,24 +318,60 @@ void SplitAligner::updateInplayAlnIndicesB(unsigned& sortedAlnPos,
newNumInplay = (newEnd - newBeg) + (sortedAlnPos - sortedAlnOldPos);
}
-long SplitAligner::viterbi() {
- resizeMatrix(Vmat);
- resizeVector(Vvec);
+long SplitAligner::viterbiSplit() {
+ unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+ unsigned *inplayAlnEnd = inplayAlnBeg;
+ unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+ unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
- for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
- long maxScore = 0;
- cell(Vvec, minBeg) = maxScore;
- long scoreFromJump = restartScore;
+ std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+ DpBegLess(&dpBegs[0], &dpEnds[0]));
- stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
- QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+ for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
+ long maxScore = 0;
+
+ for (unsigned j = minBeg; j < maxEnd; j++) {
+ while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
+ --inplayAlnEnd; // it is no longer "in play"
+ }
+ const unsigned *sortedAlnBeg = sortedAlnPtr;
+ while (sortedAlnPtr < sortedAlnEnd && dpBeg(*sortedAlnPtr) == j) {
+ ++sortedAlnPtr;
+ }
+ mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+ DpEndLess(&dpBegs[0], &dpEnds[0]));
+ inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+ cell(Vvec, j) = maxScore;
+ long scoreFromJump = maxScore + restartScore;
+ for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+ size_t ij = matrixRowOrigins[*x] + j;
+ long s = std::max(scoreFromJump, Vmat[ij] + Smat[ij*2]) + Smat[ij*2+1];
+ Vmat[ij + 1] = s;
+ maxScore = std::max(maxScore, s);
+ }
+ }
+
+ cell(Vvec, maxEnd) = maxScore;
+ return maxScore;
+}
+
+long SplitAligner::viterbiSplice() {
unsigned sortedAlnPos = 0;
unsigned oldNumInplay = 0;
unsigned newNumInplay = 0;
+ stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+ QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+
+ for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
+ long maxScore = 0;
+ long scoreFromJump = restartScore;
+
for (unsigned j = minBeg; j < maxEnd; j++) {
updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
+ cell(Vvec, j) = maxScore;
long sMax = INT_MIN/2;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
@@ -324,19 +382,19 @@ long SplitAligner::viterbi() {
s = std::max(s,
scoreFromSplice(i, j, oldNumInplay, oldInplayPos));
s += spliceEndScore(ij);
- s = std::max(s, Vmat[ij] + Dmat[ij]);
- if (restartProb <= 0 && alns[i].qstart == j && s < 0) s = 0;
- s += Amat[ij];
+ s = std::max(s, Vmat[ij] + Smat[ij*2]);
+ if (alns[i].qstart == j && s < 0) s = 0;
+ s += Smat[ij*2+1];
Vmat[ij + 1] = s;
sMax = std::max(sMax, s + spliceBegScore(ij + 1));
}
maxScore = std::max(sMax, maxScore);
- cell(Vvec, j+1) = maxScore;
scoreFromJump = std::max(sMax + jumpScore, maxScore + restartScore);
}
- return (restartProb > 0) ? maxScore : endScore();
+ cell(Vvec, maxEnd) = maxScore;
+ return endScore();
}
long SplitAligner::endScore() const {
@@ -377,7 +435,7 @@ void SplitAligner::traceBack(long viterbiScore,
for (;;) {
--j;
size_t ij = matrixRowOrigins[i] + j;
- long score = Vmat[ij + 1] - Amat[ij];
+ long score = Vmat[ij + 1] - Smat[ij*2+1];
if (restartProb <= 0 && alns[i].qstart == j && score == 0) {
queryBegs.push_back(j);
return;
@@ -389,7 +447,7 @@ void SplitAligner::traceBack(long viterbiScore,
// makes some other kinds of boundary less clean. What's the best
// procedure for tied scores?
- bool isStay = (score == Vmat[ij] + Dmat[ij]);
+ bool isStay = (score == Vmat[ij] + Smat[ij*2]);
if (isStay && alns[i].isForwardStrand()) continue;
long s = score - spliceEndScore(ij);
@@ -417,8 +475,8 @@ int SplitAligner::segmentScore(unsigned alnNum,
unsigned i = alnNum;
for (unsigned j = queryBeg; j < queryEnd; ++j) {
size_t ij = matrixRowOrigins[i] + j;
- score += Amat[ij];
- if (j > queryBeg) score += Dmat[ij];
+ score += Smat[ij*2+1];
+ if (j > queryBeg) score += Smat[ij*2];
}
return score;
}
@@ -483,30 +541,69 @@ double SplitAligner::probFromSpliceB(unsigned i, unsigned j,
return sum;
}
-void SplitAligner::forward() {
- resizeVector(rescales);
- cell(rescales, minBeg) = 1.0;
+void SplitAligner::forwardSplit() {
+ unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+ unsigned *inplayAlnEnd = inplayAlnBeg;
+ unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+ unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
- resizeMatrix(Fmat);
- for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
- double sumProb = 1;
- double probFromJump = restartProb;
- double begprob = 1.0;
- double zF = 0.0; // sum of probabilities from the forward algorithm
+ std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+ DpBegLess(&dpBegs[0], &dpEnds[0]));
- stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
- QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+ for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
+ double sumOfProbs = 1;
+ double rescale = 1;
+
+ for (unsigned j = minBeg; j < maxEnd; j++) {
+ while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
+ --inplayAlnEnd; // it is no longer "in play"
+ }
+ const unsigned *sortedAlnBeg = sortedAlnPtr;
+ while (sortedAlnPtr < sortedAlnEnd && dpBeg(*sortedAlnPtr) == j) {
+ ++sortedAlnPtr;
+ }
+ mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+ DpEndLess(&dpBegs[0], &dpEnds[0]));
+ inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+ cell(rescales, j) = rescale;
+ double probFromJump = sumOfProbs * restartProb;
+ double pSum = 0.0;
+ for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+ size_t ij = matrixRowOrigins[*x] + j;
+ double p =
+ (probFromJump + Fmat[ij] * Sexp[ij*2]) * Sexp[ij*2+1] * rescale;
+ Fmat[ij + 1] = p;
+ pSum += p;
+ }
+ sumOfProbs = pSum + sumOfProbs * rescale;
+ rescale = 1 / (pSum + 1);
+ }
+
+ cell(rescales, maxEnd) = 1 / sumOfProbs; // makes scaled sumOfProbs equal 1
+}
+
+void SplitAligner::forwardSplice() {
unsigned sortedAlnPos = 0;
unsigned oldNumInplay = 0;
unsigned newNumInplay = 0;
+ stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+ QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+
+ for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
+ double probFromJump = 0;
+ double begprob = 1.0;
+ double zF = 0.0; // sum of probabilities from the forward algorithm
+ double rescale = 1;
+
for (unsigned j = minBeg; j < maxEnd; j++) {
updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
- double r = cell(rescales, j);
- zF /= r;
+ cell(rescales, j) = rescale;
+ zF *= rescale;
double pSum = 0.0;
- double rNew = 1.0;
+ double rNew = 0.0;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
size_t ij = matrixRowOrigins[i] + j;
@@ -515,45 +612,77 @@ void SplitAligner::forward() {
if (splicePrior > 0.0)
p += probFromSpliceF(i, j, oldNumInplay, oldInplayPos);
p *= spliceEndProb(ij);
- p += Fmat[ij] * Dexp[ij];
- if (restartProb <= 0 && alns[i].qstart == j) p += begprob;
- p = p * Aexp[ij] / r;
+ p += Fmat[ij] * Sexp[ij*2];
+ if (alns[i].qstart == j) p += begprob;
+ p = p * Sexp[ij*2+1] * rescale;
Fmat[ij + 1] = p;
if (alns[i].qend == j+1) zF += p;
pSum += p * spliceBegProb(ij + 1);
rNew += p;
}
- begprob /= r;
- cell(rescales, j+1) = rNew;
- sumProb = pSum + sumProb / r;
- probFromJump = pSum * jumpProb + sumProb * restartProb;
+ begprob *= rescale;
+ probFromJump = pSum * jumpProb;
+ rescale = 1 / (rNew + 1);
}
- if (restartProb > 0) zF = sumProb;
- //zF /= cell(rescales, maxEnd);
- cell(rescales, maxEnd) = zF; // this causes scaled zF to equal 1
+ cell(rescales, maxEnd) = 1 / zF; // this causes scaled zF to equal 1
}
-void SplitAligner::backward() {
- resizeMatrix(Bmat);
- for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
- double sumProb = (restartProb > 0) ? 1 : 0;
- double probFromJump = sumProb;
- double endprob = 1.0;
- //double zB = 0.0; // sum of probabilities from the backward algorithm
+void SplitAligner::backwardSplit() {
+ unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+ unsigned *inplayAlnEnd = inplayAlnBeg;
+ unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+ unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
- stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
- QendLess(&dpEnds[0], &rnameAndStrandIds[0], &rEnds[0]));
+ std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+ DpEndLess(&dpBegs[0], &dpEnds[0]));
+
+ for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
+ double sumOfProbs = 1;
+
+ for (unsigned j = maxEnd; j > minBeg; j--) {
+ while (inplayAlnEnd > inplayAlnBeg && dpBeg(inplayAlnEnd[-1]) == j) {
+ --inplayAlnEnd; // it is no longer "in play"
+ }
+ const unsigned *sortedAlnBeg = sortedAlnPtr;
+ while (sortedAlnPtr < sortedAlnEnd && dpEnd(*sortedAlnPtr) == j) {
+ ++sortedAlnPtr;
+ }
+ mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+ DpBegLess(&dpBegs[0], &dpEnds[0]));
+ inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+ double rescale = cell(rescales, j);
+ double pSum = 0.0;
+ for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+ size_t ij = matrixRowOrigins[*x] + j;
+ double p = (sumOfProbs + Bmat[ij] * Sexp[ij*2]) * Sexp[ij*2-1] * rescale;
+ Bmat[ij - 1] = p;
+ pSum += p;
+ }
+ sumOfProbs = pSum * restartProb + sumOfProbs * rescale;
+ }
+}
+
+void SplitAligner::backwardSplice() {
unsigned sortedAlnPos = 0;
unsigned oldNumInplay = 0;
unsigned newNumInplay = 0;
+ stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+ QendLess(&dpEnds[0], &rnameAndStrandIds[0], &rEnds[0]));
+
+ for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
+ double probFromJump = 0;
+ double endprob = 1.0;
+ //double zB = 0.0; // sum of probabilities from the backward algorithm
+
for (unsigned j = maxEnd; j > minBeg; j--) {
updateInplayAlnIndicesB(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
- double r = cell(rescales, j);
- //zB /= r;
+ double rescale = cell(rescales, j);
+ //zB *= rescale;
double pSum = 0.0;
for (unsigned x = 0; x < newNumInplay; ++x) {
unsigned i = newInplayAlnIndices[x];
@@ -563,48 +692,44 @@ void SplitAligner::backward() {
if (splicePrior > 0.0)
p += probFromSpliceB(i, j, oldNumInplay, oldInplayPos);
p *= spliceBegProb(ij);
- p += Bmat[ij] * Dexp[ij];
- if (restartProb <= 0 && alns[i].qend == j) p += endprob;
- p = p * Aexp[ij - 1] / r;
+ p += Bmat[ij] * Sexp[ij*2];
+ if (alns[i].qend == j) p += endprob;
+ p = p * Sexp[ij*2-1] * rescale;
Bmat[ij - 1] = p;
//if (alns[i].qstart == j-1) zB += p;
pSum += p * spliceEndProb(ij - 1);
}
- endprob /= r;
- sumProb = pSum * restartProb + sumProb / r;
- probFromJump = pSum * jumpProb + sumProb;
+ endprob *= rescale;
+ probFromJump = pSum * jumpProb;
}
-
- //if (restartProb > 0) zB = sumProb;
- //zB /= cell(rescales, minBeg);
}
std::vector<double>
SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
unsigned alnBeg, unsigned alnEnd) const {
- std::vector<double> output;
- unsigned i = alnNum;
- unsigned j = queryBeg;
- for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
- size_t ij = matrixRowOrigins[i] + j;
- if (alns[i].qalign[pos] == '-') {
- double value = Fmat[ij] * Bmat[ij] * Dexp[ij] / cell(rescales, j);
- output.push_back(value);
- } else {
- double value = Fmat[ij + 1] * Bmat[ij] / Aexp[ij];
- if (value != value) value = 0.0;
- output.push_back(value);
- j++;
- }
+ std::vector<double> output;
+ unsigned i = alnNum;
+ unsigned j = queryBeg;
+ for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
+ size_t ij = matrixRowOrigins[i] + j;
+ if (alns[i].qalign[pos] == '-') {
+ double value = Fmat[ij] * Bmat[ij] * Sexp[ij*2] * cell(rescales, j);
+ output.push_back(value);
+ } else {
+ double value = Fmat[ij + 1] * Bmat[ij] / Sexp[ij*2+1];
+ if (value != value) value = 0.0;
+ output.push_back(value);
+ j++;
}
- return output;
+ }
+ return output;
}
// The next routine represents affine gap scores in a cunning way.
-// Amat holds scores at query bases, and at every base that is aligned
+// Aij holds scores at query bases, and at every base that is aligned
// to a gap it gets a score of insExistenceScore + insExtensionScore.
-// Dmat holds scores between query bases, and between every pair of
+// Dij holds scores between query bases, and between every pair of
// bases that are both aligned to gaps it gets a score of
// -insExistenceScore. This produces suitable affine gap scores, even
// if we jump from one alignment to another in the middle of a gap.
@@ -616,18 +741,17 @@ void SplitAligner::calcBaseScores(unsigned i) {
const UnsplitAlignment& a = alns[i];
const size_t origin = matrixRowOrigins[i];
- int *AmatB = &Amat[origin + dpBeg(i)];
- int *DmatB = &Dmat[origin + dpBeg(i)];
- int *AmatS = &Amat[origin + a.qstart];
- int *AmatE = &Amat[origin + dpEnd(i)];
+ int *matBeg = &Smat[(origin + dpBeg(i)) * 2];
+ int *alnBeg = &Smat[(origin + a.qstart) * 2];
+ int *matEnd = &Smat[(origin + dpEnd(i)) * 2];
int delScore = 0;
int insCompensationScore = 0;
// treat any query letters before the alignment as insertions:
- while (AmatB < AmatS) {
- *AmatB++ = firstInsScore;
- *DmatB++ = delScore + insCompensationScore;
+ while (matBeg < alnBeg) {
+ *matBeg++ = delScore + insCompensationScore;
+ *matBeg++ = firstInsScore;
delScore = 0;
insCompensationScore = tweenInsScore;
}
@@ -641,8 +765,8 @@ void SplitAligner::calcBaseScores(unsigned i) {
unsigned char y = *qAlign++;
int q = qQual ? (*qQual++ - qualityOffset) : (numQualCodes - 1);
if (x == '-') { // gap in reference sequence: insertion
- *AmatB++ = firstInsScore;
- *DmatB++ = delScore + insCompensationScore;
+ *matBeg++ = delScore + insCompensationScore;
+ *matBeg++ = firstInsScore;
delScore = 0;
insCompensationScore = tweenInsScore;
} else if (y == '-') { // gap in query sequence: deletion
@@ -652,8 +776,8 @@ void SplitAligner::calcBaseScores(unsigned i) {
} else {
assert(q >= 0);
if (q >= numQualCodes) q = numQualCodes - 1;
- *AmatB++ = score_mat[x % 64][y % 64][q];
- *DmatB++ = delScore;
+ *matBeg++ = delScore;
+ *matBeg++ = score_mat[x % 64][y % 64][q];
delScore = 0;
insCompensationScore = 0;
}
@@ -662,14 +786,14 @@ void SplitAligner::calcBaseScores(unsigned i) {
}
// treat any query letters after the alignment as insertions:
- while (AmatB < AmatE) {
- *AmatB++ = firstInsScore;
- *DmatB++ = delScore + insCompensationScore;
+ while (matBeg < matEnd) {
+ *matBeg++ = delScore + insCompensationScore;
+ *matBeg++ = firstInsScore;
delScore = 0;
insCompensationScore = tweenInsScore;
}
- *DmatB++ = delScore;
+ *matBeg++ = delScore;
}
void SplitAligner::initRbegsAndEnds() {
@@ -922,24 +1046,24 @@ void SplitAligner::layout(std::vector<UnsplitAlignment>::const_iterator beg,
sortedAlnIndices.resize(numAlns);
for (unsigned i = 0; i < numAlns; ++i) sortedAlnIndices[i] = i;
- oldInplayAlnIndices.resize(numAlns);
newInplayAlnIndices.resize(numAlns);
- rBegs.resize(numAlns);
- rEnds.resize(numAlns);
-
- if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
+ if (restartProb <= 0) {
+ oldInplayAlnIndices.resize(numAlns);
+ rBegs.resize(numAlns);
+ rEnds.resize(numAlns);
+ if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
initRbegsAndEnds();
- //initRnameAndStrandIds();
+ }
+ initRnameAndStrandIds();
}
- initRnameAndStrandIds();
initDpBounds();
}
size_t SplitAligner::memory(bool isViterbi, bool isBothSpliceStrands) const {
size_t numOfStrands = isBothSpliceStrands ? 2 : 1;
- size_t x = 2 * sizeof(int) + 2 * sizeof(double);
+ size_t x = 2 * sizeof(int) + 2 * sizeof(float);
if (splicePrior > 0 || !chromosomeIndex.empty()) x += 2 * sizeof(unsigned);
if (!chromosomeIndex.empty()) x += 2;
if (isViterbi) x += sizeof(long) * numOfStrands;
@@ -948,10 +1072,13 @@ size_t SplitAligner::memory(bool isViterbi, bool isBothSpliceStrands) const {
}
void SplitAligner::initMatricesForOneQuery() {
- resizeMatrix(Amat);
- resizeMatrix(Dmat);
- resizeMatrix(Aexp);
- resizeMatrix(Dexp);
+ size_t doubleMatrixSize = cellsPerDpMatrix() * 2;
+ // The final cell per row is never used, because there's one less
+ // Aij than Dij per candidate alignment.
+ if (Smat.size() < doubleMatrixSize) {
+ Smat.resize(doubleMatrixSize);
+ Sexp.resize(doubleMatrixSize);
+ }
for (unsigned i = 0; i < numAlns; i++) calcBaseScores(i);
@@ -967,8 +1094,7 @@ void SplitAligner::initMatricesForOneQuery() {
for (unsigned i = 0; i < numAlns; ++i) initSpliceSignals(i);
}
- transform(Amat.begin(), Amat.end(), Aexp.begin(), scaledExp);
- transform(Dmat.begin(), Dmat.end(), Dexp.begin(), scaledExp);
+ std::transform(&Smat[0], &Smat[0] + doubleMatrixSize, &Sexp[0], scaledExp);
// if x/scale < about -745, then exp(x/scale) will be exactly 0.0
}
@@ -990,7 +1116,7 @@ double SplitAligner::spliceSignalStrandLogOdds() const {
assert(rescales.size() == rescalesRev.size());
double logOdds = 0;
for (unsigned j = 0; j < rescales.size(); ++j) {
- logOdds += std::log(rescales[j] / rescalesRev[j]);
+ logOdds += std::log(rescalesRev[j] / rescales[j]);
}
return logOdds;
}
=====================================
src/split/cbrc_split_aligner.hh
=====================================
@@ -74,7 +74,11 @@ public:
// Call this before viterbi/forward/backward, and after layout
void initMatricesForOneQuery();
- long viterbi(); // returns the optimal split-alignment score
+ long viterbi() { // returns the optimal split-alignment score
+ resizeMatrix(Vmat);
+ resizeVector(Vvec);
+ return (restartProb > 0) ? viterbiSplit() : viterbiSplice();
+ }
// Gets the chunks of an optimal split alignment.
// For each chunk, it gets:
@@ -91,9 +95,18 @@ public:
int segmentScore(unsigned alnNum,
unsigned queryBeg, unsigned queryEnd) const;
- void forward();
-
- void backward();
+ void forwardBackward() {
+ resizeVector(rescales);
+ resizeMatrix(Fmat);
+ resizeMatrix(Bmat);
+ if (restartProb > 0) {
+ forwardSplit();
+ backwardSplit();
+ } else {
+ forwardSplice();
+ backwardSplice();
+ }
+ }
// Returns one probability per column, for a segment of an alignment
std::vector<double> marginalProbs(unsigned queryBeg, unsigned alnNum,
@@ -139,12 +152,22 @@ private:
std::vector<unsigned> dpBegs; // dynamic programming begin coords
std::vector<unsigned> dpEnds; // dynamic programming end coords
std::vector<size_t> matrixRowOrigins; // layout of ragged matrices
- std::vector<int> Amat; // scores at query bases, for each candidate
- std::vector<int> Dmat; // scores between query bases, for each candidate
+
+ std::vector<int> Smat;
+ // Smat holds position-specific substitution, insertion, and
+ // deletion scores for the candidate alignments of one query
+ // sequence to a genome. These scores, for each candidate
+ // alignment i, are called Aij and Dij in [Frith&Kawaguchi 2015].
+ // Aij holds scores at query bases, in Smat[i][1,3,5,...,2n-1].
+ // Dij holds scores between query bases, in Smat[i][0,2,4,...,2n].
+
std::vector<long> Vmat; // DP matrix for Viterbi algorithm
std::vector<long> Vvec; // DP vector for Viterbi algorithm
- std::vector<double> Aexp;
- std::vector<double> Dexp;
+
+ std::vector<float> Sexp;
+ // Sexp holds exp(Smat / t): these values are called A'ij and D'ij
+ // in [Frith&Kawaguchi 2015].
+
std::vector<double> Fmat; // DP matrix for Forward algorithm
std::vector<double> Bmat; // DP matrix for Backward algorithm
std::vector<double> rescales; // the usual scaling for numerical stability
@@ -227,6 +250,14 @@ private:
unsigned& oldNumInplay,
unsigned& newNumInplay, unsigned j);
+ long viterbiSplit();
+ long viterbiSplice();
+
+ void forwardSplit();
+ void backwardSplit();
+ void forwardSplice();
+ void backwardSplice();
+
unsigned findScore(unsigned j, long score) const;
unsigned findSpliceScore(unsigned i, unsigned j, long score) const;
long scoreFromSplice(unsigned i, unsigned j, unsigned oldNumInplay,
@@ -264,9 +295,9 @@ private:
void resizeMatrix(T& m) const {
// This reserves size for a ragged matrix, which is actually
// stored in a flat vector. There are numAlns rows, and row i
- // has dpEnd(i) - dpBeg(i) + 1 cells. (The final cell per row
- // is used in some matrices but not others.)
- m.resize(cellsPerDpMatrix());
+ // has dpEnd(i) - dpBeg(i) + 1 cells.
+ size_t s = cellsPerDpMatrix();
+ if (m.size() < s) m.resize(s);
}
double probFromSpliceF(unsigned i, unsigned j, unsigned oldNumInplay,
=====================================
src/split/last-split.cc
=====================================
@@ -184,13 +184,11 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg,
sa.initMatricesForOneQuery();
if (opts.direction != 0) {
- sa.forward();
- sa.backward();
+ sa.forwardBackward();
}
if (opts.direction != 1) {
sa.flipSpliceSignals();
- sa.forward();
- sa.backward();
+ sa.forwardBackward();
sa.flipSpliceSignals();
}
=====================================
src/version.hh
=====================================
@@ -1 +1 @@
-"1047"
+"1060"
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a03c3caa8895e6816352644f2d8854847c148dc1
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a03c3caa8895e6816352644f2d8854847c148dc1
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200325/ca05e9fa/attachment-0001.html>
More information about the debian-med-commit
mailing list