[med-svn] [Git][med-team/last-align][upstream] New upstream version 1060

Steffen Möller gitlab at salsa.debian.org
Wed Mar 25 00:14:38 GMT 2020



Steffen Möller pushed to branch upstream at Debian Med / last-align


Commits:
a03c3caa by Steffen Moeller at 2020-03-24T21:22:19+01:00
New upstream version 1060
- - - - -


6 changed files:

- ChangeLog.txt
- src/split/cbrc_int_exponentiator.hh
- src/split/cbrc_split_aligner.cc
- src/split/cbrc_split_aligner.hh
- src/split/last-split.cc
- src/version.hh


Changes:

=====================================
ChangeLog.txt
=====================================
@@ -1,9 +1,68 @@
+2020-03-16  Martin C. Frith  <Martin C. Frith>
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+	Reduce last-split memory & time usage a bit
+	[ca3f417edc01] [tip]
+
+	* src/split/cbrc_int_exponentiator.hh:
+	Make last-split a bit faster
+	[a380494ff77b]
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+	Make last-split a bit faster
+	[0b3571f8e14b]
+
+2020-03-13  Martin C. Frith  <Martin C. Frith>
+
+	* src/split/cbrc_split_aligner.cc:
+	Refactor
+	[ff73ca674104]
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+	Make last-split much faster
+	[6f4ec587fe25]
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+	Refactor
+	[3887935792ab]
+
+2020-03-11  Martin C. Frith  <Martin C. Frith>
+
+	* src/split/cbrc_split_aligner.cc:
+	Make last-split a bit faster
+	[8621c8e512ff]
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh:
+	Try to make last-split faster
+	[13c4e8b892dc]
+
+	* src/split/cbrc_split_aligner.cc, src/split/cbrc_split_aligner.hh,
+	src/split/last-split.cc:
+	Make last-split a bit faster
+	[ba6739cb6e7b]
+
+	* src/split/cbrc_split_aligner.cc, test/last-split-test.out:
+	Tiny change to last-split probabilities
+	[eb537293aa74]
+
+	* src/split/cbrc_split_aligner.cc:
+	Refactor
+	[cf3b0bf3dac8]
+
+	* src/split/cbrc_split_aligner.cc:
+	Refactor
+	[c0681ee6f1fd]
+
+	* src/split/cbrc_split_aligner.cc:
+	Refactor (rename a variable)
+	[c62c2b618d94]
+
 2020-01-20  Martin C. Frith  <Martin C. Frith>
 
 	* scripts/last-postmask, test/last-postmask-test.out, test/last-
 	postmask-test.sh:
 	last-postmask: fix crash on length-1 DB name
-	[1015a8df8312] [tip]
+	[1015a8df8312]
 
 	* scripts/last-train:
 	last-train: fix occasional crashes since v1020


=====================================
src/split/cbrc_int_exponentiator.hh
=====================================
@@ -17,9 +17,23 @@ public:
   void setBase(double b) {
     base = b;
     invBase = 1.0 / b;
+
+    double y = 1;
+    double z = 1;
+    for (int i = 0; i < 128; ++i) {
+      lookup[128 + i] = y;
+      y *= base;
+      z *= invBase;
+      lookup[127 - i] = z;
+    }
   }
 
   double operator()(int exponent) const {
+    unsigned u = exponent + 128;
+    if (u < 256) {
+      return lookup[u];
+    }
+
     unsigned n = exponent;
     double x;
     if (exponent >= 0) {
@@ -40,6 +54,7 @@ public:
 private:
   double base;
   double invBase;
+  double lookup[256];
 };
 
 }


=====================================
src/split/cbrc_split_aligner.cc
=====================================
@@ -23,6 +23,30 @@ template<typename T, int N> T arrayMax(T (&array)[N]) {
   return *std::max_element(array, array + N);
 }
 
+// Orders candidate alignments by increasing DP start coordinate.
+// Breaks ties by decreasing DP end coordinate.
+struct DpBegLess {
+  DpBegLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+  bool operator()(unsigned a, unsigned b) const {
+    return
+      dpBegs[a] != dpBegs[b] ? dpBegs[a] < dpBegs[b] : dpEnds[a] > dpEnds[b];
+  }
+  const unsigned *dpBegs;
+  const unsigned *dpEnds;
+};
+
+// Orders candidate alignments by decreasing DP end coordinate.
+// Breaks ties by increasing DP start coordinate.
+struct DpEndLess {
+  DpEndLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+  bool operator()(unsigned a, unsigned b) const {
+    return
+      dpEnds[a] != dpEnds[b] ? dpEnds[a] > dpEnds[b] : dpBegs[a] < dpBegs[b];
+  }
+  const unsigned *dpBegs;
+  const unsigned *dpEnds;
+};
+
 // Orders candidate alignments by increasing DP start coordinate.
 // Breaks ties by chromosome & strand, then by increasing genomic
 // start coordinate.
@@ -104,9 +128,7 @@ void mergeInto(unsigned* beg1,
 	       const unsigned* end2,
 	       T lessFunc) {
   unsigned* end3 = end1 + (end2 - beg2);
-  for (;;) {
-    if (beg2 == end2)
-      break;
+  while (end2 != beg2) {
     if (beg1 == end1) {
       std::copy(beg2, end2, beg1);
       break;
@@ -296,24 +318,60 @@ void SplitAligner::updateInplayAlnIndicesB(unsigned& sortedAlnPos,
   newNumInplay = (newEnd - newBeg) + (sortedAlnPos - sortedAlnOldPos);
 }
 
-long SplitAligner::viterbi() {
-    resizeMatrix(Vmat);
-    resizeVector(Vvec);
+long SplitAligner::viterbiSplit() {
+  unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+  unsigned *inplayAlnEnd = inplayAlnBeg;
+  unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+  unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
 
-    for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
-    long maxScore = 0;
-    cell(Vvec, minBeg) = maxScore;
-    long scoreFromJump = restartScore;
+  std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+		   DpBegLess(&dpBegs[0], &dpEnds[0]));
 
-    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
-		QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+  for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
+  long maxScore = 0;
+
+  for (unsigned j = minBeg; j < maxEnd; j++) {
+    while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
+      --inplayAlnEnd;  // it is no longer "in play"
+    }
+    const unsigned *sortedAlnBeg = sortedAlnPtr;
+    while (sortedAlnPtr < sortedAlnEnd && dpBeg(*sortedAlnPtr) == j) {
+      ++sortedAlnPtr;
+    }
+    mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+	      DpEndLess(&dpBegs[0], &dpEnds[0]));
+    inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+    cell(Vvec, j) = maxScore;
+    long scoreFromJump = maxScore + restartScore;
+    for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+      size_t ij = matrixRowOrigins[*x] + j;
+      long s = std::max(scoreFromJump, Vmat[ij] + Smat[ij*2]) + Smat[ij*2+1];
+      Vmat[ij + 1] = s;
+      maxScore = std::max(maxScore, s);
+    }
+  }
+
+  cell(Vvec, maxEnd) = maxScore;
+  return maxScore;
+}
+
+long SplitAligner::viterbiSplice() {
     unsigned sortedAlnPos = 0;
     unsigned oldNumInplay = 0;
     unsigned newNumInplay = 0;
 
+    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+		QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+
+    for (unsigned i = 0; i < numAlns; ++i) cell(Vmat, i, dpBeg(i)) = INT_MIN/2;
+    long maxScore = 0;
+    long scoreFromJump = restartScore;
+
     for (unsigned j = minBeg; j < maxEnd; j++) {
 	updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
 	unsigned oldInplayPos = 0;
+	cell(Vvec, j) = maxScore;
 	long sMax = INT_MIN/2;
 	for (unsigned x = 0; x < newNumInplay; ++x) {
 	    unsigned i = newInplayAlnIndices[x];
@@ -324,19 +382,19 @@ long SplitAligner::viterbi() {
 	      s = std::max(s,
 			   scoreFromSplice(i, j, oldNumInplay, oldInplayPos));
 	    s += spliceEndScore(ij);
-	    s = std::max(s, Vmat[ij] + Dmat[ij]);
-	    if (restartProb <= 0 && alns[i].qstart == j && s < 0) s = 0;
-	    s += Amat[ij];
+	    s = std::max(s, Vmat[ij] + Smat[ij*2]);
+	    if (alns[i].qstart == j && s < 0) s = 0;
+	    s += Smat[ij*2+1];
 
 	    Vmat[ij + 1] = s;
 	    sMax = std::max(sMax, s + spliceBegScore(ij + 1));
 	}
 	maxScore = std::max(sMax, maxScore);
-	cell(Vvec, j+1) = maxScore;
 	scoreFromJump = std::max(sMax + jumpScore, maxScore + restartScore);
     }
 
-    return (restartProb > 0) ? maxScore : endScore();
+    cell(Vvec, maxEnd) = maxScore;
+    return endScore();
 }
 
 long SplitAligner::endScore() const {
@@ -377,7 +435,7 @@ void SplitAligner::traceBack(long viterbiScore,
   for (;;) {
     --j;
     size_t ij = matrixRowOrigins[i] + j;
-    long score = Vmat[ij + 1] - Amat[ij];
+    long score = Vmat[ij + 1] - Smat[ij*2+1];
     if (restartProb <= 0 && alns[i].qstart == j && score == 0) {
       queryBegs.push_back(j);
       return;
@@ -389,7 +447,7 @@ void SplitAligner::traceBack(long viterbiScore,
     // makes some other kinds of boundary less clean.  What's the best
     // procedure for tied scores?
 
-    bool isStay = (score == Vmat[ij] + Dmat[ij]);
+    bool isStay = (score == Vmat[ij] + Smat[ij*2]);
     if (isStay && alns[i].isForwardStrand()) continue;
 
     long s = score - spliceEndScore(ij);
@@ -417,8 +475,8 @@ int SplitAligner::segmentScore(unsigned alnNum,
   unsigned i = alnNum;
   for (unsigned j = queryBeg; j < queryEnd; ++j) {
     size_t ij = matrixRowOrigins[i] + j;
-    score += Amat[ij];
-    if (j > queryBeg) score += Dmat[ij];
+    score += Smat[ij*2+1];
+    if (j > queryBeg) score += Smat[ij*2];
   }
   return score;
 }
@@ -483,30 +541,69 @@ double SplitAligner::probFromSpliceB(unsigned i, unsigned j,
   return sum;
 }
 
-void SplitAligner::forward() {
-    resizeVector(rescales);
-    cell(rescales, minBeg) = 1.0;
+void SplitAligner::forwardSplit() {
+  unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+  unsigned *inplayAlnEnd = inplayAlnBeg;
+  unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+  unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
 
-    resizeMatrix(Fmat);
-    for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
-    double sumProb = 1;
-    double probFromJump = restartProb;
-    double begprob = 1.0;
-    double zF = 0.0;  // sum of probabilities from the forward algorithm
+  std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+		   DpBegLess(&dpBegs[0], &dpEnds[0]));
 
-    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
-		QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+  for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
+  double sumOfProbs = 1;
+  double rescale = 1;
+
+  for (unsigned j = minBeg; j < maxEnd; j++) {
+    while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
+      --inplayAlnEnd;  // it is no longer "in play"
+    }
+    const unsigned *sortedAlnBeg = sortedAlnPtr;
+    while (sortedAlnPtr < sortedAlnEnd && dpBeg(*sortedAlnPtr) == j) {
+      ++sortedAlnPtr;
+    }
+    mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+	      DpEndLess(&dpBegs[0], &dpEnds[0]));
+    inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+    cell(rescales, j) = rescale;
+    double probFromJump = sumOfProbs * restartProb;
+    double pSum = 0.0;
+    for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+      size_t ij = matrixRowOrigins[*x] + j;
+      double p =
+	(probFromJump + Fmat[ij] * Sexp[ij*2]) * Sexp[ij*2+1] * rescale;
+      Fmat[ij + 1] = p;
+      pSum += p;
+    }
+    sumOfProbs = pSum + sumOfProbs * rescale;
+    rescale = 1 / (pSum + 1);
+  }
+
+  cell(rescales, maxEnd) = 1 / sumOfProbs;  // makes scaled sumOfProbs equal 1
+}
+
+void SplitAligner::forwardSplice() {
     unsigned sortedAlnPos = 0;
     unsigned oldNumInplay = 0;
     unsigned newNumInplay = 0;
 
+    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+		QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+
+    for (unsigned i = 0; i < numAlns; ++i) cell(Fmat, i, dpBeg(i)) = 0.0;
+    double probFromJump = 0;
+    double begprob = 1.0;
+    double zF = 0.0;  // sum of probabilities from the forward algorithm
+    double rescale = 1;
+
     for (unsigned j = minBeg; j < maxEnd; j++) {
 	updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
 	unsigned oldInplayPos = 0;
-	double r = cell(rescales, j);
-	zF /= r;
+	cell(rescales, j) = rescale;
+	zF *= rescale;
 	double pSum = 0.0;
-	double rNew = 1.0;
+	double rNew = 0.0;
 	for (unsigned x = 0; x < newNumInplay; ++x) {
 	    unsigned i = newInplayAlnIndices[x];
 	    size_t ij = matrixRowOrigins[i] + j;
@@ -515,45 +612,77 @@ void SplitAligner::forward() {
 	    if (splicePrior > 0.0)
 	      p += probFromSpliceF(i, j, oldNumInplay, oldInplayPos);
 	    p *= spliceEndProb(ij);
-	    p += Fmat[ij] * Dexp[ij];
-	    if (restartProb <= 0 && alns[i].qstart == j) p += begprob;
-	    p = p * Aexp[ij] / r;
+	    p += Fmat[ij] * Sexp[ij*2];
+	    if (alns[i].qstart == j) p += begprob;
+	    p = p * Sexp[ij*2+1] * rescale;
 
 	    Fmat[ij + 1] = p;
 	    if (alns[i].qend == j+1) zF += p;
 	    pSum += p * spliceBegProb(ij + 1);
 	    rNew += p;
         }
-        begprob /= r;
-        cell(rescales, j+1) = rNew;
-	sumProb = pSum + sumProb / r;
-	probFromJump = pSum * jumpProb + sumProb * restartProb;
+        begprob *= rescale;
+	probFromJump = pSum * jumpProb;
+	rescale = 1 / (rNew + 1);
     }
 
-    if (restartProb > 0) zF = sumProb;
-    //zF /= cell(rescales, maxEnd);
-    cell(rescales, maxEnd) = zF;  // this causes scaled zF to equal 1
+    cell(rescales, maxEnd) = 1 / zF;  // this causes scaled zF to equal 1
 }
 
-void SplitAligner::backward() {
-    resizeMatrix(Bmat);
-    for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
-    double sumProb = (restartProb > 0) ? 1 : 0;
-    double probFromJump = sumProb;
-    double endprob = 1.0;
-    //double zB = 0.0;  // sum of probabilities from the backward algorithm
+void SplitAligner::backwardSplit() {
+  unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
+  unsigned *inplayAlnEnd = inplayAlnBeg;
+  unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+  unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
 
-    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
-		QendLess(&dpEnds[0], &rnameAndStrandIds[0], &rEnds[0]));
+  std::stable_sort(sortedAlnPtr, sortedAlnEnd,
+		   DpEndLess(&dpBegs[0], &dpEnds[0]));
+
+  for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
+  double sumOfProbs = 1;
+
+  for (unsigned j = maxEnd; j > minBeg; j--) {
+    while (inplayAlnEnd > inplayAlnBeg && dpBeg(inplayAlnEnd[-1]) == j) {
+      --inplayAlnEnd;  // it is no longer "in play"
+    }
+    const unsigned *sortedAlnBeg = sortedAlnPtr;
+    while (sortedAlnPtr < sortedAlnEnd && dpEnd(*sortedAlnPtr) == j) {
+      ++sortedAlnPtr;
+    }
+    mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
+	      DpBegLess(&dpBegs[0], &dpEnds[0]));
+    inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
+
+    double rescale = cell(rescales, j);
+    double pSum = 0.0;
+    for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
+      size_t ij = matrixRowOrigins[*x] + j;
+      double p = (sumOfProbs + Bmat[ij] * Sexp[ij*2]) * Sexp[ij*2-1] * rescale;
+      Bmat[ij - 1] = p;
+      pSum += p;
+    }
+    sumOfProbs = pSum * restartProb + sumOfProbs * rescale;
+  }
+}
+
+void SplitAligner::backwardSplice() {
     unsigned sortedAlnPos = 0;
     unsigned oldNumInplay = 0;
     unsigned newNumInplay = 0;
 
+    stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+		QendLess(&dpEnds[0], &rnameAndStrandIds[0], &rEnds[0]));
+
+    for (unsigned i = 0; i < numAlns; ++i) cell(Bmat, i, dpEnd(i)) = 0.0;
+    double probFromJump = 0;
+    double endprob = 1.0;
+    //double zB = 0.0;  // sum of probabilities from the backward algorithm
+
     for (unsigned j = maxEnd; j > minBeg; j--) {
 	updateInplayAlnIndicesB(sortedAlnPos, oldNumInplay, newNumInplay, j);
 	unsigned oldInplayPos = 0;
-	double r = cell(rescales, j);
-	//zB /= r;
+	double rescale = cell(rescales, j);
+	//zB *= rescale;
 	double pSum = 0.0;
 	for (unsigned x = 0; x < newNumInplay; ++x) {
 	    unsigned i = newInplayAlnIndices[x];
@@ -563,48 +692,44 @@ void SplitAligner::backward() {
 	    if (splicePrior > 0.0)
 	      p += probFromSpliceB(i, j, oldNumInplay, oldInplayPos);
 	    p *= spliceBegProb(ij);
-	    p += Bmat[ij] * Dexp[ij];
-	    if (restartProb <= 0 && alns[i].qend == j) p += endprob;
-	    p = p * Aexp[ij - 1] / r;
+	    p += Bmat[ij] * Sexp[ij*2];
+	    if (alns[i].qend == j) p += endprob;
+	    p = p * Sexp[ij*2-1] * rescale;
 
 	    Bmat[ij - 1] = p;
 	    //if (alns[i].qstart == j-1) zB += p;
 	    pSum += p * spliceEndProb(ij - 1);
         }
-        endprob /= r;
-	sumProb = pSum * restartProb + sumProb / r;
-	probFromJump = pSum * jumpProb + sumProb;
+        endprob *= rescale;
+	probFromJump = pSum * jumpProb;
     }
-
-    //if (restartProb > 0) zB = sumProb;
-    //zB /= cell(rescales, minBeg);
 }
 
 std::vector<double>
 SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
 			    unsigned alnBeg, unsigned alnEnd) const {
-    std::vector<double> output;
-    unsigned i = alnNum;
-    unsigned j = queryBeg;
-    for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
-	size_t ij = matrixRowOrigins[i] + j;
-        if (alns[i].qalign[pos] == '-') {
-            double value = Fmat[ij] * Bmat[ij] * Dexp[ij] / cell(rescales, j);
-            output.push_back(value);
-        } else {
-            double value = Fmat[ij + 1] * Bmat[ij] / Aexp[ij];
-            if (value != value) value = 0.0;
-            output.push_back(value);
-            j++;
-        }
+  std::vector<double> output;
+  unsigned i = alnNum;
+  unsigned j = queryBeg;
+  for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
+    size_t ij = matrixRowOrigins[i] + j;
+    if (alns[i].qalign[pos] == '-') {
+      double value = Fmat[ij] * Bmat[ij] * Sexp[ij*2] * cell(rescales, j);
+      output.push_back(value);
+    } else {
+      double value = Fmat[ij + 1] * Bmat[ij] / Sexp[ij*2+1];
+      if (value != value) value = 0.0;
+      output.push_back(value);
+      j++;
     }
-    return output;
+  }
+  return output;
 }
 
 // The next routine represents affine gap scores in a cunning way.
-// Amat holds scores at query bases, and at every base that is aligned
+// Aij holds scores at query bases, and at every base that is aligned
 // to a gap it gets a score of insExistenceScore + insExtensionScore.
-// Dmat holds scores between query bases, and between every pair of
+// Dij holds scores between query bases, and between every pair of
 // bases that are both aligned to gaps it gets a score of
 // -insExistenceScore.  This produces suitable affine gap scores, even
 // if we jump from one alignment to another in the middle of a gap.
@@ -616,18 +741,17 @@ void SplitAligner::calcBaseScores(unsigned i) {
   const UnsplitAlignment& a = alns[i];
   const size_t origin = matrixRowOrigins[i];
 
-  int *AmatB = &Amat[origin + dpBeg(i)];
-  int *DmatB = &Dmat[origin + dpBeg(i)];
-  int *AmatS = &Amat[origin + a.qstart];
-  int *AmatE = &Amat[origin + dpEnd(i)];
+  int *matBeg = &Smat[(origin + dpBeg(i)) * 2];
+  int *alnBeg = &Smat[(origin + a.qstart) * 2];
+  int *matEnd = &Smat[(origin + dpEnd(i)) * 2];
 
   int delScore = 0;
   int insCompensationScore = 0;
 
   // treat any query letters before the alignment as insertions:
-  while (AmatB < AmatS) {
-    *AmatB++ = firstInsScore;
-    *DmatB++ = delScore + insCompensationScore;
+  while (matBeg < alnBeg) {
+    *matBeg++ = delScore + insCompensationScore;
+    *matBeg++ = firstInsScore;
     delScore = 0;
     insCompensationScore = tweenInsScore;
   }
@@ -641,8 +765,8 @@ void SplitAligner::calcBaseScores(unsigned i) {
     unsigned char y = *qAlign++;
     int q = qQual ? (*qQual++ - qualityOffset) : (numQualCodes - 1);
     if (x == '-') {  // gap in reference sequence: insertion
-      *AmatB++ = firstInsScore;
-      *DmatB++ = delScore + insCompensationScore;
+      *matBeg++ = delScore + insCompensationScore;
+      *matBeg++ = firstInsScore;
       delScore = 0;
       insCompensationScore = tweenInsScore;
     } else if (y == '-') {  // gap in query sequence: deletion
@@ -652,8 +776,8 @@ void SplitAligner::calcBaseScores(unsigned i) {
     } else {
       assert(q >= 0);
       if (q >= numQualCodes) q = numQualCodes - 1;
-      *AmatB++ = score_mat[x % 64][y % 64][q];
-      *DmatB++ = delScore;
+      *matBeg++ = delScore;
+      *matBeg++ = score_mat[x % 64][y % 64][q];
       delScore = 0;
       insCompensationScore = 0;
     }
@@ -662,14 +786,14 @@ void SplitAligner::calcBaseScores(unsigned i) {
   }
 
   // treat any query letters after the alignment as insertions:
-  while (AmatB < AmatE) {
-    *AmatB++ = firstInsScore;
-    *DmatB++ = delScore + insCompensationScore;
+  while (matBeg < matEnd) {
+    *matBeg++ = delScore + insCompensationScore;
+    *matBeg++ = firstInsScore;
     delScore = 0;
     insCompensationScore = tweenInsScore;
   }
 
-  *DmatB++ = delScore;
+  *matBeg++ = delScore;
 }
 
 void SplitAligner::initRbegsAndEnds() {
@@ -922,24 +1046,24 @@ void SplitAligner::layout(std::vector<UnsplitAlignment>::const_iterator beg,
 
     sortedAlnIndices.resize(numAlns);
     for (unsigned i = 0; i < numAlns; ++i) sortedAlnIndices[i] = i;
-    oldInplayAlnIndices.resize(numAlns);
     newInplayAlnIndices.resize(numAlns);
 
-    rBegs.resize(numAlns);
-    rEnds.resize(numAlns);
-
-    if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
+    if (restartProb <= 0) {
+      oldInplayAlnIndices.resize(numAlns);
+      rBegs.resize(numAlns);
+      rEnds.resize(numAlns);
+      if (splicePrior > 0.0 || !chromosomeIndex.empty()) {
 	initRbegsAndEnds();
-	//initRnameAndStrandIds();
+      }
+      initRnameAndStrandIds();
     }
-    initRnameAndStrandIds();
 
     initDpBounds();
 }
 
 size_t SplitAligner::memory(bool isViterbi, bool isBothSpliceStrands) const {
   size_t numOfStrands = isBothSpliceStrands ? 2 : 1;
-  size_t x = 2 * sizeof(int) + 2 * sizeof(double);
+  size_t x = 2 * sizeof(int) + 2 * sizeof(float);
   if (splicePrior > 0 || !chromosomeIndex.empty()) x += 2 * sizeof(unsigned);
   if (!chromosomeIndex.empty()) x += 2;
   if (isViterbi) x += sizeof(long) * numOfStrands;
@@ -948,10 +1072,13 @@ size_t SplitAligner::memory(bool isViterbi, bool isBothSpliceStrands) const {
 }
 
 void SplitAligner::initMatricesForOneQuery() {
-  resizeMatrix(Amat);
-  resizeMatrix(Dmat);
-  resizeMatrix(Aexp);
-  resizeMatrix(Dexp);
+  size_t doubleMatrixSize = cellsPerDpMatrix() * 2;
+  // The final cell per row is never used, because there's one less
+  // Aij than Dij per candidate alignment.
+  if (Smat.size() < doubleMatrixSize) {
+    Smat.resize(doubleMatrixSize);
+    Sexp.resize(doubleMatrixSize);
+  }
 
   for (unsigned i = 0; i < numAlns; i++) calcBaseScores(i);
 
@@ -967,8 +1094,7 @@ void SplitAligner::initMatricesForOneQuery() {
     for (unsigned i = 0; i < numAlns; ++i) initSpliceSignals(i);
   }
 
-  transform(Amat.begin(), Amat.end(), Aexp.begin(), scaledExp);
-  transform(Dmat.begin(), Dmat.end(), Dexp.begin(), scaledExp);
+  std::transform(&Smat[0], &Smat[0] + doubleMatrixSize, &Sexp[0], scaledExp);
   // if x/scale < about -745, then exp(x/scale) will be exactly 0.0
 }
 
@@ -990,7 +1116,7 @@ double SplitAligner::spliceSignalStrandLogOdds() const {
   assert(rescales.size() == rescalesRev.size());
   double logOdds = 0;
   for (unsigned j = 0; j < rescales.size(); ++j) {
-    logOdds += std::log(rescales[j] / rescalesRev[j]);
+    logOdds += std::log(rescalesRev[j] / rescales[j]);
   }
   return logOdds;
 }


=====================================
src/split/cbrc_split_aligner.hh
=====================================
@@ -74,7 +74,11 @@ public:
     // Call this before viterbi/forward/backward, and after layout
     void initMatricesForOneQuery();
 
-    long viterbi();  // returns the optimal split-alignment score
+    long viterbi() {  // returns the optimal split-alignment score
+      resizeMatrix(Vmat);
+      resizeVector(Vvec);
+      return (restartProb > 0) ? viterbiSplit() : viterbiSplice();
+    }
 
     // Gets the chunks of an optimal split alignment.
     // For each chunk, it gets:
@@ -91,9 +95,18 @@ public:
     int segmentScore(unsigned alnNum,
 		     unsigned queryBeg, unsigned queryEnd) const;
 
-    void forward();
-
-    void backward();
+    void forwardBackward() {
+      resizeVector(rescales);
+      resizeMatrix(Fmat);
+      resizeMatrix(Bmat);
+      if (restartProb > 0) {
+	forwardSplit();
+	backwardSplit();
+      } else {
+	forwardSplice();
+	backwardSplice();
+      }
+    }
 
     // Returns one probability per column, for a segment of an alignment
     std::vector<double> marginalProbs(unsigned queryBeg, unsigned alnNum,
@@ -139,12 +152,22 @@ private:
     std::vector<unsigned> dpBegs;  // dynamic programming begin coords
     std::vector<unsigned> dpEnds;  // dynamic programming end coords
     std::vector<size_t> matrixRowOrigins;  // layout of ragged matrices
-    std::vector<int> Amat;  // scores at query bases, for each candidate
-    std::vector<int> Dmat;  // scores between query bases, for each candidate
+
+    std::vector<int> Smat;
+    // Smat holds position-specific substitution, insertion, and
+    // deletion scores for the candidate alignments of one query
+    // sequence to a genome.  These scores, for each candidate
+    // alignment i, are called Aij and Dij in [Frith&Kawaguchi 2015].
+    // Aij holds scores at query bases, in Smat[i][1,3,5,...,2n-1].
+    // Dij holds scores between query bases, in Smat[i][0,2,4,...,2n].
+
     std::vector<long> Vmat;  // DP matrix for Viterbi algorithm
     std::vector<long> Vvec;  // DP vector for Viterbi algorithm
-    std::vector<double> Aexp;
-    std::vector<double> Dexp;
+
+    std::vector<float> Sexp;
+    // Sexp holds exp(Smat / t): these values are called A'ij and D'ij
+    // in [Frith&Kawaguchi 2015].
+
     std::vector<double> Fmat;  // DP matrix for Forward algorithm
     std::vector<double> Bmat;  // DP matrix for Backward algorithm
     std::vector<double> rescales;  // the usual scaling for numerical stability
@@ -227,6 +250,14 @@ private:
 				 unsigned& oldNumInplay,
 				 unsigned& newNumInplay, unsigned j);
 
+    long viterbiSplit();
+    long viterbiSplice();
+
+    void forwardSplit();
+    void backwardSplit();
+    void forwardSplice();
+    void backwardSplice();
+
     unsigned findScore(unsigned j, long score) const;
     unsigned findSpliceScore(unsigned i, unsigned j, long score) const;
     long scoreFromSplice(unsigned i, unsigned j, unsigned oldNumInplay,
@@ -264,9 +295,9 @@ private:
     void resizeMatrix(T& m) const {
       // This reserves size for a ragged matrix, which is actually
       // stored in a flat vector.  There are numAlns rows, and row i
-      // has dpEnd(i) - dpBeg(i) + 1 cells.  (The final cell per row
-      // is used in some matrices but not others.)
-      m.resize(cellsPerDpMatrix());
+      // has dpEnd(i) - dpBeg(i) + 1 cells.
+      size_t s = cellsPerDpMatrix();
+      if (m.size() < s) m.resize(s);
     }
 
     double probFromSpliceF(unsigned i, unsigned j, unsigned oldNumInplay,


=====================================
src/split/last-split.cc
=====================================
@@ -184,13 +184,11 @@ static void doOneQuery(std::vector<cbrc::UnsplitAlignment>::const_iterator beg,
   sa.initMatricesForOneQuery();
 
   if (opts.direction != 0) {
-    sa.forward();
-    sa.backward();
+    sa.forwardBackward();
   }
   if (opts.direction != 1) {
     sa.flipSpliceSignals();
-    sa.forward();
-    sa.backward();
+    sa.forwardBackward();
     sa.flipSpliceSignals();
   }
 


=====================================
src/version.hh
=====================================
@@ -1 +1 @@
-"1047"
+"1060"



View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a03c3caa8895e6816352644f2d8854847c148dc1

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a03c3caa8895e6816352644f2d8854847c148dc1
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200325/ca05e9fa/attachment-0001.html>


More information about the debian-med-commit mailing list