[med-svn] [Git][med-team/last-align][upstream] New upstream version 1243
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Fri May 28 18:07:29 BST 2021
Nilesh Patra pushed to branch upstream at Debian Med / last-align
Commits:
cf76fa7e by Nilesh Patra at 2021-05-28T22:21:44+05:30
New upstream version 1243
- - - - -
20 changed files:
- bin/last-train
- doc/last-cookbook.rst
- doc/last-split.rst
- src/Alignment.cc
- src/Centroid.cc
- src/Centroid.hh
- src/LastalArguments.cc
- src/makefile
- src/split/cbrc_unsplit_alignment.cc
- src/split/cbrc_unsplit_alignment.hh
- src/split/last-split-main.cc
- src/split/last-split.cc
- src/split/last-split.hh
- + test/alli.fa
- + test/huma.fa
- test/last-split-test.out
- test/last-split-test.sh
- test/last-test.out
- test/last-test.sh
- + test/split1.maf
Changes:
=====================================
bin/last-train
=====================================
@@ -299,7 +299,7 @@ def balancedEndProb(*args):
return rootOfIncreasingFunction(probImbalance,
lowerBound, upperBound, args)
-def gapProbsFromCounts(counts, opts):
+def gapProbsFromCounts(counts, opts, maxGapGrowProb):
matches, deletes, inserts, delOpens, insOpens, alignments = counts
gaps = deletes + inserts
gapOpens = delOpens + insOpens
@@ -329,6 +329,8 @@ def gapProbsFromCounts(counts, opts):
print("# delExtendProb: %g" % delGrowProb)
print("# insExtendProb: %g" % insGrowProb)
print()
+ delGrowProb = min(delGrowProb, maxGapGrowProb)
+ insGrowProb = min(insGrowProb, maxGapGrowProb)
return matchProb, (delOpenProb, delGrowProb), (insOpenProb, insGrowProb)
def gapRatiosFromProbs(matchProb, delProbs, insProbs):
@@ -366,13 +368,12 @@ def matScoresFromProbs(scale, matchRatio, matProbs, rowProbs, colProbs):
return [[scoreFromLetterProbs(scale, matchRatio, matProbs[i][j], x, y)
for j, y in enumerate(colProbs)] for i, x in enumerate(rowProbs)]
-def gapCostsFromProbRatios(scale, isBigScale, firstGapRatio, gapExtendRatio):
+def gapCostsFromProbRatios(scale, firstGapRatio, gapExtendRatio):
# The next addition gets the alignment parameter from the path
# parameters, as in Supplementary section 3.1 of [Fri19]:
gapExtendRatio += firstGapRatio
- # This isBigScale aims to avoid ALP failures, and improve speed
- firstGapCost = max(costFromProb(scale, firstGapRatio), 1 + isBigScale)
- gapExtendCost = max(costFromProb(scale, gapExtendRatio), 1 + isBigScale)
+ firstGapCost = max(costFromProb(scale, firstGapRatio), 1)
+ gapExtendCost = max(costFromProb(scale, gapExtendRatio), 1)
return firstGapCost, gapExtendCost
def imbalanceFromGap(scale, firstGapCost, gapExtendCost):
@@ -412,11 +413,11 @@ def balancedScale(imbalanceFunc, nearScale, args):
oldUpper = newUpper
return 0.0
-def scoresAndScale(originalScale, matParams, delRatios, insRatios, isInner):
+def scoresAndScale(originalScale, matParams, delRatios, insRatios):
while True:
matScores = matScoresFromProbs(originalScale, *matParams)
- delCosts = gapCostsFromProbRatios(originalScale, isInner, *delRatios)
- insCosts = gapCostsFromProbRatios(originalScale, isInner, *insRatios)
+ delCosts = gapCostsFromProbRatios(originalScale, *delRatios)
+ insCosts = gapCostsFromProbRatios(originalScale, *insRatios)
args = matScores, delCosts, insCosts
scale = balancedScale(scoreImbalance, originalScale, args)
if scale > 0:
@@ -623,7 +624,7 @@ def normalizedFreqs(freqs):
s = sum(x)
return [i / s for i in x]
-def codonScoresAndScale(originalScale, matParams, delRatios, insRatios, junk):
+def codonScoresAndScale(originalScale, matParams, delRatios, insRatios):
matchRatio, matProbs, rowFreqs, colFreqs = matParams
rowProbs = normalizedFreqs(rowFreqs)
colProbs = normalizedFreqs(colFreqs)
@@ -680,7 +681,7 @@ def writeGapCosts(opts, delCosts, insCosts, isLastFormat, outFile):
print("# frameshiftCosts del-1,del-2,ins+1,ins+2:",
frameshiftCosts, file=outFile)
-def probsFromFile(opts, lastalArgs, codonMatches, lines):
+def probsFromFile(opts, lastalArgs, maxGapGrowProb, codonMatches, lines):
print("#", *lastalArgs)
print()
sys.stdout.flush()
@@ -689,7 +690,7 @@ def probsFromFile(opts, lastalArgs, codonMatches, lines):
gapProbs = frameshiftProbsFromCounts(gapCounts, opts)
matProbs = codonMatProbsFromCounts(matCounts, opts)
else:
- gapProbs = gapProbsFromCounts(gapCounts, opts)
+ gapProbs = gapProbsFromCounts(gapCounts, opts, maxGapGrowProb)
matProbs = matProbsFromCounts(matCounts, opts)
return matProbs, gapProbs
@@ -779,6 +780,9 @@ def doTraining(opts, args):
if opts.scale:
outerScale = opts.scale / math.log(2)
+ # minimum possible (integer) gap extend cost = 1
+ maxGapGrowProb = math.exp(-1 / outerScale)
+
innerScale = outerScale * scaleIncrease
oldParameters = []
@@ -789,8 +793,8 @@ def doTraining(opts, args):
if opts.codon:
matProbs, gapProbs = initialCodonProbs(opts)
else:
- matProbs, gapProbs = probsFromFile(opts, lastalArgs, codonMatches,
- proc.stdout)
+ matProbs, gapProbs = probsFromFile(opts, lastalArgs, maxGapGrowProb,
+ codonMatches, proc.stdout)
while True:
matchRatio, delRatios, insRatios = gapRatiosFunc(*gapProbs)
@@ -801,7 +805,7 @@ def doTraining(opts, args):
rowProbs = [freqText(i) for i in rowProbs]
colProbs = [freqText(i) for i in colProbs]
matParams = matchRatio, matProbs, rowProbs, colProbs
- ss = scoresAndScaleFunc(innerScale, matParams, delRatios, insRatios, 1)
+ ss = scoresAndScaleFunc(innerScale, matParams, delRatios, insRatios)
matScores, delCosts, insCosts, scale = ss
writeGapCosts(opts, delCosts, insCosts, False, None)
print()
@@ -827,10 +831,10 @@ def doTraining(opts, args):
proc.stdin.close()
if not opts.codon:
proc = lastSplitProcess(opts, proc)
- matProbs, gapProbs = probsFromFile(opts, lastalArgs, codonMatches,
- proc.stdout)
+ matProbs, gapProbs = probsFromFile(opts, lastalArgs, maxGapGrowProb,
+ codonMatches, proc.stdout)
- ss = scoresAndScaleFunc(outerScale, matParams, delRatios, insRatios, 0)
+ ss = scoresAndScaleFunc(outerScale, matParams, delRatios, insRatios)
matScores, delCosts, insCosts, scale = ss
if opts.X: print("#last -X", opts.X)
if opts.R: print("#last -R", opts.R)
=====================================
doc/last-cookbook.rst
=====================================
@@ -294,22 +294,20 @@ The result so far is asymmetric: each part of the chimp genome is
aligned to at most one part of the human genome, but not vice-versa.
We can get one-to-one alignments like this::
- maf-swap humchi1.maf | last-split > humchi2.maf
+ last-split -r -m1e-5 humchi1.maf | last-postmask > humchi2.maf
-Here, last-split_ gets parts of the humchi1 alignments. It uses the
-humchi1 *per-base* mismap probabilities to get the humchi2
-*per-alignment* mismap probabilities.
+Here, last-split_ gets parts of the humchi1 alignments. The ``-r``
+reverses the roles of the genomes, so it finds a unique best alignment
+for each part of human. It uses the humchi1 *per-base* mismap
+probabilities to get the humchi2 *per-alignment* mismap probabilities.
-Then we can discard less-confident alignments, and convert_ to a
-compact tabular format::
+Here we've also discarded less-confident alignments: ``-m1e-5`` omits
+alignments with `mismap probability`_ > 10^-5, and last-postmask_
+discards alignments caused by simple sequence.
- last-postmask humchi2.maf | maf-convert -n tab | awk -F= '$2 <= 1e-5' > humchi.tab
-
-last-postmask_ discards alignments caused by simple sequence. The
-``awk`` command gets alignments with `mismap probability`_ ≤ 10^-5.
Finally, we can make a dotplot_::
- last-dotplot humchi.tab humchi.png
+ last-dotplot humchi2.maf humchi2.png
**To go faster** with minor accuracy loss: replace ``-uNEAR`` with
``-uRY32`` and/or `mask repeats`_.
@@ -404,7 +402,6 @@ core is indicated by "~" symbols, and it contains exact matches only.
.. _mismap probability:
.. _last-split: doc/last-split.rst
.. _last-train: doc/last-train.rst
-.. _convert:
.. _maf-convert: doc/maf-convert.rst
.. _scoring scheme: doc/last-matrices.rst
.. _seeding scheme:
=====================================
doc/last-split.rst
=====================================
@@ -177,6 +177,11 @@ Options
alignments have "p" lines from ``lastal -j``, in which case
the default is ``MAF+``).
+-r, --reverse
+ Reverse the roles of the 2 sequences in each alignment: use the
+ 1st (top) sequence as the "query" and the 2nd as the
+ "reference".
+
-g, --genome=NAME
Do spliced alignment, and read splice signals (GT, AG, etc)
from the named genome. NAME should be the name of a lastdb
@@ -288,6 +293,7 @@ Limitations
last-split does not support:
+* DNA-versus-protein alignments.
* Generalized affine gap costs.
To do
=====================================
src/Alignment.cc
=====================================
@@ -12,20 +12,6 @@
using namespace cbrc;
-static void addExpectedCounts(double *totalCounts, const ExpectedCount &ec) {
- for( unsigned i = 0; i < scoreMatrixRowSize; ++i ){
- for( unsigned j = 0; j < scoreMatrixRowSize; ++j ){
- *totalCounts += ec.emit[i][j];
- ++totalCounts;
- }
- }
- totalCounts[0] += ec.toMatch;
- totalCounts[1] += ec.delNext + ec.delInit; // deleted letter count
- totalCounts[2] += ec.insNext + ec.insInit; // inserted letter count
- totalCounts[3] += ec.delInit; // deletion open/close count
- totalCounts[4] += ec.insInit; // insertion open/close count
-}
-
static void addSeedCounts(const uchar *seq1, const uchar *seq2, size_t size,
double *counts) {
for (size_t i = 0; i < size; ++i) {
@@ -291,6 +277,15 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
GappedXdropAligner& aligner = centroid.aligner();
GreedyXdropAligner &greedyAligner = aligners.greedyAligner;
+ double *subsCounts[scoreMatrixRowSize];
+ double *tranCounts;
+ if (outputType == 7) {
+ double *ec = &extras.expectedCounts[0];
+ for (int i = 0; i < scoreMatrixRowSize; ++i)
+ subsCounts[i] = ec + i * scoreMatrixRowSize;
+ tranCounts = ec + scoreMatrixRowSize * scoreMatrixRowSize;
+ }
+
if( frameSize ){
assert( !isGreedy );
assert( !globality );
@@ -323,14 +318,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
if (outputType < 4) return;
fxa.backward(isForward, probMat, gap);
getColumnCodes(fxa, columnCodes, chunks);
- if (outputType == 7) {
- double *ec = &extras.expectedCounts[0];
- double *subsCounts[scoreMatrixRowSize];
- for (int i = 0; i < scoreMatrixRowSize; ++i)
- subsCounts[i] = ec + i * scoreMatrixRowSize;
- double *tranCounts = ec + scoreMatrixRowSize * scoreMatrixRowSize;
- fxa.count(isForward, gap, subsCounts, tranCounts);
- }
+ if (outputType == 7) fxa.count(isForward, gap, subsCounts, tranCounts);
} else {
assert(!isFullScore);
assert(outputType < 4);
@@ -429,17 +417,15 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
extras.fullScore += s / scale;
}
if (outputType < 4) return;
- centroid.backward(seq1, seq2, start2, isForward, probMat, gap, globality);
+ centroid.backward(isForward, probMat, gap, globality);
if (outputType > 4 && outputType < 7) { // gamma-centroid / LAMA alignment
centroid.dp(outputType, gamma);
centroid.traceback(chunks, outputType, gamma);
}
getColumnCodes(centroid, columnCodes, chunks, isForward);
if (outputType == 7) {
- ExpectedCount ec;
- centroid.computeExpectedCounts(seq1, seq2, start2, isForward,
- probMat, gap, alph.size, ec);
- addExpectedCounts(&extras.expectedCounts[0], ec);
+ centroid.addExpectedCounts(start2, isForward, probMat, gap, alph.size,
+ subsCounts, tranCounts);
}
}
}
=====================================
src/Centroid.cc
=====================================
@@ -5,7 +5,6 @@
#include "GappedXdropAlignerInl.hh"
#include <algorithm>
-#include <cassert>
#include <cmath> // for exp
#include <cfloat> // for DBL_MAX
@@ -19,19 +18,6 @@ namespace{
namespace cbrc{
- ExpectedCount::ExpectedCount ()
- {
- double d0 = 0;
- toMatch = d0;
- delInit = d0;
- delNext = d0;
- insInit = d0;
- insNext = d0;
-
- for (int n=0; n<scoreMatrixRowSize; n++)
- for (int m=0; m<scoreMatrixRowSize; m++) emit[n][m] = d0;
- }
-
void Centroid::setPssm( const ScoreMatrixRow* pssm, size_t qsize, double T,
const OneQualityExpMatrix& oqem,
const uchar* sequenceBeg, const uchar* qualityBeg ) {
@@ -83,255 +69,239 @@ namespace cbrc{
}
}
- void Centroid::initForwardMatrix(){
- size_t n = xa.scoreEndIndex( numAntidiagonals );
- if ( fM.size() < n ) {
- fM.resize( n );
- fD.resize( n );
- fI.resize( n );
- }
- fM[xdropPadLen-1] = 1;
- }
-
- void Centroid::initBackwardMatrix(){
- size_t n = xa.scoreEndIndex( numAntidiagonals );
- bM.assign( n, 0.0 );
- bD.assign( n, 0.0 );
- bI.assign( n, 0.0 );
- }
-
double Centroid::forward(const uchar *seq1, const uchar *seq2,
size_t start2, bool isExtendFwd,
const const_dbl_ptr *substitutionProbs,
const GapCosts &gapCosts, int globality) {
- const ExpMatrixRow *pssm = pssmExp.empty() ? 0 : pssmExp2 + start2;
+ seq1ptr = seq1;
+ seq2ptr = seq2;
+ pssmPtr = pssmExp.empty() ? 0 : pssmExp2 + start2;
const int seqIncrement = isExtendFwd ? 1 : -1;
- numAntidiagonals = xa.numAntidiagonals();
- scale.assign(numAntidiagonals + 2, 1.0);
- initForwardMatrix();
const double delInit = gapCosts.delProbPieces[0].openProb;
const double delNext = gapCosts.delProbPieces[0].growProb;
const double insInit = gapCosts.insProbPieces[0].openProb;
const double insNext = gapCosts.insProbPieces[0].growProb;
- double Z = 0.0; // partion function of forward values
-
- for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals
- const size_t seq1beg = xa.seq1start( k );
- const size_t seq2pos = k - seq1beg;
- const double scale2 = scale[k];
- const double scale1 = scale[k+1];
- double sum_f = 0.0; // sum of forward values
-
- const double scaledDelNext = scale1 * delNext;
- const double scaledInsNext = scale1 * insNext;
-
- const size_t scoreEnd = xa.scoreEndIndex( k );
- double* fM0 = &fM[ scoreEnd ];
- double* fD0 = &fD[ scoreEnd ];
- double* fI0 = &fI[ scoreEnd ];
+ initForward();
- const size_t horiBeg = xa.hori( k, seq1beg );
- const size_t vertBeg = xa.vert( k, seq1beg );
- const size_t diagBeg = xa.diag( k, seq1beg );
- const double* fD1 = &fD[ horiBeg ];
- const double* fI1 = &fI[ vertBeg ];
- const double* fM2 = &fM[ diagBeg ];
+ size_t antidiagonal = 0;
+ size_t seq1beg = 0;
+ size_t diagPos = xdropPadLen - 1;
+ size_t horiPos = xdropPadLen * 2 - 1;
+ size_t thisPos = xdropPadLen * 2;
- const double* fM0last = fM0 + xa.numCellsAndPads( k ) - 1;
-
- const uchar* s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
+ double sumOfEdgeProbRatios = 0;
+ double sumOfProbRatios = 0;
+ double logSumOfProbRatios = 0;
+ while (1) {
+ double *fM0 = &fM[thisPos];
+ double *fD0 = &fD[thisPos];
+ double *fI0 = &fI[thisPos];
for (int i = 0; i < xdropPadLen; ++i) {
*fM0++ = *fD0++ = *fI0++ = 0.0;
}
-
- if (!pssm) {
- const uchar* s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
-
- while (1) {
- const unsigned letter1 = *s1;
- const unsigned letter2 = *s2;
- const double matchProb = substitutionProbs[letter1][letter2];
-
- const double xM = *fM2;
- const double xD = *fD1;
- const double xI = *fI1;
- const double xSum = (xM * scale2 + xD + xI) * scale1;
-
- *fD0 = xSum * delInit + xD * scaledDelNext;
- *fI0 = xSum * insInit + xI * scaledInsNext;
- *fM0 = xSum * matchProb;
- sum_f += xSum;
- if (globality && matchProb <= 0) Z += xSum; // xxx
-
- if (fM0 == fM0last) break;
- fM0++; fD0++; fI0++;
- fM2++; fD1++; fI1++;
+ thisPos += xdropPadLen;
+
+ const double *fD1 = &fD[horiPos];
+ const double *fI1 = &fI[horiPos + 1];
+ const double *fM2 = &fM[diagPos];
+
+ ++antidiagonal;
+ const size_t nextPos = xa.scoreEndIndex(antidiagonal);
+ const int numCells = nextPos - thisPos;
+ const uchar *s1 = seq1ptr;
+
+ if (!pssmPtr) {
+ const uchar *s2 = seq2ptr;
+ for (int i = 0; i < numCells; ++i) {
+ const double matchProb = substitutionProbs[*s1][*s2];
+ const double xD = fD1[i];
+ const double xI = fI1[i];
+ const double xSum = fM2[i] + xD + xI;
+ fD0[i] = xSum * delInit + xD * delNext;
+ fI0[i] = xSum * insInit + xI * insNext;
+ fM0[i] = xSum * matchProb;
+ sumOfProbRatios += xSum;
s1 += seqIncrement;
s2 -= seqIncrement;
}
} else {
- const ExpMatrixRow* p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
-
- while (1) {
- const unsigned letter1 = *s1;
- const double *matchProbs = *p2;
- const double matchProb = matchProbs[letter1];
-
- const double xM = *fM2;
- const double xD = *fD1;
- const double xI = *fI1;
- const double xSum = (xM * scale2 + xD + xI) * scale1;
-
- *fD0 = xSum * delInit + xD * scaledDelNext;
- *fI0 = xSum * insInit + xI * scaledInsNext;
- *fM0 = xSum * matchProb;
- sum_f += xSum;
- if (globality && matchProb <= 0) Z += xSum; // xxx
-
- if (fM0 == fM0last) break;
- fM0++; fD0++; fI0++;
- fM2++; fD1++; fI1++;
+ const ExpMatrixRow *p2 = pssmPtr;
+ for (int i = 0; i < numCells; ++i) {
+ const double matchProb = (*p2)[*s1];
+ const double xD = fD1[i];
+ const double xI = fI1[i];
+ const double xSum = fM2[i] + xD + xI;
+ fD0[i] = xSum * delInit + xD * delNext;
+ fI0[i] = xSum * insInit + xI * insNext;
+ fM0[i] = xSum * matchProb;
+ sumOfProbRatios += xSum;
s1 += seqIncrement;
p2 -= seqIncrement;
}
}
- if( !globality ) Z += sum_f;
- scale[k+2] = 1.0 / (sum_f + 1.0); // seems ugly
- Z *= scale[k+2]; // scaling
- } // k
+ if (globality) {
+ const int n = numCells - 1;
+ if (substitutionProbs[0][*seq2ptr] <= 0) {
+ sumOfEdgeProbRatios += fM2[0] + fD1[0] + fI1[0];
+ }
+ if (n > 0 && substitutionProbs[*(s1 - seqIncrement)][0] <= 0) {
+ sumOfEdgeProbRatios += fM2[n] + fD1[n] + fI1[n];
+ }
+ }
+
+ if (antidiagonal == numAntidiagonals) break;
- //std::cout << "# Z=" << Z << std::endl;
- assert( Z > 0.0 );
- scale[ numAntidiagonals + 1 ] /= Z; // this causes scaled Z to equal 1
- return logPartitionFunction();
+ diagPos = horiPos;
+ horiPos = thisPos - 1;
+ thisPos = nextPos;
+
+ const size_t newSeq1beg = xa.seq1start(antidiagonal);
+ if (newSeq1beg > seq1beg) {
+ seq1beg = newSeq1beg;
+ seq1ptr += seqIncrement;
+ ++diagPos;
+ ++horiPos;
+ } else {
+ seq2ptr += seqIncrement;
+ if (pssmPtr) pssmPtr += seqIncrement;
+ }
+
+ if (antidiagonal % rescaleStep == 0) {
+ const double scale = 1 / sumOfProbRatios;
+ rescales[antidiagonal / rescaleStep - 1] = scale;
+ rescaleFwdProbs(xa.scoreEndIndex(antidiagonal - 2), thisPos, scale);
+ logSumOfProbRatios += log(sumOfProbRatios);
+ sumOfEdgeProbRatios *= scale;
+ sumOfProbRatios = 1;
+ }
+ }
+
+ if (globality) {
+ assert(sumOfEdgeProbRatios > 0);
+ sumOfProbRatios = sumOfEdgeProbRatios;
+ }
+ rescaledSumOfProbRatios = sumOfProbRatios;
+ return logSumOfProbRatios + log(sumOfProbRatios);
}
// added by M. Hamada
// compute posterior probabilities while executing backward algorithm
- void Centroid::backward(const uchar* seq1, const uchar* seq2,
- size_t start2, bool isExtendFwd,
+ void Centroid::backward(bool isExtendFwd,
const const_dbl_ptr *substitutionProbs,
const GapCosts& gapCosts, int globality) {
- const ExpMatrixRow *pssm = pssmExp.empty() ? 0 : pssmExp2 + start2;
const int seqIncrement = isExtendFwd ? 1 : -1;
- mD.assign(numAntidiagonals + 2, 0.0);
- mI.assign(numAntidiagonals + 2, 0.0);
- initBackwardMatrix();
const double delInit = gapCosts.delProbPieces[0].openProb;
const double delNext = gapCosts.delProbPieces[0].growProb;
const double insInit = gapCosts.insProbPieces[0].openProb;
const double insNext = gapCosts.insProbPieces[0].growProb;
- double scaledUnit = 1.0;
-
- for( size_t k = numAntidiagonals; k-- > 0; ){
- const size_t seq1beg = xa.seq1start( k );
- const size_t seq2pos = k - seq1beg;
- const double scale2 = scale[k];
- const double scale1 = scale[k+1];
- scaledUnit *= scale[k+2];
-
- const double scaledDelNext = scale1 * delNext;
- const double scaledInsNext = scale1 * insNext;
-
- const size_t scoreEnd = xa.scoreEndIndex( k );
- const double* bM0 = &bM[ scoreEnd + xdropPadLen ];
- const double* bD0 = &bD[ scoreEnd + xdropPadLen ];
- const double* bI0 = &bI[ scoreEnd + xdropPadLen ];
-
- const size_t horiBeg = xa.hori( k, seq1beg );
- const size_t vertBeg = xa.vert( k, seq1beg );
- const size_t diagBeg = xa.diag( k, seq1beg );
- double* bD1 = &bD[ horiBeg ];
- double* bI1 = &bI[ vertBeg ];
- double* bM2 = &bM[ diagBeg ];
-
- const double* fD1 = &fD[ horiBeg ];
- const double* fI1 = &fI[ vertBeg ];
-
- const double* bM0last = bM0 + xa.numCellsAndPads( k ) - xdropPadLen - 1;
-
- double* mDout = &mD[ seq1beg ];
- double* mIout = &mI[ seq2pos ];
-
- const uchar *s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
-
- if (!pssm) {
- const uchar *s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
-
- while (1) {
- const unsigned letter1 = *s1;
- const unsigned letter2 = *s2;
- const double matchProb = substitutionProbs[letter1][letter2];
-
- const double yM = *bM0;
- const double yD = *bD0;
- const double yI = *bI0;
-
+ size_t antidiagonal = numAntidiagonals - 1;
+ size_t seq1beg = xa.seq1start(antidiagonal);
+ size_t oldPos = xa.scoreEndIndex(numAntidiagonals);
+ initBackward(oldPos);
+ double scaledUnit = 1 / rescaledSumOfProbRatios;
+
+ while (1) {
+ const size_t newPos = xa.scoreEndIndex(antidiagonal);
+ const double *bM0 = &bM[newPos + xdropPadLen];
+ const double *bD0 = &bD[newPos + xdropPadLen];
+ const double *bI0 = &bI[newPos + xdropPadLen];
+
+ const double *fD0 = &fD[newPos + xdropPadLen];
+ const double *fI0 = &fI[newPos + xdropPadLen];
+
+ const size_t vertPos = xa.vert(antidiagonal, seq1beg);
+ const size_t diagPos = xa.diag(antidiagonal, seq1beg);
+ double *bD1 = &bD[vertPos - 1];
+ double *bI1 = &bI[vertPos];
+ double *bM2 = &bM[diagPos];
+
+ const size_t seq2pos = antidiagonal - seq1beg;
+ double *mDout = &mD[seq1beg + 1];
+ double *mIout = &mI[seq2pos + 1];
+
+ const int numCells = oldPos - newPos - xdropPadLen;
+ const uchar *s1 = seq1ptr;
+
+ // !!! careful: values written into pad cells may be wrong
+ // !!! (overwrite each other, wrong scaling)
+
+ if (!pssmPtr) {
+ const uchar *s2 = seq2ptr;
+
+ for (int i = 0; i < numCells; ++i) {
+ const double matchProb = substitutionProbs[*s1][*s2];
+ const double yM = bM0[i];
+ const double yD = bD0[i];
+ const double yI = bI0[i];
double ySum = yM * matchProb + yD * delInit + yI * insInit;
if (!globality || matchProb <= 0) ySum += scaledUnit;
// xxx matchProb should be 0 only at delimiters, but will be
// 0 for non-delimiters with severe mismatch scores
- ySum *= scale1;
-
- *bM2 = ySum * scale2;
- *bD1 = ySum + yD * scaledDelNext;
- *bI1 = ySum + yI * scaledInsNext;
+ bM2[i] = ySum;
+ bD1[i] = ySum + yD * delNext;
+ bI1[i] = ySum + yI * insNext;
- *mDout += (*fD1) * (*bD1);
- *mIout += (*fI1) * (*bI1);
+ *mDout += fD0[i] * yD;
+ *mIout += fI0[i] * yI;
- if (bM0 == bM0last) break;
mDout++; mIout--;
- bM2++; bD1++; bI1++;
- bM0++; bD0++; bI0++;
- fD1++; fI1++;
s1 += seqIncrement;
s2 -= seqIncrement;
}
} else {
- const ExpMatrixRow *p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
-
- while (1) {
- const unsigned letter1 = *s1;
- const double matchProb = (*p2)[letter1];
-
- const double yM = *bM0;
- const double yD = *bD0;
- const double yI = *bI0;
+ const ExpMatrixRow *p2 = pssmPtr;
+ for (int i = 0; i < numCells; ++i) {
+ const double matchProb = (*p2)[*s1];
+ const double yM = bM0[i];
+ const double yD = bD0[i];
+ const double yI = bI0[i];
double ySum = yM * matchProb + yD * delInit + yI * insInit;
if (!globality || matchProb <= 0) ySum += scaledUnit; // xxx
- ySum *= scale1;
-
- *bM2 = ySum * scale2;
- *bD1 = ySum + yD * scaledDelNext;
- *bI1 = ySum + yI * scaledInsNext;
+ bM2[i] = ySum;
+ bD1[i] = ySum + yD * delNext;
+ bI1[i] = ySum + yI * insNext;
- *mDout += (*fD1) * (*bD1);
- *mIout += (*fI1) * (*bI1);
+ *mDout += fD0[i] * yD;
+ *mIout += fI0[i] * yI;
- if (bM0 == bM0last) break;
mDout++; mIout--;
- bM2++; bD1++; bI1++;
- bM0++; bD0++; bI0++;
- fD1++; fI1++;
s1 += seqIncrement;
p2 -= seqIncrement;
}
}
- }
- //std::cout << "# bM[0]=" << bM[0] << std::endl;
+ if (antidiagonal == 0) break;
+
+ oldPos = newPos;
+
+ if ((antidiagonal + 2) % rescaleStep == 0 &&
+ antidiagonal + 2 < numAntidiagonals) {
+ const double scale = rescales[antidiagonal / rescaleStep];
+ rescaleBckProbs(diagPos, newPos, scale);
+ scaledUnit *= scale;
+ }
+
+ --antidiagonal;
+ const size_t newSeq1beg = xa.seq1start(antidiagonal);
+ if (newSeq1beg < seq1beg) {
+ seq1beg = newSeq1beg;
+ seq1ptr -= seqIncrement;
+ } else {
+ seq2ptr -= seqIncrement;
+ if (pssmPtr) pssmPtr -= seqIncrement;
+ }
+ }
}
double Centroid::dp_centroid( double gamma ){
@@ -510,14 +480,6 @@ namespace cbrc{
ambiguityCodes.push_back(asciiProbability(mI[i]));
}
- double Centroid::logPartitionFunction() const{
- double x = 0.0;
- for( size_t k = 0; k < numAntidiagonals; ++k ){
- x -= std::log( scale[k+2] );
- }
- return x;
- }
-
static void countUncertainLetters(double *counts, double alignProb,
unsigned alphabetSize,
const double *probRatios,
@@ -537,14 +499,12 @@ namespace cbrc{
}
}
- void Centroid::computeExpectedCounts(const uchar *seq1, const uchar *seq2,
- size_t start2, bool isExtendFwd,
- const const_dbl_ptr *substitutionProbs,
- const GapCosts &gapCosts,
- unsigned alphabetSize,
- ExpectedCount &c) const {
- const ExpMatrixRow *pssm = pssmExp.empty() ? 0 : pssmExp2 + start2;
-
+ void Centroid::addExpectedCounts(size_t start2, bool isExtendFwd,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts,
+ unsigned alphabetSize,
+ const dbl_ptr *substitutionCounts,
+ double *transitionCounts) {
const double *letterProbs = 0;
if (!letterProbsPerPosition.empty()) {
letterProbs = &letterProbsPerPosition[0] + start2 * alphabetSize;
@@ -554,114 +514,98 @@ namespace cbrc{
int alphabetSizeIncrement = alphabetSize;
if (!isExtendFwd) alphabetSizeIncrement *= -1;
- double delInitCount = 0; double delNextCount = 0;
- double insInitCount = 0; double insNextCount = 0;
+ size_t antidiagonal = 0;
+ size_t seq1beg = 0;
+ size_t vertPos = xdropPadLen * 2;
+ size_t thisPos = xdropPadLen * 3;
- for( size_t k = 0; k < numAntidiagonals; ++k ){ // loop over antidiagonals
- const size_t seq1beg = xa.seq1start( k );
- const size_t seq2pos = k - seq1beg;
- const double scale2 = scale[k];
- const double scale1 = scale[k+1];
+ double alignedLetterPairCount = 0;
+ double delNextCount = 0;
+ double insNextCount = 0;
- const size_t scoreEnd = xa.scoreEndIndex( k );
- const double* bM0 = &bM[ scoreEnd + xdropPadLen ];
- const double* bD0 = &bD[ scoreEnd + xdropPadLen ];
- const double* bI0 = &bI[ scoreEnd + xdropPadLen ];
-
- const size_t horiBeg = xa.hori( k, seq1beg );
- const size_t vertBeg = xa.vert( k, seq1beg );
- const size_t diagBeg = xa.diag( k, seq1beg );
- const double* fD1 = &fD[ horiBeg ];
- const double* fI1 = &fI[ vertBeg ];
- const double* fM2 = &fM[ diagBeg ];
+ while (1) {
+ const double *bM0 = &bM[thisPos];
+ const double *bD0 = &bD[thisPos];
+ const double *bI0 = &bI[thisPos];
+ const double *fM0 = &fM[thisPos];
+ const double *fD1 = &fD[vertPos - 1];
+ const double *fI1 = &fI[vertPos];
double dNextCount = 0;
double iNextCount = 0;
- const double* bM0last = bM0 + xa.numCellsAndPads( k ) - xdropPadLen - 1;
-
- const uchar* s1 = isExtendFwd ? seq1 + seq1beg : seq1 - seq1beg;
-
- if (!pssm) {
- const uchar* s2 = isExtendFwd ? seq2 + seq2pos : seq2 - seq2pos;
-
- while (1) {
- const unsigned letter1 = *s1;
- const unsigned letter2 = *s2;
- const double matchProb = substitutionProbs[letter1][letter2];
-
- const double yM = *bM0;
- const double yD = *bD0;
- const double yI = *bI0;
-
- const double xM = *fM2;
- const double xD = *fD1;
- const double xI = *fI1;
- const double xSum = (xM * scale2 + xD + xI) * scale1;
-
- const double alignProb = xSum * yM * matchProb;
- c.emit[letter1][letter2] += alignProb;
- c.toMatch += alignProb;
- delInitCount += xSum * yD;
- dNextCount += xD * yD;
- insInitCount += xSum * yI;
- iNextCount += xI * yI;
-
- if (bM0 == bM0last) break;
- fM2++; fD1++; fI1++;
- bM0++; bD0++; bI0++;
+ ++antidiagonal;
+ const size_t nextPos = xa.scoreEndIndex(antidiagonal);
+ const int numCells = nextPos - thisPos;
+ const uchar *s1 = seq1ptr;
+
+ if (!letterProbs) {
+ const uchar *s2 = seq2ptr;
+ for (int i = 0; i < numCells; ++i) {
+ const double alignProb = fM0[i] * bM0[i];
+ substitutionCounts[*s1][*s2] += alignProb;
+ alignedLetterPairCount += alignProb;
+ dNextCount += fD1[i] * bD0[i];
+ iNextCount += fI1[i] * bI0[i];
s1 += seqIncrement;
s2 -= seqIncrement;
}
} else {
- const ExpMatrixRow* p2 = isExtendFwd ? pssm + seq2pos : pssm - seq2pos;
- const size_t a2 = seq2pos * alphabetSize;
- const double* lp2 = isExtendFwd ? letterProbs + a2 : letterProbs - a2;
-
- while (1) { // inner most loop
+ const double *lp2 = letterProbs;
+ for (int i = 0; i < numCells; ++i) {
+ const double alignProb = fM0[i] * bM0[i];
const unsigned letter1 = *s1;
- const double matchProb = (*p2)[letter1];
-
- const double yM = *bM0;
- const double yD = *bD0;
- const double yI = *bI0;
-
- const double xM = *fM2;
- const double xD = *fD1;
- const double xI = *fI1;
- const double xSum = (xM * scale2 + xD + xI) * scale1;
-
- const double alignProb = xSum * yM * matchProb;
- countUncertainLetters(c.emit[letter1], alignProb,
+ countUncertainLetters(substitutionCounts[letter1], alignProb,
alphabetSize, substitutionProbs[letter1], lp2);
- c.toMatch += alignProb;
- delInitCount += xSum * yD;
- dNextCount += xD * yD;
- insInitCount += xSum * yI;
- iNextCount += xI * yI;
-
- if (bM0 == bM0last) break;
- fM2++; fD1++; fI1++;
- bM0++; bD0++; bI0++;
+ alignedLetterPairCount += alignProb;
+ dNextCount += fD1[i] * bD0[i];
+ iNextCount += fI1[i] * bI0[i];
s1 += seqIncrement;
- p2 -= seqIncrement;
lp2 -= alphabetSizeIncrement;
}
}
- delNextCount += dNextCount * scale1;
- insNextCount += iNextCount * scale1;
+ if ((antidiagonal + 1) % rescaleStep == 0 &&
+ antidiagonal + 1 < numAntidiagonals) {
+ const double mul = rescales[antidiagonal / rescaleStep];
+ dNextCount *= mul;
+ iNextCount *= mul;
+ }
+
+ delNextCount += dNextCount;
+ insNextCount += iNextCount;
+
+ if (antidiagonal == numAntidiagonals) break;
+
+ vertPos = thisPos;
+ thisPos = nextPos + xdropPadLen;
+
+ const size_t newSeq1beg = xa.seq1start(antidiagonal);
+ if (newSeq1beg > seq1beg) {
+ seq1beg = newSeq1beg;
+ seq1ptr += seqIncrement;
+ ++vertPos;
+ } else {
+ seq2ptr += seqIncrement;
+ if (letterProbs) letterProbs += alphabetSizeIncrement;
+ }
}
- const double delInit = gapCosts.delProbPieces[0].openProb;
- const double delNext = gapCosts.delProbPieces[0].growProb;
- const double insInit = gapCosts.insProbPieces[0].openProb;
- const double insNext = gapCosts.insProbPieces[0].growProb;
+ double delCount = 0;
+ double insCount = 0;
+ for (size_t i = 0; i < numAntidiagonals + 2; ++i) {
+ delCount += mD[i];
+ insCount += mI[i];
+ }
+
+ delNextCount *= gapCosts.delProbPieces[0].growProb;
+ insNextCount *= gapCosts.insProbPieces[0].growProb;
- c.delInit += delInitCount * delInit;
- c.delNext += delNextCount * delNext;
- c.insInit += insInitCount * insInit;
- c.insNext += insNextCount * insNext;
+ transitionCounts[0] += alignedLetterPairCount;
+ transitionCounts[1] += delCount; // deleted letter count
+ transitionCounts[2] += insCount; // inserted letter count
+ transitionCounts[3] += delCount - delNextCount; // delete open/close count
+ transitionCounts[4] += insCount - insNextCount; // insert open/close count
}
} // end namespace cbrc
=====================================
src/Centroid.hh
=====================================
@@ -18,24 +18,15 @@
namespace cbrc{
- struct ExpectedCount{
- public:
- double emit[scoreMatrixRowSize][scoreMatrixRowSize];
- double toMatch;
- double delInit;
- double delNext;
- double insInit;
- double insNext;
- public:
- ExpectedCount ();
- };
-
/**
* (1) Forward and backward algorithm on the DP region given by Xdrop algorithm
* (2) \gamma-centroid decoding
*/
class Centroid{
typedef const double *const_dbl_ptr;
+ typedef double *dbl_ptr;
+
+ enum { rescaleStep = 16 };
public:
GappedXdropAligner& aligner() { return xa; }
@@ -63,8 +54,7 @@ namespace cbrc{
const const_dbl_ptr *substitutionProbs,
const GapCosts &gapCosts, int globality);
- void backward(const uchar *seq1, const uchar *seq2,
- size_t start2, bool isExtendFwd,
+ void backward(bool isExtendFwd,
const const_dbl_ptr *substitutionProbs,
const GapCosts &gapCosts, int globality);
@@ -101,11 +91,11 @@ namespace cbrc{
size_t seq2end, size_t seq2beg) const;
// Added by MH (2008/10/10) : compute expected counts for transitions and emissions
- void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
- size_t start2, bool isExtendFwd,
- const const_dbl_ptr *substitutionProbs,
- const GapCosts& gapCosts, unsigned alphabetSize,
- ExpectedCount& count) const;
+ void addExpectedCounts(size_t start2, bool isExtendFwd,
+ const const_dbl_ptr *substitutionProbs,
+ const GapCosts &gapCosts, unsigned alphabetSize,
+ const dbl_ptr *substitutionCounts,
+ double *transitionCounts);
private:
typedef double ExpMatrixRow[scoreMatrixRowSize];
@@ -135,16 +125,58 @@ namespace cbrc{
dvec_t X; // DP tables for $gamma$-decoding
- dvec_t scale; // scale[n] is a scaling factor for the n-th anti-diagonal
+ dvec_t rescales;
+
+ double rescaledSumOfProbRatios;
+
+ const uchar *seq1ptr;
+ const uchar *seq2ptr;
+ const ExpMatrixRow *pssmPtr;
double bestScore;
size_t bestAntiDiagonal;
size_t bestPos1;
- void initForwardMatrix();
- void initBackwardMatrix();
+ void initForward() {
+ numAntidiagonals = xa.numAntidiagonals();
+ assert(numAntidiagonals > 0);
- double logPartitionFunction() const; // a.k.a. full score, forward score
+ size_t totalNumOfCells = xa.scoreEndIndex(numAntidiagonals);
+ if (fM.size() < totalNumOfCells) {
+ fM.resize(totalNumOfCells);
+ fD.resize(totalNumOfCells);
+ fI.resize(totalNumOfCells);
+ fM[xdropPadLen - 1] = 1;
+ }
+
+ size_t numOfRescales = (numAntidiagonals - 1) / rescaleStep;
+ if (rescales.size() < numOfRescales) rescales.resize(numOfRescales);
+ }
+
+ void initBackward(size_t totalNumOfCells) {
+ bM.assign(totalNumOfCells, 0.0);
+ bD.assign(totalNumOfCells, 0.0);
+ bI.assign(totalNumOfCells, 0.0);
+
+ mD.assign(numAntidiagonals + 2, 0.0);
+ mI.assign(numAntidiagonals + 2, 0.0);
+ }
+
+ void rescaleFwdProbs(size_t beg, size_t end, double scale) {
+ for (size_t i = beg; i < end; ++i) {
+ fM[i] *= scale;
+ fD[i] *= scale;
+ fI[i] *= scale;
+ }
+ }
+
+ void rescaleBckProbs(size_t beg, size_t end, double scale) {
+ for (size_t i = beg; i < end; ++i) {
+ bM[i] *= scale;
+ bD[i] *= scale;
+ bI[i] *= scale;
+ }
+ }
void updateScore(double score, size_t antiDiagonal, size_t cur) {
if (bestScore < score) {
=====================================
src/LastalArguments.cc
=====================================
@@ -619,8 +619,10 @@ void LastalArguments::writeCommented( std::ostream& stream ) const{
stream << " B="; writeIntList(stream, insGrowCosts);
if( gapPairCost > 0 )
stream << " c=" << gapPairCost;
- if( isTranslated() )
- stream << " F="; writeIntList(stream, frameshiftCosts);
+ if (isTranslated()) {
+ stream << " F=";
+ writeIntList(stream, frameshiftCosts);
+ }
stream << " e=" << minScoreGapped;
stream << " d=" << minScoreGapless;
stream << " x=" << maxDropGapped;
=====================================
src/makefile
=====================================
@@ -143,7 +143,7 @@ ScoreMatrixData.hh: ../data/*.mat
../build/mat-inc.sh ../data/*.mat > $@
VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1219) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1243, refs/keep-around/3243658001e5a2afe5329f23aab272c355465293) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
=====================================
src/split/cbrc_unsplit_alignment.cc
=====================================
@@ -142,12 +142,13 @@ void flipMafStrands(StringIt linesBeg, StringIt linesEnd) {
}
}
-static void canonicalizeMafStrands(StringIt linesBeg, StringIt linesEnd) {
+static void canonicalizeMafStrands(StringIt linesBeg, StringIt linesEnd,
+ unsigned rankOfQrySeq) {
unsigned s = 0;
for (StringIt i = linesBeg; i < linesEnd; ++i) {
const char *c = i->c_str();
if (*c == 's') ++s;
- if (s == 2) {
+ if (s == rankOfQrySeq) {
char strand;
c = skipWord(c);
c = skipWord(c);
@@ -162,8 +163,11 @@ static void canonicalizeMafStrands(StringIt linesBeg, StringIt linesEnd) {
err("bad MAF data");
}
-void UnsplitAlignment::init() {
- canonicalizeMafStrands(linesBeg, linesEnd);
+void UnsplitAlignment::init(bool isTopSeqQuery) {
+ const unsigned rankOfQrySeq = 2 - isTopSeqQuery;
+ const unsigned rankOfRefSeq = isTopSeqQuery + 1;
+
+ canonicalizeMafStrands(linesBeg, linesEnd, rankOfQrySeq);
qQual = 0; // in case the input lacks sequence quality data
@@ -189,13 +193,13 @@ void UnsplitAlignment::init() {
if (!f || f >= g) err("bad MAF line: " + *i);
(*i)[e - c] = 0; // write a terminator for the sequence name
if (g < lineEnd) (*i)[g - c] = 0; // trim trailing whitespace
- if (s == 1) {
+ if (s == rankOfRefSeq) {
rstart = start;
rend = start + len;
qstrand = (strand == '-') * 2;
rname = i->c_str() + (d - c);
ralign = i->c_str() + (f - c);
- } else if (s == 2) {
+ } else if (s == rankOfQrySeq) {
qstart = start;
qend = start + len;
if (strand == '-') qstrand = 3 - qstrand;
@@ -203,9 +207,9 @@ void UnsplitAlignment::init() {
qalign = i->c_str() + (f - c);
}
} else if (*c == 'q') {
- if (s == 1)
+ if (s == rankOfRefSeq)
err("I can't handle quality data for the genomic sequence");
- if (s == 2) {
+ if (s == rankOfQrySeq) {
f = skipSpace(skipWord(skipWord(c)));
if (!f || f >= g) err("bad MAF line: " + *i);
if (g < lineEnd) (*i)[g - c] = 0; // trim trailing whitespace
=====================================
src/split/cbrc_unsplit_alignment.hh
=====================================
@@ -31,9 +31,10 @@ public:
const char *qalign;
const char *qQual;
UnsplitAlignment(){}
- UnsplitAlignment(StringIt linesBegIn, StringIt linesEndIn)
- : linesBeg(linesBegIn), linesEnd(linesEndIn) { init(); }
- void init();
+ UnsplitAlignment(StringIt linesBegIn,
+ StringIt linesEndIn, bool isTopSeqQuery)
+ : linesBeg(linesBegIn), linesEnd(linesEndIn) { init(isTopSeqQuery); }
+ void init(bool isTopSeqQuery);
bool isForwardStrand() const { return qstrand < 2; }
bool isFlipped() const { return qstrand % 2; }
};
=====================================
src/split/last-split-main.cc
=====================================
@@ -36,6 +36,7 @@ static void run(int argc, char* argv[]) {
LastSplitOptions opts;
opts.format = 0;
+ opts.isTopSeqQuery = false;
opts.direction = 1;
opts.cis = 0.004;
opts.trans = 1e-05;
@@ -62,6 +63,7 @@ come from different parts of the genome.\n\
Options:\n\
-h, --help show this help message and exit\n\
-f, --format=FMT output format: MAF, MAF+ (default: depends on input)\n\
+ -r, --reverse reverse the roles of the 2 sequences in each alignment\n\
-g, --genome=NAME lastdb genome name\n\
-d, --direction=D RNA direction: 0=reverse, 1=forward, 2=mixed (default="
+ cbrc::stringify(opts.direction) + ")\n\
@@ -82,11 +84,12 @@ Options:\n\
-V, --version show version information and exit\n\
";
- const char sOpts[] = "hf:g:d:c:t:M:S:m:s:nb:vV";
+ const char sOpts[] = "hf:rg:d:c:t:M:S:m:s:nb:vV";
static struct option lOpts[] = {
{ "help", no_argument, 0, 'h' },
{ "format", required_argument, 0, 'f' },
+ { "reverse", no_argument, 0, 'r' },
{ "genome", required_argument, 0, 'g' },
{ "direction",required_argument, 0, 'd' },
{ "cis", required_argument, 0, 'c' },
@@ -111,6 +114,9 @@ Options:\n\
case 'f':
opts.format = parseOutputFormat(optarg);
break;
+ case 'r':
+ opts.isTopSeqQuery = true;
+ break;
case 'g':
opts.isSplicedAlignment = true;
opts.genome = optarg;
=====================================
src/split/last-split.cc
=====================================
@@ -51,6 +51,19 @@ static int scoreFromProb(double prob, double scale) {
return std::floor(scale * std::log(prob) + 0.5);
}
+static void transpose(std::vector< std::vector<int> > &matrix) {
+ size_t rows = matrix.size();
+ size_t cols = matrix[0].size();
+ std::vector< std::vector<int> > m(cols);
+ for (size_t i = 0; i < cols; ++i) {
+ m[i].resize(rows);
+ for (size_t j = 0; j < rows; ++j) {
+ m[i][j] = matrix[j][i];
+ }
+ }
+ m.swap(matrix);
+}
+
// Defines an ordering, for sorting.
static bool less(const cbrc::UnsplitAlignment& a,
const cbrc::UnsplitAlignment& b) {
@@ -253,7 +266,8 @@ static void doOneBatch(std::vector<std::string>& mafLines,
mafs.reserve(mafEnds.size() - 1); // saves memory: no excess capacity
for (unsigned i = 1; i < mafEnds.size(); ++i)
mafs.push_back(cbrc::UnsplitAlignment(mafLines.begin() + mafEnds[i-1],
- mafLines.begin() + mafEnds[i]));
+ mafLines.begin() + mafEnds[i],
+ opts.isTopSeqQuery));
sort(mafs.begin(), mafs.end(), less);
std::vector<cbrc::UnsplitAlignment>::const_iterator b = mafs.begin();
@@ -319,7 +333,8 @@ void lastSplit(LastSplitOptions& opts) {
int score;
ls >> word >> name;
while (ls >> score) row.push_back(score);
- if (word == "#" && name.size() == 1 && !row.empty() && ls.eof()) {
+ if (word == "#" && name.size() == 1 &&
+ row.size() == colNames.size() && ls.eof()) {
rowNames.push_back(std::toupper(name[0]));
scoreMatrix.push_back(row);
} else {
@@ -375,6 +390,12 @@ void lastSplit(LastSplitOptions& opts) {
int qualityOffset =
(sequenceFormat == 0) ? 0 : (sequenceFormat == 3) ? 64 : 33;
printParameters(opts);
+ if (opts.isTopSeqQuery) {
+ transpose(scoreMatrix);
+ std::swap(rowNames, colNames);
+ std::swap(gapExistenceCost, insExistenceCost);
+ std::swap(gapExtensionCost, insExtensionCost);
+ }
sa.setParams(-gapExistenceCost, -gapExtensionCost,
-insExistenceCost, -insExtensionCost,
-jumpCost, -restartCost, scale, qualityOffset);
@@ -389,7 +410,7 @@ void lastSplit(LastSplitOptions& opts) {
}
}
if (state == 1) { // we are reading alignments
- if (startsWith(line, "# batch")) {
+ if (startsWith(line, "# batch") && !opts.isTopSeqQuery) {
addMaf(mafEnds, mafLines);
doOneBatch(mafLines, mafEnds, sa, opts, isAlreadySplit);
mafLines.clear();
=====================================
src/split/last-split.hh
=====================================
@@ -19,6 +19,7 @@
struct LastSplitOptions {
int format;
+ bool isTopSeqQuery;
std::string genome;
int direction;
double cis;
=====================================
test/alli.fa
=====================================
@@ -0,0 +1,3 @@
+>NW_017707804.1
+AACACCACAATGAGATACCACCTCACAGCAGTCAGAATGGCTACAATTAA
+CAAGTCAGGAAAGGACAGAT
=====================================
test/huma.fa
=====================================
@@ -0,0 +1,3 @@
+>57:130722274
+aaaaccacaatgagagtcagagtggtgattattaaaagtcgagaaatgat
+agat
=====================================
test/last-split-test.out
=====================================
@@ -7,6 +7,7 @@ come from different parts of the genome.
Options:
-h, --help show this help message and exit
-f, --format=FMT output format: MAF, MAF+ (default: depends on input)
+ -r, --reverse reverse the roles of the 2 sequences in each alignment
-g, --genome=NAME lastdb genome name
-d, --direction=D RNA direction: 0=reverse, 1=forward, 2=mixed (default=1)
-c, --cis=PROB cis-splice probability per base (default=0.004)
@@ -43885,3 +43886,80 @@ s chr6 42658730 94 + 170805979 CTGAAATGCA-GCGGCATTTTA--TTGA---TGAAAACAA
s spliceWithGap 2662 98 - 7262 CTGAAATGCATGCAG-ATTTGAGTTTGAGGCTGAAAACAGTGAACT-TTTCAGCAGACATTAGAACTGGATGCCTCAACCTCTGCTGTTCTTGATCATAG
p !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+#
+# a=31 b=1 A=21 B=3 e=190 d=94 x=189 y=48 z=189 D=1e+09 E=0.516058
+# R=01 u=0 s=2 S=1 M=0 T=0 m=100 l=1 n=100 k=1 w=1000 t=4.75495 j=3 Q=0
+# /home/mcfrith/data/genome/calMil1/last/calMil1-uMAM8
+# Reference sequences=21204 normal letters=936953443
+# lambda=0.214243 K=0.246122
+#
+# A C G T M S K W R Y B D H V
+# A 5 -7 -3 -6 2 -5 -4 2 3 -6 -5 1 1 1
+# C -8 5 -7 -2 2 2 -4 -4 -7 3 1 -5 1 0
+# G -2 -7 5 -8 -4 2 2 -4 3 -7 0 1 -5 1
+# T -6 -3 -7 5 -4 -5 2 2 -6 3 1 1 1 -5
+# M 2 2 -5 -4 2 0 -4 0 0 0 -1 -1 1 1
+# S -4 2 2 -4 0 2 0 -4 0 0 1 -1 -1 1
+# K -4 -5 2 2 -4 0 2 0 0 0 1 1 -1 -1
+# W 2 -5 -5 2 0 -5 0 2 0 0 -1 1 1 -1
+# R 3 -7 3 -7 0 0 0 0 3 -7 -2 1 -1 1
+# Y -7 3 -7 3 0 0 0 0 -7 3 1 -1 1 -2
+# B -5 1 1 1 -1 1 1 -1 -1 1 1 0 0 0
+# D 1 -5 1 0 -1 -1 1 1 1 -2 -1 1 -1 0
+# H 0 1 -5 1 1 -1 -1 1 -2 1 0 -1 1 -1
+# V 1 1 1 -5 1 1 -1 -1 1 -1 0 0 0 1
+#
+# Coordinates are 0-based. For - strand matches, coordinates
+# in the reverse complement of the 2nd sequence are used.
+#
+# name start alnSize strand seqSize alignment
+#
+# m=1 s=190
+#
+# Query sequences=195
+# m=1 s=190
+#
+a score=226 mismap=0.2
+s KI635862 2418202 89 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248781029 91 - 248956422 CACTGCAACCTCCACCTCCCGGGTTCAAGCGATTCTCCTGCCTCAGGCTCCCGAGTAGCTGGGATTACAGGCGCGCGCCACCATGCCCAGC
+
+a score=211 mismap=0.5
+s KI635889 4462115 58 + 6429418 TACCTATAGATTGGTTGACAGCTTTCATGCACAAATTTTCCACCAGAATCTTTCCTGG
+s chr1 248943367 58 - 248956422 TGCCTATGGACTGGTTGACAGCCTTCATGCACAGGTTCTCCACCAGAGCCTTTCTTGG
+
+a score=271 mismap=1.3e-08
+s KI635901 1311390 127 + 5048387 TCTTTTTTTCCTTCTGTCAACACTTTTAGCTGAAGCCATAAAAACGACAAGTCCCGTCATAATCCCATACCCACTCTTTCAGTCCAATGCTGAAGATCTTAGTGTTGAAGGTTTACCCGAAGGTATT
+s chr1 248791606 123 - 248956422 TCCCTttttctttttttt----CTTATAGCTCGAGCTGTAAAAGCCAAAGGTCCGGTGATGATCCCATACCCTTTTTTCCAGTCTCATGTTGAAGATTTTTATGTAGAAGGCCTtcccaaaggaatt
+
+a score=392 mismap=3.2e-06
+s KI635917 1140607 159 + 4339253 ACCTTGTTcttgttccttctgcttcttcttcttctccatcttaatcAGCTTTGCATTGCGCAGTTTGGCCTTTCCAATGCCTCCGGCCTGACGAATGGATTCCAGCAGACTAGCCCTTCCATCCGAGGGTTTCACAACCTCCTTGGGTGCCCCCTGGAC
+s chr1 186314 156 + 248956422 ACCTTGCTCctgctccttctg---cttcttcttctccAGCTTTCGCTCCTTCATGCTGCGCAGCTTGGCCTTGCCGATGCCCCCAGCTTGGCGGATGGACTCTAGCAGAGTGGCCCGGCCACCGGAGGGGTCGACCACTTCCCTGGGAGCTCCCTGGAC
+
+a score=520 mismap=0.2
+s KI635917 1143406 188 + 4339253 TCTGTGCTGAACGTGGGCAGTTCAGGCATGTTGCTTCCTGGCACGGAGGGCGCGATGCCAGGTCCCAGGTCAGCACTGTACATGAGGTCATCTGCAATCCCAGGCAGATCCGGTAGGTAGGATGGGACGTCAATCTCTGGGACCTGACCCAGGTCTGGTACATAGAAGTAATTCTCAGTAGTCTGAAA
+s chr1 16876 185 + 248956422 TCAGTGTGGAAGGTGGGCAGTTCTGGAATGGTGC---CAGGGGCAGAGGGGGCAATGCCGGGGCCCAGGTCGGCAATGTACATGAGGTCGTTGGCAATGCCGGGCAGGTCAGGCAGGTAGGATGGAACATCAATCTCAGGCACCTGGCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCAA
+
+a score=340 mismap=1e-09
+s KI635917 1144102 140 + 4339253 ACCCGTCTTTCTAGTTGCTCTCGCGTGGTGATTGAGAGCGGGGCATCAAACGGCTTCTCTTCCTTCTCAGTCTCCAGTGCCATGTGTGTCTTTGTCACAGCTCCTGCTAACGGATCCAAGAGAACGTACTTCTTGTAGCT
+s chr1 17230 140 + 248956422 ACCTGCTGTTCCAGCTGCTCTCTCTTGCTGATGGACAAGGGGGCATCAAACAGCTTCTCCTCTGTCTCTGCCCCCAGCATCACATGGGTCTTTGTTACAGCACCAGCCAGGGGGTCCAGGAAGACATACTTCTTCTACCT
+
+a score=334 mismap=1e-09
+s KI635917 1144725 145 + 4339253 TACGGATTCTCAGTAGTGTTGAACAGCAGCAGGGAGCTGATGGAGCTAATGTTCCGAGGAAGGCTTCCCAACCCCTCCTCCGTCTCATCCTCCTCTCGAGACTTGGTGTTCAGACACACGGGGTAATGCTTTGCCTTTTCCTGAG
+s chr1 17602 145 + 248956422 TACAGGTTCTCGGTGGTGTTGAAGAGCAGCAAGGAGCTGACAGAGCTGATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGGGCTCCGGCTTGGTGCTCACGCACACAGGAAAGTCCTTCAGCTTCTCCTGAG
+
+a score=340 mismap=1e-09
+s KI635917 1146887 159 + 4339253 CACACCTGCACAGCCTTATCATCCAGAGGCTTCAGCTTGCTGTGGATCTTGTAGCGTGGGCGCTTCTGAGAAGCAGGATCATCCGCGCCCGAGAATATGGAAGTGTAATCCTGTAGCCGGTCAGGTGCAGGGTACTTAGCACTGGAAAAGACCTGCAGA
+s chr1 17909 159 + 248956422 CAGACCTGCAGGGCCCGCTCGTCCAGGGGGCGGTGCTTGCTCTGGATCCTGTGGCGGGGGCGTCTCTGCAGGCCAGGGTCCTGGGCGCCCGTGAAGATGGAGCCATATTCCTGCAGGCGCCCTGGAGCAGGGTACTTGGCACTGGAGAACACCTGTGGA
+
+a score=240 mismap=1.6e-05
+s KI635917 1147176 117 + 4339253 TACTTTTGTGGCTTTCTTGCTGCCTTTGATCCTCTCGATCTTGGCCTGTGCCAGATTGATGCGGTCACTGATGGACTGCAGGCGAGTCTGATTCCTCTCCACGTTTTGGGAAACCCT
+s chr1 18264 117 + 248956422 TACCTTGATGGCCTTCTTGCTGCCCTTGATCTTCTCAATCTTGGCCTGGGCCAAGGAGACCTTCTCTCCAATGGCCTGCACCTGGCTCCGGCTCTGCTCTACCTGCTGGGAGATCCT
+
+a score=252 mismap=1e-06
+s KI635917 1161884 119 + 4339253 ACCTCGTGAAGACATCATTGGAGATTTTTTGGAGATACTGAAGAGCATCAGTGATCTGGTGAATGGACTCTTCCCGGCGCAGGTCCGGCTGGATTATTGGCACCGTGTACACCTGTCCT
+s chr1 24735 119 + 248956422 ACCTGCTGAAGATGTCTCCAGAGACCTTCTGCAGGTACTGCAGGGCATCCGCCATCTGCTGGACGGCCTCCTCTCGCCGCAGGTCTGGCTGGATGAAGGGCACGGCATAGGTCTGACCT
+
+a score=330 mismap=0.79
+s KI636063 409336 105 + 789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
+s chr1 157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
+
=====================================
test/last-split-test.sh
=====================================
@@ -33,4 +33,7 @@ maf=SRR359290-1k.maf
last-split -fMAF 102.maf
last-split -d1 -fMAF+ spliceWithGap.maf
+
+ last-split -r split1.maf
+
} | diff -u last-split-test.out -
=====================================
test/last-test.out
=====================================
@@ -4792,3 +4792,39 @@ s LTR22C#LTR/ERVK 0 505 + 509 TGTAGGGGNTCGGTCAGGGTGGTGGGAGAAATTGTAAAAATAAATTATA
140 chrM 3279 28 + 16571 chrM 10177 28 - 16775 28 EG2=1.5e+05 E=8.5e-05
# batch 1
# Query sequences=2
+#
+# a=15 b=3 A=15 B=3 e=79 d=40 x=78 y=46 z=78 D=1e+06 E=1.42772e+10
+# R=01 u=0 s=2 S=0 M=0 T=0 m=10 l=1 n=10 k=1 w=1000 t=4.5512 j=7 Q=0
+# /tmp/last-test
+# Reference sequences=1 normal letters=70
+# lambda=0.203841 K=0.140698
+#
+# A C G T M S K W R Y B D H V
+# A 5 -5 -5 -5 2 -5 -5 2 2 -5 -5 1 1 1
+# C -5 5 -5 -5 2 2 -5 -5 -5 2 1 -5 1 1
+# G -5 -5 5 -5 -5 2 2 -5 2 -5 1 1 -5 1
+# T -5 -5 -5 5 -5 -5 2 2 -5 2 1 1 1 -5
+# M 2 2 -5 -5 2 0 -5 0 0 0 -1 -1 1 1
+# S -5 2 2 -5 0 2 0 -5 0 0 1 -1 -1 1
+# K -5 -5 2 2 -5 0 2 0 0 0 1 1 -1 -1
+# W 2 -5 -5 2 0 -5 0 2 0 0 -1 1 1 -1
+# R 2 -5 2 -5 0 0 0 0 2 -5 -1 1 -1 1
+# Y -5 2 -5 2 0 0 0 0 -5 2 1 -1 1 -1
+# B -5 1 1 1 -1 1 1 -1 -1 1 1 0 0 0
+# D 1 -5 1 1 -1 -1 1 1 1 -1 0 1 0 0
+# H 1 1 -5 1 1 -1 -1 1 -1 1 0 0 1 0
+# V 1 1 1 -5 1 1 -1 -1 1 -1 0 0 0 1
+#
+# Coordinates are 0-based. For - strand matches, coordinates
+# in the reverse complement of the 2nd sequence are used.
+#
+# name start alnSize strand seqSize alignment
+#
+# batch 0
+a score=92 EG2=1e+09 E=8e-07 fullScore=102
+s NW_017707804.1 0 70 + 70 AACACCACAATGAGATACCACCTCACAGCAGTCAGAATGGCTACAATTAACAAGTCAGGAAAGGACAGAT
+s 57:130722274 0 54 + 54 AAAACCACAATGAGA---------------GTCAGAGTGGTGATTATTAA-AAGTCGAGAAATGATAGAT
+p $&'~~~~~~~~~~~~(.0233.,,,,,+--&)*+,,,-.-+++++---)'%(+253--/111/0/,,+(%
+c 20.6 0.00414 1.97 1.05 1.13 4.98 0.158 2.83 0.966 0.0011 9.56 0.997 0.0153 0.0037 1.07 6.54 51.8 16.4 0.361 3 0.27
+
+# Query sequences=1
=====================================
test/last-test.sh
=====================================
@@ -236,6 +236,10 @@ trap 'rm -f $db*' EXIT
lastdb -uRY8-8 -B1 $db hg19-M.fa
lastdb -uRY8 -B1 $db hg19-M.fa
lastal -fTAB -q8 -b4 $db galGal3-M-32.fa
+
+ # tricky Forward-Backward bug that happened once
+ lastdb $db alli.fa
+ lastal -j7 -r5 -q5 -a15 -b3 $db huma.fa
} 2>&1 |
grep -v version | diff -u last-test.out -
=====================================
test/split1.maf
=====================================
@@ -0,0 +1,201 @@
+#
+# a=31 b=1 A=21 B=3 e=190 d=94 x=189 y=48 z=189 D=1e+09 E=0.516058
+# R=01 u=0 s=2 S=1 M=0 T=0 m=100 l=1 n=100 k=1 w=1000 t=4.75495 j=3 Q=0
+# /home/mcfrith/data/genome/calMil1/last/calMil1-uMAM8
+# Reference sequences=21204 normal letters=936953443
+# lambda=0.214243 K=0.246122
+#
+# A C G T M S K W R Y B D H V
+# A 5 -7 -3 -6 2 -5 -4 2 3 -6 -5 1 1 1
+# C -8 5 -7 -2 2 2 -4 -4 -7 3 1 -5 1 0
+# G -2 -7 5 -8 -4 2 2 -4 3 -7 0 1 -5 1
+# T -6 -3 -7 5 -4 -5 2 2 -6 3 1 1 1 -5
+# M 2 2 -5 -4 2 0 -4 0 0 0 -1 -1 1 1
+# S -4 2 2 -4 0 2 0 -4 0 0 1 -1 -1 1
+# K -4 -5 2 2 -4 0 2 0 0 0 1 1 -1 -1
+# W 2 -5 -5 2 0 -5 0 2 0 0 -1 1 1 -1
+# R 3 -7 3 -7 0 0 0 0 3 -7 -2 1 -1 1
+# Y -7 3 -7 3 0 0 0 0 -7 3 1 -1 1 -2
+# B -5 1 1 1 -1 1 1 -1 -1 1 1 0 0 0
+# D 1 -5 1 0 -1 -1 1 1 1 -2 -1 1 -1 0
+# H 0 1 -5 1 1 -1 -1 1 -2 1 0 -1 1 -1
+# V 1 1 1 -5 1 1 -1 -1 1 -1 0 0 0 1
+#
+# Coordinates are 0-based. For - strand matches, coordinates
+# in the reverse complement of the 2nd sequence are used.
+#
+# name start alnSize strand seqSize alignment
+#
+# m=1 s=190
+#
+# Query sequences=195
+a score=211 mismap=0.00255
+s KI635889 4462115 58 + 6429418 TACCTATAGATTGGTTGACAGCTTTCATGCACAAATTTTCCACCAGAATCTTTCCTGG
+s chr1 248943367 58 - 248956422 TGCCTATGGACTGGTTGACAGCCTTCATGCACAGGTTCTCCACCAGAGCCTTTCTTGG
+p #$)-1356899::::::::::::::::::::::::::::::::::::9888640,+(%
+
+a score=331 mismap=1e-10
+s KI635917 1140607 159 + 4339253 ACCTTGTTcttgttccttctgcttcttcttcttctccatcttaatcAGCTTTGCATTGCGCAGTTTGGCCTTTCCAATGCCTCCGGCCTGACGAATGGATTCCAGCAGACTAGCCCTTCCATCCGAGGGTTTCACAACCTCCTTGGGTGCCCCCTGGAC
+s chr1 15793 155 + 248956422 ACCTTGCTCctgctccttctgctgctg---cttctccaGCTTTCGCTCCTTCATGCTGCGCAGCTTGGCCTTGCCGATGCCCCCAGCTTGGCGGATGGACTCTAGCAGAGTGG-CCAGCCACCGGAGGGGTCAACCACTTCCCTGGGAGCTCCCTGGAC
+p %*.2578<>?BDFJMOPPQQQQQQQQQQQQUY\^___________```bbccccdejnsw{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{zwtpkgbb``^^^^^^^^^^^^^^^^]]\\\\[[[[ZZXUTROKGFDBA>;73.*%
+
+a score=520 mismap=1e-10
+s KI635917 1143406 188 + 4339253 TCTGTGCTGAACGTGGGCAGTTCAGGCATGTTGCTTCCTGGCACGGAGGGCGCGATGCCAGGTCCCAGGTCAGCACTGTACATGAGGTCATCTGCAATCCCAGGCAGATCCGGTAGGTAGGATGGGACGTCAATCTCTGGGACCTGACCCAGGTCTGGTACATAGAAGTAATTCTCAGTAGTCTGAAA
+s chr1 16876 185 + 248956422 TCAGTGTGGAAGGTGGGCAGTTCTGGAATGGTGC---CAGGGGCAGAGGGGGCAATGCCGGGGCCCAGGTCGGCAATGTACATGAGGTCGTTGGCAATGCCGGGCAGGTCAGGCAGGTAGGATGGAACATCAATCTCAGGCACCTGGCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCAA
+p #$%()**+.1237<@EILOPQQQQQQQQQQQQQQQQQRRSTTUXY]begiimoquz}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}xsojigc_[VRMID?;:740+'&$$$$$$#"""
+
+a score=340 mismap=1e-10
+s KI635917 1144102 140 + 4339253 ACCCGTCTTTCTAGTTGCTCTCGCGTGGTGATTGAGAGCGGGGCATCAAACGGCTTCTCTTCCTTCTCAGTCTCCAGTGCCATGTGTGTCTTTGTCACAGCTCCTGCTAACGGATCCAAGAGAACGTACTTCTTGTAGCT
+s chr1 17230 140 + 248956422 ACCTGCTGTTCCAGCTGCTCTCTCTTGCTGATGGACAAGGGGGCATCAAACAGCTTCTCCTCTGTCTCTGCCCCCAGCATCACATGGGTCTTTGTTACAGCACCAGCCAGGGGGTCCAGGAAGACATACTTCTTCTACCT
+p #%%&&&''+.12589=BEHJKLLLMOPQTWXYYZZ[\\\afjosx|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|xsonljihfecbbba`_]ZVURONMLIHFC?;62-((&$$#
+
+a score=334 mismap=1e-10
+s KI635917 1144725 145 + 4339253 TACGGATTCTCAGTAGTGTTGAACAGCAGCAGGGAGCTGATGGAGCTAATGTTCCGAGGAAGGCTTCCCAACCCCTCCTCCGTCTCATCCTCCTCTCGAGACTTGGTGTTCAGACACACGGGGTAATGCTTTGCCTTTTCCTGAG
+s chr1 17602 145 + 248956422 TACAGGTTCTCGGTGGTGTTGAAGAGCAGCAAGGAGCTGACAGAGCTGATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGGGCTCCGGCTTGGTGCTCACGCACACAGGAAAGTCCTTCAGCTTCTCCTGAG
+p $(*+-/37;?ABEHINRW[_cegglpuy}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}yxvuttssrplhdcccccccccccccccba`^[ZZZXVRNLJFEEDCBBBA>>===<::862.*%
+
+a score=340 mismap=1e-10
+s KI635917 1146887 159 + 4339253 CACACCTGCACAGCCTTATCATCCAGAGGCTTCAGCTTGCTGTGGATCTTGTAGCGTGGGCGCTTCTGAGAAGCAGGATCATCCGCGCCCGAGAATATGGAAGTGTAATCCTGTAGCCGGTCAGGTGCAGGGTACTTAGCACTGGAAAAGACCTGCAGA
+s chr1 17909 159 + 248956422 CAGACCTGCAGGGCCCGCTCGTCCAGGGGGCGGTGCTTGCTCTGGATCCTGTGGCGGGGGCGTCTCTGCAGGCCAGGGTCCTGGGCGCCCGTGAAGATGGAGCCATATTCCTGCAGGCGCCCTGGAGCAGGGTACTTGGCACTGGAGAACACCTGTGGA
+p #$%).1456667888889;=>@AAAAAAAAAAABGKPTWZ[\`eimqstx{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~zvrmhd_^\XTPKGB><:6652/+'%$#
+
+a score=240 mismap=7.38e-06
+s KI635917 1147176 117 + 4339253 TACTTTTGTGGCTTTCTTGCTGCCTTTGATCCTCTCGATCTTGGCCTGTGCCAGATTGATGCGGTCACTGATGGACTGCAGGCGAGTCTGATTCCTCTCCACGTTTTGGGAAACCCT
+s chr1 18264 117 + 248956422 TACCTTGATGGCCTTCTTGCTGCCCTTGATCTTCTCAATCTTGGCCTGGGCCAAGGAGACCTTCTCTCCAATGGCCTGCACCTGGCTCCGGCTCTGCTCTACCTGCTGGGAGATCCT
+p $'()*++-159;=AEJNQRSTTTTTTTTTTTTTTTTTTTTTTTTTTSSQQPOLHGGFFFFFFFFFFFFFFFFFFFFEEECA@@@@@@@@@@??????>>=<;987777642.-+*($
+
+a score=252 mismap=4.27e-07
+s KI635917 1161884 119 + 4339253 ACCTCGTGAAGACATCATTGGAGATTTTTTGGAGATACTGAAGAGCATCAGTGATCTGGTGAATGGACTCTTCCCGGCGCAGGTCCGGCTGGATTATTGGCACCGTGTACACCTGTCCT
+s chr1 24735 119 + 248956422 ACCTGCTGAAGATGTCTCCAGAGACCTTCTGCAGGTACTGCAGGGCATCCGCCATCTGCTGGACGGCCTCCTCTCGCCGCAGGTCTGGCTGGATGAAGGGCACGGCATAGGTCTGACCT
+p #%&&''+/35667778889:>ACDEFIKLNOOSUVZ]^___``````````````````````````````````````````_]ZYWSOKFB=987666542/.-,,+*))))('&%#
+
+a score=192 mismap=0.109
+s KI635862 2418195 96 + 12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCCGA-GCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248924753 98 - 248956422 CATAGCTCACTGCAGCCTCGATCTCCCAAGCTCAAGTGATCCTCCCGCCTCAGCCTCCCATGTAGCTGGGACTACAGGTGTGTGACACCATGACTGGC
+p #$%%&&'(()*****************************************************************************))(''&&%%$#
+
+a score=201 mismap=0.031
+s KI635862 2418214 77 + 12728579 GAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248920976 79 - 248956422 GAACTCCTGGCCTCAAGCGATCCTCCCACCTCAGCCTCCCAAAGTGTTGGGATTATAGACATGAGCCACTGCACCTGGC
+p %),.//00000000000000000000000000000000000000000000000000000000000///////..-*)($
+
+a score=214 mismap=0.00137
+s KI635862 2418214 80 + 12728579 GAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGGCGAG
+s chr1 248916719 82 - 248956422 GAACTCCTGACCTCAGGTGATCCACCCGCCTCAGCCTCCCAAAGTGTTGGGATTACAGGCACGAGCCACTGCATCCAGCCAG
+p %*.269;<==========================================================<<:87410/.+*($$#
+
+a score=214 mismap=0.00209
+s KI635862 2418214 77 + 12728579 GAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCCGAGCAGCC--GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248879450 79 - 248956422 GAATTCCTGGCCTCAAGTGATCTGCCCGCCTCAGCCTCCCAAAATGCTGGGATCACAGGCATGAGCCACCACGCCCGGC
+p %(+,158:;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;::862.*%
+
+a score=198 mismap=0.0234
+s KI635862 2418209 85 + 12728579 GCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGGCGAG
+s chr1 248845257 87 - 248956422 GTCCCGAACTCCTGACCTTAGGTGATCCACCAGCATCGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACTGTGTCCAGCCAG
+p #$&'(+.001111111111111111111111111111111111111111111111111111111111111111100//..-*)'$$#
+
+a score=190 mismap=0.228
+s KI635862 2418217 74 + 12728579 CTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCCG-AGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248845129 76 - 248956422 CTCCTGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCTAGTAGCTGGTATTACAGGTGCCTGCCACCACACCCAGC
+p $%&''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''&&%$#
+
+a score=194 mismap=0.127
+s KI635862 2418218 73 + 12728579 TCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248826215 75 - 248956422 TCCTGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGTGTGTGCCACCACACCCGGC
+p $'()))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))('$
+
+a score=192 mismap=0.12
+s KI635862 2418202 89 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248811964 91 - 248956422 CACTGCAGCCTCCACCTGCCAGGCTCAAGCAATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGTAAGTGCCACCACACCCAGC
+p #$'())))))))))))))*****************************************************************))))(&%#
+
+a score=191 mismap=0.124
+s KI635862 2418206 85 + 12728579 GCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248811595 87 - 248956422 GCAACCTCCACCTTCTGGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCGAGTAGTTGGGATTACAGGCATGTGCCACCATGCCCAGC
+p "##$$$$$$%%'(())))))*********************************************************))))))(&%#
+
+a score=207 mismap=0.00555
+s KI635862 2418202 89 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248806800 91 - 248956422 CACTGCAACCTTCACCTTCTGGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGTGCCCGCCACCATACCCAGC
+p #%(,.///000001145567777777777777777777777777777777777777777777777777777777777776530/..-+(&$
+
+a score=192 mismap=0.117
+s KI635862 2418195 96 + 12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248803921 98 - 248956422 CATAGCTCACTGCAGCCTTGACCTCCTAGGCTAAAGCAATCCTCCCACCTTAGCCTCTCCAGTAGCTGGAACTACAGGCATGCATCACCATGTCCAGC
+p #$$%&&'(()*******************************************************************************)))(('&%#
+
+a score=330 mismap=0.75
+s KI636063 409336 105 + 789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
+s chr1 157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
+p !"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""!
+
+a score=203 mismap=0.0127
+s KI635862 2418202 85 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCC
+s chr1 248798099 87 - 248956422 CACTGCAGCCTCAACCTCCTGGGCTCAAGCAATCCTCCTGCCTCAGCCTCACAAGTAGCTGGGACTACAGGTGCTTGTCACCACACC
+p #%)-01233333333333333333333333333333333333333333333333333333333333333333322211110.*)'&$
+
+a score=201 mismap=0.0167
+s KI635862 2418211 79 + 12728579 CCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGG
+s chr1 248796986 81 - 248956422 CTAGAACTTTTGACCTCAGGTGATCCGCCCGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGATCCACCGCGCCCAG
+p #%),/11222222222222222222222222222222222222222222222222222222222222222222221/-)%#
+
+a score=209 mismap=0.00211
+s KI635862 2418209 87 + 12728579 GCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGGCGAGGG
+s chr1 248792490 89 - 248956422 GTCTAGAACTCCTGACCTTGGGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGCGCCCAGCCTGGG
+p #$'(-158:;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;:9740,+($$$#"
+
+a score=271 mismap=6.21e-09
+s KI635901 1311390 127 + 5048387 TCTTTTTTTCCTTCTGTCAACACTTTTAGCTGAAGCCATAAAAACGACAAGTCCCGTCATAATCCCATACCCACTCTTTCAGTCCAATGCTGAAGATCTTAGTGTTGAAGGTTTACCCGAAGGTATT
+s chr1 248791606 123 - 248956422 TCCCTttttctttttttt----CTTATAGCTCGAGCTGTAAAAGCCAAAGGTCCGGTGATGATCCCATACCCTTTTTTCCAGTCTCATGTTGAAGATTTTTATGTAGAAGGCCTtcccaaaggaatt
+p "##$&'''''''''''''''''+./048:;<<=ACEFGKNQRSSSSTTWX\_abbdefikloqrrrssssssssssssssssssrrrrrrrrqolid`^\XXWVTQQOMIEA@?>>=;8752.)('$
+
+a score=202 mismap=0.0217
+s KI635862 2418204 87 + 12728579 CTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248790144 89 - 248956422 CTGCAACCTCCACCTCCTGGGTTCAGGCAATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGTGCCTGCCACCATGCCCAGC
+p $')*+++++++,-/011111111111111111111111111111111111111111111111111111111111111111000/.+(&$
+
+a score=191 mismap=0.145
+s KI635862 2418202 87 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCT-CCGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCCCG
+s chr1 167211 89 + 248956422 CACTGCAACCTCTGCCTCCTGGGTTCAAGCGATTCTCCTGCCTTGGCCTCCCGAATAGCTGGGATTACAGACATGCGCCACCACACCCG
+p "#%&&&'''''''''()))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))((('&$
+
+a score=192 mismap=0.162
+s KI635862 2418214 77 + 12728579 GAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC--GAGCAGCCGGATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248786969 79 - 248956422 GAACTCCTGACCTCAAGTGATCCACCCATCTCGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACTGAGCCCAGC
+p $&((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((('&%#
+
+a score=194 mismap=0.0422
+s KI635862 2418197 97 + 12728579 CAGGCCGCTGCAGCC-CAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCCG-GATAACAGGCGCGCGCCACCGCGCCCGGCGAG
+s chr1 248786816 100 - 248956422 CAGTTCACTGCAACCTCTGCCCCCCTGGGTTCAAACAATTCTCCTGTCTCAGCCTCCTGAGTAGCTGAGATTACAGGCATGTGCCACCATGCCCGGCTAG
+p "###$%%&&&''''''''''')*,-.....................................................................-+($$#
+
+a score=199 mismap=0.0162
+s KI635862 2418217 87 + 12728579 CTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGCGAGGGTTGCGGCT
+s chr1 248781694 89 - 248956422 CTTTTGGGTTCAAGGGATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGCATGTGCCACCACACCCGGCTAATGTTGTAGTT
+p #$%&)*++,/12222222222222222222222222222222222222222222222222222222222222221/-,+++**(&%$#"
+
+a score=226 mismap=0.000106
+s KI635862 2418202 89 + 12728579 CGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 248781029 91 - 248956422 CACTGCAACCTCCACCTCCCGGGTTCAAGCGATTCTCCTGCCTCAGGCTCCCGAGTAGCTGGGATTACAGGCGCGCGCCACCATGCCCAGC
+p #%),./0000000226:<>>@@ABEGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGFC@;75530,('$
+
+a score=194 mismap=0.0709
+s KI635862 2418197 94 + 12728579 CAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTCC-GAGCAGCCGG-ATAACAGGCGCGCGCCACCGCGCCCGGC
+s chr1 175660 96 + 248956422 CAGCTCACTGCAATCTCTGCCCCCTGGGTTCAAGCAATTCTCCTTCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCATGCACCACCACGCCTGGC
+p "###$%%&''''''''''''))++,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,+))'$
+
+a score=211 mismap=0.00255
+s KI635889 4462115 58 + 6429418 TACCTATAGATTGGTTGACAGCTTTCATGCACAAATTTTCCACCAGAATCTTTCCTGG
+s chr1 248772848 58 - 248956422 TGCCTATGGACTGGTTGACAGCCTTCATGCACAGGTTCTCCACCAGAGCCTTTCTTGG
+p #$)-1356899::::::::::::::::::::::::::::::::::::9888640,+(%
+
+a score=392 mismap=1e-10
+s KI635917 1140607 159 + 4339253 ACCTTGTTcttgttccttctgcttcttcttcttctccatcttaatcAGCTTTGCATTGCGCAGTTTGGCCTTTCCAATGCCTCCGGCCTGACGAATGGATTCCAGCAGACTAGCCCTTCCATCCGAGGGTTTCACAACCTCCTTGGGTGCCCCCTGGAC
+s chr1 186314 156 + 248956422 ACCTTGCTCctgctccttctg---cttcttcttctccAGCTTTCGCTCCTTCATGCTGCGCAGCTTGGCCTTGCCGATGCCCCCAGCTTGGCGGATGGACTCTAGCAGAGTGGCCCGGCCACCGGAGGGGTCGACCACTTCCCTGGGAGCTCCCTGGAC
+p %*.2578;>?BDEHJKKKKKKKKKPUY^bgkortuuuuuuuuuuuuvvwxxyyyz{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~zvvuutsrpponkgcbaa``___^]\YVUROKGFDBA>;73.*%
+
+a score=513 mismap=1e-10
+s KI635917 1143406 188 + 4339253 TCTGTGCTGAACGTGGGCAGTTCAGGCATGTTGCTTCCTGGCACGGAGGGCGCGATGCCAGGTCCCAGGTCAGCACTGTACATGAGGTCATCTGCAATCCCAGGCAGATCCGGTAGGTAGGATGGGACGTCAATCTCTGGGACCTGACCCAGGTCTGGTACATAGAAGTAATTCTCAGTAGTCTGAAA
+s chr1 187398 185 + 248956422 TCAGTGTGGAAGGTGGGCAGTTCTGGAATGGTGC---CAGGGGCAGAGGGGGCAATGCCGGGGCCCAGGTCGGCAATGTACATGAGGTCGTTGGCAATGCCGGACAGGTCAGGCAGGTAGGATGGAACATCAATCTCAGGCACCTGGCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCAA
+p #$%()**+.1237<@EILOPQQQQQQQQQQQQQQQQQRRSTTUXY]begiimoquz}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}xsojigc_[VRMID?;:740+'&$$$$$$#"""
+
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/cf76fa7e236ed8adc40b6c2c1f9011646941aebd
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/cf76fa7e236ed8adc40b6c2c1f9011646941aebd
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210528/f5851781/attachment-0001.htm>
More information about the debian-med-commit
mailing list