[med-svn] [Git][med-team/last-align][upstream] New upstream version 1642
Charles Plessy (@plessy)
gitlab at salsa.debian.org
Tue Sep 2 05:21:42 BST 2025
Charles Plessy pushed to branch upstream at Debian Med / last-align
Commits:
9291a16b by Charles Plessy at 2025-09-02T11:50:44+09:00
New upstream version 1642
- - - - -
26 changed files:
- bin/last-train
- bin/maf-linked
- data/MAM4.seed
- data/MAM8.seed
- data/YASS.seed
- doc/last-cookbook.rst
- doc/last-seeds.rst
- doc/lastdb.rst
- doc/maf-linked.rst
- src/Alignment.cc
- src/Alignment.hh
- src/AlignmentPot.hh
- src/AlignmentWrite.cc
- src/LastdbArguments.hh
- src/lastal.cc
- src/makefile
- src/split/cbrc_split_aligner.cc
- src/split/cbrc_split_aligner.hh
- src/split/cbrc_unsplit_alignment.cc
- src/split/cbrc_unsplit_alignment.hh
- src/split/mcf_last_splitter.cc
- src/split/mcf_last_splitter.hh
- test/last-test.sh
- + test/maf-linked-test.sh
- + test/maf-linked-test.txt
- + util/plot-score-matrix
Changes:
=====================================
bin/last-train
=====================================
@@ -9,6 +9,7 @@
from __future__ import division, print_function
import gzip
+import itertools
import math
import optparse
import os
@@ -1582,6 +1583,11 @@ def scoresAndScale(originalScale, matParams, delRatios, insRatios):
"increasing the scale")
originalScale *= 1.1
+def isAlmostSame(oldScores, newScores, slop):
+ old = itertools.chain(oldScores[0], oldScores[1], *oldScores[2])
+ new = itertools.chain(newScores[0], newScores[1], *newScores[2])
+ return all(abs(i - j) <= slop for i, j in zip(old, new))
+
### Routines for codons & frameshifts:
def initialCodonSubstitutionProbs(matchProb):
@@ -1652,7 +1658,7 @@ def bestAminoPerCodon(matProbs, rowProbs):
for column in zip(*matProbs):
z = zip(column, rowProbs)
m = min((-x[0] / x[1], i) for i, x in enumerate(z))
- yield m[1]
+ yield m[1] # xxx stop codons wrong for initialCodonSubstitutionProbs
def freqText(probability):
p = 100 * probability
@@ -1845,12 +1851,19 @@ def writeGapCosts(opts, delCosts, insCosts, isLastFormat, outFile):
print("# frameshiftCosts del-1,del-2,ins+1,ins+2:",
frameshiftCosts, file=outFile)
-def probsFromFile(opts, lastalArgs, maxGapGrowProb, codonMatches, lines):
+def readProbs(opts, trial, lastalArgs, maxGapGrowProb, codonMatches, lines):
print("#", *lastalArgs)
print()
sys.stdout.flush()
+
# For the 1st iteration, use a % identity cutoff closer to 100:
- pid = opts.pid if lastalArgs[-1] == "-p-" else opts.pid * 0.8 + 20
+ if trial < 0:
+ pid = opts.pid * 0.8 + 20
+ elif trial < 1 and opts.codon:
+ pid = 100
+ else:
+ pid = opts.pid
+
matCounts, gapCounts = countsFromLastOutput(opts, pid, codonMatches, lines)
if opts.codon:
gapProbs = frameshiftProbsFromCounts(gapCounts, opts)
@@ -1960,10 +1973,10 @@ def doTraining(opts, args):
if opts.codon:
matProbs, gapProbs = initialCodonProbs(opts)
else:
- matProbs, gapProbs = probsFromFile(opts, lastalArgs, maxGapGrowProb,
- codonMatches, proc.stdout)
+ matProbs, gapProbs = readProbs(opts, -1, lastalArgs, maxGapGrowProb,
+ codonMatches, proc.stdout)
- while True:
+ for trial in itertools.count():
matchRatio, delRatios, insRatios = gapRatiosFunc(*gapProbs)
rowProbs = [sum(i) for i in matProbs]
colProbs = [sum(i) for i in zip(*matProbs)]
@@ -1985,7 +1998,8 @@ def doTraining(opts, args):
if any(isCloseEnough(i, parameters) for i in oldParameters):
break
else:
- if parameters in oldParameters:
+ slop = max(len(oldParameters) - 20, 0) # stop it running forever
+ if any(isAlmostSame(i, parameters, slop) for i in oldParameters):
break
oldParameters.append(parameters)
lastalArgs = fixedLastalArgs(opts, alphabet)
@@ -1997,8 +2011,8 @@ def doTraining(opts, args):
proc.stdin.close()
if opts.postmask and not opts.codon:
proc = process(["last-postmask"], proc.stdout)
- matProbs, gapProbs = probsFromFile(opts, lastalArgs, maxGapGrowProb,
- codonMatches, proc.stdout)
+ matProbs, gapProbs = readProbs(opts, trial, lastalArgs, maxGapGrowProb,
+ codonMatches, proc.stdout)
ss = scoresAndScaleFunc(outerScale, matParams, delRatios, insRatios)
matScores, delCosts, insCosts, scale, rowFreqs, colFreqs = ss
=====================================
bin/maf-linked
=====================================
@@ -29,11 +29,12 @@ def alignmentInput(lines):
seqName = fields[1]
beg = int(fields[2])
span = int(fields[3])
+ seqLen = int(fields[5])
end = beg + span
if fields[4] == "-":
- end = int(fields[5]) - beg
+ end = seqLen - beg
beg = end - span
- r = seqName, beg, end
+ r = seqName, seqLen, beg, end
ranges.append(r)
else:
if alnLines:
@@ -43,10 +44,18 @@ def alignmentInput(lines):
if alnLines:
yield ranges, alnLines
+def isCoveringMuchSequence(alns, nums):
+ n = nums[0]
+ for s in range(2):
+ seqLen = alns[n][0][s][1]
+ coverage = sum(alns[i][0][s][3] - alns[i][0][s][2] for i in nums)
+ if coverage * 2 >= seqLen: return True
+ return False
+
def isNearInAllSeqs(iRanges, jRanges, maxDistance):
for i, j in zip(iRanges, jRanges):
- iSeq, iBeg, iEnd = i
- jSeq, jBeg, jEnd = j
+ iSeq, iLen, iBeg, iEnd = i
+ jSeq, jLen, jBeg, jEnd = j
if iSeq != jSeq:
return False
if jBeg > iEnd + maxDistance or iBeg > jEnd + maxDistance:
@@ -59,13 +68,13 @@ def linkOneAln(args, alns, i):
jIndexEnd = min(i + maxRankDifference + 1, len(alns))
iRanges, iLines, iRank, iLinks = alns[i]
- iSeq, iBeg, iEnd = iRanges[0]
+ iSeq, iLen, iBeg, iEnd = iRanges[0]
j = i
while j > jIndexBeg:
j -= 1
jRanges, jLines, jRank, jLinks = alns[j]
- jSeq, jBeg, jEnd = jRanges[0]
+ jSeq, jLen, jBeg, jEnd = jRanges[0]
if jSeq < iSeq or jEnd + args.distance < iBeg:
break
if isNearInAllSeqs(iRanges, jRanges, args.distance):
@@ -75,7 +84,7 @@ def linkOneAln(args, alns, i):
j = i + 1
while j < jIndexEnd:
jRanges, jLines, jRank, jLinks = alns[j]
- jSeq, jBeg, jEnd = jRanges[0]
+ jSeq, jLen, jBeg, jEnd = jRanges[0]
if iSeq < jSeq or iEnd + args.distance < jBeg:
break
if isNearInAllSeqs(iRanges, jRanges, args.distance):
@@ -107,10 +116,10 @@ def sortKey1(aln):
def main(args):
alns = sorted(alignmentInput(openFile(args.infile)), key=sortKey1)
alns = sorted(aln + (rank, []) for rank, aln in enumerate(alns))
- for i, x in enumerate(alns):
+ for i in range(len(alns)):
linkOneAln(args, alns, i)
for c in connectedComponents(alns):
- if len(c) >= args.count:
+ if len(c) >= args.count or isCoveringMuchSequence(alns, c):
for i in c:
ranges, lines, rank, links = alns[i]
print(*lines, sep="")
=====================================
data/MAM4.seed
=====================================
@@ -2,8 +2,6 @@
# uses about half as much memory. [From Frith & Noé 2014 NAR 42:e59
# Table S11, row 12.]
-#lastdb -U100
-
1 A C G T
0 ACGT
T AG CT
=====================================
data/MAM8.seed
=====================================
@@ -3,8 +3,6 @@
# mammal genomes). [From Frith & Noé 2014 NAR 42:e59 Table S12, row
# 15.]
-#lastdb -U100
-
1 A C G T
0 ACGT
T AG CT
=====================================
data/YASS.seed
=====================================
@@ -2,8 +2,6 @@
# similarities. It is a good compromise for both protein-coding and
# non protein-coding DNA (L Noé & G Kucherov, NAR 2005 33:W540-W543).
-#lastdb -U100
-
1 A C G T
0 ACGT
T AG CT
=====================================
doc/last-cookbook.rst
=====================================
@@ -260,14 +260,15 @@ Aligning human & chimp genomes
------------------------------
The aim of genome-genome alignment is discussed in `our paper`_. This
-recipe seems to work well::
+recipe gets one-to-one alignments::
- lastdb -P8 -c -uRY128 humdb human.fa
+ lastdb -P8 -c -U400 -uRY128 humdb human.fa
last-train -P8 --revsym -C2 humdb chimp.fa > hc.train
- lastal -P8 -D1e9 -C2 --split-f=MAF+ -p hc.train humdb chimp.fa > many-to-one.maf
+ lastal -P8 -D1e9 -C2 --split-f=MAF+ -p hc.train humdb chimp.fa | last-split -r | maf-linked - > out.maf
-``-c`` prevents ``lastal`` using huge amounts of memory and time on
-many ways of aligning centromeric repeats.
+``-c -U400`` suppresses tandem repeats with repeat-unit length ≤ 400.
+(The default is 100). This prevents ``lastal`` using huge amounts of
+memory and time on many ways of aligning centromeric repeats.
``-uRY128`` makes it faster but less sensitive: it will miss tiny
rearranged fragments. To find such fragments, try ``-uRY4``.
@@ -281,46 +282,55 @@ T→C on the other strand). This is usually appropriate for
genome-genome comparison (but maybe not for mitochondria with
asymmetric "heavy" and "light" strands).
-``--split-f=MAF+`` has the same effect as ``--split``, and also makes
-it show `per-base mismap probabilities`_: the probability that each
-query (chimp) base should be aligned to a different part of the
-reference (human).
+``--split-f=MAF+`` has the same effect as ``--split`` (get a unique
+best alignment for each part of chimp), and gets `per-base mismap
+probabilities`_: the probability that each chimp base should be
+aligned to a different part of human.
-The result so far is asymmetric: each part of the chimp genome is
-aligned to at most one part of the human genome, but not vice-versa.
-We can get one-to-one alignments like this::
+``last-split -r`` cuts these alignments down to a unique best
+alignment for each part of human. It uses the *per-base* mismap
+probabilities to get the final *per-alignment* mismap probabilities.
+(If you won't use the mismap probabilities, you can replace
+``--split-f=MAF+`` with ``--split``.)
- last-split -r many-to-one.maf > one-to-one.maf
+maf-linked_ removes isolated alignments, that are not "linked" to
+other alignments (i.e. nearby in both genomes). This removes
+alignments between non-homologous insertions of homologous transposons
+(Frith2022_).
-Here, last-split_ gets parts of the many-to-one alignments. The ``-r``
-reverses the roles of the genomes, so it finds a unique best alignment
-for each part of human. It uses the input *per-base* mismap
-probabilities to get the output *per-alignment* mismap probabilities.
+Finally, we can make a dotplot_ (out.png)::
-Finally, we can make a dotplot_ (one-to-one.png)::
-
- last-dotplot one-to-one.maf
+ last-dotplot out.maf
Aligning more distantly related genomes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Human versus old-world monkey: the human-chimp recipe seems good.
+Human versus old-world monkey (~7% base-pair substitutions): the
+human-chimp recipe without ``-U400`` seems good. (But if you use
+``-uRY4``, ``-U400`` may reduce the memory usage.)
+
+Human versus new-world monkey (~12% base-pair substitutions):
+``RY128`` misses ~10% of related base-pairs. ``RY4`` is more
+sensitive. No need for ``-U400``.
-Human versus new-world monkey: ``RY128`` misses a bit more homology.
-It's still ok, but ``RY4`` is better.
+Human versus prosimian (~20% base-pair substitutions): ``RY4`` misses
+~15% of related base-pairs. For finer sensitivity, run ``lastdb``
+with default ``-u``::
-For more distant genomes, ``RY128`` misses a lot of homology. It's
-fine for whole-genome dotplots of e.g. human versus crocodile, but not
-human versus gar. For finer sensitivity, run ``lastdb`` with default
-``-u``::
+ lastdb -P8 -c humdb human.fa
+ last-train -P8 --revsym -C2 humdb lemur.fa > hl.train
+ lastal -P8 -m2 -D1e9 -C2 --split-f=MAF+ -p hl.train humdb lemur.fa | last-split -r | maf-linked - > out.maf
- lastdb -P8 -c mydb genome.fa
+``-m2`` halves the time and memory use, and slightly reduces sensitivity.
-To make it even more sensitive (but slow and memory-consuming), use
-``-uMAM4``. Yet more slow-and-sensitive is ``-uMAM8``.
+This recipe is okay for more distantly-related genomes too. For
+higher sensitivity, omit ``-m2``. To make it even more sensitive (but
+slow and memory-consuming), add ``lastdb`` option ``-uMAM4``. Yet
+more slow-and-sensitive is ``-uMAM8``.
-It may be useful to run maf-linked_ on the one-to-one alignments, to
-exclude non-homologous insertions of homologous transposons.
+For whole genome dotplots, high sensitivity is not necessary.
+``RY128`` is fine for e.g. human versus crocodile, though not for
+human versus fish.
Moar faster
-----------
@@ -403,4 +413,5 @@ core is indicated by "~" symbols, and it contains exact matches only.
.. _mask repeats: https://github.com/mcfrith/last-rna/blob/master/last-long-reads.md
.. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
.. _ASCII: https://en.wikipedia.org/wiki/ASCII
+.. _Frith2022: https://doi.org/10.1093/molbev/msac068
.. _our paper: https://doi.org/10.1186/s13059-015-0670-9
=====================================
doc/last-seeds.rst
=====================================
@@ -99,9 +99,6 @@ And these patterns::
11TT010T01TT0001T
11TT10T1T101TT
-It sets this lastdb default:
--U100
-
MAM8
----
@@ -126,9 +123,6 @@ And these patterns::
111100T011TTT00T0TT01T
1T1T10T1101101
-It sets this lastdb default:
--U100
-
MURPHY10
--------
@@ -196,9 +190,6 @@ And this pattern::
1T1001100101
-It sets this lastdb default:
--U100
-
RY4-9 (abbreviation: RY4)
-------------------------
=====================================
doc/lastdb.rst
=====================================
@@ -106,15 +106,6 @@ Advanced Options
100 for DNA and 50 for protein, which prevents non-homologous
alignments.
- For DNA, however, the default is 400 if you:
-
- * specify ``-c`` AND
- * don't specify AT-rich tantan AND
- * choose a seeding scheme other than MAM4, MAM8, or the default (YASS).
-
- This avoids hugely redundant alignments of human centromeric
- repeats.
-
-w STEP
Allow initial matches to start only at every STEP-th position in
each of the sequences given to lastdb (positions 0, STEP,
=====================================
doc/maf-linked.rst
=====================================
@@ -18,7 +18,8 @@ alignments between non-homologous insertions of homologous transposons
It considers two alignments to be "linked" if, in both sequences, they
are separated by at most D base-pairs and by at most T other
alignments. It keeps groups of at least C alignments that are linked
-directly or indirectly.
+directly or indirectly. It also keeps groups of < C alignments if
+they cover half (or more) of either sequence.
Options
-------
=====================================
src/Alignment.cc
=====================================
@@ -7,9 +7,6 @@
#include <assert.h>
-// make C++ tolerable:
-#define IT(type) std::vector<type>::iterator
-
using namespace cbrc;
static void addSeedCounts(BigPtr seq1, const uchar *seq2, size_t size,
@@ -36,20 +33,21 @@ void Alignment::makeXdrop( Aligners &aligners, bool isGreedy, bool isFullScore,
const uchar* qual1, const uchar* qual2,
const Alphabet& alph, AlignmentExtras& extras,
double gamma, int outputType ){
+ std::vector<char> &columnAmbiguityCodes = extras.columnAmbiguityCodes;
if (probMatrix) score = seed.score; // else keep the old score
if (outputType > 3 && !isFullScore) extras.fullScore = seed.score;
+ blocks.clear();
+ columnAmbiguityCodes.clear();
if( outputType == 7 ){
const int numOfTransitions = frameSize ? 9 : 5;
std::vector<double> &ec = extras.expectedCounts;
- ec.resize(scoreMatrixRowSize * scoreMatrixRowSize + numOfTransitions);
+ ec.assign(scoreMatrixRowSize * scoreMatrixRowSize + numOfTransitions, 0.0);
addSeedCounts(seq1 + seed.beg1(), seq2 + seed.beg2(), seed.size, &ec[0]);
}
// extend a gapped alignment in the left/reverse direction from the seed:
- blocks.clear();
- std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
- extend( blocks, columnAmbiguityCodes, aligners, isGreedy, isFullScore,
+ extend( aligners, isGreedy, isFullScore,
seq1, seq2, seed.beg1(), seed.beg2(), false, globality,
scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
frameSize, pssm2, sm2qual, qual1, qual2, alph,
@@ -60,16 +58,29 @@ void Alignment::makeXdrop( Aligners &aligners, bool isGreedy, bool isFullScore,
// convert left-extension coordinates to sequence coordinates:
size_t seedBeg1 = seed.beg1();
size_t seedBeg2 = aaToDna(seed.beg2(), frameSize);
- for( IT(SegmentPair) i = blocks.begin(); i < blocks.end(); ++i ){
- i->start1 = seedBeg1 - i->start1 - i->size;
- // careful: i->start2 might be -1 (reverse frameshift)
- i->start2 = dnaToAa( seedBeg2 - i->start2, frameSize ) - i->size;
+ for (size_t i = 0; i < blocks.size(); ++i) {
+ size_t s = blocks[i].size;
+ blocks[i].start1 = seedBeg1 - blocks[i].start1 - s;
+ // careful: start2 might be -1 (reverse frameshift)
+ blocks[i].start2 = dnaToAa(seedBeg2 - blocks[i].start2, frameSize) - s;
}
+ bool isMergeSeedRev = !blocks.empty() && isNext(blocks.back(), seed);
+ if (isMergeSeedRev) {
+ blocks.back().size += seed.size;
+ } else {
+ blocks.push_back(seed);
+ }
+
+ if (outputType > 3) { // set the un-ambiguity of the core to a max value:
+ columnAmbiguityCodes.insert(columnAmbiguityCodes.end(), seed.size, 126);
+ }
+
+ size_t middle = blocks.size();
+ size_t codesMid = columnAmbiguityCodes.size();
+
// extend a gapped alignment in the right/forward direction from the seed:
- std::vector<SegmentPair> forwardBlocks;
- std::vector<char> forwardAmbiguities;
- extend( forwardBlocks, forwardAmbiguities, aligners, isGreedy, isFullScore,
+ extend( aligners, isGreedy, isFullScore,
seq1, seq2, seed.end1(), seed.end2(), true, globality,
scoreMatrix, smMax, smMin, probMatrix, scale, maxDrop, gap,
frameSize, pssm2, sm2qual, qual1, qual2, alph,
@@ -80,48 +91,30 @@ void Alignment::makeXdrop( Aligners &aligners, bool isGreedy, bool isFullScore,
// convert right-extension coordinates to sequence coordinates:
size_t seedEnd1 = seed.end1();
size_t seedEnd2 = aaToDna(seed.end2(), frameSize);
- for( IT(SegmentPair) i = forwardBlocks.begin(); i < forwardBlocks.end();
- ++i ){
- i->start1 = seedEnd1 + i->start1;
- // careful: i->start2 might be -1 (reverse frameshift)
- i->start2 = dnaToAa( seedEnd2 + i->start2, frameSize );
+ for (size_t i = middle; i < blocks.size(); ++i) {
+ blocks[i].start1 = seedEnd1 + blocks[i].start1;
+ // careful: start2 might be -1 (reverse frameshift)
+ blocks[i].start2 = dnaToAa(seedEnd2 + blocks[i].start2, frameSize);
}
- bool isMergeSeedReverse = !blocks.empty() && isNext( blocks.back(), seed );
- bool isMergeSeedForward =
- !forwardBlocks.empty() && isNext( seed, forwardBlocks.back() );
+ bool isMergeSeedFwd = blocks.size() > middle && isNext(seed, blocks.back());
- if( seed.size == 0 && !isMergeSeedReverse && !isMergeSeedForward ){
- // unusual, weird case: give up
- score = -INF;
- return;
+ if (isMergeSeedFwd) {
+ blocks[middle - 1].size += blocks.back().size;
+ blocks.pop_back();
}
- // splice together the two extensions and the seed (a bit messy):
-
- blocks.reserve( blocks.size() + forwardBlocks.size() +
- 1 - isMergeSeedReverse - isMergeSeedForward );
-
- if( isMergeSeedReverse ) blocks.back().size += seed.size;
- else blocks.push_back(seed);
+ reverse(blocks.begin() + middle, blocks.end());
+ reverse(columnAmbiguityCodes.begin() + codesMid, columnAmbiguityCodes.end());
- if( isMergeSeedForward ){
- blocks.back().size += forwardBlocks.back().size;
- forwardBlocks.pop_back();
- }
-
- size_t oldSize = blocks.size();
- blocks.insert( blocks.end(), forwardBlocks.rbegin(), forwardBlocks.rend() );
- for (size_t i = oldSize; i < blocks.size(); ++i)
+ for (size_t i = middle; i < blocks.size(); ++i)
blocks[i - 1].score = blocks[i].score;
- if( outputType > 3 ){ // set the un-ambiguity of the core to a max value:
- columnAmbiguityCodes.insert( columnAmbiguityCodes.end(), seed.size, 126 );
+ if (seed.size == 0 && !isMergeSeedRev && !isMergeSeedFwd) {
+ // unusual, weird case: give up
+ score = -INF;
+ blocks[0].score = -1;
}
-
- columnAmbiguityCodes.insert( columnAmbiguityCodes.end(),
- forwardAmbiguities.rbegin(),
- forwardAmbiguities.rend() );
}
// cost of the gap between x and y
@@ -217,13 +210,13 @@ bool Alignment::hasGoodSegment(BigSeq seq1, const uchar *seq2,
}
static void getColumnCodes(const Centroid& centroid, std::vector<char>& codes,
- const std::vector<SegmentPair>& chunks,
+ const SegmentPair *chunks, size_t chunkCount,
bool isForward) {
- for (size_t i = 0; i < chunks.size(); ++i) {
+ for (size_t i = 0; i < chunkCount; ++i) {
const SegmentPair& x = chunks[i];
centroid.getMatchAmbiguities(codes, x.end1(), x.end2(), x.size);
size_t j = i + 1;
- bool isNext = (j < chunks.size());
+ bool isNext = (j < chunkCount);
size_t end1 = isNext ? chunks[j].end1() : 0;
size_t end2 = isNext ? chunks[j].end2() : 0;
// ASSUMPTION: if there is an insertion adjacent to a deletion,
@@ -240,15 +233,15 @@ static void getColumnCodes(const Centroid& centroid, std::vector<char>& codes,
static void getColumnCodes(const FrameshiftXdropAligner &fxa,
std::vector<char> &codes,
- const std::vector<SegmentPair> &chunks) {
- for (size_t i = 0; i < chunks.size(); ++i) {
+ const SegmentPair *chunks, size_t chunkCount) {
+ for (size_t i = 0; i < chunkCount; ++i) {
const SegmentPair &x = chunks[i];
for (size_t k = x.size; k-- > 0;) {
double p = fxa.matchProb(x.beg1() + k, x.beg2() + k * 3);
codes.push_back(asciiProbability(p));
}
size_t j = i + 1;
- bool isNext = (j < chunks.size());
+ bool isNext = (j < chunkCount);
size_t end1 = isNext ? chunks[j].end1() : 0;
size_t end2 = isNext ? chunks[j].beg2() + chunks[j].size * 3 : 0;
size_t n1 = x.beg1() - end1;
@@ -257,9 +250,7 @@ static void getColumnCodes(const FrameshiftXdropAligner &fxa,
}
}
-void Alignment::extend( std::vector< SegmentPair >& chunks,
- std::vector< char >& columnCodes,
- Aligners &aligners, bool isGreedy, bool isFullScore,
+void Alignment::extend( Aligners &aligners, bool isGreedy, bool isFullScore,
BigSeq seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isForward, int globality,
@@ -276,6 +267,8 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
Centroid ¢roid = aligners.centroid;
GappedXdropAligner& aligner = centroid.aligner();
GreedyXdropAligner &greedyAligner = aligners.greedyAligner;
+ std::vector<char> &columnCodes = extras.columnAmbiguityCodes;
+ size_t blocksBeg = blocks.size();
double *subsCounts[scoreMatrixRowSize];
double *tranCounts;
@@ -306,7 +299,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
const uchar *f2 = seq2 + dnaToAa(frame2, frameSize);
aligner.alignFrame(s1, s2, f1, f2, isForward, sm, gap, maxDrop);
while (aligner.getNextChunkFrame(end1, end2, size, gapCost, gap))
- chunks.push_back(SegmentPair(end1 - size, end2 - size * 3, size,
+ blocks.push_back(SegmentPair(end1 - size, end2 - size * 3, size,
gapCost));
if (!probMat) return;
FrameshiftXdropAligner &fxa = aligners.frameshiftAligner;
@@ -316,7 +309,8 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
score += s / scale;
if (outputType < 4) return;
fxa.backward(isForward, probMat, gap);
- getColumnCodes(fxa, columnCodes, chunks);
+ getColumnCodes(fxa, columnCodes, blocks.data() + blocksBeg,
+ blocks.size() - blocksBeg);
if (outputType == 7) fxa.count(isForward, gap, subsCounts, tranCounts);
} else {
assert(!isFullScore);
@@ -330,7 +324,7 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
while (aligner.getNextChunk3(end1, end2, size,
del.openCost, del.growCost, gap.pairCost,
gap.frameshiftCost))
- chunks.push_back(SegmentPair(end1 - size, end2 - size * 3, size));
+ blocks.push_back(SegmentPair(end1 - size, end2 - size * 3, size));
}
return;
@@ -384,21 +378,21 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
size_t end1, end2, size;
if( isGreedy ){
while( greedyAligner.getNextChunk( end1, end2, size ) )
- chunks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
+ blocks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
}
#if defined __SSE4_1__ || defined __ARM_NEON
else if (isSimdMatrix && !pssm2 && !sm2qual) {
while (aligner.getNextChunkDna(end1, end2, size,
del.openCost, del.growCost,
ins.openCost, ins.growCost))
- chunks.push_back(SegmentPair(end1 - size, end2 - size, size));
+ blocks.push_back(SegmentPair(end1 - size, end2 - size, size));
}
#endif
else {
while( aligner.getNextChunk( end1, end2, size,
del.openCost, del.growCost,
ins.openCost, ins.growCost, gap.pairCost ) )
- chunks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
+ blocks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
}
}
@@ -421,10 +415,11 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
centroid.dp(outputType, gamma);
size_t beg1, beg2, length;
while (centroid.traceback(beg1, beg2, length, outputType, gamma)) {
- chunks.push_back(SegmentPair(beg1, beg2, length));
+ blocks.push_back(SegmentPair(beg1, beg2, length));
}
}
- getColumnCodes(centroid, columnCodes, chunks, isForward);
+ getColumnCodes(centroid, columnCodes, blocks.data() + blocksBeg,
+ blocks.size() - blocksBeg, isForward);
if (outputType == 7) {
centroid.addExpectedCounts(start2, isForward, probMat, gap, alph.size,
subsCounts, tranCounts);
=====================================
src/Alignment.hh
=====================================
@@ -40,11 +40,12 @@ struct AlignmentText {
AlignmentText() {}
AlignmentText(size_t queryNumIn, size_t queryBegIn, size_t queryEndIn,
- char strandIn, double scoreIn,
+ size_t queryLengthIn, char strandIn, double scoreIn,
size_t alnSizeIn, size_t matchesIn, char *textIn) :
strandNum(queryNumIn * 2 + (strandIn == '-')),
- queryBeg(queryBegIn), queryEnd(queryEndIn), score(scoreIn),
- alnSize(alnSizeIn), matches(matchesIn), text(textIn) {}
+ queryBeg(strandIn == '-' ? queryLengthIn - queryEndIn : queryBegIn),
+ queryEnd(strandIn == '-' ? queryLengthIn - queryBegIn : queryEndIn),
+ score(scoreIn), alnSize(alnSizeIn), matches(matchesIn), text(textIn) {}
size_t queryNum() const { return strandNum / 2; }
@@ -134,9 +135,7 @@ struct Alignment{
size_t end1() const{ return blocks.back().end1(); }
size_t end2() const{ return blocks.back().end2(); }
- void extend( std::vector< SegmentPair >& chunks,
- std::vector< char >& columnCodes,
- Aligners &aligners, bool isGreedy, bool isFullScore,
+ void extend( Aligners &aligners, bool isGreedy, bool isFullScore,
BigSeq seq1, const uchar* seq2, size_t start1, size_t start2,
bool isForward, int globality,
const ScoreMatrixRow* sm, int smMax, int smMin,
=====================================
src/AlignmentPot.hh
=====================================
@@ -24,16 +24,16 @@ struct AlignmentPot{
// highest-scoring alignment
void eraseSuboptimal();
- // sort the alignments in descending order of score
- void sort() { std::sort( items.begin(), items.end(), moreScore ); }
+ // sort the alignments in ascending order of score
+ void sort() { std::sort(items.begin(), items.end(), lessScore); }
// data:
std::vector<Alignment> items;
- static bool moreScore( const Alignment& x, const Alignment& y ){
+ static bool lessScore(const Alignment &x, const Alignment &y) {
// Try to break ties, so that alignments come in a consistent
// order. This makes it easier to compare different results.
- return x.score != y.score ? x.score > y.score : lessBeg( x, y );
+ return x.score != y.score ? x.score < y.score : !lessBeg(x, y);
}
static bool lessBeg( const Alignment& x, const Alignment& y ){
=====================================
src/AlignmentWrite.cc
=====================================
@@ -37,6 +37,11 @@ const char aminoTriplets[] =
"***" "***"
"Xaa" "xaa";
+static char *copyString(char *out, const char *s, size_t sizeOf) {
+ memcpy(out, s, sizeOf - 1);
+ return out + sizeOf - 1;
+}
+
// This writes a "size_t" integer into a char buffer ending at "end".
// It writes backwards from the end, because that's easier & faster.
static char *writeSize(char *end, size_t x) {
@@ -194,13 +199,19 @@ static char* writeTags( const LastEvaluer& evaluer, double queryLength,
double epa = evaluer.evaluePerArea( score );
double area = evaluer.area( score, queryLength );
*out++ = separator;
- out += std::sprintf( out, "EG2=%.2g", 1e18 * epa );
+ static const char aString[] = "EG2=";
+ out = copyString(out, aString, sizeof aString);
+ out += std::sprintf(out, "%.2g", 1e18 * epa);
*out++ = separator;
- out += std::sprintf( out, "E=%.2g", area * epa );
+ static const char eString[] = "E=";
+ out = copyString(out, eString, sizeof eString);
+ out += std::sprintf(out, "%.2g", area * epa);
}
if( fullScore > 0 ){
*out++ = separator;
- out += std::sprintf( out, "fullScore=%.3g", fullScore );
+ static const char fString[] = "fullScore=";
+ out = copyString(out, fString, sizeof fString);
+ out += std::sprintf(out, "%.3g", fullScore);
}
*out++ = '\n';
return out;
@@ -257,7 +268,8 @@ AlignmentText Alignment::writeTab(const MultiSequence& seq1,
w.copy(tags, tagLen);
w << '\0';
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, size2, strand2, score,
+ 0, 0, text);
}
static void putLeft(Writer &w, const std::string &t, size_t width) {
@@ -275,7 +287,9 @@ static void putRight(Writer &w, const IntText &t, size_t width) {
// Write an "a" line
static char *writeMafLineA(char *out, double score, const LastEvaluer& evaluer,
double queryLength, double fullScore) {
- out += std::sprintf(out, "a score=");
+ if (fullScore < -1) return out; // no "a" line at all
+ static const char s[] = "a score=";
+ out = copyString(out, s, sizeof s);
out += std::sprintf(out, scoreFormat(fullScore >= 0), score);
return writeTags(evaluer, queryLength, score, fullScore, ' ', out);
}
@@ -329,12 +343,14 @@ static void writeMafLineC(std::vector<char> &cLine,
if (x != i) c += counts[x * scoreMatrixRowSize + j];
if (y != j) c += counts[i * scoreMatrixRowSize + y];
if (x != i && y != j) c += counts[x * scoreMatrixRowSize + y];
- e += std::sprintf(e, " %.3g", c);
+ *e++ = ' ';
+ e += std::sprintf(e, "%.3g", c);
}
}
for (size_t i = 0; i < numOfTransitions; ++i) {
- e += std::sprintf(e, " %.3g", counts[numOfSubstitutions + i]);
+ *e++ = ' ';
+ e += std::sprintf(e, "%.3g", counts[numOfSubstitutions + i]);
}
*e++ = '\n';
@@ -443,7 +459,8 @@ AlignmentText Alignment::writeMaf(const MultiSequence& seq1,
if (!cLine.empty()) w.copy(&cLine[0], cLine.size());
w << '\n' << '\0'; // blank line afterwards
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score, 0, 0, text);
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, size2, strand2, score,
+ 0, 0, text);
}
AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
@@ -539,7 +556,7 @@ AlignmentText Alignment::writeBlastTab(const MultiSequence& seq1,
if (isExtraColumns) w << t << s2 << t << s1 << t << sc;
w << '\n' << '\0';
- return AlignmentText(seqNum2, alnBeg2, alnEnd2, strand2, score,
+ return AlignmentText(seqNum2, alnBeg2, alnEnd2, size2, strand2, score,
alnSize, matches, text);
}
@@ -572,6 +589,7 @@ size_t Alignment::numColumns(size_t frameSize, bool isCodon) const {
const SegmentPair& x = blocks[i - 1];
// length of unaligned chunk of top sequence (gaps in bottom sequence):
+ assert(x.end1() <= y.beg1());
num += (y.beg1() - x.end1()) * aaLen;
// length of unaligned chunk of bottom sequence (gaps in top sequence):
=====================================
src/LastdbArguments.hh
=====================================
@@ -34,10 +34,7 @@ struct LastdbArguments{
tantanSetting = maxRepeatUnit ? (isAddStops ? 3 : 1) : 0;
}
if (maxRepeatUnit < 0) {
- maxRepeatUnit = (tantanSetting == 2) ? 100
- : isProteinish ? 50
- : isCaseSensitive ? 400
- : 100;
+ maxRepeatUnit = (isProteinish && tantanSetting != 2) ? 50 : 100;
}
}
=====================================
src/lastal.cc
=====================================
@@ -53,6 +53,7 @@ struct LastAligner { // data that changes between queries
LastSplitter splitter;
std::vector<int> qualityPssm;
std::vector<AlignmentText> textAlns;
+ std::vector<char *> alignmentTextLines;
std::vector< std::vector<countT> > matchCounts; // used if outputType == 0
countT numOfNormalLetters;
countT numOfSequences;
@@ -587,8 +588,8 @@ struct Dispatcher{
};
static bool isCollatedAlignments() {
- return args.outputFormat == 'b' || args.outputFormat == 'B' ||
- args.cullingLimitForFinalAlignments + 1 || numOfVolumes > 1;
+ return args.outputFormat == 'b' || args.outputFormat == 'B' || args.isSplit
+ || args.cullingLimitForFinalAlignments + 1 || numOfVolumes > 1;
}
static void writeAlignment(LastAligner &aligner, const MultiSequence &qrySeqs,
@@ -599,7 +600,7 @@ static void writeAlignment(LastAligner &aligner, const MultiSequence &qrySeqs,
alph, queryAlph,
translationType, geneticCode.getCodonToAmino(),
evaluer, args.outputFormat, extras);
- if (isCollatedAlignments() || aligners.size() > 1 || args.isSplit) {
+ if (isCollatedAlignments() || aligners.size() > 1) {
aligner.textAlns.push_back(a);
} else {
std::cout << a.text;
@@ -785,16 +786,14 @@ void alignGapped(LastAligner &aligner, AlignmentPot &gappedAlns,
LOG2( "redone gapless alignments=" << gaplessAlns.size() );
- for( size_t i = 0; i < gaplessAlns.size(); ++i ){
- SegmentPair& sp = gaplessAlns.get(i);
-
- if( SegmentPairPot::isMarked(sp) ) continue;
+ Alignment aln;
+ AlignmentExtras extras; // not used
- Alignment aln;
- AlignmentExtras extras; // not used
+ for (size_t i = 0; i < gaplessAlns.size(); ++i) {
+ SegmentPair &sp = gaplessAlns.get(i);
+ if (SegmentPairPot::isMarked(sp)) continue;
aln.seed = sp;
-
- shrinkToLongestIdenticalRun( aln.seed, dis );
+ shrinkToLongestIdenticalRun(aln.seed, dis);
// do gapped extension from each end of the seed:
aln.makeXdrop(aligner.engines, args.isGreedy, args.scoreType,
@@ -804,7 +803,7 @@ void alignGapped(LastAligner &aligner, AlignmentPot &gappedAlns,
qryData.frameSize, dis.p, dis.t, dis.i, dis.j, alph, extras);
++gappedExtensionCount;
- if( aln.score < args.minScoreGapped ) continue;
+ if (aln.score < args.minScoreGapped) continue;
if (args.scoreType == 0 &&
!aln.isOptimal(dis.a, dis.b, args.globality, dis.m, dis.d, gapCosts,
@@ -814,18 +813,18 @@ void alignGapped(LastAligner &aligner, AlignmentPot &gappedAlns,
continue;
}
- gaplessAlns.markAllOverlaps( aln.blocks );
- gaplessAlns.markTandemRepeats( aln.seed, args.maxRepeatDistance );
+ gaplessAlns.markAllOverlaps(aln.blocks);
+ gaplessAlns.markTandemRepeats(aln.seed, args.maxRepeatDistance);
if (phase == Phase::gapped) gappedAlns.add(aln);
else SegmentPairPot::markAsGood(sp);
++gappedAlignmentCount;
- if( gappedAlignmentCount >= args.maxAlignmentsPerQueryStrand ) break;
+ if (gappedAlignmentCount >= args.maxAlignmentsPerQueryStrand) break;
}
- LOG2( "gapped extensions=" << gappedExtensionCount );
- LOG2( "gapped alignments=" << gappedAlignmentCount );
+ LOG2("gapped extensions=" << gappedExtensionCount);
+ LOG2("gapped alignments=" << gappedAlignmentCount);
}
// Redo gapped extensions, but keep the old alignment scores
@@ -840,21 +839,24 @@ static void alignPostgapped(LastAligner &aligner, AlignmentPot &gappedAlns,
0, 0, gapCosts, dis.d,
frameSize, dis.p, dis.t, dis.i, dis.j, alph, extras);
}
+ erase_if(gappedAlns.items, AlignmentPot::isMarked);
}
// Print the gapped alignments, after optionally calculating match
// probabilities and re-aligning using the gamma-centroid algorithm
void alignFinish(LastAligner &aligner, const MultiSequence &qrySeqs,
- const SeqData &qryData, const AlignmentPot &gappedAlns,
+ const SeqData &qryData, std::vector<Alignment> &alignments,
const SubstitutionMatrices &matrices, const Dispatcher &dis) {
- for( size_t i = 0; i < gappedAlns.size(); ++i ){
- const Alignment& aln = gappedAlns.items[i];
- AlignmentExtras extras;
- if (args.scoreType != 0) extras.fullScore = -1; // score is fullScore
+ Alignment probAln;
+ AlignmentExtras extras;
+ if (args.scoreType != 0) extras.fullScore = -1; // score is fullScore
+ if (args.isSplit && !args.splitOpts.no_split) extras.fullScore = -2;
+
+ while (!alignments.empty()) {
+ const Alignment &aln = alignments.back();
if( args.outputType < 4 ){
writeAlignment(aligner, qrySeqs, qryData, aln, extras);
} else { // calculate match probabilities:
- Alignment probAln;
probAln.seed = aln.seed;
probAln.makeXdrop(aligner.engines, args.isGreedy, args.scoreType,
dis.a, dis.b, args.globality,
@@ -867,6 +869,7 @@ void alignFinish(LastAligner &aligner, const MultiSequence &qrySeqs,
probAln.score = aln.score;
writeAlignment(aligner, qrySeqs, qryData, probAln, extras);
}
+ alignments.pop_back();
}
}
@@ -1001,10 +1004,12 @@ static void printAlignments(const std::vector<AlignmentText> &textAlns) {
}
}
+static void freeAlignmentTexts(AlignmentText *textAlns, size_t textAlnCount) {
+ for (size_t i = 0; i < textAlnCount; ++i) delete[] textAlns[i].text;
+}
+
static void clearAlignments(std::vector<AlignmentText> &textAlns) {
- for (size_t i = 0; i < textAlns.size(); ++i) {
- delete[] textAlns[i].text;
- }
+ freeAlignmentTexts(textAlns.data(), textAlns.size());
textAlns.clear();
}
@@ -1048,33 +1053,36 @@ void scan(LastAligner &aligner, const MultiSequence &qrySeqs,
makeQualityPssm(qryData, matrices, maskMode > 0);
Dispatcher dis0(Phase::gapless, qryData, matrices);
- SegmentPairPot gaplessAlns;
- alignGapless(aligner, gaplessAlns, qrySeqs, qryData, dis0);
- if( args.outputType == 1 ) return; // we just want gapless alignments
- if( gaplessAlns.size() == 0 ) return;
-
- if (maskMode == 1 || (maskMode == 2 && args.scoreType == 0))
- unmaskLowercase(qryData, matrices);
-
size_t qryLen = qryData.padLen;
AlignmentPot gappedAlns;
Centroid ¢roid = aligner.engines.centroid;
- if (args.scoreType != 0 && dis0.p) {
- const OneQualityExpMatrix &m =
- (maskMode < 2) ? matrices.oneQualExp : matrices.oneQualExpMasked;
- centroid.setPssm(dis0.p, qryLen, args.temperature, m, dis0.b, dis0.j);
- }
+ {
+ SegmentPairPot gaplessAlns;
+ alignGapless(aligner, gaplessAlns, qrySeqs, qryData, dis0);
+ if (args.outputType == 1) return; // we just want gapless alignments
+ if (gaplessAlns.size() == 0) return;
+
+ if (maskMode == 1 || (maskMode == 2 && args.scoreType == 0))
+ unmaskLowercase(qryData, matrices);
+
+ if (args.scoreType != 0 && dis0.p) {
+ const OneQualityExpMatrix &m =
+ (maskMode < 2) ? matrices.oneQualExp : matrices.oneQualExpMasked;
+ centroid.setPssm(dis0.p, qryLen, args.temperature, m, dis0.b, dis0.j);
+ }
+
+ if (args.maxDropFinal != args.maxDropGapped) {
+ alignGapped(aligner, gappedAlns, gaplessAlns, qryData, matrices,
+ Phase::pregapped);
+ erase_if(gaplessAlns.items, SegmentPairPot::isNotMarkedAsGood);
+ }
- if (args.maxDropFinal != args.maxDropGapped) {
alignGapped(aligner, gappedAlns, gaplessAlns, qryData, matrices,
- Phase::pregapped);
- erase_if( gaplessAlns.items, SegmentPairPot::isNotMarkedAsGood );
+ Phase::gapped);
}
- alignGapped(aligner, gappedAlns, gaplessAlns, qryData, matrices,
- Phase::gapped);
- if( gappedAlns.size() == 0 ) return;
+ if (gappedAlns.size() == 0) return;
Dispatcher dis3(Phase::postgapped, qryData, matrices);
@@ -1112,7 +1120,7 @@ void scan(LastAligner &aligner, const MultiSequence &qrySeqs,
}
if (!isCollatedAlignments()) gappedAlns.sort(); // sort by score
- alignFinish(aligner, qrySeqs, qryData, gappedAlns, matrices, dis3);
+ alignFinish(aligner, qrySeqs, qryData, gappedAlns.items, matrices, dis3);
}
static void tantanMaskOneQuery(const SeqData &qryData) {
@@ -1181,19 +1189,21 @@ void translateAndScan(LastAligner &aligner, MultiSequence &qrySeqs,
}
}
-static void splitAlignments(LastSplitter &splitter,
- std::vector<AlignmentText> &textAlns,
- bool isQryQual) {
- if (!args.isSplit) return;
+static bool lessForSplitting(const AlignmentText &x, const AlignmentText &y) {
+ return x.queryBeg < y.queryBeg; // assumes we have just 1 query sequence
+}
+static void setupSplitAlignments(LastAligner &aligner, AlignmentText *textAlns,
+ size_t textAlnCount, bool isQryQual) {
unsigned linesPerMaf =
3 + isQryQual + (args.outputType > 3) + (args.outputType > 6);
- splitter.reserve(textAlns.size());
- std::vector<char *> linePtrs((linesPerMaf + 1) * textAlns.size());
+ LastSplitter &splitter = aligner.splitter;
+ splitter.reserve(textAlnCount);
+ aligner.alignmentTextLines.resize((linesPerMaf + 1) * textAlnCount);
- char **beg = linePtrs.empty() ? 0 : &linePtrs[0];
- for (size_t i = 0; i < textAlns.size(); ++i) {
+ char **beg = aligner.alignmentTextLines.data();
+ for (size_t i = 0; i < textAlnCount; ++i) {
char **end = beg;
char *text = textAlns[i].text;
for (unsigned j = 0; j < linesPerMaf; ++j) {
@@ -1205,11 +1215,38 @@ static void splitAlignments(LastSplitter &splitter,
splitter.addMaf(beg, end, false);
beg = end + 1;
}
+}
- splitter.split(args.splitOpts, splitParams, false);
+static void splitAlignments(LastAligner &aligner, bool isQryQual) {
+ std::vector<AlignmentText> &textAlns = aligner.textAlns;
+ setupSplitAlignments(aligner, textAlns.data(), textAlns.size(), isQryQual);
+ aligner.splitter.split(args.splitOpts, splitParams, false);
clearAlignments(textAlns);
}
+static void splitOneQuery(LastAligner &aligner, bool isQryQual) {
+ AlignmentText *textAlns = aligner.textAlns.data();
+ size_t textAlnCount = aligner.textAlns.size();
+
+ std::sort(textAlns, textAlns + textAlnCount, lessForSplitting);
+
+ size_t i = 0;
+ size_t j = 0;
+ size_t maxEnd = args.splitOpts.isSplicedAlignment ? -1 : 0;
+ while (j < textAlnCount) {
+ maxEnd = std::max(maxEnd, textAlns[j].queryEnd);
+ ++j;
+ if (j == textAlnCount || textAlns[j].queryBeg >= maxEnd) {
+ setupSplitAlignments(aligner, textAlns + i, j - i, isQryQual);
+ aligner.splitter.splitOneQuery(args.splitOpts, splitParams);
+ freeAlignmentTexts(textAlns + i, j - i);
+ i = j;
+ }
+ }
+
+ aligner.textAlns.clear();
+}
+
static void alignOneQuery(LastAligner &aligner, MultiSequence &qrySeqs,
size_t qryNum, size_t chunkQryNum,
size_t finalCullingLimit, bool isFirstVolume) {
@@ -1264,10 +1301,10 @@ static void alignOneQuery(LastAligner &aligner, MultiSequence &qrySeqs,
args.isQueryStrandMatrix ? revMatrices : fwdMatrices);
if (numOfVolumes < 2) {
+ if (args.isSplit) splitOneQuery(aligner, qrySeqs.qualsPerLetter());
if (isCollatedAlignments()) {
sort(textAlns.begin() + oldNumOfAlns, textAlns.end());
}
- splitAlignments(aligner.splitter, textAlns, qrySeqs.qualsPerLetter());
}
}
@@ -1290,9 +1327,8 @@ static size_t alignSomeQueries(size_t chunkNum, unsigned volume) {
if (isMultiVolume && volume + 1 == numOfVolumes) {
std::vector<AlignmentText> &textAlns = aligner.textAlns;
cullFinalAlignments(textAlns, 0, args.cullingLimitForFinalAlignments);
+ if (args.isSplit) splitAlignments(aligner, qrySeqsGlobal.qualsPerLetter());
sort(textAlns.begin(), textAlns.end());
- splitAlignments(aligner.splitter, textAlns,
- qrySeqsGlobal.qualsPerLetter());
}
return beg;
}
=====================================
src/makefile
=====================================
@@ -95,7 +95,7 @@ ScoreMatrixData.hh: ../data/*.mat
../build/mat-inc.sh ../data/*.mat > $@
VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1609) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1642) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
=====================================
src/split/cbrc_split_aligner.cc
=====================================
@@ -25,35 +25,49 @@ namespace cbrc {
static int myMax(const int *b, int s) { return *std::max_element(b, b + s); }
-// Orders candidate alignments by increasing DP start coordinate.
-// Breaks ties by decreasing DP end coordinate.
-struct DpBegLess {
- DpBegLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+struct BegLess { // Order by increasing begin value, then decreasing end value
+ BegLess(const size_t *b, const size_t *e) : begs(b), ends(e) {}
bool operator()(unsigned a, unsigned b) const {
- return
- dpBegs[a] != dpBegs[b] ? dpBegs[a] < dpBegs[b] : dpEnds[a] > dpEnds[b];
+ return begs[a] != begs[b] ? begs[a] < begs[b] : ends[a] > ends[b];
}
- const unsigned *dpBegs;
- const unsigned *dpEnds;
+ const size_t *begs;
+ const size_t *ends;
};
-// Orders candidate alignments by decreasing DP end coordinate.
-// Breaks ties by increasing DP start coordinate.
-struct DpEndLess {
- DpEndLess(const unsigned *b, const unsigned *e) : dpBegs(b), dpEnds(e) {}
+struct BegLessStable {
+ BegLessStable(const size_t *b, const size_t *e) : begs(b), ends(e) {}
bool operator()(unsigned a, unsigned b) const {
- return
- dpEnds[a] != dpEnds[b] ? dpEnds[a] > dpEnds[b] : dpBegs[a] < dpBegs[b];
+ return begs[a] != begs[b] ? begs[a] < begs[b]
+ : ends[a] != ends[b] ? ends[a] > ends[b] : a < b;
+ }
+ const size_t *begs;
+ const size_t *ends;
+};
+
+struct EndLess { // Order by decreasing end value, then increasing begin value
+ EndLess(const size_t *b, const size_t *e) : begs(b), ends(e) {}
+ bool operator()(unsigned a, unsigned b) const {
+ return ends[a] != ends[b] ? ends[a] > ends[b] : begs[a] < begs[b];
}
- const unsigned *dpBegs;
- const unsigned *dpEnds;
+ const size_t *begs;
+ const size_t *ends;
+};
+
+struct EndLessStable {
+ EndLessStable(const size_t *b, const size_t *e) : begs(b), ends(e) {}
+ bool operator()(unsigned a, unsigned b) const {
+ return ends[a] != ends[b] ? ends[a] > ends[b]
+ : begs[a] != begs[b] ? begs[a] < begs[b] : a < b;
+ }
+ const size_t *begs;
+ const size_t *ends;
};
// Orders candidate alignments by increasing DP start coordinate.
// Breaks ties by chromosome & strand, then by increasing genomic
// start coordinate.
struct QbegLess {
- QbegLess(const unsigned *b, const unsigned *r, const unsigned *rb)
+ QbegLess(const size_t *b, const unsigned *r, const unsigned *rb)
: dpBegs(b), rnameAndStrandIds(r), rBegs(rb) {}
bool operator()(unsigned a, unsigned b) const {
@@ -64,7 +78,7 @@ struct QbegLess {
: rBegs[a] < rBegs[b];
}
- const unsigned *dpBegs;
+ const size_t *dpBegs;
const unsigned *rnameAndStrandIds;
const unsigned *rBegs;
};
@@ -73,7 +87,7 @@ struct QbegLess {
// Breaks ties by chromosome & strand, then by decreasing genomic end
// coordinate.
struct QendLess {
- QendLess(const unsigned *e, const unsigned *r, const unsigned *re)
+ QendLess(const size_t *e, const unsigned *r, const unsigned *re)
: dpEnds(e), rnameAndStrandIds(r), rEnds(re) {}
bool operator()(unsigned a, unsigned b) const {
@@ -84,7 +98,7 @@ struct QendLess {
: rEnds[a] > rEnds[b];
}
- const unsigned *dpEnds;
+ const size_t *dpEnds;
const unsigned *rnameAndStrandIds;
const unsigned *rEnds;
};
@@ -193,7 +207,7 @@ static unsigned spliceEndSignalRev(mcf::BigSeq seq, size_t pos,
return 15 - (n1 * 4 + n2); // reverse-complement
}
-unsigned SplitAligner::findScore(bool isGenome, unsigned j, long score) const {
+unsigned SplitAligner::findScore(bool isGenome, size_t j, long score) const {
for (unsigned i = 0; i < numAlns; ++i) {
if (dpBeg(i) >= j || dpEnd(i) < j) continue;
size_t ij = matrixRowOrigins[i] + j;
@@ -203,7 +217,7 @@ unsigned SplitAligner::findScore(bool isGenome, unsigned j, long score) const {
}
unsigned SplitAligner::findSpliceScore(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j,
+ unsigned i, size_t j,
long score) const {
assert(params.splicePrior > 0.0);
const bool isGenome = params.isGenome();
@@ -225,7 +239,7 @@ unsigned SplitAligner::findSpliceScore(const SplitAlignerParams ¶ms,
}
long SplitAligner::scoreFromSplice(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j,
+ unsigned i, size_t j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
const unsigned maxSpliceDist = params.maxSpliceDist;
@@ -261,7 +275,7 @@ long SplitAligner::scoreFromSplice(const SplitAlignerParams ¶ms,
void SplitAligner::updateInplayAlnIndicesF(unsigned& sortedAlnPos,
unsigned& oldNumInplay,
unsigned& newNumInplay,
- unsigned j) { // query coordinate
+ size_t j) { // query coordinate
oldInplayAlnIndices.swap(newInplayAlnIndices);
oldNumInplay = newNumInplay;
@@ -295,7 +309,7 @@ void SplitAligner::updateInplayAlnIndicesF(unsigned& sortedAlnPos,
void SplitAligner::updateInplayAlnIndicesB(unsigned& sortedAlnPos,
unsigned& oldNumInplay,
unsigned& newNumInplay,
- unsigned j) { // query coordinate
+ size_t j) { // query coordinate
oldInplayAlnIndices.swap(newInplayAlnIndices);
oldNumInplay = newNumInplay;
@@ -330,15 +344,12 @@ long SplitAligner::viterbiSplit(const SplitAlignerParams ¶ms) {
const int restartScore = params.restartScore;
unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
unsigned *inplayAlnEnd = inplayAlnBeg;
- unsigned *sortedAlnPtr = &sortedAlnIndices[0];
- unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
-
- std::stable_sort(sortedAlnPtr, sortedAlnEnd,
- DpBegLess(&dpBegs[0], &dpEnds[0]));
+ const unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+ const unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
long maxScore = 0;
- for (unsigned j = minBeg; j < maxEnd; j++) {
+ for (size_t j = minBeg; j < maxEnd; j++) {
while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
--inplayAlnEnd; // it is no longer "in play"
}
@@ -347,7 +358,7 @@ long SplitAligner::viterbiSplit(const SplitAlignerParams ¶ms) {
++sortedAlnPtr;
}
mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
- DpEndLess(&dpBegs[0], &dpEnds[0]));
+ EndLess(&dpBegs[0], &dpEnds[0]));
inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
cell(Vvec, j) = maxScore;
@@ -373,13 +384,10 @@ long SplitAligner::viterbiSplice(const SplitAlignerParams ¶ms) {
unsigned oldNumInplay = 0;
unsigned newNumInplay = 0;
- stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
- QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
-
long maxScore = 0;
long scoreFromJump = restartScore;
- for (unsigned j = minBeg; j < maxEnd; j++) {
+ for (size_t j = minBeg; j < maxEnd; j++) {
updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
cell(Vvec, j) = maxScore;
@@ -424,11 +432,10 @@ unsigned SplitAligner::findEndScore(long score) const {
void SplitAligner::traceBack(const SplitAlignerParams ¶ms,
long viterbiScore,
- std::vector<unsigned>& alnNums,
- std::vector<unsigned>& queryBegs,
- std::vector<unsigned>& queryEnds) const {
+ std::vector<AlignmentPart> &alnParts) const {
const bool isGenome = params.isGenome();
- unsigned i, j;
+ unsigned i;
+ size_t j;
if (params.isSpliced()) {
i = findEndScore(viterbiScore);
assert(i < numAlns);
@@ -442,15 +449,15 @@ void SplitAligner::traceBack(const SplitAlignerParams ¶ms,
assert(i < numAlns);
}
- alnNums.push_back(i);
- queryEnds.push_back(j);
+ size_t queryEnd = j;
for (;;) {
--j;
size_t ij = matrixRowOrigins[i] + j;
long score = Vmat[ij + 1] - Smat[ij*2+1];
if (params.isSpliced() && alns[i].qstart == j && score == 0) {
- queryBegs.push_back(j);
+ AlignmentPart ap = {i, j, queryEnd};
+ alnParts.push_back(ap);
return;
}
@@ -466,27 +473,28 @@ void SplitAligner::traceBack(const SplitAlignerParams ¶ms,
long s = score - spliceEndScore(isGenome, ij);
long t = s - params.restartScore;
if (t == cell(Vvec, j)) {
- queryBegs.push_back(j);
+ AlignmentPart ap = {i, j, queryEnd};
+ alnParts.push_back(ap);
if (t == 0) return;
while (t == cell(Vvec, j-1)) --j;
i = findScore(isGenome, j, t);
} else {
if (isStay) continue;
- queryBegs.push_back(j);
+ AlignmentPart ap = {i, j, queryEnd};
+ alnParts.push_back(ap);
unsigned k = findScore(isGenome, j, s - params.jumpScore);
i = (k < numAlns) ? k : findSpliceScore(params, i, j, score);
}
assert(i < numAlns);
- alnNums.push_back(i);
- queryEnds.push_back(j);
+ queryEnd = j;
}
}
int SplitAligner::segmentScore(unsigned alnNum,
- unsigned queryBeg, unsigned queryEnd) const {
+ size_t queryBeg, size_t queryEnd) const {
int score = 0;
unsigned i = alnNum;
- for (unsigned j = queryBeg; j < queryEnd; ++j) {
+ for (size_t j = queryBeg; j < queryEnd; ++j) {
size_t ij = matrixRowOrigins[i] + j;
score += Smat[ij*2+1];
if (j > queryBeg) score += Smat[ij*2];
@@ -495,7 +503,7 @@ int SplitAligner::segmentScore(unsigned alnNum,
}
double SplitAligner::probFromSpliceF(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j,
+ unsigned i, size_t j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
const unsigned maxSpliceDist = params.maxSpliceDist;
@@ -529,7 +537,7 @@ double SplitAligner::probFromSpliceF(const SplitAlignerParams ¶ms,
}
double SplitAligner::probFromSpliceB(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j,
+ unsigned i, size_t j,
unsigned oldNumInplay,
unsigned& oldInplayPos) const {
const unsigned maxSpliceDist = params.maxSpliceDist;
@@ -566,16 +574,13 @@ void SplitAligner::forwardSplit(const SplitAlignerParams ¶ms) {
const double restartProb = params.restartProb;
unsigned *inplayAlnBeg = &newInplayAlnIndices[0];
unsigned *inplayAlnEnd = inplayAlnBeg;
- unsigned *sortedAlnPtr = &sortedAlnIndices[0];
- unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
-
- std::stable_sort(sortedAlnPtr, sortedAlnEnd,
- DpBegLess(&dpBegs[0], &dpEnds[0]));
+ const unsigned *sortedAlnPtr = &sortedAlnIndices[0];
+ const unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
double sumOfProbs = 1;
double rescale = 1;
- for (unsigned j = minBeg; j < maxEnd; j++) {
+ for (size_t j = minBeg; j < maxEnd; j++) {
while (inplayAlnEnd > inplayAlnBeg && dpEnd(inplayAlnEnd[-1]) == j) {
--inplayAlnEnd; // it is no longer "in play"
}
@@ -584,7 +589,7 @@ void SplitAligner::forwardSplit(const SplitAlignerParams ¶ms) {
++sortedAlnPtr;
}
mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
- DpEndLess(&dpBegs[0], &dpEnds[0]));
+ EndLess(&dpBegs[0], &dpEnds[0]));
inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
cell(rescales, j) = rescale;
@@ -620,7 +625,7 @@ void SplitAligner::forwardSplice(const SplitAlignerParams ¶ms) {
double zF = 0.0; // sum of probabilities from the forward algorithm
double rescale = 1;
- for (unsigned j = minBeg; j < maxEnd; j++) {
+ for (size_t j = minBeg; j < maxEnd; j++) {
updateInplayAlnIndicesF(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
cell(rescales, j) = rescale;
@@ -659,12 +664,11 @@ void SplitAligner::backwardSplit(const SplitAlignerParams ¶ms) {
unsigned *sortedAlnPtr = &sortedAlnIndices[0];
unsigned *sortedAlnEnd = sortedAlnPtr + numAlns;
- std::stable_sort(sortedAlnPtr, sortedAlnEnd,
- DpEndLess(&dpBegs[0], &dpEnds[0]));
+ std::sort(sortedAlnPtr, sortedAlnEnd, EndLessStable(&dpBegs[0], &dpEnds[0]));
double sumOfProbs = 1;
- for (unsigned j = maxEnd; j > minBeg; j--) {
+ for (size_t j = maxEnd; j > minBeg; j--) {
while (inplayAlnEnd > inplayAlnBeg && dpBeg(inplayAlnEnd[-1]) == j) {
--inplayAlnEnd; // it is no longer "in play"
}
@@ -673,7 +677,7 @@ void SplitAligner::backwardSplit(const SplitAlignerParams ¶ms) {
++sortedAlnPtr;
}
mergeInto(inplayAlnBeg, inplayAlnEnd, sortedAlnBeg, sortedAlnPtr,
- DpBegLess(&dpBegs[0], &dpEnds[0]));
+ BegLess(&dpBegs[0], &dpEnds[0]));
inplayAlnEnd += sortedAlnPtr - sortedAlnBeg;
double rescale = cell(rescales, j);
@@ -703,7 +707,7 @@ void SplitAligner::backwardSplice(const SplitAlignerParams ¶ms) {
double endprob = 1.0;
//double zB = 0.0; // sum of probabilities from the backward algorithm
- for (unsigned j = maxEnd; j > minBeg; j--) {
+ for (size_t j = maxEnd; j > minBeg; j--) {
updateInplayAlnIndicesB(sortedAlnPos, oldNumInplay, newNumInplay, j);
unsigned oldInplayPos = 0;
double rescale = cell(rescales, j);
@@ -735,27 +739,26 @@ void SplitAligner::backwardSplice(const SplitAlignerParams ¶ms) {
}
}
-std::vector<double>
-SplitAligner::marginalProbs(unsigned queryBeg, unsigned alnNum,
- unsigned alnBeg, unsigned alnEnd) const {
- std::vector<double> output;
- unsigned i = alnNum;
- unsigned j = queryBeg;
+void SplitAligner::marginalProbs(double *output,
+ size_t queryBeg, unsigned alnNum,
+ unsigned alnBeg, unsigned alnEnd) const {
+ const char *qalign = alns[alnNum].qalign;
+ size_t ij = matrixRowOrigins[alnNum] + queryBeg;
+ size_t rescalesOffset = matrixRowOrigins[alnNum] + minBeg;
+
for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
- size_t ij = matrixRowOrigins[i] + j;
+ double value;
if (Bmat[ij] > DBL_MAX) { // can happen for spliced alignment
- output.push_back(0);
- } else if (alns[i].qalign[pos] == '-') {
- double value = Fmat[ij] * Bmat[ij] * Sexp[ij*2] * cell(rescales, j);
- output.push_back(value);
+ value = 0;
+ } else if (qalign[pos] == '-') {
+ value = Fmat[ij] * Bmat[ij] * Sexp[ij*2] * rescales[ij - rescalesOffset];
} else {
- double value = Fmat[ij + 1] * Bmat[ij] / Sexp[ij*2+1];
- if (value != value) value = 0.0;
- output.push_back(value);
- j++;
+ value = Fmat[ij + 1] * Bmat[ij] / Sexp[ij*2+1];
+ if (value != value) value = 0;
+ ++ij;
}
+ output[pos - alnBeg] = value;
}
- return output;
}
// The next routine represents affine gap scores in a cunning way.
@@ -845,7 +848,7 @@ void SplitAligner::initRbegsAndEnds() {
void SplitAligner::initSpliceCoords(unsigned i) {
const UnsplitAlignment& a = alns[i];
- unsigned j = dpBeg(i);
+ size_t j = dpBeg(i);
unsigned k = a.rstart;
cell(spliceBegCoords, i, j) = k;
@@ -897,15 +900,15 @@ void SplitAligner::initSpliceSignals(const SplitAlignerParams ¶ms,
const unsigned *endCoords = &spliceEndCoords[rowBeg];
unsigned char *begSignals = &spliceBegSignals[rowBeg];
unsigned char *endSignals = &spliceEndSignals[rowBeg];
- unsigned dpLen = dpEnd(i) - dpBeg(i);
+ size_t dpLen = dpEnd(i) - dpBeg(i);
if (a.isForwardStrand()) {
- for (unsigned j = 0; j <= dpLen; ++j) {
+ for (size_t j = 0; j <= dpLen; ++j) {
begSignals[j] = spliceBegSignalFwd(seq, seqBeg+begCoords[j], toUnmasked);
endSignals[j] = spliceEndSignalFwd(seq, seqBeg+endCoords[j], toUnmasked);
}
} else {
- for (unsigned j = 0; j <= dpLen; ++j) {
+ for (size_t j = 0; j <= dpLen; ++j) {
begSignals[j] = spliceBegSignalRev(seq, seqEnd-begCoords[j], toUnmasked);
endSignals[j] = spliceEndSignalRev(seq, seqEnd-endCoords[j], toUnmasked);
}
@@ -1093,6 +1096,14 @@ void SplitAligner::layout(const SplitAlignerParams ¶ms,
}
initDpBounds(params);
+
+ if (params.isSpliced()) {
+ stable_sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+ QbegLess(&dpBegs[0], &rnameAndStrandIds[0], &rBegs[0]));
+ } else {
+ sort(sortedAlnIndices.begin(), sortedAlnIndices.end(),
+ BegLessStable(&dpBegs[0], &dpEnds[0]));
+ }
}
size_t SplitAligner::memory(const SplitAlignerParams ¶ms,
@@ -1165,7 +1176,7 @@ double SplitAligner::spliceSignalStrandLogOdds() const {
// XXX if Bmat overflowed to inf, then I think this is unreliable
assert(rescales.size() == rescalesRev.size());
double logOdds = 0;
- for (unsigned j = 0; j < rescales.size(); ++j) {
+ for (size_t j = 0; j < rescales.size(); ++j) {
logOdds += std::log(rescalesRev[j] / rescales[j]);
}
return logOdds;
=====================================
src/split/cbrc_split_aligner.hh
=====================================
@@ -21,6 +21,12 @@
namespace cbrc {
+struct AlignmentPart {
+ unsigned alnNum;
+ size_t queryBeg;
+ size_t queryEnd;
+};
+
struct SplitAlignerParams {
static int scoreFromProb(double prob, double scale) {
return floor(scale * log(prob) + 0.5);
@@ -134,6 +140,8 @@ public:
void layout(const SplitAlignerParams ¶ms,
const UnsplitAlignment *beg, const UnsplitAlignment *end);
+ size_t maxQueryEnd() const { return maxEnd; }
+
// The number of cells in each dynamic programming matrix
size_t cellsPerDpMatrix() const
{ return matrixRowOrigins[numAlns-1] + dpEnd(numAlns-1) + 1; }
@@ -162,13 +170,10 @@ public:
// 3. The chunk's end coordinate in the query sequence
// It gets the chunks in reverse order, from query end to query start.
void traceBack(const SplitAlignerParams ¶ms, long viterbiScore,
- std::vector<unsigned>& alnNums,
- std::vector<unsigned>& queryBegs,
- std::vector<unsigned>& queryEnds) const;
+ std::vector<AlignmentPart> &alnParts) const;
// Calculates the alignment score for a segment of an alignment
- int segmentScore(unsigned alnNum,
- unsigned queryBeg, unsigned queryEnd) const;
+ int segmentScore(unsigned alnNum, size_t queryBeg, size_t queryEnd) const;
void exponentiateScores(const SplitAlignerParams ¶ms) {
size_t s = cellsPerDpMatrix() * 2;
@@ -192,8 +197,8 @@ public:
}
// Returns one probability per column, for a segment of an alignment
- std::vector<double> marginalProbs(unsigned queryBeg, unsigned alnNum,
- unsigned alnBeg, unsigned alnEnd) const;
+ void marginalProbs(double *output, size_t queryBeg, unsigned alnNum,
+ unsigned alnBeg, unsigned alnEnd) const;
// Toggles between forward and reverse-complement splice signals
void flipSpliceSignals(const SplitAlignerParams ¶ms);
@@ -206,7 +211,7 @@ public:
// Gets the 2 genome bases immediately downstream of queryPos in
// alnNum, and writes them into the buffer pointed to by "out"
void spliceBegSignal(char *out, const SplitAlignerParams ¶ms,
- unsigned alnNum, unsigned queryPos,
+ unsigned alnNum, size_t queryPos,
bool isSenseStrand) const {
const UnsplitAlignment &a = alns[alnNum];
params.spliceBegSignal(out, a.rname, a.isForwardStrand(), isSenseStrand,
@@ -216,7 +221,7 @@ public:
// Gets the 2 genome bases immediately upstream of queryPos in
// alnNum, and writes them into the buffer pointed to by "out"
void spliceEndSignal(char *out, const SplitAlignerParams ¶ms,
- unsigned alnNum, unsigned queryPos,
+ unsigned alnNum, size_t queryPos,
bool isSenseStrand) const {
const UnsplitAlignment &a = alns[alnNum];
params.spliceEndSignal(out, a.rname, a.isForwardStrand(), isSenseStrand,
@@ -226,10 +231,10 @@ public:
private:
unsigned numAlns; // the number of candidate alignments (for 1 query)
const UnsplitAlignment *alns; // the candidates
- unsigned minBeg; // the minimum query start coordinate of any candidate
- unsigned maxEnd; // the maximum query end coordinate of any candidate
- std::vector<unsigned> dpBegs; // dynamic programming begin coords
- std::vector<unsigned> dpEnds; // dynamic programming end coords
+ size_t minBeg; // the minimum query start coordinate of any candidate
+ size_t maxEnd; // the maximum query end coordinate of any candidate
+ std::vector<size_t> dpBegs; // dynamic programming begin coords
+ std::vector<size_t> dpEnds; // dynamic programming end coords
std::vector<size_t> matrixRowOrigins; // layout of ragged matrices
size_t maxCellsPerMatrix;
@@ -295,11 +300,11 @@ private:
void updateInplayAlnIndicesF(unsigned& sortedAlnPos,
unsigned& oldNumInplay,
- unsigned& newNumInplay, unsigned j);
+ unsigned& newNumInplay, size_t j);
void updateInplayAlnIndicesB(unsigned& sortedAlnPos,
unsigned& oldNumInplay,
- unsigned& newNumInplay, unsigned j);
+ unsigned& newNumInplay, size_t j);
long viterbiSplit(const SplitAlignerParams ¶ms);
long viterbiSplice(const SplitAlignerParams ¶ms);
@@ -309,38 +314,38 @@ private:
void forwardSplice(const SplitAlignerParams ¶ms);
void backwardSplice(const SplitAlignerParams ¶ms);
- unsigned findScore(bool isGenome, unsigned j, long score) const;
+ unsigned findScore(bool isGenome, size_t j, long score) const;
unsigned findSpliceScore(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j, long score) const;
+ unsigned i, size_t j, long score) const;
long scoreFromSplice(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j, unsigned oldNumInplay,
+ unsigned i, size_t j, unsigned oldNumInplay,
unsigned& oldInplayPos) const;
long endScore() const;
unsigned findEndScore(long score) const;
// "dp" means "dynamic programming":
- unsigned dpBeg(unsigned i) const { return dpBegs[i]; }
- unsigned dpEnd(unsigned i) const { return dpEnds[i]; }
+ size_t dpBeg(unsigned i) const { return dpBegs[i]; }
+ size_t dpEnd(unsigned i) const { return dpEnds[i]; }
template<typename T> T&
- cell(std::vector<T>& v, unsigned j) const
+ cell(std::vector<T>& v, size_t j) const
{ return v[j - minBeg]; }
template<typename T> const T&
- cell(const std::vector<T>& v, unsigned j) const
+ cell(const std::vector<T>& v, size_t j) const
{ return v[j - minBeg]; }
// cell j in row i of a ragged matrix
template<typename T> T&
- cell(std::vector<T>& v, unsigned i, unsigned j) const
+ cell(std::vector<T>& v, unsigned i, size_t j) const
{ return v[matrixRowOrigins[i] + j]; }
// cell j in row i of a ragged matrix
template<typename T> const T&
- cell(const std::vector<T>& v, unsigned i, unsigned j) const
+ cell(const std::vector<T>& v, unsigned i, size_t j) const
{ return v[matrixRowOrigins[i] + j]; }
- long cell(const long *v, unsigned i, unsigned j) const
+ long cell(const long *v, unsigned i, size_t j) const
{ return v[matrixRowOrigins[i] + j]; }
template<typename T>
@@ -357,11 +362,11 @@ private:
}
double probFromSpliceF(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j, unsigned oldNumInplay,
+ unsigned i, size_t j, unsigned oldNumInplay,
unsigned& oldInplayPos) const;
double probFromSpliceB(const SplitAlignerParams ¶ms,
- unsigned i, unsigned j, unsigned oldNumInplay,
+ unsigned i, size_t j, unsigned oldNumInplay,
unsigned& oldInplayPos) const;
void calcBaseScores(const SplitAlignerParams ¶ms, unsigned i);
=====================================
src/split/cbrc_unsplit_alignment.cc
=====================================
@@ -13,6 +13,11 @@
#include <stdexcept>
#include <string>
+struct IntText {
+ char text[32];
+ int length;
+};
+
static void err(const std::string& s) {
throw std::runtime_error(s);
}
@@ -29,18 +34,29 @@ static bool isDigit(char c) {
return c >= '0' && c <= '9';
}
+static void writeSize(IntText &it, size_t x) {
+ char *end = it.text + sizeof it.text;
+ char *beg = end;
+ do {
+ --beg;
+ *beg = '0' + x % 10;
+ } while (x /= 10);
+ it.length = end - beg;
+}
+
namespace cbrc {
-static const char *readUint(const char *c, unsigned &x) {
+static const char *readSize(const char *c, size_t &x) {
if (!c) return 0;
+ size_t maxVal = -1;
// faster than std::strtoul
while (isSpace(*c)) ++c;
if (!isDigit(*c)) return 0;
- unsigned z = *c++ - '0';
+ size_t z = *c++ - '0';
while (isDigit(*c)) {
- if (z > UINT_MAX / 10) return 0;
- unsigned digit = *c++ - '0';
+ if (z > maxVal / 10) return 0;
+ size_t digit = *c++ - '0';
z = z * 10 + digit;
if (z < digit) return 0;
}
@@ -100,15 +116,15 @@ static void reverseComplement(char *beg, char *end) {
}
static void parseSeqLine(char *line, char *lineEnd, const char *&seqName,
- unsigned &start, unsigned &span, char &strand,
- unsigned &seqLen, char *&aln, char *&alnEnd) {
+ size_t &start, size_t &span, char &strand,
+ size_t &seqLen, char *&aln, char *&alnEnd) {
seqName = skipSpace(skipWord(line));
const char *nameEnd = skipWord(seqName);
const char *s = nameEnd;
- s = readUint(s, start);
- s = readUint(s, span);
+ s = readSize(s, start);
+ s = readSize(s, span);
s = readChar(s, strand);
- s = readUint(s, seqLen);
+ s = readSize(s, seqLen);
s = skipSpace(s);
if (!s || s >= lineEnd) err(std::string("bad MAF line: ") + line);
aln = line + (s - line);
@@ -129,8 +145,8 @@ void UnsplitAlignment::init(bool isTopSeqQuery) {
const unsigned rankOfRefSeq = isTopSeqQuery + 1;
char refStrand, qryStrand;
- unsigned refSpan, qrySpan;
- unsigned refSeqLen, qrySeqLen;
+ size_t refSpan, qrySpan;
+ size_t refSeqLen, qrySeqLen;
char *refAln;
char *refAlnEnd;
char *qryAln;
@@ -177,11 +193,11 @@ void UnsplitAlignment::init(bool isTopSeqQuery) {
qQual = qual;
}
-static unsigned seqPosFromAlnPos(unsigned alnPos, const char *aln) {
+static size_t seqPosFromAlnPos(size_t alnPos, const char *aln) {
return alnPos - std::count(aln, aln + alnPos, '-');
}
-static unsigned nthBasePrefix(const char* sequenceWithGapsBeg, unsigned n) {
+static unsigned nthBasePrefix(const char* sequenceWithGapsBeg, size_t n) {
for (unsigned i = 0; /* noop */; ++i)
if (sequenceWithGapsBeg[i] != '-') {
if (n > 0) --n;
@@ -189,7 +205,7 @@ static unsigned nthBasePrefix(const char* sequenceWithGapsBeg, unsigned n) {
}
}
-static unsigned nthBaseSuffix(const char *sequenceWithGapsEnd, unsigned n) {
+static unsigned nthBaseSuffix(const char *sequenceWithGapsEnd, size_t n) {
for (unsigned i = 0; /* noop */; ++i)
if (*(sequenceWithGapsEnd - 1 - i) != '-') {
if (n > 0) --n;
@@ -198,7 +214,7 @@ static unsigned nthBaseSuffix(const char *sequenceWithGapsEnd, unsigned n) {
}
void mafSliceBeg(const char* rAln, const char* qAln,
- unsigned qBeg, unsigned& qSliceBeg, unsigned& alnBeg) {
+ size_t qBeg, size_t& qSliceBeg, unsigned& alnBeg) {
if (qSliceBeg < qBeg) {
qSliceBeg = qBeg;
alnBeg = 0;
@@ -211,8 +227,8 @@ void mafSliceBeg(const char* rAln, const char* qAln,
}
void mafSliceEnd(const char* rAln, const char* qAln,
- unsigned qEnd, unsigned& qSliceEnd, unsigned& alnEnd) {
- unsigned alnLength = strlen(qAln);
+ size_t qEnd, size_t& qSliceEnd, unsigned& alnEnd) {
+ size_t alnLength = strlen(qAln);
if (qSliceEnd > qEnd) {
qSliceEnd = qEnd;
alnEnd = alnLength;
@@ -253,8 +269,11 @@ static void sprintRight(char*& dest, const char*& src, int width) {
}
static void sprintReplaceRight(char *&dest, const char *&oldIn,
- const char *newIn, int width) {
- sprintRight(dest, newIn, width);
+ const IntText &it, int width) {
+ memset(dest, ' ', width - it.length);
+ dest += width;
+ memcpy(dest - it.length, it.text + sizeof it.text - it.length, it.length);
+ *dest++ = ' ';
oldIn = skipWord(oldIn);
}
@@ -267,38 +286,41 @@ static char asciiFromProb(double probRight) {
}
size_t mafSlice(std::vector<char> &outputText, const UnsplitAlignment &aln,
- unsigned alnBeg, unsigned alnEnd, const double *probs) {
- char begTexts[2][32];
- char lenTexts[2][32];
+ size_t alnBeg, size_t alnEnd, const double *probs) {
+ IntText begTexts[2];
+ IntText lenTexts[2];
int w[6] = {0};
- unsigned numOfLines = 1; // for an extra "p" line at the end
+ unsigned numOfLines = !!probs;
int j = 0;
for (char **i = aln.linesBeg; i < aln.linesEnd; ++i) {
const char *c = i[0];
numOfLines += (*c == 's' || *c == 'q' || *c == 'p');
if (*c == 's') {
- unsigned beg = 0;
- unsigned len = 0;
+ size_t beg = 0;
+ size_t len = 0;
c = updateFieldWidth(c, w[0]);
c = updateFieldWidth(c, w[1]);
++c; // skip over the string terminator
- c = readUint(c, beg);
- c = readUint(c, len);
+ c = readSize(c, beg);
+ c = readSize(c, len);
c = updateFieldWidth(c, w[4]);
c = updateFieldWidth(c, w[5]);
c = skipSpace(c);
- unsigned begPos = seqPosFromAlnPos(alnBeg, c);
- unsigned endPos = seqPosFromAlnPos(alnEnd, c);
- unsigned newBeg = aln.isFlipped() ? beg + len - endPos : beg + begPos;
- unsigned newLen = endPos - begPos;
- w[2] = std::max(w[2], sprintf(begTexts[j], "%u", newBeg));
- w[3] = std::max(w[3], sprintf(lenTexts[j], "%u", newLen));
+ size_t begPos = seqPosFromAlnPos(alnBeg, c);
+ size_t endPos = seqPosFromAlnPos(alnEnd, c);
+ size_t newBeg = aln.isFlipped() ? beg + len - endPos : beg + begPos;
+ size_t newLen = endPos - begPos;
+ writeSize(begTexts[j], newBeg);
+ writeSize(lenTexts[j], newLen);
++j;
}
}
- unsigned alnLen = alnEnd - alnBeg;
+ w[2] = std::max(begTexts[0].length, begTexts[1].length);
+ w[3] = std::max(lenTexts[0].length, lenTexts[1].length);
+
+ size_t alnLen = alnEnd - alnBeg;
size_t lineLength = w[0] + w[1] + w[2] + w[3] + w[4] + w[5] + alnLen + 7;
size_t outputSize = outputText.size();
outputText.insert(outputText.end(), numOfLines * lineLength, 0);
@@ -340,12 +362,14 @@ size_t mafSlice(std::vector<char> &outputText, const UnsplitAlignment &aln,
}
}
- const char *in = "p";
- sprintLeft(out, in, w[0] + w[1] + w[2] + w[3] + w[4] + w[5] + 5);
- std::transform(probs, probs + alnLen, out, asciiFromProb);
- if (aln.isFlipped()) std::reverse(out, out + alnLen);
- out += alnLen;
- *out++ = '\n';
+ if (probs) {
+ const char *in = "p";
+ sprintLeft(out, in, w[0] + w[1] + w[2] + w[3] + w[4] + w[5] + 5);
+ std::transform(probs, probs + alnLen, out, asciiFromProb);
+ if (aln.isFlipped()) std::reverse(out, out + alnLen);
+ out += alnLen;
+ *out++ = '\n';
+ }
return lineLength;
}
=====================================
src/split/cbrc_unsplit_alignment.hh
=====================================
@@ -19,11 +19,11 @@ public:
char **linesBeg;
char **linesEnd;
const char *qname;
- unsigned qstart;
- unsigned qend;
+ size_t qstart;
+ size_t qend;
char qstrand;
- unsigned rstart;
- unsigned rend;
+ size_t rstart;
+ size_t rend;
const char *rname;
const char *ralign;
const char *qalign;
@@ -40,13 +40,13 @@ public:
// Appends an extra "p" line for "probs".
// Returns the line length (including a newline).
size_t mafSlice(std::vector<char> &outputText, const UnsplitAlignment &aln,
- unsigned alnBeg, unsigned alnEnd, const double *probs);
+ size_t alnBeg, size_t alnEnd, const double *probs);
void mafSliceBeg(const char* rAln, const char* qAln,
- unsigned qBeg, unsigned& qSliceBeg, unsigned& alnBeg);
+ size_t qBeg, size_t& qSliceBeg, unsigned& alnBeg);
void mafSliceEnd(const char* rAln, const char* qAln,
- unsigned qEnd, unsigned& qSliceEnd, unsigned& alnEnd);
+ size_t qEnd, size_t& qSliceEnd, unsigned& alnEnd);
double pLinesToErrorProb(const char *line1, const char *line2);
=====================================
src/split/mcf_last_splitter.cc
=====================================
@@ -12,11 +12,14 @@
#include <algorithm>
#include <stdexcept>
+static char *copyString(char *out, const char *s, size_t sizeOf) {
+ memcpy(out, s, sizeOf - 1);
+ return out + sizeOf - 1;
+}
+
// Defines an ordering, for sorting.
-static bool less(const cbrc::UnsplitAlignment& a,
- const cbrc::UnsplitAlignment& b) {
- int qnameCmp = strcmp(a.qname, b.qname);
- if (qnameCmp != 0 ) return qnameCmp < 0;
+static bool lessForOneQuery(const cbrc::UnsplitAlignment& a,
+ const cbrc::UnsplitAlignment& b) {
if (a.qstart != b.qstart ) return a.qstart < b.qstart;
if (a.qend != b.qend ) return a.qend < b.qend;
if (a.qstrand != b.qstrand) return a.qstrand < b.qstrand;
@@ -29,6 +32,12 @@ static bool less(const cbrc::UnsplitAlignment& a,
return a.linesBeg < b.linesBeg; // stabilizes the sort
}
+static bool less(const cbrc::UnsplitAlignment& a,
+ const cbrc::UnsplitAlignment& b) {
+ int qnameCmp = strcmp(a.qname, b.qname);
+ return qnameCmp ? qnameCmp < 0 : lessForOneQuery(a, b);
+}
+
static int whichPairedSequence(const char *name) {
size_t s = strlen(name);
if (s > 1 && name[s-2] == '/') {
@@ -38,39 +47,41 @@ static int whichPairedSequence(const char *name) {
throw std::runtime_error("query sequence name should end in /1 or /2");
}
-static int printSense(char *out, double senseStrandLogOdds) {
+static char *printSense(char *out, double senseStrandLogOdds) {
double b = senseStrandLogOdds / log(2.0);
if (b < 0.1 && b > -0.1) b = 0;
else if (b > 10) b = floor(b + 0.5);
else if (b < -10) b = ceil(b - 0.5);
+ static const char tagString[] = " sense=";
+ out = copyString(out, tagString, sizeof tagString);
int precision = (b < 10 && b > -10) ? 2 : 3;
- return sprintf(out, " sense=%.*g", precision, b);
+ return out + sprintf(out, "%.*g", precision, b);
}
namespace mcf {
-static void doOneSlice(SliceData &sd, unsigned &qSliceBeg, unsigned &qSliceEnd,
+static void doOneSlice(SliceData &sd, cbrc::AlignmentPart &ap,
const cbrc::SplitAligner &sa,
- const cbrc::UnsplitAlignment &a, unsigned alnNum) {
+ const cbrc::UnsplitAlignment &a) {
sd.score = -1;
- if (qSliceBeg >= a.qend || qSliceEnd <= a.qstart) {
+ if (ap.queryBeg >= a.qend || ap.queryEnd <= a.qstart) {
return; // this can happen for spliced alignment!
}
- unsigned qSliceBegOld = qSliceBeg;
- unsigned qSliceEndOld = qSliceEnd;
- cbrc::mafSliceBeg(a.ralign, a.qalign, a.qstart, qSliceBeg, sd.alnBeg);
- cbrc::mafSliceEnd(a.ralign, a.qalign, a.qend, qSliceEnd, sd.alnEnd);
+ size_t qSliceBegOld = ap.queryBeg;
+ size_t qSliceEndOld = ap.queryEnd;
+ cbrc::mafSliceBeg(a.ralign, a.qalign, a.qstart, ap.queryBeg, sd.alnBeg);
+ cbrc::mafSliceEnd(a.ralign, a.qalign, a.qend, ap.queryEnd, sd.alnEnd);
- if (qSliceBeg >= qSliceEnd) {
+ if (ap.queryBeg >= ap.queryEnd) {
return; // I think this can happen for spliced alignment
}
sd.score =
- sa.segmentScore(alnNum, qSliceBegOld, qSliceEndOld) -
- sa.segmentScore(alnNum, qSliceBegOld, qSliceBeg) -
- sa.segmentScore(alnNum, qSliceEnd, qSliceEndOld);
+ sa.segmentScore(ap.alnNum, qSliceBegOld, qSliceEndOld) -
+ sa.segmentScore(ap.alnNum, qSliceBegOld, ap.queryBeg) -
+ sa.segmentScore(ap.alnNum, ap.queryEnd, qSliceEndOld);
}
void LastSplitter::doOneAlignmentPart(const LastSplitOptions &opts,
@@ -78,85 +89,97 @@ void LastSplitter::doOneAlignmentPart(const LastSplitOptions &opts,
bool isAlreadySplit,
const cbrc::UnsplitAlignment &a,
unsigned numOfParts, unsigned partNum,
- const SliceData &sd, unsigned alnNum,
- unsigned qSliceBeg, unsigned qSliceEnd,
+ const SliceData &sd,
+ cbrc::AlignmentPart ap,
int rnaStrand, bool isSenseStrand,
double senseStrandLogOdds) {
if (sd.score < opts.score) return;
- std::vector<double> p;
+ size_t alnLen = sd.alnEnd - sd.alnBeg;
+ columnProbabilities.resize(alnLen * (rnaStrand/2 + 1));
+ double *probs = columnProbabilities.data();
+ double *probsRev = probs + alnLen * rnaStrand/2;
+
if (rnaStrand != 0) {
- p = sa.marginalProbs(qSliceBeg, alnNum, sd.alnBeg, sd.alnEnd);
+ sa.marginalProbs(probs, ap.queryBeg, ap.alnNum, sd.alnBeg, sd.alnEnd);
}
- std::vector<double> pRev;
if (rnaStrand != 1) {
sa.flipSpliceSignals(params);
- pRev = sa.marginalProbs(qSliceBeg, alnNum, sd.alnBeg, sd.alnEnd);
+ sa.marginalProbs(probsRev, ap.queryBeg, ap.alnNum, sd.alnBeg, sd.alnEnd);
sa.flipSpliceSignals(params);
}
- if (rnaStrand == 0) p.swap(pRev);
if (rnaStrand == 2) {
double reverseProb = 1 / (1 + exp(senseStrandLogOdds));
// the exp might overflow to inf, but that should be OK
double forwardProb = 1 - reverseProb;
- for (unsigned i = 0; i < p.size(); ++i) {
- p[i] = forwardProb * p[i] + reverseProb * pRev[i];
+ for (size_t i = 0; i < alnLen; ++i) {
+ probs[i] = forwardProb * probs[i] + reverseProb * probsRev[i];
}
}
- assert(!p.empty());
- double mismap = 1.0 - *std::max_element(p.begin(), p.end());
+ assert(alnLen > 0);
+ double mismap = 1.0 - *std::max_element(probs, probs + alnLen);
mismap = std::max(mismap, 1e-10);
if (mismap > opts.mismap) return;
- int mismapPrecision = 3;
-
- std::vector<char> slice;
- size_t lineLen = cbrc::mafSlice(slice, a, sd.alnBeg, sd.alnEnd, &p[0]);
- const char *sliceBeg = &slice[0];
- const char *sliceEnd = sliceBeg + slice.size();
- const char *pLine = sliceEnd - lineLen;
- const char *secondLastLine = pLine - lineLen;
-
- if (isAlreadySplit && secondLastLine[0] == 'p') {
- size_t backToSeq = sd.alnEnd - sd.alnBeg + 1;
- mismap = cbrc::pLinesToErrorProb(pLine - backToSeq, sliceEnd - backToSeq);
- if (mismap > opts.mismap) return;
- mismapPrecision = 2;
+
+ bool isSplitProbs = (isAlreadySplit && a.linesEnd[-1][0] == 'p');
+ bool isAlignProbs = (a.linesEnd[-1-isAlreadySplit][0] == 'p');
+ int format = opts.format ? opts.format : "mM"[isAlignProbs];
+ int mismapPrecision = 3 - isSplitProbs;
+ if (format == 'm' && !isSplitProbs) probs = 0;
+
+ bool isCopyFirstLine = (opts.no_split && a.linesBeg[0][0] == 'a');
+ size_t firstLineSize = a.linesBeg[1] - a.linesBeg[0] - 1;
+ size_t aLineSpace = firstLineSize * isCopyFirstLine + 128;
+ size_t outputSize = outputText.size();
+ outputText.resize(outputSize + aLineSpace);
+ size_t lineLen = cbrc::mafSlice(outputText, a, sd.alnBeg, sd.alnEnd, probs);
+
+ char *out = &outputText[outputSize];
+ const char *sliceBeg = out + aLineSpace;
+ const char *sliceEnd = &outputText[0] + outputText.size();
+
+ if (isSplitProbs) {
+ size_t off = sd.alnEnd - sd.alnBeg + 1;
+ mismap = cbrc::pLinesToErrorProb(sliceEnd - lineLen - off, sliceEnd - off);
+ if (mismap > opts.mismap) {
+ outputText.resize(outputSize);
+ return;
+ }
}
- bool isLastalProbs = (*(secondLastLine - isAlreadySplit * lineLen) == 'p');
- if (opts.format == 'm' || (opts.format == 0 && !isLastalProbs)) {
+ if (format == 'm') {
while (*(sliceEnd - lineLen) == 'p') sliceEnd -= lineLen;
}
- const char *aLineOld = a.linesBeg[0];
- size_t aLineOldSize = a.linesBeg[1] - a.linesBeg[0] - 1;
- std::vector<char> aLine(aLineOldSize + 128);
- char *out = &aLine[0];
- if (opts.no_split && aLineOld[0] == 'a') {
- memcpy(out, aLineOld, aLineOldSize);
- out += aLineOldSize;
+ if (isCopyFirstLine) {
+ memcpy(out, a.linesBeg[0], firstLineSize);
+ out += firstLineSize;
} else {
- out += sprintf(out, "a score=%d", sd.score);
+ static const char sString[] = "a score=";
+ out = copyString(out, sString, sizeof sString);
+ out += sprintf(out, "%d", sd.score);
}
- out += sprintf(out, " mismap=%.*g", mismapPrecision, mismap);
- if (rnaStrand == 2) out += printSense(out, senseStrandLogOdds);
+ static const char mString[] = " mismap=";
+ out = copyString(out, mString, sizeof mString);
+ out += sprintf(out, "%.*g", mismapPrecision, mismap);
+ if (rnaStrand == 2) out = printSense(out, senseStrandLogOdds);
if (!opts.genome.empty() && !opts.no_split) {
if (partNum > 0) {
out = strcpy(out, isSenseStrand ? " acc=" : " don=") + 5;
- sa.spliceEndSignal(out, params, alnNum, qSliceBeg, isSenseStrand);
+ sa.spliceEndSignal(out, params, ap.alnNum, ap.queryBeg, isSenseStrand);
out += 2;
}
if (partNum + 1 < numOfParts) {
out = strcpy(out, isSenseStrand ? " don=" : " acc=") + 5;
- sa.spliceBegSignal(out, params, alnNum, qSliceEnd, isSenseStrand);
+ sa.spliceBegSignal(out, params, ap.alnNum, ap.queryEnd, isSenseStrand);
out += 2;
}
}
*out++ = '\n';
- outputText.insert(outputText.end(), &aLine[0], out);
- outputText.insert(outputText.end(), sliceBeg, sliceEnd);
+ memmove(out, sliceBeg, sliceEnd - sliceBeg);
+ outputText.resize(out + (sliceEnd - sliceBeg) - &outputText[0]);
if (opts.no_split && a.linesEnd[-1][0] == 'c') {
outputText.insert(outputText.end(), a.linesEnd[-1], a.linesEnd[0]);
@@ -171,12 +194,15 @@ void LastSplitter::doOneQuery(const LastSplitOptions &opts,
bool isAlreadySplit,
const cbrc::UnsplitAlignment *beg,
const cbrc::UnsplitAlignment *end) {
+ assert(beg < end);
int rnaStrand = opts.direction;
if (rnaStrand == 3) rnaStrand = 2 - whichPairedSequence(beg->qname);
if (rnaStrand == 4) rnaStrand = whichPairedSequence(beg->qname) - 1;
sa.layout(params, beg, end);
- if (opts.verbose) std::cerr << "\tcells=" << sa.cellsPerDpMatrix();
+ if (opts.verbose) std::cerr << beg->qname << '\t' << beg->qstart << '\t'
+ << sa.maxQueryEnd() << '\t' << (end - beg)
+ << "\tcells=" << sa.cellsPerDpMatrix();
size_t bytes = sa.memory(params, rnaStrand == 2);
if (bytes > opts.bytes) {
if (opts.verbose) std::cerr << "\n";
@@ -188,19 +214,14 @@ void LastSplitter::doOneQuery(const LastSplitOptions &opts,
long viterbiScore = LONG_MIN;
long viterbiScoreRev = LONG_MIN;
- std::vector<unsigned> alnNums;
- std::vector<unsigned> queryBegs;
- std::vector<unsigned> queryEnds;
if (opts.no_split) {
unsigned numOfParts = end - beg;
- alnNums.resize(numOfParts);
- queryBegs.resize(numOfParts);
- queryEnds.resize(numOfParts);
+ alignmentParts.resize(numOfParts);
for (unsigned i = 0; i < numOfParts; ++i) {
- alnNums[i] = i;
- queryBegs[i] = beg[i].qstart;
- queryEnds[i] = beg[i].qend;
+ alignmentParts[i].alnNum = i;
+ alignmentParts[i].queryBeg = beg[i].qstart;
+ alignmentParts[i].queryEnd = beg[i].qend;
}
} else {
if (rnaStrand != 0) {
@@ -213,23 +234,22 @@ void LastSplitter::doOneQuery(const LastSplitOptions &opts,
sa.flipSpliceSignals(params);
if (opts.verbose) std::cerr << "\t" << viterbiScoreRev;
}
+ alignmentParts.clear();
if (viterbiScore >= viterbiScoreRev) {
- sa.traceBack(params, viterbiScore, alnNums, queryBegs, queryEnds);
+ sa.traceBack(params, viterbiScore, alignmentParts);
} else {
sa.flipSpliceSignals(params);
- sa.traceBack(params, viterbiScoreRev, alnNums, queryBegs, queryEnds);
+ sa.traceBack(params, viterbiScoreRev, alignmentParts);
sa.flipSpliceSignals(params);
}
- std::reverse(alnNums.begin(), alnNums.end());
- std::reverse(queryBegs.begin(), queryBegs.end());
- std::reverse(queryEnds.begin(), queryEnds.end());
+ std::reverse(alignmentParts.begin(), alignmentParts.end());
}
- unsigned numOfParts = alnNums.size();
+ unsigned numOfParts = alignmentParts.size();
slices.resize(numOfParts);
for (unsigned k = 0; k < numOfParts; ++k) {
- unsigned i = alnNums[k];
- doOneSlice(slices[k], queryBegs[k], queryEnds[k], sa, beg[i], i);
+ unsigned i = alignmentParts[k].alnNum;
+ doOneSlice(slices[k], alignmentParts[k], sa, beg[i]);
}
sa.exponentiateScores(params);
@@ -251,19 +271,26 @@ void LastSplitter::doOneQuery(const LastSplitOptions &opts,
if (opts.verbose) std::cerr << "\n";
for (unsigned k = 0; k < numOfParts; ++k) {
- unsigned i = alnNums[k];
+ unsigned i = alignmentParts[k].alnNum;
doOneAlignmentPart(opts, params, isAlreadySplit, beg[i], numOfParts, k,
- slices[k], i, queryBegs[k], queryEnds[k],
+ slices[k], alignmentParts[k],
rnaStrand, isSenseStrand, senseStrandLogOdds);
}
}
+void LastSplitter::splitOneQuery(const LastSplitOptions &opts,
+ const cbrc::SplitAlignerParams ¶ms) {
+ sort(mafs.begin(), mafs.end(), lessForOneQuery);
+ doOneQuery(opts, params, false, mafs.data(), mafs.data() + mafs.size());
+ mafs.clear();
+}
+
void LastSplitter::split(const LastSplitOptions &opts,
const cbrc::SplitAlignerParams ¶ms,
bool isAlreadySplit) {
sort(mafs.begin(), mafs.end(), less);
- const cbrc::UnsplitAlignment *beg = mafs.empty() ? 0 : &mafs[0];
+ const cbrc::UnsplitAlignment *beg = mafs.data();
const cbrc::UnsplitAlignment *end = beg + mafs.size();
const cbrc::UnsplitAlignment *mid = beg;
size_t qendMax = 0;
@@ -273,8 +300,6 @@ void LastSplitter::split(const LastSplitOptions &opts,
++mid;
if (mid == end || strcmp(mid->qname, beg->qname) != 0 ||
(mid->qstart >= qendMax && !opts.isSplicedAlignment)) {
- if (opts.verbose) std::cerr << beg->qname << '\t' << beg->qstart << '\t'
- << qendMax << '\t' << (mid - beg);
doOneQuery(opts, params, isAlreadySplit, beg, mid);
beg = mid;
qendMax = 0;
=====================================
src/split/mcf_last_splitter.hh
=====================================
@@ -38,6 +38,9 @@ public:
void split(const LastSplitOptions &opts,
const cbrc::SplitAlignerParams ¶ms, bool isAlreadySplit);
+ void splitOneQuery(const LastSplitOptions &opts,
+ const cbrc::SplitAlignerParams ¶ms);
+
bool isOutputEmpty() const { return outputText.empty(); }
void printOutput() const
@@ -48,7 +51,9 @@ public:
private:
cbrc::SplitAligner sa;
std::vector<cbrc::UnsplitAlignment> mafs;
+ std::vector<cbrc::AlignmentPart> alignmentParts;
std::vector<SliceData> slices;
+ std::vector<double> columnProbabilities;
std::vector<char> outputText;
void doOneQuery(const LastSplitOptions &opts,
@@ -60,8 +65,7 @@ private:
const cbrc::SplitAlignerParams ¶ms,
bool isAlreadySplit, const cbrc::UnsplitAlignment &a,
unsigned numOfParts, unsigned partNum,
- const SliceData &sd, unsigned alnNum,
- unsigned qSliceBeg, unsigned qSliceEnd,
+ const SliceData &sd, cbrc::AlignmentPart ap,
int rnaStrand, bool isSenseStrand,
double senseStrandLogOdds);
};
=====================================
test/last-test.sh
=====================================
@@ -292,6 +292,7 @@ rm f.* r.*
./last-train-test.sh
./maf-convert-test.sh
./maf-cut-test.sh
+./maf-linked-test.sh
./maf-swap-test.sh
# Test: lastdb, lastal, last-split, maf-sort, maf-join
=====================================
test/maf-linked-test.sh
=====================================
@@ -0,0 +1,11 @@
+#! /bin/sh
+
+cd $(dirname $0)
+
+PATH=../bin:$PATH
+
+{
+ maf-linked homSap-monDom.maf
+ maf-linked -c2 -d100000 homSap-monDom.maf
+ maf-linked 90089.maf
+} | diff -u maf-linked-test.txt -
=====================================
test/maf-linked-test.txt
=====================================
@@ -0,0 +1,100 @@
+# LAST version 1584
+#
+# a=29 b=1 A=28 B=1 e=185 d=116 x=184 y=52 z=184 H=100 E=3.87001
+# R=01 u=2 s=2 S=1 M=0 T=0 m=10 l=1 n=10 C=2 k=1 w=1000 t=5.22312 j=3 Q=0
+# /home/mcfrith/data/genome/mammal/homSap/last/homSap-cMAM4
+# Reference sequences=195 normal letters=2934876451
+# lambda=0.207774 K=0.191089
+#
+# A C G T M S K W R Y B D H V
+# A 4 -6 -2 -6 2 -4 -4 1 2 -6 -5 1 0 1
+# C -6 6 -6 -2 2 3 -3 -4 -6 2 1 -4 1 0
+# G -2 -6 6 -6 -3 3 2 -4 2 -6 0 1 -4 1
+# T -6 -2 -6 4 -4 -4 2 1 -6 2 1 0 1 -5
+# M 2 2 -3 -4 2 0 -4 0 1 -1 -2 -1 0 1
+# S -4 3 3 -4 0 3 0 -4 0 0 1 -1 -1 1
+# K -4 -3 2 2 -4 0 2 0 -1 1 1 0 -1 -2
+# W 1 -4 -4 1 0 -4 0 1 0 0 -1 0 0 -1
+# R 2 -6 2 -6 1 0 -1 0 2 -6 -2 1 -1 1
+# Y -6 2 -6 2 -1 0 1 0 -6 2 1 -1 1 -2
+# B -5 1 0 1 -2 1 1 -1 -2 1 1 -1 0 -1
+# D 1 -4 1 0 -1 -1 0 0 1 -1 -1 0 0 0
+# H 0 1 -4 1 0 -1 -1 0 -1 1 0 0 0 -1
+# V 1 0 1 -5 1 1 -2 -1 1 -2 -1 0 -1 1
+#
+# Coordinates are 0-based. For - strand matches, coordinates
+# in the reverse complement of the 2nd sequence are used.
+#
+# name start alnSize strand seqSize alignment
+#
+# m=1 s=185
+#
+# m=1 s=185
+#
+# Query sequences=13 normal letters=3554259256
+a score=355 mismap=0.63
+s chr1 47567 300 + 248956422 TtttttccTTACTTTCCATCACCAAGTAACTCTTCTGATATTTTTTCTCTTGAGAAAATTAATATGACTCATAGATCTGGTTCCCAAGAGAAATCAATGGAGGCCTGGTTACAAGGATCTAAGAAGCATCAATGGGTCACTAACATCTAGTGGTACTAATTAACTCTGTTAATCATTGGGAAGAAAATGTATATATACTTTTGTCTTGGAGCTGATTCTACTAGAAAGCAGAAATCAAAATGATCAGTTTCCCAGTGT----CACTACTGCACACCCTGGAACAGAACAGGTAGGTCAGAAAAA
+s NC_077227.1 171526596 292 + 760810273 TTCtctctctctgttttATCACTAAGCAACCTTTCTAATGTTTTTCCTCTTGGGAAAACTAGTGTGACTTCAAGAAACAATTACCAAGGGAATTAATCATAAGCTAGGTATTAGAAGTCTAAGAAGCATCATTGGGTCACTGACAGTCAGAGCAACTAAGTATGTTGCTTTCTCTTAGGGAAGAACTTAGTTTAACTTTTTTTCTTTAGAGGTTATTAAATTACATTTCTGAATT------------TTTCTCATTCTTAAATACAATTTCATATTCTGTGATAGAACAGGTAGGTGAGAGGAA
+
+a score=2697 mismap=0.01
+s chr1 62902 978 + 248956422 TCGTTTCTTTCAGGTTGCTTCAGAGTCTTCCCTTCTATCTGATTCAGTGGACCAAGTAAATGACTCTCTGGTAACAGAATTTGTATTACTTGGACTTGCACAATCCTTGGAAATGCAGTTTTTCCTttttctcttcttctcttTATTCTATGTGGGAATTATCCTGGGAAAACTCTTCATTGTGTTCACAGTGATCTTTGATCCTCACTTACACTCCCCCATGTATATTCTGCTGGCCAACCTATCGCTCATTGACTTGAGCCTTTCATCTACCACAGTTCCTAGGTTGATCTACGATCTTTTTACTGATTGTAA---AGTTATTTCCTTCCATAATTGCATGATACAAAAGTTCTTTATCCATGTTATGGGAGGAGTTGAAATGGTGCTGCTGATAGTCATGGCATATGATAGGTACACTGCGATCTGCAAGCCTCTCCACTATCCAACTATTATGAATCCCAAAATGTGCATGTTTTTGGTAGCAGCAGCTTGGGTCATTGGGGTGATTCATGCTATGTCTCAGTTTGTTTTTGTCATAAATTTACCCTTCTGTGGCCCTAATAATGTGGGGAGCTTTTATTGTGATTTTCCTCGGGTTATTAAACTTGCATGCATGGACACTTATGGGCTAGAATTTGTGGTCACTGCCAACAGTGGATTCATATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTTTATTCTGGTCACTGTCCAACGACATTCCTCAAATGATTTATCCAAAGCATTCTTCACTTCGTCGGCTCACATCACCGTAGTGGTTTTGTTTTTTGCTCCATGCATGTTTCTCTACGTGTGGCCTTTCCCTACTAAGTCATTGGATAAATTTTTTGCCATCATGAACTTTGTTGTCACCCCTGTCGTAAATCCTGCCATCTATACTTTAAGGAACAAAGATATGAAGTTTGCAATGAGAAGGCTGAATCAACATATTTTAAATTCTATGGAGAC
+s NC_077227.1 171564230 981 + 760810273 TCTTTACCCTTAGGTCATTTTAGCAACTGTGGAACCAACTGAGTCAATGGATGGAGTAAATAATTCTGTGGTAAATGAGTTTGTCCTTTTGGGTCTCTCTGCTTCTTGGGAGATGAAGATTCTACTcttttttttcttctctgtgttttttgtgGGAATCATGTTTGGAAACctcttcattgtattcactgtgatttttgACCCTCATTTACACTCCCCCATGTATTTCTTGTTGGCCAATCTTTCTCTCATTGACCTGGGTCTGTCTGCTACCACAGTACCAAGGATGATAGTGGACCTTTTCAGTGAATGTAATATAGTCATCTCCTTCCCAGGTTGCATGACACAGATGTTCTTTATCCATGTCATGGGAGGAACTGAGATGGTTCTGCTCATAGTCATGGCATATGACAGATATACAGCAATTTGTAAGCCTCTCCACTATTTGACAATTATGAATCCAAAAATGTGTATTTGCTTTGTAGTGACTGCCTGGATAGTTGGAGTGATCCATGCGATCTCTCAGTTTGTTTTTGTTATAAATTTGCCTTTCTGTGGTCCAAATAAAGTGGAGAGCTTTTATTGTGACTTTCCTCGGGTCATAAAACTTGCGTGCATAGACACCTATAGCCTGGAGTTTGTGGTTATAGCCAATAGTGGTTTTATCTCCATTGGGACTTTTTTCCTATTGATTATCTCCTACATTTTCATCCTGGCCTCTGTTCGACAACATTCTGTCGGTGATTTGTCCAAAGCCTTCTTTACTTTGTCAGCTCACATCACTGTGGTGATTTTATTTTTTGGTCCATGCATGTTTCTTTATGTGTGGCCATTTCCTACATTGTCACTTGATAAATTCTTTGCTATTGTTGACTTTGTTGTCACCCCTGTCTTGAATCCTGCTATCTATACATTAAGGAACAAGGATATGAAGATGGCAATGAGGAGACTGAGCAGCCAGATTGTGAGTTCTAGGGAGAC
+
+a score=601 mismap=0.64
+s chr1 69075 254 + 248956422 GAAACGAATAACTCTATGGTGACTGAATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCTATTTATGTTGTTTTTTGTATTCTATGGAGGAATCGTGTTTGGAAACCTTCTTATTGTCATAACAGTGGTATCTGACTCCCACCTTCACT---CTCCCATGTACTTCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTCTTCAGTCACAGCCCCCAAGATGATTACTGACTTTTTCAG
+s NC_077227.1 171835366 257 + 760810273 GAGATGAATAACTCTATGGTGAATGAATTCATTTTGTTGGGACTCACCAATTCTTGGGAACTTGAGATTTTCTTTTTtgtggttttctttCTGGCCTATACATTAATCTTGACTGGGAACTTTCTCATTATACTCATGGTGACTCTTGACTCCCATCTGCATTCAACCCCCATGTACTTCCTCCTGGCCAACCTCTCTCTTTTTGACATGTCTCTTTCCACTGTTACTGTCCCCAAAATGATCATAGACTTCTTCAG
+
+# LAST version 1584
+#
+# a=29 b=1 A=28 B=1 e=185 d=116 x=184 y=52 z=184 H=100 E=3.87001
+# R=01 u=2 s=2 S=1 M=0 T=0 m=10 l=1 n=10 C=2 k=1 w=1000 t=5.22312 j=3 Q=0
+# /home/mcfrith/data/genome/mammal/homSap/last/homSap-cMAM4
+# Reference sequences=195 normal letters=2934876451
+# lambda=0.207774 K=0.191089
+#
+# A C G T M S K W R Y B D H V
+# A 4 -6 -2 -6 2 -4 -4 1 2 -6 -5 1 0 1
+# C -6 6 -6 -2 2 3 -3 -4 -6 2 1 -4 1 0
+# G -2 -6 6 -6 -3 3 2 -4 2 -6 0 1 -4 1
+# T -6 -2 -6 4 -4 -4 2 1 -6 2 1 0 1 -5
+# M 2 2 -3 -4 2 0 -4 0 1 -1 -2 -1 0 1
+# S -4 3 3 -4 0 3 0 -4 0 0 1 -1 -1 1
+# K -4 -3 2 2 -4 0 2 0 -1 1 1 0 -1 -2
+# W 1 -4 -4 1 0 -4 0 1 0 0 -1 0 0 -1
+# R 2 -6 2 -6 1 0 -1 0 2 -6 -2 1 -1 1
+# Y -6 2 -6 2 -1 0 1 0 -6 2 1 -1 1 -2
+# B -5 1 0 1 -2 1 1 -1 -2 1 1 -1 0 -1
+# D 1 -4 1 0 -1 -1 0 0 1 -1 -1 0 0 0
+# H 0 1 -4 1 0 -1 -1 0 -1 1 0 0 0 -1
+# V 1 0 1 -5 1 1 -2 -1 1 -2 -1 0 -1 1
+#
+# Coordinates are 0-based. For - strand matches, coordinates
+# in the reverse complement of the 2nd sequence are used.
+#
+# name start alnSize strand seqSize alignment
+#
+# m=1 s=185
+#
+# m=1 s=185
+#
+# Query sequences=13 normal letters=3554259256
+a score=355 mismap=0.63
+s chr1 47567 300 + 248956422 TtttttccTTACTTTCCATCACCAAGTAACTCTTCTGATATTTTTTCTCTTGAGAAAATTAATATGACTCATAGATCTGGTTCCCAAGAGAAATCAATGGAGGCCTGGTTACAAGGATCTAAGAAGCATCAATGGGTCACTAACATCTAGTGGTACTAATTAACTCTGTTAATCATTGGGAAGAAAATGTATATATACTTTTGTCTTGGAGCTGATTCTACTAGAAAGCAGAAATCAAAATGATCAGTTTCCCAGTGT----CACTACTGCACACCCTGGAACAGAACAGGTAGGTCAGAAAAA
+s NC_077227.1 171526596 292 + 760810273 TTCtctctctctgttttATCACTAAGCAACCTTTCTAATGTTTTTCCTCTTGGGAAAACTAGTGTGACTTCAAGAAACAATTACCAAGGGAATTAATCATAAGCTAGGTATTAGAAGTCTAAGAAGCATCATTGGGTCACTGACAGTCAGAGCAACTAAGTATGTTGCTTTCTCTTAGGGAAGAACTTAGTTTAACTTTTTTTCTTTAGAGGTTATTAAATTACATTTCTGAATT------------TTTCTCATTCTTAAATACAATTTCATATTCTGTGATAGAACAGGTAGGTGAGAGGAA
+
+a score=2697 mismap=0.01
+s chr1 62902 978 + 248956422 TCGTTTCTTTCAGGTTGCTTCAGAGTCTTCCCTTCTATCTGATTCAGTGGACCAAGTAAATGACTCTCTGGTAACAGAATTTGTATTACTTGGACTTGCACAATCCTTGGAAATGCAGTTTTTCCTttttctcttcttctcttTATTCTATGTGGGAATTATCCTGGGAAAACTCTTCATTGTGTTCACAGTGATCTTTGATCCTCACTTACACTCCCCCATGTATATTCTGCTGGCCAACCTATCGCTCATTGACTTGAGCCTTTCATCTACCACAGTTCCTAGGTTGATCTACGATCTTTTTACTGATTGTAA---AGTTATTTCCTTCCATAATTGCATGATACAAAAGTTCTTTATCCATGTTATGGGAGGAGTTGAAATGGTGCTGCTGATAGTCATGGCATATGATAGGTACACTGCGATCTGCAAGCCTCTCCACTATCCAACTATTATGAATCCCAAAATGTGCATGTTTTTGGTAGCAGCAGCTTGGGTCATTGGGGTGATTCATGCTATGTCTCAGTTTGTTTTTGTCATAAATTTACCCTTCTGTGGCCCTAATAATGTGGGGAGCTTTTATTGTGATTTTCCTCGGGTTATTAAACTTGCATGCATGGACACTTATGGGCTAGAATTTGTGGTCACTGCCAACAGTGGATTCATATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTTTATTCTGGTCACTGTCCAACGACATTCCTCAAATGATTTATCCAAAGCATTCTTCACTTCGTCGGCTCACATCACCGTAGTGGTTTTGTTTTTTGCTCCATGCATGTTTCTCTACGTGTGGCCTTTCCCTACTAAGTCATTGGATAAATTTTTTGCCATCATGAACTTTGTTGTCACCCCTGTCGTAAATCCTGCCATCTATACTTTAAGGAACAAAGATATGAAGTTTGCAATGAGAAGGCTGAATCAACATATTTTAAATTCTATGGAGAC
+s NC_077227.1 171564230 981 + 760810273 TCTTTACCCTTAGGTCATTTTAGCAACTGTGGAACCAACTGAGTCAATGGATGGAGTAAATAATTCTGTGGTAAATGAGTTTGTCCTTTTGGGTCTCTCTGCTTCTTGGGAGATGAAGATTCTACTcttttttttcttctctgtgttttttgtgGGAATCATGTTTGGAAACctcttcattgtattcactgtgatttttgACCCTCATTTACACTCCCCCATGTATTTCTTGTTGGCCAATCTTTCTCTCATTGACCTGGGTCTGTCTGCTACCACAGTACCAAGGATGATAGTGGACCTTTTCAGTGAATGTAATATAGTCATCTCCTTCCCAGGTTGCATGACACAGATGTTCTTTATCCATGTCATGGGAGGAACTGAGATGGTTCTGCTCATAGTCATGGCATATGACAGATATACAGCAATTTGTAAGCCTCTCCACTATTTGACAATTATGAATCCAAAAATGTGTATTTGCTTTGTAGTGACTGCCTGGATAGTTGGAGTGATCCATGCGATCTCTCAGTTTGTTTTTGTTATAAATTTGCCTTTCTGTGGTCCAAATAAAGTGGAGAGCTTTTATTGTGACTTTCCTCGGGTCATAAAACTTGCGTGCATAGACACCTATAGCCTGGAGTTTGTGGTTATAGCCAATAGTGGTTTTATCTCCATTGGGACTTTTTTCCTATTGATTATCTCCTACATTTTCATCCTGGCCTCTGTTCGACAACATTCTGTCGGTGATTTGTCCAAAGCCTTCTTTACTTTGTCAGCTCACATCACTGTGGTGATTTTATTTTTTGGTCCATGCATGTTTCTTTATGTGTGGCCATTTCCTACATTGTCACTTGATAAATTCTTTGCTATTGTTGACTTTGTTGTCACCCCTGTCTTGAATCCTGCTATCTATACATTAAGGAACAAGGATATGAAGATGGCAATGAGGAGACTGAGCAGCCAGATTGTGAGTTCTAGGGAGAC
+
+a score=9115 mismap=8.94e-10 sense=6.5 acc=AG
+s chr7 121348853 1697 + 159345973 ACAAAGCAAAACCAAGAGTATTTATTTTCATTATTTTAAAAAACTGCAACATTTATTTCACAATCCCTGGAACTGTAAAATAACTATTCTTTTAATTTACAGAATTGAGGGAAGTATTTTAAATGTCAAAATACACATGCTGAATGTGATTTAGCATATAAGCACCAAAAATCCCAAAAGTATACAAA-GATTGTTTAAACATATAAGGTGCATTTATTCatgaattaattaCATTCAGTTTGAAGGAAGAAACAGTACTCCAAAGTTCATCAATATTTAAGAAATTATCCGAATAGCTTCCAACTTGGTTTACAGCTAAATAACAGATACTGCTTTTAGGTTTTTTTATACAATGCATCTAAACTTTAGGTTCGAAATTTAAGTGCTGCCTTACACTGTGAATGGCAGTTTGATCACTAAATTTATGTTCCAGTTTGAGATACTTCATACACAACATATATTTAATGCCCTGAAAACTTCTGCTTAAAAACTGAGTTATTTAGTGAACATCTGAAGGTAACAAAAAAAGATAGATCTTCTATATTTTTATGTAGAATTGGGTATAAAGTCATTTGGGGTCAAACACAATTATTTTTTAAATGTTATCAGTCCGCCCAGTAATAAGAACTTTTGTAAATATGCCACTATAGCTTCGGGCAATAATTGCTATTATGAAGTTAGATAAACACAGGATTAATTTCTTGGTTTACAAATCAAAATTATTGTGATATTAAAGCATGCCTCCTTCCAAACCACTTGGTCAATGCTGATAACAATTAAGAAAAACGTATGCTAGTATTGCTCTAAATATACTGACTTAGCTGTCCTGAATGACCTAATTAGGTTGGGGAGGGGACTTGGCCTGGTTCTCCAGTTCTGTGAAGCTGCCACCTCCAGCCTCACTCTAAATAGCTTATACAAAATCAAGTTATTCCCATAGCTAACACAAATCCAGCAAGCTGTGGTCCTAAATGCCATCTACATTTAtatatttacttttttactgTGTACATTAAGAATTTTGTTGTGCTTTTCACATATCCGATGTATGTTTTCATATTAAACATTCCTCAGTTTCTTGGGAAGGAAAGAAAAAGATCATGAAAAGAGGTACATCAGGAGAGTCGGACTGTAAAGTAACATTACACACACAACTAGCCAAATCTCTTTTTAATTTAAAAGATAATGTCATCACAACGCAACATATAgaaacataaaagaaaataaaGTATCCACCCTAAAATCCCTATTATTCCATGATATTTTCATAGCAACTAGTATatatatcaatatatttttCACAAACCATTCAGTTACACATTGTGTATGCTTTTAGAATTTAAAATATGTATTACCATAGCAGTCTAAACATTGTACTTTTAGGGAAATGTGCGGAGACATGGTTCAGTGATTTGATCATTGTGATGATTATTTACAATGCTATCTATTTACGTTAGCAATAAATAATCCTGATATCAAAAGAGACAATACTCATGCACACACCAAGATGGCTGCATGCTCTTAAAATATTTACACATAGCTCTGCCATTTatagctctcccattaagaGTGAAAGTGCGCTTTCTTCAATTCTC-TCCACATTTCCATTAGTCTTGCTTCTGGGGGATGCATCCTTCCATTTCTACAACTTCAGGCCATCCTTCATATTTGTTTGTATCCTTATTGTTCTTTATGTG
+s 90089 47 1655 - 2478 ACAAAGCAAAACCAAGAGTATTTATTTTCATTATTTTAAAAAGCC-CAACATTTATTTCA--ATCC-TGGAACTGTAAAATAACTATTCTTTTAATTTACAGAATTGAGG-AAGTATTTTAA-TGTCAAAATACACATGCTGAATGTGATTTAGCATATAAGCACCAAAAATCCCAAAAGTATACAAAAGATTGTTTAAACATATAAGGTGCATTTATTCATGAAT-AATTACATTCAGTTTGAAGGAAGAAACAGTACTCCAAAGTTCATCAATATTTAAGAAATTATCCGAATAGCTTCCAACTTGGTTTACAGCTAAATAACAGATACTGCTTTTAGG--TTTTTATACAATGCATCTAAACTTTAGGTTCGAAATTTAAGTGCTGCCTTACACTGTGAATGGCAGTTTGATCACTAAATTTATGTTCCAGTTTGAGATACTTCATACACAACATATATTTAATGCCCTGAAAACTTCTGCTTAAAAACTGAGTTATTTAGTGAACATCTGAAGGTAACAAAAA--GATAGATCTTCTATA-TTTTATGTAGAATTGGGTATAAAGTCATTTGGGGTCAAACACAATTA-TTTTTAAATGTTATCAGTCCGCCCAGTAATAAGAACTTTTGTAAATATGCCACTATAGCTTCGGGCAATAATTGC---TATGAAGTTAGATAAACACAGGATTAATTTCTTGGTTTACAAATCAAAATTATTGTGATATTAAAGCATGCCTCCTTCCAAACCACTTGGTCAATGCTGATAACAATTAAGAAAAACGTA---TAGTATTGCTCTAAATATACTGACTTAGCTGTCCTGAATGACCTAATTAGGTT--GGA-GGGACTTGGCCTGGTTCTCCAGTTC--TGAAGCTGCCACCTCCAGCCTCACTCTAAATAGCTTATACAAAATCAAATTATTCCCATAGTTAACACAAATCCAGCAAGCTGTGGTCCTAAATGCCATCTACATTTATATATTTAC-TTTTTACTGTGTACATTAAGAATTTTGTTGTGCTTTTCACATATCCGATGTATGTTTTCATATTAAACATTCCTCAGTTTCTTGGGAAGGAAAG-AAAAGATCATGAAAAGAGGTACATCAGGAGAGTCGGACTGTAAAGTAACATT--ACACACAACTAGCCAAATCTCTTTTTAATTTAAAAGATAATGTCATCACAACGCAACATatagaaacat-aaagaaaATAAAGTATCCA-CCTAAAATCCCTATTATTCCATGATATTTTCATAGCAACTAGTATatatatcaatatatttttCACAAACCATTCAGTTACACATTGTGTATGC-TTTAGAATTTAAAATATGTATTACCATAGCAGTTCAAACATTGTACTTTTA-GGAAATGT--GGAGACATGGTTCAGTGATTTGATCATTGTGATGATTATTTACAATGCTATCTATTTACGTTAGCAATAAATAATCCTGATATC-AAAGAGACAATACTCATGCACACACCAAGATGGCTGCATG-TTTT-AAATATTTACACATAGCTCTGCCATTTatagctctcccattaagaG-G-AAGTGCGC-TTCTTCAATTCTCTTCCACATTTCCATTAGTCTTGCTTCT-GGGGATGCATCCTTCCATTTCT---GCTTCAGGCCATCCTTCATATTTGTTTGTATCCTTATTGTTCTTTATGTG
+p ',28=DJIHHJNNNNQQRRRLJLRJHHJRRNNRIGGG520/.''%&*=BIOLJLQC@=%"%"36%""7KMNNRRRRJHHJRNNRRRNNQJHHJNNLJLRRRRNNNNG9%""69KRQA><;%""ANPRJHHJRQQPQQRRRRRNNRQQRRLJLRRRQQPQNNRRRNNIGGGIRLJLJHHJROML@=&$""9 at MNRLJLLJLRQQPQNNNNRRRRLJLRNNRRRO?<$#<?LMQRRNNRRRLJLRNNNNNNQLJLRRRRRRRNNLJLRNNRRRRNNQQLJLNNRLJLNNRRNNRNNRRRRNNNNNNRNNNNLJLRRRRRRLJLRNNRRRRRRRRRRJHHJPB;I=~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~1414KMQRRRNNRJGEA!!#$'BIQRRRNNNNLJLRQQLJLRRRRLJLJHHJRRLJLPPPNMLIC!!"$%(?ADRRNNRRRRRRNNRLJLRRRNNRNNRNNRJHHJRRLJLQQRRNNRRRQQRRRNNRLJLRNNRNNNJI9"$&"$&?CLRNNRRRRLJLPPQQNNRNNNNLJLQNNNNLJLRRLJLRRJHHJNNRNNQQRRQQNNLJLRRRRRNNQNNNNNNLJLNNRRNNNNRRNNRRRRRRRNNRNNNNNNQIGGGIRPD=#7?#7FLONNRRQQRLJLPPQQRRRRRNNRRRRRRNNRRNNRRRNNRNNNNRMLF?"%"$+""$':PNNNNNNRNNNNQQNNRRK@="%"%?DQRRRNNRNNRNNRRNNRQRQQQRLJLRRRRNNQQQRJHHJRQ?<:HKOMMLJLRQPK>AMMPPPLJLRNNRRRNNRRRQQNNRNNRLJLRRNNRRRRRRRLJKOOOOOKJKO;!!"$%(DPPQPQQRRRNNNNRNNJHHJRNNQQRRJHHJQQQQQQQNNRRRRRRRRJHHJRRQQNNLJLRRNNNNRRRRLJLQNNLJLNNNNJGF;!!#$';HRRRRRJHHJQQNNRRRRRRRNNPPPQRRNNRRRRRLJLRRNNRQG;!"!"#$%&)8IRRRNNLJLQPPPIGGGINNLJLJHHJRQRNNRRRRRRQQQNNRRRNNRQQPQQRLJLQIB""#&:DEFIRLJLRRRRLG@""%7HGHJRLJLRRNNRNNNNRRRQQQJHHJRRRRRRNNRRRRPONMMMNOPRNMPOOPIGGGIQQQLJLNNRNNRRRNNQQPQQNNQPQQRPN9""$'BNMNLJLJHHJQQQRRRNNRNNRRRRQOF.**>ADQRNNRRRRJHHI9""%468521#0$2EKMRRRNNNNRRRRRRLJLRRRRRNNQQQRRRRNNRLJLRRNNRRRRRRRQRLJLRRRNNRRRNNRLJLRNNRNNRRQQPNB""#&:DFIRNNRRRRRRRRQOOOOOMNNNRRRNNRRRRQH?###++""$'AFHJLQQPQQRRRRQQRRNNRLJLQQRRQPPPKJLRNNMFB?+3("#&:MNNL8""%8CGONNJ=:83##25IKMLJLNNRNNRRRRNNRRNMNB!!#$';OQRRRNNNNNNRLJI@;"%4"%>CPRNNNNRRNNNNRRQQLJLRLJLRRRRNNNNRNNRNNQLJKKF@:5
+p zz{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{o{{{{{{{{{{{{{{ii{{{{U{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{p{{{{{{{{{{{p{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{p{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{ii{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{ii{{{{{{{{{{{{{{{m{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{m{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{aaa{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{ZZZ{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{zzziizzzhzzzzzzzzzzzzzzzzzzzzzzzzziizzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzozzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzozzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzNNzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzvvvvvvvvvvvvvvvvvnttttoooooooolkkkkkkUbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb`_\\\[[[YXXXXXXXXXXXXXXXXXXXXXXXXXXXXWWWWWWVVVVVVVUUUUUTTTTTTTTTTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSRRRRRRRRRRRRRRRRRRRKLKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKJJJJJJJJJJJEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEEEEEE??>>>>>>>>>>>=<;;;;;:::::9999999999999999999998888888888877777777777777777777777777777777766666666664444444444444444444444444333333333333333333
+
+a score=493 mismap=0.014 sense=6.5 acc=GC don=GT
+s chr7 121351142 114 + 159345973 CTGTTCAAAAGGGCTTTTTGTCTTAATGCCCTTCCCACCACAGAAGACCCAGTTGTCTCTAAAACCAAGATTAGTAATAGATGTGCTCCCCAAATCAGCAATGAGCCGCCGTGC
+s 90089 1702 101 - 2478 CTGTTCAAA-GGGCTTTTTA--TTAATGCCCTTCCCACCACAGAAGACCCAGTTGTCTCTAAA-CC-AGATTAGTAATAGATGTGCT-CCCAAATCAGCAATGAGCC-------
+p &,1578&$""57:J9643*'*'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~1..#$7ENNRRRNNRRRRRQQQG8""$'>?BRRRRRNNOIC,)$''()%+$
+p 3333333310000,,,,,,,,,,,,,,,,,))))))))((('''''''&&&&&&&&%%%%%%%%%$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
+
=====================================
util/plot-score-matrix
=====================================
@@ -0,0 +1,72 @@
+#! /bin/sh
+# Author: Martin C. Frith 2025
+
+progName=$(basename $0)
+
+help="Usage: $progName substitution-matrix-file [output.pdf]
+
+Draw an image of a substitution matrix, for example, last-train
+output. If the matrix is codons versus amino acids, the standard
+genetic code is shown by dots."
+
+if test $# -lt 1 || test $# -gt 2
+then echo "$help" && exit 1
+fi
+
+if test "$1" = --help || test "$1" = -h
+then echo "$help" && exit 0
+fi
+
+matrixFile=$1
+pdf=${2-$matrixFile.pdf}
+
+geneticCode="\
+FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
+TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"
+
+tmp=plot-score-matrix.$$
+
+awk '/^ *[a-zA-Z*]/' "$matrixFile" | sed 's/ [^ ][^ ]*//65' > $tmp.m
+
+lineCount=$(wc -l < $tmp.m)
+
+columnCount=$(head -n1 $tmp.m | wc -w)
+
+for i in $(seq 64)
+do echo "$geneticCode" | cut -c$i | paste -s -
+done |
+ sort -k2 | awk '{print $1, NR}' | sort > $tmp.x
+
+awk 'NR > 1 {print $1, '"$lineCount"' - NR + 1}' $tmp.m | sort |
+ join -o1.2,2.2 $tmp.x - > $tmp.g
+
+las=$(head -n1 $tmp.m | awk '{print (length($1) > 1) ? 2 : 1}')
+
+{
+ cat <<EOF
+colors <- "Heat 2"
+#colors <- "PinkYl"
+#colors <- "OrYel"
+margin <- 1.8
+m <- as.matrix(read.table("$tmp.m", check.names=FALSE))
+m <- m[nrow(m):1,]
+x <- 1:ncol(m)
+y <- 1:nrow(m)
+width <- min(7, 1 + 0.14 * ncol(m))
+pdf("$pdf", width=width, height=width*nrow(m)/ncol(m), pointsize=8)
+par(mar=c(0, margin * ncol(m) / nrow(m), margin, 0))
+par(mgp=c(3, 0.25, 0))
+par(las=$las)
+image(x, y, t(m), col=hcl.colors(12, colors), axes=FALSE, xlab="", ylab="")
+axis(2, y, rownames(m), FALSE, family="mono")
+axis(3, x, colnames(m), FALSE, family="mono")
+EOF
+
+test $columnCount = 64 && cat <<EOF
+points(read.table("$tmp.g"), pch=16)
+EOF
+} | R --vanilla -s
+
+rm $tmp.?
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/9291a16b57d8cbb657c94241213655a60afd80fa
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/9291a16b57d8cbb657c94241213655a60afd80fa
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250902/2f426ab4/attachment-0001.htm>
More information about the debian-med-commit
mailing list