[med-svn] [Git][med-team/last-align][upstream] New upstream version 1296
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Sun May 29 14:56:46 BST 2022
Nilesh Patra pushed to branch upstream at Debian Med / last-align
Commits:
a87dec4f by Nilesh Patra at 2022-05-29T19:11:55+05:30
New upstream version 1296
- - - - -
23 changed files:
- bin/maf-cut
- bin/parallel-fasta
- bin/parallel-fastq
- build/seed-doc.sh
- data/RY16-8.seed
- data/RY32-10.seed
- data/RY4-6.seed
- data/RY8-8.seed
- doc/last-cookbook.rst
- doc/last-map-probs.rst
- doc/last-parallel.rst
- doc/last-seeds.rst
- doc/maf-cut.rst
- src/CyclicSubsetSeed.cc
- src/CyclicSubsetSeed.hh
- src/LastalArguments.cc
- src/fileMap.cc
- src/lastdb.cc
- src/makefile
- test/last-test.out
- test/last-test.sh
- + test/maf-cut-test.out
- + test/maf-cut-test.sh
Changes:
=====================================
bin/maf-cut
=====================================
@@ -3,9 +3,10 @@
from __future__ import print_function
+import argparse
+import collections
import gzip
import itertools
-import optparse
import signal
import sys
@@ -56,13 +57,28 @@ def findTheSpecifiedSequence(seqName, mafLines):
return fields
return None
+def addMafData(dataPerSeq, mafLines, alnBeg, alnEnd):
+ for line in mafLines:
+ if line[0] == "s":
+ fields = line.split()
+ seqName = fields[1] # xxx omit theSpecifiedSequence ?
+ strand = fields[4]
+ seqLen = int(fields[5])
+ oldSeq = fields[6]
+ beg = int(fields[2]) + alnBeg - oldSeq.count("-", 0, alnBeg)
+ end = beg + alnEnd - alnBeg - oldSeq.count("-", alnBeg, alnEnd)
+ if strand == "-":
+ beg, end = seqLen - end, seqLen - beg
+ seqData = seqLen, beg, end
+ dataPerSeq[seqName].append(seqData)
+
def cutMafRecords(mafLines, alnBeg, alnEnd):
for line in mafLines:
fields = line.split()
if line[0] == "s":
oldSeq = fields[6]
newSeq = oldSeq[alnBeg:alnEnd]
- beg = int(fields[2]) + alnBeg - oldSeq[:alnBeg].count("-")
+ beg = int(fields[2]) + alnBeg - oldSeq.count("-", 0, alnBeg)
span = len(newSeq) - newSeq.count("-")
yield fields[:2] + [str(beg), str(span)] + fields[4:6] + [newSeq]
elif line[0] == "q":
@@ -82,15 +98,7 @@ def printMafLine(fieldWidths, fields):
formatParams = itertools.chain.from_iterable(zip(fieldWidths, fields))
print("%*s %-*s %*s %*s %*s %*s %*s" % tuple(formatParams))
-def cutOneMaf(cutRange, mafLines):
- seqName, cutBeg, cutEnd = cutRange
- sLineFields = findTheSpecifiedSequence(seqName, mafLines)
- if not sLineFields:
- return
- alnBeg, alnEnd = alignmentRange(cutBeg, cutEnd, sLineFields)
- if alnBeg >= alnEnd:
- return
- mafRecords = list(cutMafRecords(mafLines, alnBeg, alnEnd))
+def printOneMaf(mafRecords):
fieldWidths = list(mafFieldWidths(mafRecords))
for fields in mafRecords:
if fields[0] == "s":
@@ -103,15 +111,75 @@ def cutOneMaf(cutRange, mafLines):
print(" ".join(fields))
print()
-def mafCutOneFile(cutRange, lines):
- mafLines = []
+def nameFromTitleLine(line):
+ return line[1:].split()[0]
+
+def doOneFastaSequence(dataPerSeq, topLine, seqLines):
+ alnData = dataPerSeq.get(nameFromTitleLine(topLine))
+ if alnData:
+ seq = "".join(i.rstrip() for i in seqLines)
+ seqLen = len(seq)
+ alnData = [i for i in alnData if i[0] == seqLen]
+ if alnData:
+ beg = min(i[1] for i in alnData)
+ end = max(i[2] for i in alnData)
+ print(topLine, end="")
+ print(seq[beg:end])
+
+def doOneFastqSequence(dataPerSeq, lines):
+ alnData = dataPerSeq.get(nameFromTitleLine(lines[0]))
+ if alnData:
+ seqLen = len(lines[1].rstrip())
+ alnData = [i for i in alnData if i[0] == seqLen]
+ if alnData:
+ beg = min(i[1] for i in alnData)
+ end = max(i[2] for i in alnData)
+ print(lines[0], end="")
+ print(lines[1][beg:end])
+ print(lines[2], end="")
+ print(lines[3][beg:end])
+
+def printFastaParts(dataPerSeq, topLine, lines):
+ seqLines = []
for line in lines:
- if line.isspace():
- cutOneMaf(cutRange, mafLines)
- mafLines = []
- elif line[0] != "#":
- mafLines.append(line)
- cutOneMaf(cutRange, mafLines)
+ if line[0] == ">":
+ doOneFastaSequence(dataPerSeq, topLine, seqLines)
+ topLine = line
+ seqLines = []
+ else:
+ seqLines.append(line)
+ doOneFastaSequence(dataPerSeq, topLine, seqLines)
+
+def printFastqParts(dataPerSeq, topLine, lines):
+ b = [topLine]
+ for i, x in enumerate(lines):
+ b.append(x)
+ if i % 4 == 2:
+ doOneFastqSequence(dataPerSeq, b)
+ b = []
+
+def printSequenceParts(dataPerSeq, lines):
+ for line in lines:
+ if line[0] == ">":
+ printFastaParts(dataPerSeq, line, lines)
+ elif line[0] == "@":
+ printFastqParts(dataPerSeq, line, lines)
+ else:
+ raise RuntimeError("can't read the sequence data")
+ break
+
+def cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines):
+ sLineFields = findTheSpecifiedSequence(seqName, mafLines)
+ if not sLineFields:
+ return
+ alnBeg, alnEnd = alignmentRange(cutBeg, cutEnd, sLineFields)
+ if alnBeg >= alnEnd:
+ return
+ if dataPerSeq is None:
+ mafRecords = list(cutMafRecords(mafLines, alnBeg, alnEnd))
+ printOneMaf(mafRecords)
+ else:
+ addMafData(dataPerSeq, mafLines, alnBeg, alnEnd)
def wantedRange(cutSpecification):
seqName = None
@@ -120,20 +188,32 @@ def wantedRange(cutSpecification):
beg, end = cutSpecification.rsplit("-", 1)
return seqName, int(beg), int(end)
-def mafCut(opts, args):
- cutRange = wantedRange(args[0])
- if len(args) > 1:
- for i in args[1:]:
- mafCutOneFile(cutRange, myOpen(i))
- else:
- mafCutOneFile(cutRange, sys.stdin)
+def mafCut(args):
+ seqName, cutBeg, cutEnd = wantedRange(args.segment)
+ dataPerSeq = collections.defaultdict(list) if args.sequences else None
+
+ mafLines = []
+ for line in myOpen(args.alignments):
+ if line.isspace():
+ cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines)
+ mafLines = []
+ elif line[0] != "#":
+ mafLines.append(line)
+ cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines)
+
+ if dataPerSeq:
+ for fileName in args.sequences:
+ printSequenceParts(dataPerSeq, myOpen(fileName))
if __name__ == "__main__":
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
- usage = "%prog chrN:start-end alignments.maf"
description = "Get parts of MAF-format alignments."
- op = optparse.OptionParser(usage=usage, description=description)
- opts, args = op.parse_args()
- if not args:
- op.error("please give me a cut specification and MAF alignments")
- mafCut(opts, args)
+ ap = argparse.ArgumentParser(description=description)
+ ap.add_argument("segment",
+ help="zero-based sequence range, e.g. chr7:123400-123500")
+ ap.add_argument("alignments", nargs="?", default="-",
+ help="file of sequence alignments in MAF format")
+ ap.add_argument("sequences", nargs="*", default=None, # xxx need default?
+ help="file of sequences in fasta or fastq format")
+ args = ap.parse_args()
+ mafCut(args)
=====================================
bin/parallel-fasta
=====================================
@@ -1,8 +1,3 @@
#! /bin/sh
-parallel --gnu --version > /dev/null || exit 1
-
-parallel --gnu --minversion 20130222 > /dev/null ||
-echo $(basename $0): warning: old version of parallel, might be slow 1>&2
-
-exec parallel --gnu --pipe --recstart '>' "$@"
+exec parallel --round --recstart '>' "$@"
=====================================
bin/parallel-fastq
=====================================
@@ -1,9 +1,3 @@
#! /bin/sh
-parallel --gnu --version > /dev/null || exit 1
-
-parallel --gnu --minversion 20130222 > /dev/null ||
-echo $(basename $0): warning: old version of parallel, might be slow 1>&2
-
-# use a record size of 8 lines, so that paired sequences stay together:
-exec parallel --gnu --pipe -L8 "$@"
+exec parallel --round -L8 "$@"
=====================================
build/seed-doc.sh
=====================================
@@ -56,14 +56,14 @@ do
grep '^# ' $i | cut -d' ' -f2-
echo It uses this seed alphabet::
echo
- awk '!/^#/ && NF > 1' $i | sed 's/^/ /'
+ awk '!/^#/ && NF > 1 && length($1) == 1' $i | sed 's/^/ /'
echo
- if [ $(awk 'NF == 1' $i | wc -l) = 1 ]
+ if [ $(awk '!/^#/ && length($1) > 1 || NF == 1' $i | wc -w) = 1 ]
then echo And this pattern::
else echo And these patterns::
fi
echo
- awk 'NF == 1' $i | sed 's/^/ /'
+ awk '!/^#/ && length($1) > 1 || NF == 1' $i | sed 's/^/ /'
echo
grep -q '^#lastdb' $i && {
echo It sets this lastdb default:
=====================================
data/RY16-8.seed
=====================================
@@ -3,7 +3,7 @@
#abbreviation RY16
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
R A G
Y C T
=====================================
data/RY32-10.seed
=====================================
@@ -3,7 +3,7 @@
#abbreviation RY32
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
R A G
Y C T
=====================================
data/RY4-6.seed
=====================================
@@ -4,7 +4,7 @@
#abbreviation RY4
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
R A G
Y C T
=====================================
data/RY8-8.seed
=====================================
@@ -3,7 +3,7 @@
#abbreviation RY8
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
R A G
Y C T
=====================================
doc/last-cookbook.rst
=====================================
@@ -145,47 +145,53 @@ you expect the rates to be roughly the same.
Aligning high-indel-error long DNA reads to a genome
----------------------------------------------------
-Suppose we have DNA reads in either FASTA or FASTQ_ format. This is
-sensitive but slow::
+Suppose we have DNA reads in either FASTA or FASTQ_ format. We can
+align them to a genome like this::
- lastdb -P8 -uNEAR mydb genome.fa
+ lastdb -P8 -uRY4 mydb genome.fa
last-train -P8 -Q0 mydb reads.fastq > reads.train
lastal -P8 -p reads.train mydb reads.fastq | last-split > out.maf
``-P8`` makes it faster by running 8 parallel threads, adjust as
appropriate for your computer. This has no effect on the results.
-``-uNEAR`` selects a `seeding scheme`_ that's better at finding
-alignments with few substitutions and/or many gaps.
+``-uRY4`` selects a `seeding scheme`_ that reduces the run time and
+memory use, but also reduces sensitivity.
``-Q0`` makes it discard the fastq_ quality information (or you can
keep-but-ignore it with ``-Qkeep``).
-last-split_ finds a unique best alignment for each part of each read.
-It gives each alignment a `mismap probability`_, which is high if that
-part of the read is almost equally similar to several parts of the
-genome.
+last-split_ cuts the output of ``lastal`` down to a unique best
+alignment for each part of each read. It gives each alignment a
+`mismap probability`_, which is high if that part of the read is
+almost equally similar to several parts of the genome.
Here we didn't suppress alignments caused by simple sequence (like
``cacacacacacacacacacacaca``), so as not to hide anything from
last-split_. You can discard such alignments with last-postmask_
(though they may help to explain each part of a DNA read).
-You can go faster by sacrificing a bit of sensitivity. It depends on
-your aim, e.g. slow-and-sensitive seems necessary to find intricate
-rearrangements of repeats. Suggested ways to go faster:
+To make it more sensitive but slow, replace ``RY4`` with ``NEAR``:
+good for smaller data. (``-uNEAR`` is suitable for finding alignments
+with few substitutions and/or many gaps.)
-* `Mask repeats`_. This has often worked well.
+Aligning low-error long DNA reads to a genome
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We can do this the same way as for high-error reads, but perhaps
+accelerate more aggressively. ``RY8`` reduces the run time and memory
+use even more than ``RY4``. (This is because ``RY8`` uses ~1/8 of the
+seeds, i.e. initial matches, whereas ``RY4`` uses ~1/4). ``RY16`` is
+faster still, and ``RY32`` is the fastest of these options.
-* Add lastal_ option ``-k8`` (or ``-k16`` etc). This makes it faster,
- by only finding initial matches starting at every 8th (or 16th etc)
- position in the reads.
+Also, lastal_ option ``-C2`` may reduce run time with little effect on
+accuracy.
-* Replace ``-uNEAR`` with ``-uRY32`` (or ``-uRY16``, ``-uRY8``,
- ``-uRY4``). This makes it check for initial matches starting at
- only ~1/32 (or ~1/16 etc) of positions, in both reads and genome.
- Compared to ``-k``: this harms sensitivity slightly more, but
- reduces memory use and makes lastdb faster.
+Aligning potentially-spliced RNA or cDNA long reads to a genome
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See here_. (For low-error reads, you can probably omit ``-d90`` and
+``-m20``.)
Which genome version to use?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -198,22 +204,7 @@ like "analysis set".
You can use multiple genomes, which will be treated like one big
genome::
- lastdb -P8 -uNEAR mydb human.fa virus.fa other-genomes.fa
-
-Aligning low-error long DNA reads to a genome
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-We can do this the same way as for high-error reads, but perhaps
-accelerate more aggressively (e.g. ``-uRY32``).
-
-If repeats are not masked, lastal_ option ``-C2`` may reduce run time
-with little effect on accuracy.
-
-Aligning potentially-spliced RNA or cDNA long reads to a genome
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See here_. (For low-error reads, you can probably omit ``-d90`` and
-``-m20``.)
+ lastdb -P8 -uRY4 mydb human.fa virus.fa other-genomes.fa
Aligning Illumina DNA reads to a genome
---------------------------------------
@@ -235,7 +226,7 @@ didn't use this option.)
This recipe may be excessively slow-and-sensitive. Adding lastal_
option ``-C2`` may make it faster with negligible accuracy loss. You
-can accelerate with e.g. ``-uRY16`` or ``-k16`` as above.
+can accelerate with e.g. ``-uRY16`` as above.
Finding very short DNA alignments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -306,7 +297,7 @@ Finally, we can make a dotplot_::
last-dotplot humchi2.maf humchi2.png
**To go faster** with minor accuracy loss: replace ``-uNEAR`` with
-``-uRY32`` and/or `mask repeats`_.
+``-uRY32``.
To squeeze out the last 0.000...1% of accuracy: add ``-m50`` to the
lastal_ options.
@@ -328,7 +319,7 @@ consider using 5-byte LAST (lastdb5_ and lastal5_). Ordinary (4-byte)
LAST can't handle so much sequence at once, so lastdb_ splits it into
"volumes", which may be inefficient. 5-byte LAST avoids voluming, but
uses more memory. So lastdb5_ works well with a memory-reducing
-option: ``-uRY`` or ``-w`` or ``-W``.
+option such as ``-uRY`` or ``-w``.
Moar faster
-----------
=====================================
doc/last-map-probs.rst
=====================================
@@ -49,16 +49,6 @@ Details
(e.g. because there are no lines starting with "# batch"), it might
need too much memory.
-Using multiple CPUs
--------------------
-
-This will run the pipeline on all your CPU cores::
-
- parallel-fastq "lastal -Q1 -D1000 hu | last-map-probs" < reads.fastq > myalns.maf
-
-It requires GNU parallel to be installed
-(http://www.gnu.org/software/parallel/).
-
Limitations
-----------
=====================================
doc/last-parallel.rst
=====================================
@@ -2,7 +2,7 @@ Running LAST in parallel
========================
You can make LAST faster by running it on multiple CPUs / cores. The
-easiest way is with lastal's -P option::
+easiest way is with lastal_'s -P option::
lastal -P4 my-index queries.fasta > out.maf
@@ -15,7 +15,7 @@ This works by aligning different query sequences in different threads
Dealing with very long query sequences
--------------------------------------
-lastal aligns one "batch" of queries at a time, so if the batch has
+lastal_ aligns one "batch" of queries at a time, so if the batch has
only one query you won't get any parallelization. This can be fixed
by increasing the batch size, with option -i::
@@ -31,50 +31,52 @@ If you have a multi-command "pipeline", such as::
lastal -P4 my-index queries.fasta | last-split > out.maf
-then the -P option may help, because lastal is often the slowest step,
-but it would be nice to parallelize the whole thing. Unfortunately,
-last-split doesn't have a -P option, and even if it did, the pipe
-between the commands would become a bottleneck.
+then the ``-P`` option may help, because lastal is often the slowest
+step, but it would be nice to parallelize the whole thing.
-You can use parallel-fasta and parallel-fastq (which accompany LAST,
-but require `GNU parallel`_ to be installed). These commands read
-sequence data, split it into blocks (with a whole number of sequences
-per block), and run the blocks in parallel through any command or
-pipeline you specify, using all your CPU cores. Here are some
-examples.
+You can use ``parallel-fasta`` and ``parallel-fastq``, which accompany
+LAST, but require `GNU Parallel`_ to be installed.
Instead of this::
- lastal mydb queries.fa > myalns.maf
+ lastal -P8 -p my.train mydb seqs.fasta | last-split > out.maf
try this::
- parallel-fasta "lastal mydb" < queries.fa > myalns.maf
+ parallel-fasta -j8 "lastal -p my.train mydb | last-split" < seqs.fasta > out.maf
Instead of this::
- lastal -Q1 db q.fastq | last-split > out.maf
+ lastal -P8 -p my.train mydb seqs.fastq.gz | last-split | gzip > out.maf.gz
try this::
- parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
+ zcat seqs.fastq.gz | parallel-fastq -j8 "lastal -p my.train mydb | last-split | gzip" > out.maf.gz
-Instead of this::
- bzcat queries.fa.bz2 | lastal mydb > myalns.maf
+Notes:
-try this::
+* ``parallel-fasta`` and ``parallel-fastq`` simply run GNU Parallel
+ with a few options for fasta or fastq: you can specify other GNU
+ Parallel options (like ``-j8``).
- bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf
+* The ``gzip`` example above runs ``gzip`` several times in parallel,
+ and concatenates the results. Fortunately, that is valid ``gz``
+ format.
-Notes:
+* ``lastal`` reads the ``mydb`` files into shared memory, so this
+ memory use does not get mutiplied 8 times.
-* parallel-fasta and parallel-fastq simply execute GNU parallel with a
- few options for fasta or fastq: you can specify other GNU parallel
- options to control the number of simultaneous jobs, use remote
- computers, get the output in the same order as the input, etc.
+* ``parallel-fastq`` assumes that each fastq record is 4 lines, so
+ there should be no line wrapping or blank lines. (Actually, it
+ assumes one "record" is 8 lines, to keep paired reads together.)
-* parallel-fastq assumes that each fastq record is 4 lines, so there
- should be no line wrapping or blank lines.
+* In the current version, ``parallel-fasta`` and ``parallel-fastq``
+ use GNU Parallel's ``--round`` option. This avoids the overhead of
+ restarting the command for each input chunk, but it prevents keeping
+ the order of sequences, and stashes the output in large temporary
+ files. If necessary, use ``$TMPDIR`` or ``--tmpdir`` to choose a
+ temp directory with enough space.
+.. _lastal: doc/lastal.rst
.. _GNU parallel: http://www.gnu.org/software/parallel/
=====================================
doc/last-seeds.rst
=====================================
@@ -206,7 +206,7 @@ And these patterns::
YYYYRY
It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
RY8-8 (abbreviation: RY8)
-------------------------
@@ -254,7 +254,7 @@ And these patterns::
YYYYYYYR
It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
RY16-8 (abbreviation: RY16)
---------------------------
@@ -286,7 +286,7 @@ And these patterns::
YYRRYRYY
It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
RY32-10 (abbreviation: RY32)
----------------------------
@@ -334,5 +334,5 @@ And these patterns::
YYYYYYRRRY
It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
=====================================
doc/maf-cut.rst
=====================================
@@ -17,4 +17,14 @@ You can omit the sequence name::
Then, the range applies to the topmost sequence in each alignment.
+``maf-cut`` can also get parts of sequences::
+
+ maf-cut chr7:1234000-1235000 alignments.maf sequences.fasta
+
+This first applies ``maf-cut`` to ``alignments.maf`` in the usual way
+(without printing anything). For each sequence, it gets the minimum
+start coordinate and maximum end coordinate in the resulting
+alignments. It prints this segment of each sequence in
+``sequences.fasta``. (fastq format is ok too.)
+
.. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
=====================================
src/CyclicSubsetSeed.cc
=====================================
@@ -86,47 +86,50 @@ n ACGT\n\
" + patterns;
}
-bool CyclicSubsetSeed::nextPattern( std::istream& in,
- std::vector< std::string >& seedAlphabet,
- std::string& pattern ){
- while ( getline( in, pattern ) ){
- std::istringstream iss( pattern );
- std::string x, y;
- iss >> x;
- if( x.empty() || x[0] == '#' ) continue;
- iss >> y;
- if( y.empty() ) return true;
- if( x.size() > 1 ) ERR( "bad seed line: " + pattern );
- seedAlphabet.push_back( pattern );
- }
- return false;
+static int lineType(const std::string &line) {
+ std::istringstream s(line);
+ std::string x, y;
+ s >> x >> y;
+ return (x.empty() || x[0] == '#') ? 0 // blank or comment line
+ : (x.size() == 1 && !y.empty()) ? 1 // seed-alphabet line
+ : 2; // seed-pattern line
}
-static size_t findSeedLetter( const std::vector< std::string >& seedAlphabet,
- char seedLetter ){
+static const char *letterGroups(const std::vector<std::string> &seedAlphabet,
+ char seedLetter) {
// go backwards, so that newer definitions override older ones:
- for( size_t j = seedAlphabet.size(); j > 0; ){
- --j;
- const std::string& s = seedAlphabet[j];
- assert( !s.empty() );
- if( s[0] == seedLetter ) return j;
+ for (size_t i = seedAlphabet.size(); i-- > 0; ) {
+ const char *s = seedAlphabet[i].c_str();
+ if (s[0] == seedLetter) return s + 2;
}
- ERR( "unknown symbol in seed pattern: " + stringify(seedLetter) );
+ ERR("unknown symbol in seed pattern: " + stringify(seedLetter));
}
-void CyclicSubsetSeed::init( const std::vector< std::string >& seedAlphabet,
- const std::string& pattern,
- bool isMaskLowercase,
- const uchar letterCode[],
- const std::string &mainSequenceAlphabet ){
- clear();
- for( size_t i = 0; i < pattern.size(); ++i ){
- char seedLetter = pattern[i];
- if( std::isspace(seedLetter) ) continue;
- size_t j = findSeedLetter( seedAlphabet, seedLetter );
- std::istringstream iss( seedAlphabet[j] );
- iss >> seedLetter;
- appendPosition( iss, isMaskLowercase, letterCode, mainSequenceAlphabet );
+void CyclicSubsetSeed::addPatterns(std::vector<CyclicSubsetSeed> &patterns,
+ const std::string &text,
+ bool isMaskLowercase,
+ const uchar letterCode[],
+ const std::string &mainSequenceAlphabet) {
+ std::vector<std::string> seedAlphabet;
+ std::string line, word;
+ std::istringstream textStream(text);
+
+ while (getline(textStream, line)) {
+ int type = lineType(line);
+ if (type == 1) {
+ seedAlphabet.push_back(line);
+ } else if (type == 2) {
+ std::istringstream lineStream(line);
+ while (lineStream >> word) {
+ CyclicSubsetSeed pat;
+ for (size_t i = 0; i < word.size(); ++i) {
+ std::istringstream s(letterGroups(seedAlphabet, word[i]));
+ pat.appendPosition(s, isMaskLowercase, letterCode,
+ mainSequenceAlphabet);
+ }
+ patterns.push_back(pat);
+ }
+ }
}
}
=====================================
src/CyclicSubsetSeed.hh
=====================================
@@ -72,21 +72,16 @@ public:
// codes. Finally, "@" allows any match or transition.
static std::string stringFromDnaPatterns(std::string patterns);
- // Reads lines from "in" until it finds a pattern line. Any seed
- // alphabet lines are appended to "seedAlphabet". If it finds a
- // pattern line, it stores it in "pattern" and returns true, else it
- // returns false. Blank lines and comment lines starting with # are
- // ignored.
- static bool nextPattern( std::istream& in,
- std::vector< std::string >& seedAlphabet,
- std::string& pattern );
-
- void clear() {
- subsetLists.clear();
- subsetMaps.clear();
- originalSubsetMaps.clear();
- numOfSubsetsPerPosition.clear();
- }
+ // Read subset-seed patterns from text and add them to patterns.
+ // The text should start with lines defining a seed alphabet,
+ // followed by lines with seed patterns. Blank lines and comment
+ // lines starting with # are ignored. Seed letters are
+ // case-sensitive.
+ static void addPatterns(std::vector<CyclicSubsetSeed> &patterns,
+ const std::string &text,
+ bool isMaskLowercase,
+ const uchar letterCode[],
+ const std::string &mainSequenceAlphabet);
void swap( CyclicSubsetSeed& x ){
subsetLists.swap( x.subsetLists );
@@ -95,13 +90,6 @@ public:
numOfSubsetsPerPosition.swap( x.numOfSubsetsPerPosition );
}
- // Seed letters are case-sensitive.
- void init( const std::vector< std::string >& seedAlphabet,
- const std::string& pattern,
- bool isMaskLowercase,
- const uchar letterCode[],
- const std::string& mainSequenceAlphabet );
-
// "inputLine" should be a grouping of sequence letters.
void appendPosition( std::istream& inputLine,
bool isMaskLowercase,
=====================================
src/LastalArguments.cc
=====================================
@@ -172,7 +172,7 @@ Miscellaneous options (default settings):\n\
-C: omit gapless alignments in >= C others with > score-per-length (off)\n\
-P: number of parallel threads ("
+ stringify(numOfThreads) + ")\n\
--i: query batch size (8 KiB, unless there is > 1 thread or lastdb volume)\n\
+-i: query batch size (64M if multi-volume, else 32M if multi-thread, else 8K)\n\
-M: find minimum-difference alignments (faster but cruder)\n\
-T: type of alignment: 0=local, 1=overlap ("
+ stringify(globality) + ")\n\
@@ -524,7 +524,10 @@ void LastalArguments::setDefaultsFromAlphabet( bool isDna, bool isProtein,
else if( outputType == 0 )
batchSize = 0x1000000; // 16 Mbytes
else if( !isVolumes )
- batchSize = 0x800000; // 8 Mbytes
+ batchSize = 0x2000000; // 32 Mbytes
+ // 32 Mbytes gave quite good load-balancing when aligning human
+ // nanopore reads with 16 threads. Sometimes 64M (and even 128M)
+ // was better. With WindowMasker, smaller batches were fine.
else if( inputFormat == sequenceFormat::prb )
batchSize = 0x2000000; // 32 Mbytes (?)
else
=====================================
src/fileMap.cc
=====================================
@@ -25,8 +25,7 @@ static void err( const std::string& s ) {
// random access can be horribly slow (at least on two Linux 2.6
// systems).
static void primeMemory( void* begin, size_t bytes ){
- // use "static" to stop the compiler optimizing the whole function away:
- static unsigned z = 0;
+ unsigned z = 0;
size_t stepSize = 1024;
const char* x = static_cast<char*>(begin);
const char* y = x + (bytes / stepSize) * stepSize;
@@ -34,6 +33,8 @@ static void primeMemory( void* begin, size_t bytes ){
z += *x;
x += stepSize;
}
+ volatile unsigned dontOptimizeMeAway = z;
+ dontOptimizeMeAway = dontOptimizeMeAway; // ??? prevents compiler warning
}
namespace cbrc{
=====================================
src/lastdb.cc
=====================================
@@ -51,49 +51,36 @@ bool isDubiousDna( const Alphabet& alph, const MultiSequence& multi ){
else return false;
}
-static void addSeeds( std::vector< CyclicSubsetSeed >& seeds,
- const std::string& seedText,
- const LastdbArguments& args, const Alphabet& alph ){
- std::istringstream iss( seedText );
- std::vector< std::string > seedAlphabet;
- std::string pattern;
- while( CyclicSubsetSeed::nextPattern( iss, seedAlphabet, pattern ) ){
- CyclicSubsetSeed s;
- s.init(seedAlphabet, pattern, args.isCaseSensitive, alph.encode,
- alph.letters);
- seeds.push_back(s);
- }
-}
-
// Set up the seed pattern(s)
static void makeSubsetSeeds( std::vector< CyclicSubsetSeed >& seeds,
const std::string& seedText,
const LastdbArguments& args,
const Alphabet& alph ){
const std::string& a = alph.letters;
+ bool isCaseSens = args.isCaseSensitive;
if( !args.subsetSeedFile.empty() ){
- addSeeds( seeds, seedText, args, alph );
+ CyclicSubsetSeed::addPatterns(seeds, seedText, isCaseSens, alph.encode, a);
}
else if (!args.dnaSeedPatterns.empty()) {
for (size_t x = 0; x < args.dnaSeedPatterns.size(); ++x) {
const std::string &p = args.dnaSeedPatterns[x];
std::string s = CyclicSubsetSeed::stringFromDnaPatterns(p);
- addSeeds(seeds, s, args, alph);
+ CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
}
}
else if( !args.seedPatterns.empty() ){
for( unsigned x = 0; x < args.seedPatterns.size(); ++x ){
const std::string& p = args.seedPatterns[x];
std::string s = CyclicSubsetSeed::stringFromPatterns( p, a );
- addSeeds( seeds, s, args, alph );
+ CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
}
}
else{
std::string s = (alph.letters == alph.dna)
? CyclicSubsetSeed::stringFromName( "YASS" )
: CyclicSubsetSeed::stringFromPatterns( "1", a );
- addSeeds( seeds, s, args, alph );
+ CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
}
if( seeds.empty() ) ERR( "no seed patterns" );
=====================================
src/makefile
=====================================
@@ -143,7 +143,7 @@ ScoreMatrixData.hh: ../data/*.mat
../build/mat-inc.sh ../data/*.mat > $@
VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1282) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1296) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
=====================================
test/last-test.out
=====================================
@@ -5235,8 +5235,8 @@ s LTR22C#LTR/ERVK 0 505 + 509 TGTAGGGGNTCGGTCAGGGTGGTGGGAGAAATTGTAAAAATAAATTATA
# Query sequences=1 normal letters=485
#
-# a=21 b=4 A=21 B=4 e=114 d=70 x=113 y=49 z=113 D=1e+06 E=2.92797e+07
-# R=01 u=0 s=2 S=0 M=0 T=0 m=10 l=1 n=10 k=1 w=1000 t=4.88281 j=3 Q=0
+# a=21 b=4 A=21 B=4 e=114 d=77 x=113 y=49 z=113 D=1e+06 E=2.92797e+07
+# R=01 u=0 s=2 S=0 M=0 T=0 m=2 l=1 n=2 k=1 w=1000 t=4.88281 j=3 Q=0
# /tmp/last-test
# Reference sequences=1 normal letters=16571
# lambda=0.201859 K=0.288737
@@ -5275,7 +5275,6 @@ s LTR22C#LTR/ERVK 0 505 + 509 TGTAGGGGNTCGGTCAGGGTGGTGGGAGAAATTGTAAAAATAAATTATA
644 chrM 1742 311 + 16571 chrM 2432 326 + 16775 18,1:0,2,0:2,17,0:4,25,0:4,11,0:2,57,1:0,42,0:1,24,0:3,83,0:4,6,3:0,21 EG2=1e-39 E=5.5e-49
627 chrM 656 291 + 16571 chrM 1303 296 + 16775 78,0:3,35,0:2,6,2:0,20,0:1,19,0:1,55,1:0,57,0:1,18 EG2=3.1e-38 E=1.7e-47
587 chrM 5208 516 + 16571 chrM 5978 522 + 16775 74,0:2,17,2:0,88,0:3,19,3:0,5,5:0,14,0:5,6,3:0,21,0:3,14,1:0,12,2:0,29,0:1,6,0:4,30,0:2,9,0:1,13,1:0,73,0:1,2,0:1,67 EG2=1e-34 E=5.5e-44
-300 chrM 10642 148 + 16571 chrM 11367 148 + 16775 68,1:0,9,0:1,70 EG2=1.4e-09 E=8e-19
288 chrM 8638 216 + 16571 chrM 9353 216 + 16775 216 EG2=1.6e-08 E=9e-18
210 chrM 10518 77 + 16571 chrM 11243 77 + 16775 77 EG2=0.11 E=6.2e-11
154 chrM 14425 84 + 16571 chrM 16456 84 + 16775 84 EG2=9.1e+03 E=5e-06
@@ -5320,18 +5319,3 @@ p $&'~~~~~~~~~~~~(.0233.,,,,,+--&)*+,,,-.-+++++---)'%(+
c 20.6 0.00414 1.97 1.05 1.13 4.98 0.158 2.83 0.966 0.0011 9.56 0.997 0.0153 0.0037 1.07 6.54 51.8 16.4 0.361 3 0.27
# Query sequences=1 normal letters=54
-a score=192 mismap=0.117
-s KI635862 2418195 86 + 12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCCGG-ATAACAGGCGCGCGCCAC
-s chr1 248803921 88 - 248956422 CATAGCTCACTGCAGCCTTGACCTCCTAGGCTAAAGCAATCCTCCCACCTTAGCCTCTCCAGTAGCTGGAACTACAGGCATGCATCAC
-p #$$%&&'(()******************************************************************************
-
-a score=330 mismap=0.75
-s KI636063 409336 105 + 789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
-s chr1 157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
-p !"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""!
-
-a score=203 mismap=0.0127
-s KI635862 2418239 48 + 12728579 CAGCCTCAGCCTC-CGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCC
-s chr1 248798136 50 - 248956422 CTGCCTCAGCCTCACAAGTAGCTGGGACTACAGGTGCTTGTCACCACACC
-p 33333333333333333333333333333333333322211110.*)'&$
-
=====================================
test/last-test.sh
=====================================
@@ -245,8 +245,6 @@ trap 'rm -f $db*' EXIT
# tricky Forward-Backward bug that happened once
lastdb $db alli.fa
lastal -j7 -r5 -q5 -a15 -b3 $db huma.fa
-
- maf-cut chr1:152413-158286 split1.maf
} 2>&1 |
grep -v version | diff -u last-test.out -
@@ -261,6 +259,7 @@ rm f.* r.*
./last-split-test.sh
./last-train-test.sh
./maf-convert-test.sh
+./maf-cut-test.sh
./maf-swap-test.sh
# Test: lastdb, lastal, last-split, maf-sort, maf-join
=====================================
test/maf-cut-test.out
=====================================
@@ -0,0 +1,33 @@
+a score=192 mismap=0.117
+s KI635862 2418195 86 + 12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCCGG-ATAACAGGCGCGCGCCAC
+s chr1 248803921 88 - 248956422 CATAGCTCACTGCAGCCTTGACCTCCTAGGCTAAAGCAATCCTCCCACCTTAGCCTCTCCAGTAGCTGGAACTACAGGCATGCATCAC
+p #$$%&&'(()******************************************************************************
+
+a score=330 mismap=0.75
+s KI636063 409336 105 + 789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
+s chr1 157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
+p !"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""!
+
+a score=203 mismap=0.0127
+s KI635862 2418239 48 + 12728579 CAGCCTCAGCCTC-CGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCC
+s chr1 248798136 50 - 248956422 CTGCCTCAGCCTCACAAGTAGCTGGGACTACAGGTGCTTGTCACCACACC
+p 33333333333333333333333333333333333322211110.*)'&$
+
+@ 7 87 chrMa 141 -
+TAGTAGGTATGTTTGCTTGTAATATTGAATGTAGGAGCGATAGATAAGAG
++
+?:+4>89ABBB at AB?):B=5=1@@49;;9;<9(22+/+>?060.=<4++2
+@ 21 87 chrMa 80 +
+TTATTATTTATCGTATT
++
+B2BBABBA:1:A8A?=9
+@ 95 87 chrMa 131 +
+TTATTATTTATCGTATTTACGTTTAATATTATAGGTGAATATATTTATTG
++
+A?6BA=*?@>>8:?0;=6>@A8?76 at 7<?@?;99174=9?3216;(:19#
+> 7 87 chrMa 141 -
+TAGTAGGTATGTTTGCTTGTAATATTGAATGTAGGAGCGATAGATAAGAG
+> 21 87 chrMa 80 +
+TTATTATTTATCGTATT
+> 95 87 chrMa 131 +
+TTATTATTTATCGTATTTACGTTTAATATTATAGGTGAATATATTTATTG
=====================================
test/maf-cut-test.sh
=====================================
@@ -0,0 +1,14 @@
+#! /bin/sh
+
+cd $(dirname $0)
+
+PATH=../bin:$PATH
+
+{
+ maf-cut chr1:152413-158286 split1.maf
+
+ maf-cut chrM:150-200 bs100.maf bs100.fastq
+
+ awk '(NR-1) % 4 < 2' bs100.fastq | tr '@' '>' |
+ maf-cut chrM:150-200 bs100.maf -
+} | diff -u maf-cut-test.out -
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a87dec4f5896357452377c8df6169de6609b2822
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a87dec4f5896357452377c8df6169de6609b2822
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220529/fef2032e/attachment-0001.htm>
More information about the debian-med-commit
mailing list