[med-svn] [Git][med-team/last-align][upstream] New upstream version 1296

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Sun May 29 14:56:46 BST 2022



Nilesh Patra pushed to branch upstream at Debian Med / last-align


Commits:
a87dec4f by Nilesh Patra at 2022-05-29T19:11:55+05:30
New upstream version 1296
- - - - -


23 changed files:

- bin/maf-cut
- bin/parallel-fasta
- bin/parallel-fastq
- build/seed-doc.sh
- data/RY16-8.seed
- data/RY32-10.seed
- data/RY4-6.seed
- data/RY8-8.seed
- doc/last-cookbook.rst
- doc/last-map-probs.rst
- doc/last-parallel.rst
- doc/last-seeds.rst
- doc/maf-cut.rst
- src/CyclicSubsetSeed.cc
- src/CyclicSubsetSeed.hh
- src/LastalArguments.cc
- src/fileMap.cc
- src/lastdb.cc
- src/makefile
- test/last-test.out
- test/last-test.sh
- + test/maf-cut-test.out
- + test/maf-cut-test.sh


Changes:

=====================================
bin/maf-cut
=====================================
@@ -3,9 +3,10 @@
 
 from __future__ import print_function
 
+import argparse
+import collections
 import gzip
 import itertools
-import optparse
 import signal
 import sys
 
@@ -56,13 +57,28 @@ def findTheSpecifiedSequence(seqName, mafLines):
                 return fields
     return None
 
+def addMafData(dataPerSeq, mafLines, alnBeg, alnEnd):
+    for line in mafLines:
+        if line[0] == "s":
+            fields = line.split()
+            seqName = fields[1]  # xxx omit theSpecifiedSequence ?
+            strand = fields[4]
+            seqLen = int(fields[5])
+            oldSeq = fields[6]
+            beg = int(fields[2]) + alnBeg - oldSeq.count("-", 0, alnBeg)
+            end = beg + alnEnd - alnBeg - oldSeq.count("-", alnBeg, alnEnd)
+            if strand == "-":
+                beg, end = seqLen - end, seqLen - beg
+            seqData = seqLen, beg, end
+            dataPerSeq[seqName].append(seqData)
+
 def cutMafRecords(mafLines, alnBeg, alnEnd):
     for line in mafLines:
         fields = line.split()
         if line[0] == "s":
             oldSeq = fields[6]
             newSeq = oldSeq[alnBeg:alnEnd]
-            beg = int(fields[2]) + alnBeg - oldSeq[:alnBeg].count("-")
+            beg = int(fields[2]) + alnBeg - oldSeq.count("-", 0, alnBeg)
             span = len(newSeq) - newSeq.count("-")
             yield fields[:2] + [str(beg), str(span)] + fields[4:6] + [newSeq]
         elif line[0] == "q":
@@ -82,15 +98,7 @@ def printMafLine(fieldWidths, fields):
     formatParams = itertools.chain.from_iterable(zip(fieldWidths, fields))
     print("%*s %-*s %*s %*s %*s %*s %*s" % tuple(formatParams))
 
-def cutOneMaf(cutRange, mafLines):
-    seqName, cutBeg, cutEnd = cutRange
-    sLineFields = findTheSpecifiedSequence(seqName, mafLines)
-    if not sLineFields:
-        return
-    alnBeg, alnEnd = alignmentRange(cutBeg, cutEnd, sLineFields)
-    if alnBeg >= alnEnd:
-        return
-    mafRecords = list(cutMafRecords(mafLines, alnBeg, alnEnd))
+def printOneMaf(mafRecords):
     fieldWidths = list(mafFieldWidths(mafRecords))
     for fields in mafRecords:
         if fields[0] == "s":
@@ -103,15 +111,75 @@ def cutOneMaf(cutRange, mafLines):
             print(" ".join(fields))
     print()
 
-def mafCutOneFile(cutRange, lines):
-    mafLines = []
+def nameFromTitleLine(line):
+    return line[1:].split()[0]
+
+def doOneFastaSequence(dataPerSeq, topLine, seqLines):
+    alnData = dataPerSeq.get(nameFromTitleLine(topLine))
+    if alnData:
+        seq = "".join(i.rstrip() for i in seqLines)
+        seqLen = len(seq)
+        alnData = [i for i in alnData if i[0] == seqLen]
+        if alnData:
+            beg = min(i[1] for i in alnData)
+            end = max(i[2] for i in alnData)
+            print(topLine, end="")
+            print(seq[beg:end])
+
+def doOneFastqSequence(dataPerSeq, lines):
+    alnData = dataPerSeq.get(nameFromTitleLine(lines[0]))
+    if alnData:
+        seqLen = len(lines[1].rstrip())
+        alnData = [i for i in alnData if i[0] == seqLen]
+        if alnData:
+            beg = min(i[1] for i in alnData)
+            end = max(i[2] for i in alnData)
+            print(lines[0], end="")
+            print(lines[1][beg:end])
+            print(lines[2], end="")
+            print(lines[3][beg:end])
+
+def printFastaParts(dataPerSeq, topLine, lines):
+    seqLines = []
     for line in lines:
-        if line.isspace():
-            cutOneMaf(cutRange, mafLines)
-            mafLines = []
-        elif line[0] != "#":
-            mafLines.append(line)
-    cutOneMaf(cutRange, mafLines)
+        if line[0] == ">":
+            doOneFastaSequence(dataPerSeq, topLine, seqLines)
+            topLine = line
+            seqLines = []
+        else:
+            seqLines.append(line)
+    doOneFastaSequence(dataPerSeq, topLine, seqLines)
+
+def printFastqParts(dataPerSeq, topLine, lines):
+    b = [topLine]
+    for i, x in enumerate(lines):
+        b.append(x)
+        if i % 4 == 2:
+            doOneFastqSequence(dataPerSeq, b)
+            b = []
+
+def printSequenceParts(dataPerSeq, lines):
+    for line in lines:
+        if line[0] == ">":
+            printFastaParts(dataPerSeq, line, lines)
+        elif line[0] == "@":
+            printFastqParts(dataPerSeq, line, lines)
+        else:
+            raise RuntimeError("can't read the sequence data")
+        break
+
+def cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines):
+    sLineFields = findTheSpecifiedSequence(seqName, mafLines)
+    if not sLineFields:
+        return
+    alnBeg, alnEnd = alignmentRange(cutBeg, cutEnd, sLineFields)
+    if alnBeg >= alnEnd:
+        return
+    if dataPerSeq is None:
+        mafRecords = list(cutMafRecords(mafLines, alnBeg, alnEnd))
+        printOneMaf(mafRecords)
+    else:
+        addMafData(dataPerSeq, mafLines, alnBeg, alnEnd)
 
 def wantedRange(cutSpecification):
     seqName = None
@@ -120,20 +188,32 @@ def wantedRange(cutSpecification):
     beg, end = cutSpecification.rsplit("-", 1)
     return seqName, int(beg), int(end)
 
-def mafCut(opts, args):
-    cutRange = wantedRange(args[0])
-    if len(args) > 1:
-        for i in args[1:]:
-            mafCutOneFile(cutRange, myOpen(i))
-    else:
-        mafCutOneFile(cutRange, sys.stdin)
+def mafCut(args):
+    seqName, cutBeg, cutEnd = wantedRange(args.segment)
+    dataPerSeq = collections.defaultdict(list) if args.sequences else None
+
+    mafLines = []
+    for line in myOpen(args.alignments):
+        if line.isspace():
+            cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines)
+            mafLines = []
+        elif line[0] != "#":
+            mafLines.append(line)
+    cutOneMaf(seqName, cutBeg, cutEnd, dataPerSeq, mafLines)
+
+    if dataPerSeq:
+        for fileName in args.sequences:
+            printSequenceParts(dataPerSeq, myOpen(fileName))
 
 if __name__ == "__main__":
     signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
-    usage = "%prog chrN:start-end alignments.maf"
     description = "Get parts of MAF-format alignments."
-    op = optparse.OptionParser(usage=usage, description=description)
-    opts, args = op.parse_args()
-    if not args:
-        op.error("please give me a cut specification and MAF alignments")
-    mafCut(opts, args)
+    ap = argparse.ArgumentParser(description=description)
+    ap.add_argument("segment",
+                    help="zero-based sequence range, e.g. chr7:123400-123500")
+    ap.add_argument("alignments", nargs="?", default="-",
+                    help="file of sequence alignments in MAF format")
+    ap.add_argument("sequences", nargs="*", default=None,  # xxx need default?
+                    help="file of sequences in fasta or fastq format")
+    args = ap.parse_args()
+    mafCut(args)


=====================================
bin/parallel-fasta
=====================================
@@ -1,8 +1,3 @@
 #! /bin/sh
 
-parallel --gnu --version > /dev/null || exit 1
-
-parallel --gnu --minversion 20130222 > /dev/null ||
-echo $(basename $0): warning: old version of parallel, might be slow 1>&2
-
-exec parallel --gnu --pipe --recstart '>' "$@"
+exec parallel --round --recstart '>' "$@"


=====================================
bin/parallel-fastq
=====================================
@@ -1,9 +1,3 @@
 #! /bin/sh
 
-parallel --gnu --version > /dev/null || exit 1
-
-parallel --gnu --minversion 20130222 > /dev/null ||
-echo $(basename $0): warning: old version of parallel, might be slow 1>&2
-
-# use a record size of 8 lines, so that paired sequences stay together:
-exec parallel --gnu --pipe -L8 "$@"
+exec parallel --round -L8 "$@"


=====================================
build/seed-doc.sh
=====================================
@@ -56,14 +56,14 @@ do
     grep '^# ' $i | cut -d' ' -f2-
     echo It uses this seed alphabet::
     echo
-    awk '!/^#/ && NF > 1' $i | sed 's/^/  /'
+    awk '!/^#/ && NF > 1 && length($1) == 1' $i | sed 's/^/  /'
     echo
-    if [ $(awk 'NF == 1' $i | wc -l) = 1 ]
+    if [ $(awk '!/^#/ && length($1) > 1 || NF == 1' $i | wc -w) = 1 ]
 	then echo And this pattern::
 	else echo And these patterns::
     fi
     echo
-    awk 'NF == 1' $i | sed 's/^/  /'
+    awk '!/^#/ && length($1) > 1 || NF == 1' $i | sed 's/^/  /'
     echo
     grep -q '^#lastdb' $i && {
 	echo It sets this lastdb default:


=====================================
data/RY16-8.seed
=====================================
@@ -3,7 +3,7 @@
 
 #abbreviation RY16
 
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
 
 R  A G
 Y  C T


=====================================
data/RY32-10.seed
=====================================
@@ -3,7 +3,7 @@
 
 #abbreviation RY32
 
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
 
 R  A G
 Y  C T


=====================================
data/RY4-6.seed
=====================================
@@ -4,7 +4,7 @@
 
 #abbreviation RY4
 
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
 
 R  A G
 Y  C T


=====================================
data/RY8-8.seed
=====================================
@@ -3,7 +3,7 @@
 
 #abbreviation RY8
 
-#lastal -r6 -q18 -a21 -b9
+#lastal -m2 -r6 -q18 -a21 -b9
 
 R  A G
 Y  C T


=====================================
doc/last-cookbook.rst
=====================================
@@ -145,47 +145,53 @@ you expect the rates to be roughly the same.
 Aligning high-indel-error long DNA reads to a genome
 ----------------------------------------------------
 
-Suppose we have DNA reads in either FASTA or FASTQ_ format.  This is
-sensitive but slow::
+Suppose we have DNA reads in either FASTA or FASTQ_ format.  We can
+align them to a genome like this::
 
-  lastdb -P8 -uNEAR mydb genome.fa
+  lastdb -P8 -uRY4 mydb genome.fa
   last-train -P8 -Q0 mydb reads.fastq > reads.train
   lastal -P8 -p reads.train mydb reads.fastq | last-split > out.maf
 
 ``-P8`` makes it faster by running 8 parallel threads, adjust as
 appropriate for your computer.  This has no effect on the results.
 
-``-uNEAR`` selects a `seeding scheme`_ that's better at finding
-alignments with few substitutions and/or many gaps.
+``-uRY4`` selects a `seeding scheme`_ that reduces the run time and
+memory use, but also reduces sensitivity.
 
 ``-Q0`` makes it discard the fastq_ quality information (or you can
 keep-but-ignore it with ``-Qkeep``).
 
-last-split_ finds a unique best alignment for each part of each read.
-It gives each alignment a `mismap probability`_, which is high if that
-part of the read is almost equally similar to several parts of the
-genome.
+last-split_ cuts the output of ``lastal`` down to a unique best
+alignment for each part of each read.  It gives each alignment a
+`mismap probability`_, which is high if that part of the read is
+almost equally similar to several parts of the genome.
 
 Here we didn't suppress alignments caused by simple sequence (like
 ``cacacacacacacacacacacaca``), so as not to hide anything from
 last-split_.  You can discard such alignments with last-postmask_
 (though they may help to explain each part of a DNA read).
 
-You can go faster by sacrificing a bit of sensitivity.  It depends on
-your aim, e.g. slow-and-sensitive seems necessary to find intricate
-rearrangements of repeats.  Suggested ways to go faster:
+To make it more sensitive but slow, replace ``RY4`` with ``NEAR``:
+good for smaller data.  (``-uNEAR`` is suitable for finding alignments
+with few substitutions and/or many gaps.)
 
-* `Mask repeats`_.  This has often worked well.
+Aligning low-error long DNA reads to a genome
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We can do this the same way as for high-error reads, but perhaps
+accelerate more aggressively.  ``RY8`` reduces the run time and memory
+use even more than ``RY4``.  (This is because ``RY8`` uses ~1/8 of the
+seeds, i.e. initial matches, whereas ``RY4`` uses ~1/4).  ``RY16`` is
+faster still, and ``RY32`` is the fastest of these options.
 
-* Add lastal_ option ``-k8`` (or ``-k16`` etc).  This makes it faster,
-  by only finding initial matches starting at every 8th (or 16th etc)
-  position in the reads.
+Also, lastal_ option ``-C2`` may reduce run time with little effect on
+accuracy.
 
-* Replace ``-uNEAR`` with ``-uRY32`` (or ``-uRY16``, ``-uRY8``,
-  ``-uRY4``).  This makes it check for initial matches starting at
-  only ~1/32 (or ~1/16 etc) of positions, in both reads and genome.
-  Compared to ``-k``: this harms sensitivity slightly more, but
-  reduces memory use and makes lastdb faster.
+Aligning potentially-spliced RNA or cDNA long reads to a genome
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See here_.  (For low-error reads, you can probably omit ``-d90`` and
+``-m20``.)
 
 Which genome version to use?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -198,22 +204,7 @@ like "analysis set".
 You can use multiple genomes, which will be treated like one big
 genome::
 
-  lastdb -P8 -uNEAR mydb human.fa virus.fa other-genomes.fa
-
-Aligning low-error long DNA reads to a genome
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-We can do this the same way as for high-error reads, but perhaps
-accelerate more aggressively (e.g. ``-uRY32``).
-
-If repeats are not masked, lastal_ option ``-C2`` may reduce run time
-with little effect on accuracy.
-
-Aligning potentially-spliced RNA or cDNA long reads to a genome
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See here_.  (For low-error reads, you can probably omit ``-d90`` and
-``-m20``.)
+  lastdb -P8 -uRY4 mydb human.fa virus.fa other-genomes.fa
 
 Aligning Illumina DNA reads to a genome
 ---------------------------------------
@@ -235,7 +226,7 @@ didn't use this option.)
 
 This recipe may be excessively slow-and-sensitive.  Adding lastal_
 option ``-C2`` may make it faster with negligible accuracy loss.  You
-can accelerate with e.g. ``-uRY16`` or ``-k16`` as above.
+can accelerate with e.g. ``-uRY16`` as above.
 
 Finding very short DNA alignments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -306,7 +297,7 @@ Finally, we can make a dotplot_::
   last-dotplot humchi2.maf humchi2.png
 
 **To go faster** with minor accuracy loss: replace ``-uNEAR`` with
-``-uRY32`` and/or `mask repeats`_.
+``-uRY32``.
 
 To squeeze out the last 0.000...1% of accuracy: add ``-m50`` to the
 lastal_ options.
@@ -328,7 +319,7 @@ consider using 5-byte LAST (lastdb5_ and lastal5_).  Ordinary (4-byte)
 LAST can't handle so much sequence at once, so lastdb_ splits it into
 "volumes", which may be inefficient.  5-byte LAST avoids voluming, but
 uses more memory.  So lastdb5_ works well with a memory-reducing
-option: ``-uRY`` or ``-w`` or ``-W``.
+option such as ``-uRY`` or ``-w``.
 
 Moar faster
 -----------


=====================================
doc/last-map-probs.rst
=====================================
@@ -49,16 +49,6 @@ Details
   (e.g. because there are no lines starting with "# batch"), it might
   need too much memory.
 
-Using multiple CPUs
--------------------
-
-This will run the pipeline on all your CPU cores::
-
-  parallel-fastq "lastal -Q1 -D1000 hu | last-map-probs" < reads.fastq > myalns.maf
-
-It requires GNU parallel to be installed
-(http://www.gnu.org/software/parallel/).
-
 Limitations
 -----------
 


=====================================
doc/last-parallel.rst
=====================================
@@ -2,7 +2,7 @@ Running LAST in parallel
 ========================
 
 You can make LAST faster by running it on multiple CPUs / cores.  The
-easiest way is with lastal's -P option::
+easiest way is with lastal_'s -P option::
 
   lastal -P4 my-index queries.fasta > out.maf
 
@@ -15,7 +15,7 @@ This works by aligning different query sequences in different threads
 Dealing with very long query sequences
 --------------------------------------
 
-lastal aligns one "batch" of queries at a time, so if the batch has
+lastal_ aligns one "batch" of queries at a time, so if the batch has
 only one query you won't get any parallelization.  This can be fixed
 by increasing the batch size, with option -i::
 
@@ -31,50 +31,52 @@ If you have a multi-command "pipeline", such as::
 
   lastal -P4 my-index queries.fasta | last-split > out.maf
 
-then the -P option may help, because lastal is often the slowest step,
-but it would be nice to parallelize the whole thing.  Unfortunately,
-last-split doesn't have a -P option, and even if it did, the pipe
-between the commands would become a bottleneck.
+then the ``-P`` option may help, because lastal is often the slowest
+step, but it would be nice to parallelize the whole thing.
 
-You can use parallel-fasta and parallel-fastq (which accompany LAST,
-but require `GNU parallel`_ to be installed).  These commands read
-sequence data, split it into blocks (with a whole number of sequences
-per block), and run the blocks in parallel through any command or
-pipeline you specify, using all your CPU cores.  Here are some
-examples.
+You can use ``parallel-fasta`` and ``parallel-fastq``, which accompany
+LAST, but require `GNU Parallel`_ to be installed.
 
 Instead of this::
 
-  lastal mydb queries.fa > myalns.maf
+  lastal -P8 -p my.train mydb seqs.fasta | last-split > out.maf
 
 try this::
 
-  parallel-fasta "lastal mydb" < queries.fa > myalns.maf
+  parallel-fasta -j8 "lastal -p my.train mydb | last-split" < seqs.fasta > out.maf
 
 Instead of this::
 
-  lastal -Q1 db q.fastq | last-split > out.maf
+  lastal -P8 -p my.train mydb seqs.fastq.gz | last-split | gzip > out.maf.gz
 
 try this::
 
-  parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf
+  zcat seqs.fastq.gz | parallel-fastq -j8 "lastal -p my.train mydb | last-split | gzip" > out.maf.gz
 
-Instead of this::
 
-  bzcat queries.fa.bz2 | lastal mydb > myalns.maf
+Notes:
 
-try this::
+* ``parallel-fasta`` and ``parallel-fastq`` simply run GNU Parallel
+  with a few options for fasta or fastq: you can specify other GNU
+  Parallel options (like ``-j8``).
 
-  bzcat queries.fa.bz2 | parallel-fasta "lastal mydb" > myalns.maf
+* The ``gzip`` example above runs ``gzip`` several times in parallel,
+  and concatenates the results.  Fortunately, that is valid ``gz``
+  format.
 
-Notes:
+* ``lastal`` reads the ``mydb`` files into shared memory, so this
+  memory use does not get mutiplied 8 times.
 
-* parallel-fasta and parallel-fastq simply execute GNU parallel with a
-  few options for fasta or fastq: you can specify other GNU parallel
-  options to control the number of simultaneous jobs, use remote
-  computers, get the output in the same order as the input, etc.
+* ``parallel-fastq`` assumes that each fastq record is 4 lines, so
+  there should be no line wrapping or blank lines.  (Actually, it
+  assumes one "record" is 8 lines, to keep paired reads together.)
 
-* parallel-fastq assumes that each fastq record is 4 lines, so there
-  should be no line wrapping or blank lines.
+* In the current version, ``parallel-fasta`` and ``parallel-fastq``
+  use GNU Parallel's ``--round`` option.  This avoids the overhead of
+  restarting the command for each input chunk, but it prevents keeping
+  the order of sequences, and stashes the output in large temporary
+  files.  If necessary, use ``$TMPDIR`` or ``--tmpdir`` to choose a
+  temp directory with enough space.
 
+.. _lastal: doc/lastal.rst
 .. _GNU parallel: http://www.gnu.org/software/parallel/


=====================================
doc/last-seeds.rst
=====================================
@@ -206,7 +206,7 @@ And these patterns::
   YYYYRY
 
 It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
 
 RY8-8 (abbreviation: RY8)
 -------------------------
@@ -254,7 +254,7 @@ And these patterns::
   YYYYYYYR
 
 It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
 
 RY16-8 (abbreviation: RY16)
 ---------------------------
@@ -286,7 +286,7 @@ And these patterns::
   YYRRYRYY
 
 It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
 
 RY32-10 (abbreviation: RY32)
 ----------------------------
@@ -334,5 +334,5 @@ And these patterns::
   YYYYYYRRRY
 
 It sets this lastal default:
--r6 -q18 -a21 -b9
+-m2 -r6 -q18 -a21 -b9
 


=====================================
doc/maf-cut.rst
=====================================
@@ -17,4 +17,14 @@ You can omit the sequence name::
 
 Then, the range applies to the topmost sequence in each alignment.
 
+``maf-cut`` can also get parts of sequences::
+
+  maf-cut chr7:1234000-1235000 alignments.maf sequences.fasta
+
+This first applies ``maf-cut`` to ``alignments.maf`` in the usual way
+(without printing anything).  For each sequence, it gets the minimum
+start coordinate and maximum end coordinate in the resulting
+alignments.  It prints this segment of each sequence in
+``sequences.fasta``.  (fastq format is ok too.)
+
 .. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5


=====================================
src/CyclicSubsetSeed.cc
=====================================
@@ -86,47 +86,50 @@ n  ACGT\n\
 " + patterns;
 }
 
-bool CyclicSubsetSeed::nextPattern( std::istream& in,
-				    std::vector< std::string >& seedAlphabet,
-				    std::string& pattern ){
-  while ( getline( in, pattern ) ){
-    std::istringstream iss( pattern );
-    std::string x, y;
-    iss >> x;
-    if( x.empty() || x[0] == '#' ) continue;
-    iss >> y;
-    if( y.empty() ) return true;
-    if( x.size() > 1 ) ERR( "bad seed line: " + pattern );
-    seedAlphabet.push_back( pattern );
-  }
-  return false;
+static int lineType(const std::string &line) {
+  std::istringstream s(line);
+  std::string x, y;
+  s >> x >> y;
+  return (x.empty() || x[0] == '#')    ? 0  // blank or comment line
+    :    (x.size() == 1 && !y.empty()) ? 1  // seed-alphabet line
+    : 2;                                    // seed-pattern line
 }
 
-static size_t findSeedLetter( const std::vector< std::string >& seedAlphabet,
-			      char seedLetter ){
+static const char *letterGroups(const std::vector<std::string> &seedAlphabet,
+				char seedLetter) {
   // go backwards, so that newer definitions override older ones:
-  for( size_t j = seedAlphabet.size(); j > 0; ){
-    --j;
-    const std::string& s = seedAlphabet[j];
-    assert( !s.empty() );
-    if( s[0] == seedLetter ) return j;
+  for (size_t i = seedAlphabet.size(); i-- > 0; ) {
+    const char *s = seedAlphabet[i].c_str();
+    if (s[0] == seedLetter) return s + 2;
   }
-  ERR( "unknown symbol in seed pattern: " + stringify(seedLetter) );
+  ERR("unknown symbol in seed pattern: " + stringify(seedLetter));
 }
 
-void CyclicSubsetSeed::init( const std::vector< std::string >& seedAlphabet,
-			     const std::string& pattern,
-			     bool isMaskLowercase,
-			     const uchar letterCode[],
-			     const std::string &mainSequenceAlphabet ){
-  clear();
-  for( size_t i = 0; i < pattern.size(); ++i ){
-    char seedLetter = pattern[i];
-    if( std::isspace(seedLetter) ) continue;
-    size_t j = findSeedLetter( seedAlphabet, seedLetter );
-    std::istringstream iss( seedAlphabet[j] );
-    iss >> seedLetter;
-    appendPosition( iss, isMaskLowercase, letterCode, mainSequenceAlphabet );
+void CyclicSubsetSeed::addPatterns(std::vector<CyclicSubsetSeed> &patterns,
+				   const std::string &text,
+				   bool isMaskLowercase,
+				   const uchar letterCode[],
+				   const std::string &mainSequenceAlphabet) {
+  std::vector<std::string> seedAlphabet;
+  std::string line, word;
+  std::istringstream textStream(text);
+
+  while (getline(textStream, line)) {
+    int type = lineType(line);
+    if (type == 1) {
+      seedAlphabet.push_back(line);
+    } else if (type == 2) {
+      std::istringstream lineStream(line);
+      while (lineStream >> word) {
+	CyclicSubsetSeed pat;
+	for (size_t i = 0; i < word.size(); ++i) {
+	  std::istringstream s(letterGroups(seedAlphabet, word[i]));
+	  pat.appendPosition(s, isMaskLowercase, letterCode,
+			     mainSequenceAlphabet);
+	}
+	patterns.push_back(pat);
+      }
+    }
   }
 }
 


=====================================
src/CyclicSubsetSeed.hh
=====================================
@@ -72,21 +72,16 @@ public:
   // codes.  Finally, "@" allows any match or transition.
   static std::string stringFromDnaPatterns(std::string patterns);
 
-  // Reads lines from "in" until it finds a pattern line.  Any seed
-  // alphabet lines are appended to "seedAlphabet".  If it finds a
-  // pattern line, it stores it in "pattern" and returns true, else it
-  // returns false.  Blank lines and comment lines starting with # are
-  // ignored.
-  static bool nextPattern( std::istream& in,
-			   std::vector< std::string >& seedAlphabet,
-			   std::string& pattern );
-
-  void clear() {
-    subsetLists.clear();
-    subsetMaps.clear();
-    originalSubsetMaps.clear();
-    numOfSubsetsPerPosition.clear();
-  }
+  // Read subset-seed patterns from text and add them to patterns.
+  // The text should start with lines defining a seed alphabet,
+  // followed by lines with seed patterns.  Blank lines and comment
+  // lines starting with # are ignored.  Seed letters are
+  // case-sensitive.
+  static void addPatterns(std::vector<CyclicSubsetSeed> &patterns,
+			  const std::string &text,
+			  bool isMaskLowercase,
+			  const uchar letterCode[],
+			  const std::string &mainSequenceAlphabet);
 
   void swap( CyclicSubsetSeed& x ){
     subsetLists.swap( x.subsetLists );
@@ -95,13 +90,6 @@ public:
     numOfSubsetsPerPosition.swap( x.numOfSubsetsPerPosition );
   }
 
-  // Seed letters are case-sensitive.
-  void init( const std::vector< std::string >& seedAlphabet,
-	     const std::string& pattern,
-	     bool isMaskLowercase,
-	     const uchar letterCode[],
-	     const std::string& mainSequenceAlphabet );
-
   // "inputLine" should be a grouping of sequence letters.
   void appendPosition( std::istream& inputLine,
 		       bool isMaskLowercase,


=====================================
src/LastalArguments.cc
=====================================
@@ -172,7 +172,7 @@ Miscellaneous options (default settings):\n\
 -C: omit gapless alignments in >= C others with > score-per-length (off)\n\
 -P: number of parallel threads ("
     + stringify(numOfThreads) + ")\n\
--i: query batch size (8 KiB, unless there is > 1 thread or lastdb volume)\n\
+-i: query batch size (64M if multi-volume, else 32M if multi-thread, else 8K)\n\
 -M: find minimum-difference alignments (faster but cruder)\n\
 -T: type of alignment: 0=local, 1=overlap ("
     + stringify(globality) + ")\n\
@@ -524,7 +524,10 @@ void LastalArguments::setDefaultsFromAlphabet( bool isDna, bool isProtein,
     else if( outputType == 0 )
       batchSize = 0x1000000;  // 16 Mbytes
     else if( !isVolumes )
-      batchSize = 0x800000;   // 8 Mbytes
+      batchSize = 0x2000000;   // 32 Mbytes
+    // 32 Mbytes gave quite good load-balancing when aligning human
+    // nanopore reads with 16 threads.  Sometimes 64M (and even 128M)
+    // was better.  With WindowMasker, smaller batches were fine.
     else if( inputFormat == sequenceFormat::prb )
       batchSize = 0x2000000;  // 32 Mbytes (?)
     else


=====================================
src/fileMap.cc
=====================================
@@ -25,8 +25,7 @@ static void err( const std::string& s ) {
 // random access can be horribly slow (at least on two Linux 2.6
 // systems).
 static void primeMemory( void* begin, size_t bytes ){
-  // use "static" to stop the compiler optimizing the whole function away:
-  static unsigned z = 0;
+  unsigned z = 0;
   size_t stepSize = 1024;
   const char* x = static_cast<char*>(begin);
   const char* y = x + (bytes / stepSize) * stepSize;
@@ -34,6 +33,8 @@ static void primeMemory( void* begin, size_t bytes ){
     z += *x;
     x += stepSize;
   }
+  volatile unsigned dontOptimizeMeAway = z;
+  dontOptimizeMeAway = dontOptimizeMeAway;  // ??? prevents compiler warning
 }
 
 namespace cbrc{


=====================================
src/lastdb.cc
=====================================
@@ -51,49 +51,36 @@ bool isDubiousDna( const Alphabet& alph, const MultiSequence& multi ){
   else return false;
 }
 
-static void addSeeds( std::vector< CyclicSubsetSeed >& seeds,
-		      const std::string& seedText,
-		      const LastdbArguments& args, const Alphabet& alph ){
-  std::istringstream iss( seedText );
-  std::vector< std::string > seedAlphabet;
-  std::string pattern;
-  while( CyclicSubsetSeed::nextPattern( iss, seedAlphabet, pattern ) ){
-    CyclicSubsetSeed s;
-    s.init(seedAlphabet, pattern, args.isCaseSensitive, alph.encode,
-	   alph.letters);
-    seeds.push_back(s);
-  }
-}
-
 // Set up the seed pattern(s)
 static void makeSubsetSeeds( std::vector< CyclicSubsetSeed >& seeds,
 			     const std::string& seedText,
 			     const LastdbArguments& args,
 			     const Alphabet& alph ){
   const std::string& a = alph.letters;
+  bool isCaseSens = args.isCaseSensitive;
 
   if( !args.subsetSeedFile.empty() ){
-    addSeeds( seeds, seedText, args, alph );
+    CyclicSubsetSeed::addPatterns(seeds, seedText, isCaseSens, alph.encode, a);
   }
   else if (!args.dnaSeedPatterns.empty()) {
     for (size_t x = 0; x < args.dnaSeedPatterns.size(); ++x) {
       const std::string &p = args.dnaSeedPatterns[x];
       std::string s = CyclicSubsetSeed::stringFromDnaPatterns(p);
-      addSeeds(seeds, s, args, alph);
+      CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
     }
   }
   else if( !args.seedPatterns.empty() ){
     for( unsigned x = 0; x < args.seedPatterns.size(); ++x ){
       const std::string& p = args.seedPatterns[x];
       std::string s = CyclicSubsetSeed::stringFromPatterns( p, a );
-      addSeeds( seeds, s, args, alph );
+      CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
     }
   }
   else{
     std::string s = (alph.letters == alph.dna)
       ? CyclicSubsetSeed::stringFromName( "YASS" )
       : CyclicSubsetSeed::stringFromPatterns( "1", a );
-    addSeeds( seeds, s, args, alph );
+    CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
   }
 
   if( seeds.empty() ) ERR( "no seed patterns" );


=====================================
src/makefile
=====================================
@@ -143,7 +143,7 @@ ScoreMatrixData.hh: ../data/*.mat
 	../build/mat-inc.sh ../data/*.mat > $@
 
 VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1282) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1296) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
 
 VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
 


=====================================
test/last-test.out
=====================================
@@ -5235,8 +5235,8 @@ s LTR22C#LTR/ERVK  0 505 + 509 TGTAGGGGNTCGGTCAGGGTGGTGGGAGAAATTGTAAAAATAAATTATA
 
 # Query sequences=1 normal letters=485
 #
-# a=21 b=4 A=21 B=4 e=114 d=70 x=113 y=49 z=113 D=1e+06 E=2.92797e+07
-# R=01 u=0 s=2 S=0 M=0 T=0 m=10 l=1 n=10 k=1 w=1000 t=4.88281 j=3 Q=0
+# a=21 b=4 A=21 B=4 e=114 d=77 x=113 y=49 z=113 D=1e+06 E=2.92797e+07
+# R=01 u=0 s=2 S=0 M=0 T=0 m=2 l=1 n=2 k=1 w=1000 t=4.88281 j=3 Q=0
 # /tmp/last-test
 # Reference sequences=1 normal letters=16571
 # lambda=0.201859 K=0.288737
@@ -5275,7 +5275,6 @@ s LTR22C#LTR/ERVK  0 505 + 509 TGTAGGGGNTCGGTCAGGGTGGTGGGAGAAATTGTAAAAATAAATTATA
 644	chrM	1742	311	+	16571	chrM	2432	326	+	16775	18,1:0,2,0:2,17,0:4,25,0:4,11,0:2,57,1:0,42,0:1,24,0:3,83,0:4,6,3:0,21	EG2=1e-39	E=5.5e-49
 627	chrM	656	291	+	16571	chrM	1303	296	+	16775	78,0:3,35,0:2,6,2:0,20,0:1,19,0:1,55,1:0,57,0:1,18	EG2=3.1e-38	E=1.7e-47
 587	chrM	5208	516	+	16571	chrM	5978	522	+	16775	74,0:2,17,2:0,88,0:3,19,3:0,5,5:0,14,0:5,6,3:0,21,0:3,14,1:0,12,2:0,29,0:1,6,0:4,30,0:2,9,0:1,13,1:0,73,0:1,2,0:1,67	EG2=1e-34	E=5.5e-44
-300	chrM	10642	148	+	16571	chrM	11367	148	+	16775	68,1:0,9,0:1,70	EG2=1.4e-09	E=8e-19
 288	chrM	8638	216	+	16571	chrM	9353	216	+	16775	216	EG2=1.6e-08	E=9e-18
 210	chrM	10518	77	+	16571	chrM	11243	77	+	16775	77	EG2=0.11	E=6.2e-11
 154	chrM	14425	84	+	16571	chrM	16456	84	+	16775	84	EG2=9.1e+03	E=5e-06
@@ -5320,18 +5319,3 @@ p                          $&'~~~~~~~~~~~~(.0233.,,,,,+--&)*+,,,-.-+++++---)'%(+
 c 20.6 0.00414 1.97 1.05 1.13 4.98 0.158 2.83 0.966 0.0011 9.56 0.997 0.0153 0.0037 1.07 6.54 51.8 16.4 0.361 3 0.27
 
 # Query sequences=1 normal letters=54
-a score=192 mismap=0.117
-s KI635862   2418195 86 +  12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCCGG-ATAACAGGCGCGCGCCAC
-s chr1     248803921 88 - 248956422 CATAGCTCACTGCAGCCTTGACCTCCTAGGCTAAAGCAATCCTCCCACCTTAGCCTCTCCAGTAGCTGGAACTACAGGCATGCATCAC
-p                                   #$$%&&'(()******************************************************************************
-
-a score=330 mismap=0.75
-s KI636063 409336 105 +    789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
-s chr1     157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
-p                                 !"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""!
-
-a score=203 mismap=0.0127
-s KI635862   2418239 48 +  12728579 CAGCCTCAGCCTC-CGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCC
-s chr1     248798136 50 - 248956422 CTGCCTCAGCCTCACAAGTAGCTGGGACTACAGGTGCTTGTCACCACACC
-p                                   33333333333333333333333333333333333322211110.*)'&$
-


=====================================
test/last-test.sh
=====================================
@@ -245,8 +245,6 @@ trap 'rm -f $db*' EXIT
     # tricky Forward-Backward bug that happened once
     lastdb $db alli.fa
     lastal -j7 -r5 -q5 -a15 -b3 $db huma.fa
-
-    maf-cut chr1:152413-158286 split1.maf
 } 2>&1 |
 grep -v version | diff -u last-test.out -
 
@@ -261,6 +259,7 @@ rm f.* r.*
 ./last-split-test.sh
 ./last-train-test.sh
 ./maf-convert-test.sh
+./maf-cut-test.sh
 ./maf-swap-test.sh
 
 # Test: lastdb, lastal, last-split, maf-sort, maf-join


=====================================
test/maf-cut-test.out
=====================================
@@ -0,0 +1,33 @@
+a score=192 mismap=0.117
+s KI635862   2418195 86 +  12728579 CACAGGCCGCTGCAGCCCAGAACTCCTGGCCTCAAGCGATCCACCAGCCTCAGCCTC-CGAGCAGCCGG-ATAACAGGCGCGCGCCAC
+s chr1     248803921 88 - 248956422 CATAGCTCACTGCAGCCTTGACCTCCTAGGCTAAAGCAATCCTCCCACCTTAGCCTCTCCAGTAGCTGGAACTACAGGCATGCATCAC
+p                                   #$$%&&'(()******************************************************************************
+
+a score=330 mismap=0.75
+s KI636063 409336 105 +    789923 AAAATATGGAACGCTTCACGAATTTGCGTGTCATCCTTGCGCAGGGGCCATGCTAATCTTCTCTGTATCGTTCCAATTTTAGTATATGTGCTGCCGAAGCGAGCA
+s chr1     157784 102 + 248956422 aaaataTGGAATGCTTCACAAATTTGCATGTCATTCTTTCACAGAGGCCGTGCCAA---TCTCTCTATTGTTCCAACTTAAGTATGTGTGCTACTGAGGCAAGCA
+p                                 !"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""!
+
+a score=203 mismap=0.0127
+s KI635862   2418239 48 +  12728579 CAGCCTCAGCCTC-CGAGCAGCC-GGATAACAGGCGCGCGCCACCGCGCC
+s chr1     248798136 50 - 248956422 CTGCCTCAGCCTCACAAGTAGCTGGGACTACAGGTGCTTGTCACCACACC
+p                                   33333333333333333333333333333333333322211110.*)'&$
+
+@ 7 87 chrMa 141 -
+TAGTAGGTATGTTTGCTTGTAATATTGAATGTAGGAGCGATAGATAAGAG
++
+?:+4>89ABBB at AB?):B=5=1@@49;;9;<9(22+/+>?060.=<4++2
+@ 21 87 chrMa 80 +
+TTATTATTTATCGTATT
++
+B2BBABBA:1:A8A?=9
+@ 95 87 chrMa 131 +
+TTATTATTTATCGTATTTACGTTTAATATTATAGGTGAATATATTTATTG
++
+A?6BA=*?@>>8:?0;=6>@A8?76 at 7<?@?;99174=9?3216;(:19#
+> 7 87 chrMa 141 -
+TAGTAGGTATGTTTGCTTGTAATATTGAATGTAGGAGCGATAGATAAGAG
+> 21 87 chrMa 80 +
+TTATTATTTATCGTATT
+> 95 87 chrMa 131 +
+TTATTATTTATCGTATTTACGTTTAATATTATAGGTGAATATATTTATTG


=====================================
test/maf-cut-test.sh
=====================================
@@ -0,0 +1,14 @@
+#! /bin/sh
+
+cd $(dirname $0)
+
+PATH=../bin:$PATH
+
+{
+    maf-cut chr1:152413-158286 split1.maf
+
+    maf-cut chrM:150-200 bs100.maf bs100.fastq
+
+    awk '(NR-1) % 4 < 2' bs100.fastq | tr '@' '>' |
+	maf-cut chrM:150-200 bs100.maf -
+} | diff -u maf-cut-test.out -



View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a87dec4f5896357452377c8df6169de6609b2822

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/a87dec4f5896357452377c8df6169de6609b2822
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220529/fef2032e/attachment-0001.htm>


More information about the debian-med-commit mailing list