[med-svn] [Git][med-team/last-align][upstream] New upstream version 1411

Thu Sep 22 08:38:57 BST 2022


Andreas Tille pushed to branch upstream at Debian Med / last-align


Commits:
b6c8fddb by Andreas Tille at 2022-09-22T09:28:02+02:00
New upstream version 1411
- - - - -


14 changed files:

- bin/last-train
- + data/PSEUDO.seed
- data/RY16-11.seed
- data/RY32-12.seed
- data/RY4-9.seed
- data/RY8-10.seed
- doc/last-papers.rst
- doc/last-seeds.rst
- src/LastdbArguments.cc
- src/getoptUtil.hh
- src/lastdb.cc
- src/makefile
- test/last-test.out
- test/last-test.sh


Changes:

=====================================
bin/last-train
=====================================
@@ -708,16 +708,19 @@ def tryToMakeChildProgramsFindable():
     # put it first, to avoid getting older versions of LAST:
     os.environ["PATH"] = d + os.pathsep + os.environ["PATH"]
 
-def readLastalProgName(lastdbIndexName):
+def readLastdbData(lastdbIndexName):
     bitsPerInt = "32"
     with open(lastdbIndexName + ".prj") as f:
         for line in f:
+            if line.startswith("alphabet="):
+                alphabet = line.split("=")[1].strip()
             if line.startswith("integersize="):
                 bitsPerInt = line.split("=")[1].strip()
     suffix = "" if bitsPerInt == "32" else str(int(bitsPerInt) // 8)
-    return "lastal" + suffix
+    lastalProgName = "lastal" + suffix
+    return lastalProgName, alphabet
 
-def fixedLastalArgs(opts, lastalProgName):
+def fixedLastalArgs(opts, lastalProgName, alphabet):
     x = [lastalProgName, "-j7"]
     if opts.D: x.append("-D" + opts.D)
     if opts.E: x.append("-E" + opts.E)
@@ -732,11 +735,13 @@ def fixedLastalArgs(opts, lastalProgName):
     if opts.X: x.append("-X" + opts.X)
     if opts.Q: x.append("-Q" + opts.Q)
     if opts.verbose: x.append("-" + "v" * opts.verbose)
-    if opts.codon:
-        x.append("-K1")
-    else:
+    if len(alphabet) < 20:
         x.append("--split-n")
         x.append("--split-m=0.01")  # xxx ???
+    else:
+        if opts.revsym:
+            raise RuntimeError("--revsym is for DNA only")
+        x.append("-K1")
     return x
 
 def process(args, inStream):
@@ -750,8 +755,15 @@ def versionFromLastal():
 
 def doTraining(opts, args):
     tryToMakeChildProgramsFindable()
-    lastalProgName = readLastalProgName(args[0])
+    lastalProgName, alphabet = readLastdbData(args[0])
     lastalVersion = versionFromLastal()
+
+    if not opts.p and (not opts.Q or opts.Q in ("0", "fastx", "keep")):
+        if not opts.r: opts.r = "5" if len(alphabet) < 20 else "12"
+        if not opts.q: opts.q = "5" if len(alphabet) < 20 else "7"
+        if not opts.a: opts.a = "15"
+        if not opts.b: opts.b = "3"
+
     print("# lastal version:", lastalVersion)
     print("# maximum percent identity:", opts.pid)
 
@@ -767,7 +779,7 @@ def doTraining(opts, args):
         writeScoreMatrixFunc = writeScoreMatrix
         codonMatches = None
 
-        lastalArgs = fixedLastalArgs(opts, lastalProgName)
+        lastalArgs = fixedLastalArgs(opts, lastalProgName, alphabet)
         if opts.r: lastalArgs.append("-r" + opts.r)
         if opts.q: lastalArgs.append("-q" + opts.q)
         if opts.p: lastalArgs.append("-p" + opts.p)
@@ -825,7 +837,7 @@ def doTraining(opts, args):
             if parameters in oldParameters:
                 break
         oldParameters.append(parameters)
-        lastalArgs = fixedLastalArgs(opts, lastalProgName)
+        lastalArgs = fixedLastalArgs(opts, lastalProgName, alphabet)
         lastalArgs.append("-t{0:.6}".format(scale))
         lastalArgs.append("-p-")
         proc = process(lastalArgs + args, subprocess.PIPE)
@@ -895,10 +907,10 @@ if __name__ == "__main__":
     op.add_option_group(og)
 
     og = optparse.OptionGroup(op, "Initial parameter options")
-    og.add_option("-r", metavar="SCORE",
-                  help="match score (default: 6 if Q>=1, else 5)")
-    og.add_option("-q", metavar="COST",
-                  help="mismatch cost (default: 18 if Q>=1, else 5)")
+    og.add_option("-r", metavar="SCORE", help=
+                  "match score   (default:  6 if Q>=1, or 5 if DNA, or 12)")
+    og.add_option("-q", metavar="COST", help=
+                  "mismatch cost (default: 18 if Q>=1, or 5 if DNA, or  7)")
     og.add_option("-p", metavar="NAME", help="match/mismatch score matrix")
     og.add_option("-a", metavar="COST",
                   help="gap existence cost (default: 21 if Q>=1, else 15)")
@@ -954,11 +966,6 @@ if __name__ == "__main__":
         if not opts.A: opts.A = opts.a
         if not opts.B: opts.B = opts.b
         opts.S = None
-    if not opts.p and (not opts.Q or opts.Q in ("0", "fastx", "keep")):
-        if not opts.r: opts.r = "5"
-        if not opts.q: opts.q = "5"
-        if not opts.a: opts.a = "15"
-        if not opts.b: opts.b = "3"
 
     try: lastTrain(opts, args)
     except KeyboardInterrupt: pass  # avoid silly error message


=====================================
data/PSEUDO.seed
=====================================
@@ -0,0 +1,8 @@
+# This seeding scheme slightly increases sensitivity of
+# DNA-versus-protein search for pseudogenes.
+
+1  A C D E F G H I K L M N P Q R S T V W Y *
+0  ACDEFGHIKLMNPQRSTVWY*
+2  ACST DEN FHWY G KQR* P ILMV
+
+1120


=====================================
data/RY16-11.seed
=====================================
@@ -1,5 +1,7 @@
 # This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/16 of positions in each sequence.
+# seeking seeds at ~1/16 of positions in each sequence.  (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
 
 #abbreviation RY16
 


=====================================
data/RY32-12.seed
=====================================
@@ -1,5 +1,7 @@
 # This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/32 of positions in each sequence.
+# seeking seeds at ~1/32 of positions in each sequence.  (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
 
 #abbreviation RY32
 


=====================================
data/RY4-9.seed
=====================================
@@ -1,5 +1,7 @@
 # This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/4 of positions in each sequence.
+# seeking seeds at ~1/4 of positions in each sequence.  (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
 
 #abbreviation RY4
 


=====================================
data/RY8-10.seed
=====================================
@@ -1,5 +1,7 @@
 # This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/8 of positions in each sequence.
+# seeking seeds at ~1/8 of positions in each sequence.  (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
 
 #abbreviation RY8
 


=====================================
doc/last-papers.rst
=====================================
@@ -122,16 +122,23 @@ research to society.
 
     __ https://doi.org/10.1093/bioinformatics/btaa1054
 
-    Describes the ``lastdb -u RY`` sparsity options.
+    Describes the ``lastdb -u RY`` sparsity options, for LAST version < 1407.
 
 15. `Improved DNA-versus-protein homology search for protein fossils`__.
-    Yao Y, Frith MC.
+    Yao Y, Frith MC.  IEEE/ACM Trans Comput Biol Bioinform. 2022
 
-    __ https://doi.org/10.1007/978-3-030-74432-8_11
+    __ https://doi.org/10.1109/TCBB.2022.3177855
 
     Describes "new-style" DNA-versus-protein search with
     ``last-train --codon``.
 
+16. `How to optimally sample a sequence for rapid analysis`__.
+    Frith MC, Shaw J, Spouge JL.
+
+    __ https://doi.org/10.1101/2022.08.18.504476
+
+    Describes the ``lastdb -u RY`` sparsity options, for LAST version >= 1407.
+
 External methods
 ----------------
 


=====================================
doc/last-seeds.rst
=====================================
@@ -159,6 +159,21 @@ And this pattern::
 It sets this lastal default:
 -r6 -q18 -a21 -b9
 
+PSEUDO
+------
+
+This seeding scheme slightly increases sensitivity of
+DNA-versus-protein search for pseudogenes.
+It uses this seed alphabet::
+
+  1  A C D E F G H I K L M N P Q R S T V W Y *
+  0  ACDEFGHIKLMNPQRSTVWY*
+  2  ACST DEN FHWY G KQR* P ILMV
+
+And this pattern::
+
+  1120
+
 YASS
 ----
 
@@ -179,7 +194,9 @@ RY4-9 (abbreviation: RY4)
 -------------------------
 
 This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/4 of positions in each sequence.
+seeking seeds at ~1/4 of positions in each sequence.  (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
 It uses this seed alphabet::
 
   R  A G
@@ -227,7 +244,9 @@ RY8-10 (abbreviation: RY8)
 --------------------------
 
 This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/8 of positions in each sequence.
+seeking seeds at ~1/8 of positions in each sequence.  (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
 It uses this seed alphabet::
 
   R  A G
@@ -275,7 +294,9 @@ RY16-11 (abbreviation: RY16)
 ----------------------------
 
 This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/16 of positions in each sequence.
+seeking seeds at ~1/16 of positions in each sequence.  (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
 It uses this seed alphabet::
 
   R  A G
@@ -323,7 +344,9 @@ RY32-12 (abbreviation: RY32)
 ----------------------------
 
 This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/32 of positions in each sequence.
+seeking seeds at ~1/32 of positions in each sequence.  (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
 It uses this seed alphabet::
 
   R  A G


=====================================
src/LastdbArguments.cc
=====================================
@@ -50,7 +50,7 @@ Main Options:\n\
  -h, --help  show all options and their default settings, and exit\n\
  -p  interpret the sequences as proteins\n\
  -c  soft-mask lowercase letters (in reference *and* query sequences)\n\
- -u  seeding scheme (default: YASS for DNA, else exact-match seeds)\n\
+ -u  seeding scheme (default: YASS if DNA, else PSEUDO if -q, else exact-match)\n\
  -P  number of parallel threads (default: " + stringify(numOfThreads) + ")";
 
   std::string help = usage + "\n\


=====================================
src/getoptUtil.hh
=====================================
@@ -6,8 +6,13 @@
 
 #include <getopt.h>
 
-inline void resetGetopt() {
-  optind = 1;  // xxx ???
+inline void resetGetopt() {  // XXX fragile voodoo
+#ifdef __GLIBC__
+  optind = 0;
+#else
+  optind = 1;
+  //optreset = 1;  // XXX ???
+#endif
 }
 
 #endif


=====================================
src/lastdb.cc
=====================================
@@ -77,9 +77,10 @@ static void makeSubsetSeeds( std::vector< CyclicSubsetSeed >& seeds,
     }
   }
   else{
-    std::string s = (alph.letters == alph.dna)
-      ? CyclicSubsetSeed::stringFromName( "YASS" )
-      : CyclicSubsetSeed::stringFromPatterns( "1", a );
+    std::string s =
+      (alph.letters == alph.dna) ? CyclicSubsetSeed::stringFromName("YASS") :
+      args.isAddStops ? CyclicSubsetSeed::stringFromName("PSEUDO") :
+      CyclicSubsetSeed::stringFromPatterns("1", a);
     CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
   }
 


=====================================
src/makefile
=====================================
@@ -144,7 +144,7 @@ ScoreMatrixData.hh: ../data/*.mat
 	../build/mat-inc.sh ../data/*.mat > $@
 
 VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1407) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1411) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
 
 VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
 


=====================================
test/last-test.out
=====================================
@@ -2709,6 +2709,11 @@ s Q2LCP8 88  53 + 492 MetIleIleLysMetPheGluGluGlyValThrGluGlyLysLysThrLysIleThrA
 s S1_40   3 159 - 579 ATTCTGCTTTCGCTCGTCATTGGCGCCGTTACCTGTTTTCGCCGACTGCCGACAACGCTTGCTGGTCGCATTTATTCCTACGTAAGCATTATCGGCCATTTCAGCTTCCTGGTGTTCGCCACCTACTTGCTGATCCTCTTCCCGCTGACTTTATCGTCG
 q S1_40               P|N_kqaM3BuHMQ~H~IT~>`VdH,VHZ'O13q~HfOn\2W]HRScDMkG\Hp<CGror?v4]3bdI\OB\4:UAI?@`MNQHdSKNZAeO6U>~1C7&JI58.YCWaaO5~A at EvMZLKND8QznFq$=SGLkJ=fJYeQ:~G/WL-N;D?HBN-9:
 
+a score=36
+s Q2LCP8 309 22 + 492 SerAlaLeuLeuIleLeuMetLeuAlaGlySerMetSerGluGluLeuMetValAsnSerValTyr
+s S1_50   85 66 - 262 TCCGTTTGGTTTATTTTGCTCATCATAAGCCCCATGGCAGATGACATTTTGGTTACTGCAGAATAT
+q S1_50               <SL;-.;2KW<-=IG?X.Q=4?(G.7;2ECE>6EnQIX3E=/MSL,-\OLQ>:FJ;,4^D9>>:P>
+
 a score=38
 s Q2LCP8 422 28 + 492 GlyValMetIleTyrTyrMetAsnLeuValLysIleIleIleIleAspLys---ValGlnThrHisGluGlnGlyValAlaGluLeu
 s S1_54  177 87 + 662 GGCGCGATGATGCTGCAAGCGCCGCAGGTCGCCCTCATTGTGGATGACGAATTTATTCAAACGCATACCGTCGGCTTTGACGAGCTA


=====================================
test/last-test.sh
=====================================
@@ -23,6 +23,7 @@ db=/tmp/last-test
 trap 'rm -f $db*' EXIT
 
 {
+    lastdb -uMURPHY10 $db /dev/null  # this triggered a getopt reset bug
     lastdb $db /dev/null
     lastdb -D $db
     lastal $db /dev/null



View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b6c8fddbc13ca80437dfaa097dd69be15d610168

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b6c8fddbc13ca80437dfaa097dd69be15d610168
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220922/2763daf6/attachment-0001.htm>