[med-svn] [Git][med-team/last-align][upstream] New upstream version 1411
Andreas Tille (@tille)
gitlab at salsa.debian.org
Thu Sep 22 08:38:57 BST 2022
Andreas Tille pushed to branch upstream at Debian Med / last-align
Commits:
b6c8fddb by Andreas Tille at 2022-09-22T09:28:02+02:00
New upstream version 1411
- - - - -
14 changed files:
- bin/last-train
- + data/PSEUDO.seed
- data/RY16-11.seed
- data/RY32-12.seed
- data/RY4-9.seed
- data/RY8-10.seed
- doc/last-papers.rst
- doc/last-seeds.rst
- src/LastdbArguments.cc
- src/getoptUtil.hh
- src/lastdb.cc
- src/makefile
- test/last-test.out
- test/last-test.sh
Changes:
=====================================
bin/last-train
=====================================
@@ -708,16 +708,19 @@ def tryToMakeChildProgramsFindable():
# put it first, to avoid getting older versions of LAST:
os.environ["PATH"] = d + os.pathsep + os.environ["PATH"]
-def readLastalProgName(lastdbIndexName):
+def readLastdbData(lastdbIndexName):
bitsPerInt = "32"
with open(lastdbIndexName + ".prj") as f:
for line in f:
+ if line.startswith("alphabet="):
+ alphabet = line.split("=")[1].strip()
if line.startswith("integersize="):
bitsPerInt = line.split("=")[1].strip()
suffix = "" if bitsPerInt == "32" else str(int(bitsPerInt) // 8)
- return "lastal" + suffix
+ lastalProgName = "lastal" + suffix
+ return lastalProgName, alphabet
-def fixedLastalArgs(opts, lastalProgName):
+def fixedLastalArgs(opts, lastalProgName, alphabet):
x = [lastalProgName, "-j7"]
if opts.D: x.append("-D" + opts.D)
if opts.E: x.append("-E" + opts.E)
@@ -732,11 +735,13 @@ def fixedLastalArgs(opts, lastalProgName):
if opts.X: x.append("-X" + opts.X)
if opts.Q: x.append("-Q" + opts.Q)
if opts.verbose: x.append("-" + "v" * opts.verbose)
- if opts.codon:
- x.append("-K1")
- else:
+ if len(alphabet) < 20:
x.append("--split-n")
x.append("--split-m=0.01") # xxx ???
+ else:
+ if opts.revsym:
+ raise RuntimeError("--revsym is for DNA only")
+ x.append("-K1")
return x
def process(args, inStream):
@@ -750,8 +755,15 @@ def versionFromLastal():
def doTraining(opts, args):
tryToMakeChildProgramsFindable()
- lastalProgName = readLastalProgName(args[0])
+ lastalProgName, alphabet = readLastdbData(args[0])
lastalVersion = versionFromLastal()
+
+ if not opts.p and (not opts.Q or opts.Q in ("0", "fastx", "keep")):
+ if not opts.r: opts.r = "5" if len(alphabet) < 20 else "12"
+ if not opts.q: opts.q = "5" if len(alphabet) < 20 else "7"
+ if not opts.a: opts.a = "15"
+ if not opts.b: opts.b = "3"
+
print("# lastal version:", lastalVersion)
print("# maximum percent identity:", opts.pid)
@@ -767,7 +779,7 @@ def doTraining(opts, args):
writeScoreMatrixFunc = writeScoreMatrix
codonMatches = None
- lastalArgs = fixedLastalArgs(opts, lastalProgName)
+ lastalArgs = fixedLastalArgs(opts, lastalProgName, alphabet)
if opts.r: lastalArgs.append("-r" + opts.r)
if opts.q: lastalArgs.append("-q" + opts.q)
if opts.p: lastalArgs.append("-p" + opts.p)
@@ -825,7 +837,7 @@ def doTraining(opts, args):
if parameters in oldParameters:
break
oldParameters.append(parameters)
- lastalArgs = fixedLastalArgs(opts, lastalProgName)
+ lastalArgs = fixedLastalArgs(opts, lastalProgName, alphabet)
lastalArgs.append("-t{0:.6}".format(scale))
lastalArgs.append("-p-")
proc = process(lastalArgs + args, subprocess.PIPE)
@@ -895,10 +907,10 @@ if __name__ == "__main__":
op.add_option_group(og)
og = optparse.OptionGroup(op, "Initial parameter options")
- og.add_option("-r", metavar="SCORE",
- help="match score (default: 6 if Q>=1, else 5)")
- og.add_option("-q", metavar="COST",
- help="mismatch cost (default: 18 if Q>=1, else 5)")
+ og.add_option("-r", metavar="SCORE", help=
+ "match score (default: 6 if Q>=1, or 5 if DNA, or 12)")
+ og.add_option("-q", metavar="COST", help=
+ "mismatch cost (default: 18 if Q>=1, or 5 if DNA, or 7)")
og.add_option("-p", metavar="NAME", help="match/mismatch score matrix")
og.add_option("-a", metavar="COST",
help="gap existence cost (default: 21 if Q>=1, else 15)")
@@ -954,11 +966,6 @@ if __name__ == "__main__":
if not opts.A: opts.A = opts.a
if not opts.B: opts.B = opts.b
opts.S = None
- if not opts.p and (not opts.Q or opts.Q in ("0", "fastx", "keep")):
- if not opts.r: opts.r = "5"
- if not opts.q: opts.q = "5"
- if not opts.a: opts.a = "15"
- if not opts.b: opts.b = "3"
try: lastTrain(opts, args)
except KeyboardInterrupt: pass # avoid silly error message
=====================================
data/PSEUDO.seed
=====================================
@@ -0,0 +1,8 @@
+# This seeding scheme slightly increases sensitivity of
+# DNA-versus-protein search for pseudogenes.
+
+1 A C D E F G H I K L M N P Q R S T V W Y *
+0 ACDEFGHIKLMNPQRSTVWY*
+2 ACST DEN FHWY G KQR* P ILMV
+
+1120
=====================================
data/RY16-11.seed
=====================================
@@ -1,5 +1,7 @@
# This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/16 of positions in each sequence.
+# seeking seeds at ~1/16 of positions in each sequence. (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
#abbreviation RY16
=====================================
data/RY32-12.seed
=====================================
@@ -1,5 +1,7 @@
# This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/32 of positions in each sequence.
+# seeking seeds at ~1/32 of positions in each sequence. (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
#abbreviation RY32
=====================================
data/RY4-9.seed
=====================================
@@ -1,5 +1,7 @@
# This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/4 of positions in each sequence.
+# seeking seeds at ~1/4 of positions in each sequence. (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
#abbreviation RY4
=====================================
data/RY8-10.seed
=====================================
@@ -1,5 +1,7 @@
# This DNA seeding scheme reduces run time and memory use, by only
-# seeking seeds at ~1/8 of positions in each sequence.
+# seeking seeds at ~1/8 of positions in each sequence. (From "How to
+# optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+# JL Spouge.)
#abbreviation RY8
=====================================
doc/last-papers.rst
=====================================
@@ -122,16 +122,23 @@ research to society.
__ https://doi.org/10.1093/bioinformatics/btaa1054
- Describes the ``lastdb -u RY`` sparsity options.
+ Describes the ``lastdb -u RY`` sparsity options, for LAST version < 1407.
15. `Improved DNA-versus-protein homology search for protein fossils`__.
- Yao Y, Frith MC.
+ Yao Y, Frith MC. IEEE/ACM Trans Comput Biol Bioinform. 2022
- __ https://doi.org/10.1007/978-3-030-74432-8_11
+ __ https://doi.org/10.1109/TCBB.2022.3177855
Describes "new-style" DNA-versus-protein search with
``last-train --codon``.
+16. `How to optimally sample a sequence for rapid analysis`__.
+ Frith MC, Shaw J, Spouge JL.
+
+ __ https://doi.org/10.1101/2022.08.18.504476
+
+ Describes the ``lastdb -u RY`` sparsity options, for LAST version >= 1407.
+
External methods
----------------
=====================================
doc/last-seeds.rst
=====================================
@@ -159,6 +159,21 @@ And this pattern::
It sets this lastal default:
-r6 -q18 -a21 -b9
+PSEUDO
+------
+
+This seeding scheme slightly increases sensitivity of
+DNA-versus-protein search for pseudogenes.
+It uses this seed alphabet::
+
+ 1 A C D E F G H I K L M N P Q R S T V W Y *
+ 0 ACDEFGHIKLMNPQRSTVWY*
+ 2 ACST DEN FHWY G KQR* P ILMV
+
+And this pattern::
+
+ 1120
+
YASS
----
@@ -179,7 +194,9 @@ RY4-9 (abbreviation: RY4)
-------------------------
This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/4 of positions in each sequence.
+seeking seeds at ~1/4 of positions in each sequence. (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
It uses this seed alphabet::
R A G
@@ -227,7 +244,9 @@ RY8-10 (abbreviation: RY8)
--------------------------
This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/8 of positions in each sequence.
+seeking seeds at ~1/8 of positions in each sequence. (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
It uses this seed alphabet::
R A G
@@ -275,7 +294,9 @@ RY16-11 (abbreviation: RY16)
----------------------------
This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/16 of positions in each sequence.
+seeking seeds at ~1/16 of positions in each sequence. (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
It uses this seed alphabet::
R A G
@@ -323,7 +344,9 @@ RY32-12 (abbreviation: RY32)
----------------------------
This DNA seeding scheme reduces run time and memory use, by only
-seeking seeds at ~1/32 of positions in each sequence.
+seeking seeds at ~1/32 of positions in each sequence. (From "How to
+optimally sample a sequence for rapid analysis" by MC Frith, J Shaw,
+JL Spouge.)
It uses this seed alphabet::
R A G
=====================================
src/LastdbArguments.cc
=====================================
@@ -50,7 +50,7 @@ Main Options:\n\
-h, --help show all options and their default settings, and exit\n\
-p interpret the sequences as proteins\n\
-c soft-mask lowercase letters (in reference *and* query sequences)\n\
- -u seeding scheme (default: YASS for DNA, else exact-match seeds)\n\
+ -u seeding scheme (default: YASS if DNA, else PSEUDO if -q, else exact-match)\n\
-P number of parallel threads (default: " + stringify(numOfThreads) + ")";
std::string help = usage + "\n\
=====================================
src/getoptUtil.hh
=====================================
@@ -6,8 +6,13 @@
#include <getopt.h>
-inline void resetGetopt() {
- optind = 1; // xxx ???
+inline void resetGetopt() { // XXX fragile voodoo
+#ifdef __GLIBC__
+ optind = 0;
+#else
+ optind = 1;
+ //optreset = 1; // XXX ???
+#endif
}
#endif
=====================================
src/lastdb.cc
=====================================
@@ -77,9 +77,10 @@ static void makeSubsetSeeds( std::vector< CyclicSubsetSeed >& seeds,
}
}
else{
- std::string s = (alph.letters == alph.dna)
- ? CyclicSubsetSeed::stringFromName( "YASS" )
- : CyclicSubsetSeed::stringFromPatterns( "1", a );
+ std::string s =
+ (alph.letters == alph.dna) ? CyclicSubsetSeed::stringFromName("YASS") :
+ args.isAddStops ? CyclicSubsetSeed::stringFromName("PSEUDO") :
+ CyclicSubsetSeed::stringFromPatterns("1", a);
CyclicSubsetSeed::addPatterns(seeds, s, isCaseSens, alph.encode, a);
}
=====================================
src/makefile
=====================================
@@ -144,7 +144,7 @@ ScoreMatrixData.hh: ../data/*.mat
../build/mat-inc.sh ../data/*.mat > $@
VERSION1 = git describe --dirty
-VERSION2 = echo ' (HEAD -> main, tag: 1407) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
+VERSION2 = echo ' (HEAD -> main, tag: 1411) ' | sed -e 's/.*tag: *//' -e 's/[,) ].*//'
VERSION = \"`test -e ../.git && $(VERSION1) || $(VERSION2)`\"
=====================================
test/last-test.out
=====================================
@@ -2709,6 +2709,11 @@ s Q2LCP8 88 53 + 492 MetIleIleLysMetPheGluGluGlyValThrGluGlyLysLysThrLysIleThrA
s S1_40 3 159 - 579 ATTCTGCTTTCGCTCGTCATTGGCGCCGTTACCTGTTTTCGCCGACTGCCGACAACGCTTGCTGGTCGCATTTATTCCTACGTAAGCATTATCGGCCATTTCAGCTTCCTGGTGTTCGCCACCTACTTGCTGATCCTCTTCCCGCTGACTTTATCGTCG
q S1_40 P|N_kqaM3BuHMQ~H~IT~>`VdH,VHZ'O13q~HfOn\2W]HRScDMkG\Hp<CGror?v4]3bdI\OB\4:UAI?@`MNQHdSKNZAeO6U>~1C7&JI58.YCWaaO5~A at EvMZLKND8QznFq$=SGLkJ=fJYeQ:~G/WL-N;D?HBN-9:
+a score=36
+s Q2LCP8 309 22 + 492 SerAlaLeuLeuIleLeuMetLeuAlaGlySerMetSerGluGluLeuMetValAsnSerValTyr
+s S1_50 85 66 - 262 TCCGTTTGGTTTATTTTGCTCATCATAAGCCCCATGGCAGATGACATTTTGGTTACTGCAGAATAT
+q S1_50 <SL;-.;2KW<-=IG?X.Q=4?(G.7;2ECE>6EnQIX3E=/MSL,-\OLQ>:FJ;,4^D9>>:P>
+
a score=38
s Q2LCP8 422 28 + 492 GlyValMetIleTyrTyrMetAsnLeuValLysIleIleIleIleAspLys---ValGlnThrHisGluGlnGlyValAlaGluLeu
s S1_54 177 87 + 662 GGCGCGATGATGCTGCAAGCGCCGCAGGTCGCCCTCATTGTGGATGACGAATTTATTCAAACGCATACCGTCGGCTTTGACGAGCTA
=====================================
test/last-test.sh
=====================================
@@ -23,6 +23,7 @@ db=/tmp/last-test
trap 'rm -f $db*' EXIT
{
+ lastdb -uMURPHY10 $db /dev/null # this triggered a getopt reset bug
lastdb $db /dev/null
lastdb -D $db
lastal $db /dev/null
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b6c8fddbc13ca80437dfaa097dd69be15d610168
--
View it on GitLab: https://salsa.debian.org/med-team/last-align/-/commit/b6c8fddbc13ca80437dfaa097dd69be15d610168
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220922/2763daf6/attachment-0001.htm>
More information about the debian-med-commit
mailing list