[med-svn] [Git][med-team/lamassemble][upstream] New upstream version 1.6.0

Mon May 8 20:06:34 BST 2023


Nilesh Patra pushed to branch upstream at Debian Med / lamassemble


Commits:
e8aa9065 by Nilesh Patra at 2023-05-09T00:29:02+05:30
New upstream version 1.6.0
- - - - -


3 changed files:

- README.md
- lamassemble
- setup.py


Changes:

=====================================
README.md
=====================================
@@ -11,7 +11,7 @@ from huge tandem repeats, wrong parts of the reads may get merged.
 interface](https://mafft.cbrc.jp/alignment/server/index-rawreads.html).
 
 **Usage option 2:** run it on your computer.  You can install it from
-[bioconda][]:
+[Debian Med][] or [bioconda][]:
 
     conda install -c bioconda lamassemble
 
@@ -26,7 +26,7 @@ on your computer, i.e. put them in your `PATH`.  You can install
 
 After installing, you can run it like this:
 
-    lamassemble last-train.mat sequences.fx > consensus.fa
+    lamassemble last-train-file sequences.fx > consensus.fa
 
 * The sequence file may be in fasta or fastq format (it makes no difference).
 * It's OK to use gzipped (`.gz`) files.
@@ -34,19 +34,20 @@ After installing, you can run it like this:
 You need to give it a file made by `last-train`, with the rates of
 insertions, deletions, and substitutions in the reads.
 
-## Included `last-train` files
+## Built-in `last-train` files
 
-The `train` directory has these files:
+These files (with case-insensitive names) probably won't be ideal for
+your data, but they might be good enough:
 
-* `promethion.mat`: from human DNA sequenced with PromethION R9.4,
+* `promethion-2019`: from human DNA sequenced with PromethION R9.4,
   base-called with Guppy 1.4.0 ([De Coster et al. Genome
   Res. 2019](https://www.ncbi.nlm.nih.gov/pubmed/31186302),
   ERR2631604).
 
-* `promethion-rna.mat`: from human RNA (direct RNA), sequenced with
+* `promethion-rna-2019`: from human RNA (direct RNA), sequenced with
   PromethION R9.4, base-called with Albacore.
 
-* `sequel-II-CLR.mat`: from human DNA sequenced with PacBio Sequel II,
+* `sequel-II-CLR-2019`: from human DNA sequenced with PacBio Sequel II,
   Continuous Long Reads ([Wenger et
   al. Nat. Biotechnol. 2019](https://www.ncbi.nlm.nih.gov/pubmed/31406327),
   SRR9972588).
@@ -96,6 +97,11 @@ similarities by increasing option `-m` (and/or decreasing `-W`).
 
 - `-c`, `--consensus`: just make a consensus, of already-aligned sequences.
 
+- `-f FMT`, `--format=FMT`: consensus output format, fasta/fa or
+  fastq/fq.  Fastq shows the error probability of each base (assuming
+  the alignment is correct, so over-optimistic).  The format name is
+  case-insensitive.
+
 - `-g G`, `--gap-max=G`: make the consensus sequence from alignment
   columns with <= G% gaps.
 
@@ -144,3 +150,4 @@ similarities by increasing option `-m` (and/or decreasing `-W`).
   bit, but increase run time.
 
 [bioconda]: https://bioconda.github.io/user/install.html
+[Debian Med]: https://www.debian.org/devel/debian-med/


=====================================
lamassemble
=====================================
@@ -5,7 +5,6 @@
 from __future__ import print_function
 
 import collections
-import functools
 import gzip
 import itertools
 import logging
@@ -48,7 +47,52 @@ def nameAndSeqFromFastx(seqLines):
 
 ##### Routines for getting score parameters from a last-train file:
 
-def parametersFromLastTrain(lines):
+builtinTrainFiles = {
+    "promethion-2019": """
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0369615
+# insOpenProb: 0.0340916
+# delExtendProb: 0.439744
+# insExtendProb: 0.403943
+# probability matrix (query letters = columns, reference letters = rows):
+#   A              C              G              T             
+# A 0.278908       0.00109169     0.012899       0.00107855    
+# C 0.00154506     0.197869       0.00044938     0.00272089    
+# G 0.0272926      0.000508552    0.177789       0.000859018   
+# T 0.00126293     0.00328421     0.000545484    0.291896
+""",
+    "promethion-rna-2019" :"""
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0578071
+# insOpenProb: 0.0297262
+# delExtendProb: 0.438808
+# insExtendProb: 0.380129
+# probability matrix (query letters = columns, reference letters = rows):
+#   A              C              G              T             
+# A 0.246603       0.00134        0.00742782     0.00762015    
+# C 0.00168584     0.221462       0.000388905    0.0143456     
+# G 0.00693157     0.000421241    0.239649       0.000805103   
+# T 0.00803073     0.0131941      0.000617237    0.229477      
+""",
+    "sequel-ii-clr-2019": """
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0329087
+# insOpenProb: 0.0343556
+# delExtendProb: 0.0674455
+# insExtendProb: 0.288643
+# probability matrix (query letters = columns, reference letters = rows):
+#   A              C              G              T             
+# A 0.292742       0.00340172     0.000378204    0.00239594    
+# C 0.000849175    0.20731        0.000104201    0.00037284    
+# G 0.0013137      0.000304338    0.187333       0.00680248    
+# T 0.0135957      0.000785525    0.00475765     0.277554      
+"""
+}
+
+def parametersFromLastTrain(trainFile):
+    builtin = builtinTrainFiles.get(trainFile.lower())
+    lines = builtin.splitlines() if builtin else openFile(trainFile)
+
     alphabetSize = 4
     scale = delOpenProb = delExtendProb = insOpenProb = insExtendProb = -1
     probMatrix = range(alphabetSize)
@@ -242,12 +286,19 @@ def columnScore(priorScores, scoreMatrix, column, fwdBaseIndex):
     revScore = strandScore(scoreMatrix[revBaseIndex], column, "tgca")
     return priorScores[fwdBaseIndex] + fwdScore + revScore
 
+def asciiFromInvProb(invProb):
+    errProb = 1 - 1 / invProb
+    s = int(math.floor(-10 * math.log10(max(errProb, 1e-10))))
+    return chr(min(s + 33, 126))
+
 def consensusCol(priorScores, scoreMatrix, column):
     bases = "acgt"
     column = column.replace("U", "T") # keep
-    func = functools.partial(columnScore, priorScores, scoreMatrix, column)
-    m = max(range(len(bases)), key=func)
-    return bases[m]
+    scores = [columnScore(priorScores, scoreMatrix, column, i)
+              for i in range(len(bases))]
+    j, m = max(enumerate(scores), key=itemgetter(1))
+    invProb = sum(math.exp(i - m) for i in scores)
+    return bases[j], invProb
 
 def consensusSeq(opts, probMatrix, alignedSequences):
     if not alignedSequences:
@@ -259,11 +310,16 @@ def consensusSeq(opts, probMatrix, alignedSequences):
 
     rows = [seq for name, seq in alignedSequences]
     cols = alignmentColumnsForConsensus(opts, rows)
-    return "".join(consensusCol(priorScores, scoreMatrix, i) for i in cols)
+    return [consensusCol(priorScores, scoreMatrix, i) for i in cols]
 
 def printConsensusSeq(opts, probMatrix, alignedSequences):
-    s = consensusSeq(opts, probMatrix, alignedSequences)
-    printFasta(opts.name, s, sys.stdout)
+    c = consensusSeq(opts, probMatrix, alignedSequences)
+    seq = "".join(i[0] for i in c)
+    if opts.format.lower() in ("fastq", "fq"):
+        qual = "".join(asciiFromInvProb(i[1]) for i in c)
+        print("@" + opts.name, seq, "+", qual, sep="\n")
+    else:
+        print(">" + opts.name, seq, sep="\n")
 
 ##### Routines for aligning the sequences:
 
@@ -620,7 +676,7 @@ def main(opts, trainFile, seqFile):
     logLevel = logging.INFO if opts.verbose else logging.WARNING
     logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
 
-    scale, probMatrix, gapProbs = parametersFromLastTrain(openFile(trainFile))
+    scale, probMatrix, gapProbs = parametersFromLastTrain(trainFile)
     probMatrix = list(matrixWithComplementSymmetricRows(probMatrix))
 
     fastx = fastxInput(openFile(seqFile))
@@ -657,6 +713,8 @@ if __name__ == "__main__":
                   help="print an alignment, not a consensus")
     op.add_option("-c", "--consensus", action="store_true",
                   help="just make a consensus, of already-aligned sequences")
+    op.add_option("-f", "--format", metavar="FMT", default="fasta", help=
+                  "output format: fasta/fa or fastq/fq (default=%default)")
     op.add_option("-g", "--gap-max", metavar="G", type="float", default=50,
                   help="use alignment columns with <= G% gaps "
                   "(default=%default)")


=====================================
setup.py
=====================================
@@ -1,6 +1,6 @@
 import setuptools
 
-commitInfo = " (HEAD -> master, tag: 1.4.2)".strip("( )").split()
+commitInfo = " (HEAD -> master, tag: 1.6.0)".strip("( )").split()
 version = commitInfo[commitInfo.index("tag:") + 1].rstrip(",")
 
 setuptools.setup(



View it on GitLab: https://salsa.debian.org/med-team/lamassemble/-/commit/e8aa90657e6272f5701c903830c7317830cd2bbf

-- 
View it on GitLab: https://salsa.debian.org/med-team/lamassemble/-/commit/e8aa90657e6272f5701c903830c7317830cd2bbf
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230508/84aec295/attachment-0001.htm>