[med-svn] [Git][med-team/lamassemble][upstream] New upstream version 1.6.0
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Mon May 8 20:06:34 BST 2023
Nilesh Patra pushed to branch upstream at Debian Med / lamassemble
Commits:
e8aa9065 by Nilesh Patra at 2023-05-09T00:29:02+05:30
New upstream version 1.6.0
- - - - -
3 changed files:
- README.md
- lamassemble
- setup.py
Changes:
=====================================
README.md
=====================================
@@ -11,7 +11,7 @@ from huge tandem repeats, wrong parts of the reads may get merged.
interface](https://mafft.cbrc.jp/alignment/server/index-rawreads.html).
**Usage option 2:** run it on your computer. You can install it from
-[bioconda][]:
+[Debian Med][] or [bioconda][]:
conda install -c bioconda lamassemble
@@ -26,7 +26,7 @@ on your computer, i.e. put them in your `PATH`. You can install
After installing, you can run it like this:
- lamassemble last-train.mat sequences.fx > consensus.fa
+ lamassemble last-train-file sequences.fx > consensus.fa
* The sequence file may be in fasta or fastq format (it makes no difference).
* It's OK to use gzipped (`.gz`) files.
@@ -34,19 +34,20 @@ After installing, you can run it like this:
You need to give it a file made by `last-train`, with the rates of
insertions, deletions, and substitutions in the reads.
-## Included `last-train` files
+## Built-in `last-train` files
-The `train` directory has these files:
+These files (with case-insensitive names) probably won't be ideal for
+your data, but they might be good enough:
-* `promethion.mat`: from human DNA sequenced with PromethION R9.4,
+* `promethion-2019`: from human DNA sequenced with PromethION R9.4,
base-called with Guppy 1.4.0 ([De Coster et al. Genome
Res. 2019](https://www.ncbi.nlm.nih.gov/pubmed/31186302),
ERR2631604).
-* `promethion-rna.mat`: from human RNA (direct RNA), sequenced with
+* `promethion-rna-2019`: from human RNA (direct RNA), sequenced with
PromethION R9.4, base-called with Albacore.
-* `sequel-II-CLR.mat`: from human DNA sequenced with PacBio Sequel II,
+* `sequel-II-CLR-2019`: from human DNA sequenced with PacBio Sequel II,
Continuous Long Reads ([Wenger et
al. Nat. Biotechnol. 2019](https://www.ncbi.nlm.nih.gov/pubmed/31406327),
SRR9972588).
@@ -96,6 +97,11 @@ similarities by increasing option `-m` (and/or decreasing `-W`).
- `-c`, `--consensus`: just make a consensus, of already-aligned sequences.
+- `-f FMT`, `--format=FMT`: consensus output format, fasta/fa or
+ fastq/fq. Fastq shows the error probability of each base (assuming
+ the alignment is correct, so over-optimistic). The format name is
+ case-insensitive.
+
- `-g G`, `--gap-max=G`: make the consensus sequence from alignment
columns with <= G% gaps.
@@ -144,3 +150,4 @@ similarities by increasing option `-m` (and/or decreasing `-W`).
bit, but increase run time.
[bioconda]: https://bioconda.github.io/user/install.html
+[Debian Med]: https://www.debian.org/devel/debian-med/
=====================================
lamassemble
=====================================
@@ -5,7 +5,6 @@
from __future__ import print_function
import collections
-import functools
import gzip
import itertools
import logging
@@ -48,7 +47,52 @@ def nameAndSeqFromFastx(seqLines):
##### Routines for getting score parameters from a last-train file:
-def parametersFromLastTrain(lines):
+builtinTrainFiles = {
+ "promethion-2019": """
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0369615
+# insOpenProb: 0.0340916
+# delExtendProb: 0.439744
+# insExtendProb: 0.403943
+# probability matrix (query letters = columns, reference letters = rows):
+# A C G T
+# A 0.278908 0.00109169 0.012899 0.00107855
+# C 0.00154506 0.197869 0.00044938 0.00272089
+# G 0.0272926 0.000508552 0.177789 0.000859018
+# T 0.00126293 0.00328421 0.000545484 0.291896
+""",
+ "promethion-rna-2019" :"""
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0578071
+# insOpenProb: 0.0297262
+# delExtendProb: 0.438808
+# insExtendProb: 0.380129
+# probability matrix (query letters = columns, reference letters = rows):
+# A C G T
+# A 0.246603 0.00134 0.00742782 0.00762015
+# C 0.00168584 0.221462 0.000388905 0.0143456
+# G 0.00693157 0.000421241 0.239649 0.000805103
+# T 0.00803073 0.0131941 0.000617237 0.229477
+""",
+ "sequel-ii-clr-2019": """
+# scale of score parameters: 4.5512
+# delOpenProb: 0.0329087
+# insOpenProb: 0.0343556
+# delExtendProb: 0.0674455
+# insExtendProb: 0.288643
+# probability matrix (query letters = columns, reference letters = rows):
+# A C G T
+# A 0.292742 0.00340172 0.000378204 0.00239594
+# C 0.000849175 0.20731 0.000104201 0.00037284
+# G 0.0013137 0.000304338 0.187333 0.00680248
+# T 0.0135957 0.000785525 0.00475765 0.277554
+"""
+}
+
+def parametersFromLastTrain(trainFile):
+ builtin = builtinTrainFiles.get(trainFile.lower())
+ lines = builtin.splitlines() if builtin else openFile(trainFile)
+
alphabetSize = 4
scale = delOpenProb = delExtendProb = insOpenProb = insExtendProb = -1
probMatrix = range(alphabetSize)
@@ -242,12 +286,19 @@ def columnScore(priorScores, scoreMatrix, column, fwdBaseIndex):
revScore = strandScore(scoreMatrix[revBaseIndex], column, "tgca")
return priorScores[fwdBaseIndex] + fwdScore + revScore
+def asciiFromInvProb(invProb):
+ errProb = 1 - 1 / invProb
+ s = int(math.floor(-10 * math.log10(max(errProb, 1e-10))))
+ return chr(min(s + 33, 126))
+
def consensusCol(priorScores, scoreMatrix, column):
bases = "acgt"
column = column.replace("U", "T") # keep
- func = functools.partial(columnScore, priorScores, scoreMatrix, column)
- m = max(range(len(bases)), key=func)
- return bases[m]
+ scores = [columnScore(priorScores, scoreMatrix, column, i)
+ for i in range(len(bases))]
+ j, m = max(enumerate(scores), key=itemgetter(1))
+ invProb = sum(math.exp(i - m) for i in scores)
+ return bases[j], invProb
def consensusSeq(opts, probMatrix, alignedSequences):
if not alignedSequences:
@@ -259,11 +310,16 @@ def consensusSeq(opts, probMatrix, alignedSequences):
rows = [seq for name, seq in alignedSequences]
cols = alignmentColumnsForConsensus(opts, rows)
- return "".join(consensusCol(priorScores, scoreMatrix, i) for i in cols)
+ return [consensusCol(priorScores, scoreMatrix, i) for i in cols]
def printConsensusSeq(opts, probMatrix, alignedSequences):
- s = consensusSeq(opts, probMatrix, alignedSequences)
- printFasta(opts.name, s, sys.stdout)
+ c = consensusSeq(opts, probMatrix, alignedSequences)
+ seq = "".join(i[0] for i in c)
+ if opts.format.lower() in ("fastq", "fq"):
+ qual = "".join(asciiFromInvProb(i[1]) for i in c)
+ print("@" + opts.name, seq, "+", qual, sep="\n")
+ else:
+ print(">" + opts.name, seq, sep="\n")
##### Routines for aligning the sequences:
@@ -620,7 +676,7 @@ def main(opts, trainFile, seqFile):
logLevel = logging.INFO if opts.verbose else logging.WARNING
logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
- scale, probMatrix, gapProbs = parametersFromLastTrain(openFile(trainFile))
+ scale, probMatrix, gapProbs = parametersFromLastTrain(trainFile)
probMatrix = list(matrixWithComplementSymmetricRows(probMatrix))
fastx = fastxInput(openFile(seqFile))
@@ -657,6 +713,8 @@ if __name__ == "__main__":
help="print an alignment, not a consensus")
op.add_option("-c", "--consensus", action="store_true",
help="just make a consensus, of already-aligned sequences")
+ op.add_option("-f", "--format", metavar="FMT", default="fasta", help=
+ "output format: fasta/fa or fastq/fq (default=%default)")
op.add_option("-g", "--gap-max", metavar="G", type="float", default=50,
help="use alignment columns with <= G% gaps "
"(default=%default)")
=====================================
setup.py
=====================================
@@ -1,6 +1,6 @@
import setuptools
-commitInfo = " (HEAD -> master, tag: 1.4.2)".strip("( )").split()
+commitInfo = " (HEAD -> master, tag: 1.6.0)".strip("( )").split()
version = commitInfo[commitInfo.index("tag:") + 1].rstrip(",")
setuptools.setup(
View it on GitLab: https://salsa.debian.org/med-team/lamassemble/-/commit/e8aa90657e6272f5701c903830c7317830cd2bbf
--
View it on GitLab: https://salsa.debian.org/med-team/lamassemble/-/commit/e8aa90657e6272f5701c903830c7317830cd2bbf
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230508/84aec295/attachment-0001.htm>
More information about the debian-med-commit
mailing list