[med-svn] [Git][med-team/salmid][upstream] 2 commits: New upstream version 0.122
Andreas Tille
gitlab at salsa.debian.org
Fri Dec 21 16:15:57 GMT 2018
Andreas Tille pushed to branch upstream at Debian Med / salmid
Commits:
f350e1fb by Andreas Tille at 2018-12-21T16:09:22Z
New upstream version 0.122
- - - - -
06240738 by Andreas Tille at 2018-12-21T16:14:47Z
New upstream version 0.1.23
- - - - -
9 changed files:
- README.md
- − __pycache__/version.cpython-34.pyc
- + pyproject.toml
- + salmid/__init__.py
- SalmID.py → salmid/core.py
- invA_mers_dict → salmid/invA_mers_dict
- rpoB_mers_dict → salmid/rpoB_mers_dict
- + salmid/version.py
- − version.py
Changes:
=====================================
README.md
=====================================
@@ -1,3 +1,5 @@
+[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646)
+
# SalmID
Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia).
@@ -5,26 +7,34 @@ Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs S
Python 3
## Installation:
-Clone git to your machine:
+The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)):
```
-git clone --recursive https://github.com/hcdenbakker/SalmID.git
+brew install brewsci/bio/salmid
```
+Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew!
-Make SalmID executable:
-```
-cd SalmID
-```
+Alternatively download from GitHub:
+```bash
+git clone https://github.com/hcdenbakker/SalmID.git
```
-chmod +x SalmID.py
+
+build a wheel using [poetry](https://poetry.eustace.io/):
+
+```bash
+cd SalmID
+poetry build
```
+and install using `pip`
-Add the SalmID folder to your path
+```bash
+pip install dist/salmid*.whl
+```
To execute:
```
-SalmID.py
+SalmID.py -e .fastq.gz
```
This will perform a SalmID run on all fastq.gz files in the current directory.
```
@@ -38,4 +48,4 @@ This will perform a SalmID run on all files in directory 'directory_with_data' w
## Todo's and thoughts for future releases:
- Provide coverage estimates for genomes in sample based on kmer frequencies
-- Write code to use SalmID on long read (minion, pacbio) platforms
\ No newline at end of file
+- Write code to use SalmID on long read (minion, pacbio) platforms
=====================================
__pycache__/version.cpython-34.pyc deleted
=====================================
Binary files a/__pycache__/version.cpython-34.pyc and /dev/null differ
=====================================
pyproject.toml
=====================================
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "salmid"
+version = "0.1.23"
+description = "Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia)."
+authors = ["Henk den Bakker <hcd82599 at uga.edu>"]
+license = "MIT"
+include = [ 'salmid/invA_mers_dict', 'salmid/rpoB_mers_dict' ]
+
+[tool.poetry.dependencies]
+python = "^3.5"
+
+[tool.poetry.dev-dependencies]
+
+[tool.poetry.scripts]
+'SalmID.py' = 'salmid.core:main'
+
+[build-system]
+requires = ["poetry>=0.12"]
+build-backend = "poetry.masonry.api"
=====================================
salmid/__init__.py
=====================================
=====================================
SalmID.py → salmid/core.py
=====================================
@@ -9,12 +9,13 @@ import sys
from argparse import ArgumentParser
try:
- from version import SalmID_version
-except:
+ from .version import SalmID_version
+except ImportError:
SalmID_version = "version unknown"
def reverse_complement(sequence):
+ """return the reverse complement of a nucleotide (including IUPAC ambiguous nuceotide codes)"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
return "".join(complement[base] for base in reversed(sequence))
@@ -24,63 +25,74 @@ def parse_args():
"Parse the input arguments, use '-h' for help."
parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
# inputs
- parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version)
+ parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SalmID_version)
parser.add_argument(
- '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz',
+ '-i', '--input_file', type=str, required=False, default='None', metavar='your_fastqgz',
help='Single fastq.gz file input, include path to file if file is not in same directory ')
parser.add_argument(
- '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension',
+ '-e', '--extension', type=str, required=False, default='.fastq.gz', metavar='file_extension',
help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
' with this extension in current directory, otherwise files in input directory')
parser.add_argument(
- '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory',
+ '-d', '--input_dir', type=str, required=False, default='.', metavar='directory',
help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
parser.add_argument(
- '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy',
+ '-r', '--report', type=str, required=False, default='percentage', metavar='percentage, coverage or taxonomy',
help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
parser.add_argument(
- '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough',
+ '-m', '--mode', type=str, required=False, default='quick', metavar='quick or thorough',
help='Quick [quick] or thorough [thorough] mode')
- if len(sys.argv)==1:
+ if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
return parser.parse_args()
+
def get_av_read_length(file):
+ """Samples the first 100 reads from a fastq file and return the average read length."""
i = 1
n_reads = 0
total_length = 0
if file.endswith(".gz"):
- file_content=io.BufferedReader(gzip.open(file))
+ file_content = io.BufferedReader(gzip.open(file))
else:
- file_content=open(file,"r").readlines()
+ file_content = open(file, "r").readlines()
for line in file_content:
if i % 4 == 2:
total_length += len(line.strip())
- n_reads +=1
+ n_reads += 1
i += 1
if n_reads == 100:
break
- return total_length/100
+ return total_length / 100
def createKmerDict_reads(list_of_strings, kmer):
+ """Count occurence of K-mers in a list of strings
+
+ Args:
+ list_of_strings(list of str): nucleotide sequences as a list of strings
+ kmer(int): length of the K-mer to count
+
+ Returns:
+ dict: dictionary with kmers as keys, counts for each kmer as values"""
kmer_table = {}
for string in list_of_strings:
sequence = string.strip('\n')
- for i in range(len(sequence)-kmer+1):
- new_mer =sequence[i:i+kmer]
- new_mer_rc = reverse_complement(new_mer)
- if new_mer in kmer_table:
- kmer_table[new_mer.upper()] += 1
- else:
- kmer_table[new_mer.upper()] = 1
- if new_mer_rc in kmer_table:
- kmer_table[new_mer_rc.upper()] += 1
- else:
- kmer_table[new_mer_rc.upper()] = 1
+ if len(sequence) >= kmer:
+ for i in range(len(sequence) - kmer + 1):
+ new_mer = sequence[i:i + kmer]
+ new_mer_rc = reverse_complement(new_mer)
+ if new_mer in kmer_table:
+ kmer_table[new_mer.upper()] += 1
+ else:
+ kmer_table[new_mer.upper()] = 1
+ if new_mer_rc in kmer_table:
+ kmer_table[new_mer_rc.upper()] += 1
+ else:
+ kmer_table[new_mer_rc.upper()] = 1
return kmer_table
@@ -96,16 +108,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
reads_2 = []
total_reads = 0
if file.endswith(".gz"):
- file_content=io.BufferedReader(gzip.open(file))
+ file_content = io.BufferedReader(gzip.open(file))
else:
- file_content=open(file,"r").readlines()
+ file_content = open(file, "r").readlines()
for line in file_content:
start = int((len(line) - k) // 2)
if i % 4 == 2:
total_reads += 1
if file.endswith(".gz"):
s1 = line[start:k + start].decode()
- line=line.decode()
+ line = line.decode()
else:
s1 = line[start:k + start]
if s1 in kmerDict_1:
@@ -126,15 +138,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
else:
kmer_Dict1 = createKmerDict_reads(reads_1, k)
mers_1 = set([key for key in kmer_Dict1])
- mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1)
+ mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1]) / len(mers_1)
if len(reads_2) == 0:
kmer_Dict2 = {}
else:
kmer_Dict2 = createKmerDict_reads(reads_2, k)
mers_2 = set([key for key in kmer_Dict2])
- mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2)
+ mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2]) / len(mers_2)
return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
+
def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
'''
Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
@@ -145,10 +158,11 @@ def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
'''
if len(iterable) == 0:
return 0
- return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers)
+ return sum([kmer_dict[value] for value in iterable]) / len(clade_specific_kmers)
+
def kmer_lists(query_fastq_gz, k,
- allmers,allmers_rpoB,
+ allmers, allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
@@ -165,8 +179,8 @@ def kmer_lists(query_fastq_gz, k,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode):
- dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
- allmers_rpoB, mode)
+ dict_invA, dict_rpoB, mean_invA, mean_rpoB, total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
+ allmers_rpoB, mode)
target_mers_invA = set([key for key in dict_invA])
target_mers_rpoB = set([key for key in dict_rpoB])
if target_mers_invA == 0:
@@ -211,17 +225,18 @@ def kmer_lists(query_fastq_gz, k,
S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
- p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
+ p_I, p_IIa, p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
return locus_scores, coverages, total_reads
+
def report_taxon(locus_covs, average_read_length, number_of_reads):
- list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
+ list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', # noqa: E201
'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
- 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)',
+ 'S. enterica subsp. salamae (invA: clade a)', 'S. enterica subsp. salamae (invA: clade b)',
'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
- 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
+ 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)' ] # noqa: E202
if sum(locus_covs) < 1:
rpoB = ('No rpoB matches!', 0)
invA = ('No invA matches!', 0)
@@ -229,18 +244,18 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
else:
# given list of scores get taxon
if sum(locus_covs[0:5]) > 0:
- best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1
+ best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x]) + 1
all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
- elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1):
+ elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]), 1) < 1):
rpoB = (list_taxa[0], locus_covs[0])
else:
rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
else:
rpoB = ('No rpoB matches!', 0)
if sum(locus_covs[5:]) > 0:
- best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5
+ best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x]) + 5
invA = (list_taxa[best_invA], locus_covs[best_invA])
else:
invA = ('No invA matches!', 0)
@@ -250,7 +265,6 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
return rpoB, invA, (average_read_length * number_of_reads) / 5000000
-
def main():
ex_dir = os.path.dirname(os.path.realpath(__file__))
args = parse_args()
@@ -260,7 +274,7 @@ def main():
else:
extension = args.extension
inputdir = args.input_dir
- files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)]
+ files = [inputdir + '/' + f for f in os.listdir(inputdir) if f.endswith(extension)]
report = args.report
mode = args.mode
f_invA = open(ex_dir + "/invA_mers_dict", "rb")
@@ -288,7 +302,7 @@ def main():
uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
- #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
+ # todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
if report == 'taxonomy':
print('file\trpoB\tinvA\texpected coverage')
for f in files:
@@ -317,55 +331,56 @@ def main():
'\t' + str(round(report[2], 1)))
else:
print(
- 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
- '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
- ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
+ 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + # noqa: E122
+ '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + # noqa: E122
+ ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + # noqa: E122
'\tsubsp. II (clade VIII : invA)')
if report == 'percentage':
for f in files:
- locus_scores, coverages , reads = kmer_lists( f, 27,
- allmers,allmers_rpoB,
- uniqmers_bongori,
- uniqmers_I,
- uniqmers_IIa,
- uniqmers_IIb,
- uniqmers_IIIa,
- uniqmers_IIIb,
- uniqmers_IV,
- uniqmers_VI,
- uniqmers_VII,
- uniqmers_VIII,
- uniqmers_bongori_rpoB,
- uniqmers_S_enterica_rpoB,
- uniqmers_Escherichia_rpoB,
- uniqmers_Listeria_ss_rpoB,
- uniqmers_Lmono_rpoB,
- mode)
+ locus_scores, coverages, reads = kmer_lists(f, 27,
+ allmers, allmers_rpoB,
+ uniqmers_bongori,
+ uniqmers_I,
+ uniqmers_IIa,
+ uniqmers_IIb,
+ uniqmers_IIIa,
+ uniqmers_IIIb,
+ uniqmers_IV,
+ uniqmers_VI,
+ uniqmers_VII,
+ uniqmers_VIII,
+ uniqmers_bongori_rpoB,
+ uniqmers_S_enterica_rpoB,
+ uniqmers_Escherichia_rpoB,
+ uniqmers_Listeria_ss_rpoB,
+ uniqmers_Lmono_rpoB,
+ mode)
pretty_scores = [str(round(score)) for score in locus_scores]
- print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores))
+ print(f.split('/')[-1] + '\t' + '\t'.join(pretty_scores))
else:
for f in files:
- locus_scores, coverages , reads = kmer_lists( f, 27,
- allmers,allmers_rpoB,
- uniqmers_bongori,
- uniqmers_I,
- uniqmers_IIa,
- uniqmers_IIb,
- uniqmers_IIIa,
- uniqmers_IIIb,
- uniqmers_IV,
- uniqmers_VI,
- uniqmers_VII,
- uniqmers_VIII,
- uniqmers_bongori_rpoB,
- uniqmers_S_enterica_rpoB,
- uniqmers_Escherichia_rpoB,
- uniqmers_Listeria_ss_rpoB,
- uniqmers_Lmono_rpoB,
- mode)
+ locus_scores, coverages, reads = kmer_lists(f, 27,
+ allmers, allmers_rpoB,
+ uniqmers_bongori,
+ uniqmers_I,
+ uniqmers_IIa,
+ uniqmers_IIb,
+ uniqmers_IIIa,
+ uniqmers_IIIb,
+ uniqmers_IV,
+ uniqmers_VI,
+ uniqmers_VII,
+ uniqmers_VIII,
+ uniqmers_bongori_rpoB,
+ uniqmers_S_enterica_rpoB,
+ uniqmers_Escherichia_rpoB,
+ uniqmers_Listeria_ss_rpoB,
+ uniqmers_Lmono_rpoB,
+ mode)
pretty_covs = [str(round(cov, 1)) for cov in coverages]
print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
+
if __name__ == '__main__':
main()
=====================================
invA_mers_dict → salmid/invA_mers_dict
=====================================
=====================================
rpoB_mers_dict → salmid/rpoB_mers_dict
=====================================
=====================================
salmid/version.py
=====================================
@@ -0,0 +1 @@
+SalmID_version = '0.1.23'
=====================================
version.py deleted
=====================================
@@ -1 +0,0 @@
-SalmID_version = '0.12'
View it on GitLab: https://salsa.debian.org/med-team/salmid/compare/44e471c7d273851232206e6293dfcc14f70ac611...06240738a2c3f2a951322bac4f183c170800d329
--
View it on GitLab: https://salsa.debian.org/med-team/salmid/compare/44e471c7d273851232206e6293dfcc14f70ac611...06240738a2c3f2a951322bac4f183c170800d329
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181221/093ee769/attachment-0001.html>
More information about the debian-med-commit
mailing list