[med-svn] [Git][med-team/salmid][upstream] 2 commits: New upstream version 0.122

Fri Dec 21 16:15:57 GMT 2018

Andreas Tille pushed to branch upstream at Debian Med / salmid


Commits:
f350e1fb by Andreas Tille at 2018-12-21T16:09:22Z
New upstream version 0.122
- - - - -
06240738 by Andreas Tille at 2018-12-21T16:14:47Z
New upstream version 0.1.23
- - - - -


9 changed files:

- README.md
- − __pycache__/version.cpython-34.pyc
- + pyproject.toml
- + salmid/__init__.py
- SalmID.py → salmid/core.py
- invA_mers_dict → salmid/invA_mers_dict
- rpoB_mers_dict → salmid/rpoB_mers_dict
- + salmid/version.py
- − version.py


Changes:

=====================================
README.md
=====================================
@@ -1,3 +1,5 @@
+[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646)
+
 # SalmID
 Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia).
 
@@ -5,26 +7,34 @@ Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs S
 Python 3
 
 ## Installation:
-Clone git to your machine:
+The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)):
 ```
-git clone --recursive https://github.com/hcdenbakker/SalmID.git
+brew install brewsci/bio/salmid
 ```
+Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew!
 
-Make SalmID executable:
-```
-cd SalmID
-```
+Alternatively download from GitHub:
 
+```bash
+git clone https://github.com/hcdenbakker/SalmID.git
 ```
-chmod +x SalmID.py
+
+build a wheel using [poetry](https://poetry.eustace.io/):
+
+```bash
+cd SalmID
+poetry build
 ```
 
+and install using `pip`
 
-Add the SalmID folder to your path
+```bash
+pip install dist/salmid*.whl
+```
 
 To execute:
 ```
-SalmID.py
+SalmID.py -e .fastq.gz
 ```
 This will perform a SalmID run on all fastq.gz files in the current directory.
 ```
@@ -38,4 +48,4 @@ This will perform a SalmID run on all files in directory 'directory_with_data' w
 
 ## Todo's and thoughts for future releases:
 - Provide coverage estimates for genomes in sample based on kmer frequencies
-- Write code to use SalmID on long read (minion, pacbio) platforms
\ No newline at end of file
+- Write code to use SalmID on long read (minion, pacbio) platforms


=====================================
__pycache__/version.cpython-34.pyc deleted
=====================================
Binary files a/__pycache__/version.cpython-34.pyc and /dev/null differ


=====================================
pyproject.toml
=====================================
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "salmid"
+version = "0.1.23"
+description = "Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia)."
+authors = ["Henk den Bakker <hcd82599 at uga.edu>"]
+license = "MIT"
+include = [ 'salmid/invA_mers_dict', 'salmid/rpoB_mers_dict' ]
+
+[tool.poetry.dependencies]
+python = "^3.5"
+
+[tool.poetry.dev-dependencies]
+
+[tool.poetry.scripts]
+'SalmID.py' = 'salmid.core:main'
+
+[build-system]
+requires = ["poetry>=0.12"]
+build-backend = "poetry.masonry.api"


=====================================
salmid/__init__.py
=====================================


=====================================
SalmID.py → salmid/core.py
=====================================
@@ -9,12 +9,13 @@ import sys
 
 from argparse import ArgumentParser
 try:
-	from version import SalmID_version
-except:
+    from .version import SalmID_version
+except ImportError:
     SalmID_version = "version unknown"
 
 
 def reverse_complement(sequence):
+    """return the reverse complement of a nucleotide (including IUPAC ambiguous nuceotide codes)"""
     complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
                             'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
     return "".join(complement[base] for base in reversed(sequence))
@@ -24,63 +25,74 @@ def parse_args():
     "Parse the input arguments, use '-h' for help."
     parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
     # inputs
-    parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version)
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SalmID_version)
     parser.add_argument(
-        '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz',
+        '-i', '--input_file', type=str, required=False, default='None', metavar='your_fastqgz',
         help='Single fastq.gz file input, include path to file if file is not in same directory ')
     parser.add_argument(
-        '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension',
+        '-e', '--extension', type=str, required=False, default='.fastq.gz', metavar='file_extension',
         help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
              ' with this extension in current directory, otherwise files in input directory')
 
     parser.add_argument(
-        '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory',
+        '-d', '--input_dir', type=str, required=False, default='.', metavar='directory',
         help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
     parser.add_argument(
-        '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy',
+        '-r', '--report', type=str, required=False, default='percentage', metavar='percentage, coverage or taxonomy',
         help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
              'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
     parser.add_argument(
-        '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough',
+        '-m', '--mode', type=str, required=False, default='quick', metavar='quick or thorough',
         help='Quick [quick] or thorough [thorough] mode')
-    if len(sys.argv)==1:
+    if len(sys.argv) == 1:
         parser.print_help(sys.stderr)
         sys.exit(1)
     return parser.parse_args()
 
+
 def get_av_read_length(file):
+    """Samples the first 100 reads from a fastq file and return the average read length."""
     i = 1
     n_reads = 0
     total_length = 0
     if file.endswith(".gz"):
-        file_content=io.BufferedReader(gzip.open(file))
+        file_content = io.BufferedReader(gzip.open(file))
     else:
-        file_content=open(file,"r").readlines()
+        file_content = open(file, "r").readlines()
     for line in file_content:
         if i % 4 == 2:
             total_length += len(line.strip())
-            n_reads +=1
+            n_reads += 1
         i += 1
         if n_reads == 100:
             break
-    return total_length/100
+    return total_length / 100
 
 
 def createKmerDict_reads(list_of_strings, kmer):
+    """Count occurence of K-mers in a list of strings
+
+    Args:
+        list_of_strings(list of str): nucleotide sequences as a list of strings
+        kmer(int): length of the K-mer to count
+
+    Returns:
+        dict: dictionary with kmers as keys, counts for each kmer as values"""
     kmer_table = {}
     for string in list_of_strings:
         sequence = string.strip('\n')
-        for i in range(len(sequence)-kmer+1):
-            new_mer =sequence[i:i+kmer]
-            new_mer_rc = reverse_complement(new_mer)
-            if new_mer in kmer_table:
-                kmer_table[new_mer.upper()] += 1
-            else:
-                kmer_table[new_mer.upper()] = 1
-            if new_mer_rc in kmer_table:
-                kmer_table[new_mer_rc.upper()] += 1
-            else:
-                kmer_table[new_mer_rc.upper()] = 1
+        if len(sequence) >= kmer:
+            for i in range(len(sequence) - kmer + 1):
+                new_mer = sequence[i:i + kmer]
+                new_mer_rc = reverse_complement(new_mer)
+                if new_mer in kmer_table:
+                    kmer_table[new_mer.upper()] += 1
+                else:
+                    kmer_table[new_mer.upper()] = 1
+                if new_mer_rc in kmer_table:
+                    kmer_table[new_mer_rc.upper()] += 1
+                else:
+                    kmer_table[new_mer_rc.upper()] = 1
     return kmer_table
 
 
@@ -96,16 +108,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
     reads_2 = []
     total_reads = 0
     if file.endswith(".gz"):
-      file_content=io.BufferedReader(gzip.open(file))
+        file_content = io.BufferedReader(gzip.open(file))
     else:
-      file_content=open(file,"r").readlines()
+        file_content = open(file, "r").readlines()
     for line in file_content:
         start = int((len(line) - k) // 2)
         if i % 4 == 2:
             total_reads += 1
             if file.endswith(".gz"):
                 s1 = line[start:k + start].decode()
-                line=line.decode()
+                line = line.decode()
             else:
                 s1 = line[start:k + start]
             if s1 in kmerDict_1:
@@ -126,15 +138,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
     else:
         kmer_Dict1 = createKmerDict_reads(reads_1, k)
         mers_1 = set([key for key in kmer_Dict1])
-        mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1)
+        mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1]) / len(mers_1)
     if len(reads_2) == 0:
         kmer_Dict2 = {}
     else:
         kmer_Dict2 = createKmerDict_reads(reads_2, k)
         mers_2 = set([key for key in kmer_Dict2])
-        mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2)
+        mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2]) / len(mers_2)
     return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
 
+
 def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
     '''
     Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
@@ -145,10 +158,11 @@ def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
     '''
     if len(iterable) == 0:
         return 0
-    return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers)
+    return sum([kmer_dict[value] for value in iterable]) / len(clade_specific_kmers)
+
 
 def kmer_lists(query_fastq_gz, k,
-               allmers,allmers_rpoB,
+               allmers, allmers_rpoB,
                uniqmers_bongori,
                uniqmers_I,
                uniqmers_IIa,
@@ -165,8 +179,8 @@ def kmer_lists(query_fastq_gz, k,
                uniqmers_Listeria_ss_rpoB,
                uniqmers_Lmono_rpoB,
                mode):
-    dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
-                                                                            allmers_rpoB, mode)
+    dict_invA, dict_rpoB, mean_invA, mean_rpoB, total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
+                                                                                         allmers_rpoB, mode)
     target_mers_invA = set([key for key in dict_invA])
     target_mers_rpoB = set([key for key in dict_rpoB])
     if target_mers_invA == 0:
@@ -211,17 +225,18 @@ def kmer_lists(query_fastq_gz, k,
                      S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
                      IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
         locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
-                        p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
+                        p_I, p_IIa, p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
     return locus_scores, coverages, total_reads
 
+
 def report_taxon(locus_covs, average_read_length, number_of_reads):
-    list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
+    list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',  # noqa: E201
                   'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
                   'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
-                  'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)',
+                  'S. enterica subsp. salamae (invA: clade a)', 'S. enterica subsp. salamae (invA: clade b)',
                   'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
                   'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
-                  'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
+                  'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)' ]  # noqa: E202
     if sum(locus_covs) < 1:
         rpoB = ('No rpoB matches!', 0)
         invA = ('No invA matches!', 0)
@@ -229,18 +244,18 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
     else:
         # given list of scores get taxon
         if sum(locus_covs[0:5]) > 0:
-            best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1
+            best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x]) + 1
             all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
             if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
                 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
-            elif (all_rpoB == 0) &  (round(sum(locus_covs[1:5]),1) < 1):
+            elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]), 1) < 1):
                 rpoB = (list_taxa[0], locus_covs[0])
             else:
                 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
         else:
             rpoB = ('No rpoB matches!', 0)
         if sum(locus_covs[5:]) > 0:
-            best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5
+            best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x]) + 5
             invA = (list_taxa[best_invA], locus_covs[best_invA])
         else:
             invA = ('No invA matches!', 0)
@@ -250,7 +265,6 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
             return rpoB, invA, (average_read_length * number_of_reads) / 5000000
 
 
-
 def main():
     ex_dir = os.path.dirname(os.path.realpath(__file__))
     args = parse_args()
@@ -260,7 +274,7 @@ def main():
     else:
         extension = args.extension
         inputdir = args.input_dir
-        files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)]
+        files = [inputdir + '/' + f for f in os.listdir(inputdir) if f.endswith(extension)]
     report = args.report
     mode = args.mode
     f_invA = open(ex_dir + "/invA_mers_dict", "rb")
@@ -288,7 +302,7 @@ def main():
     uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
     uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
     uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
-    #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
+    # todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
     if report == 'taxonomy':
         print('file\trpoB\tinvA\texpected coverage')
         for f in files:
@@ -317,55 +331,56 @@ def main():
                   '\t' + str(round(report[2], 1)))
     else:
         print(
-        'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
-        '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
-        ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
+        'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +  # noqa: E122
+        '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +  # noqa: E122
+        ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +  # noqa: E122
         '\tsubsp. II (clade VIII : invA)')
         if report == 'percentage':
             for f in files:
-                locus_scores, coverages , reads = kmer_lists( f, 27,
-                           allmers,allmers_rpoB,
-                           uniqmers_bongori,
-                           uniqmers_I,
-                           uniqmers_IIa,
-                           uniqmers_IIb,
-                           uniqmers_IIIa,
-                           uniqmers_IIIb,
-                           uniqmers_IV,
-                           uniqmers_VI,
-                           uniqmers_VII,
-                           uniqmers_VIII,
-                           uniqmers_bongori_rpoB,
-                           uniqmers_S_enterica_rpoB,
-                           uniqmers_Escherichia_rpoB,
-                           uniqmers_Listeria_ss_rpoB,
-                           uniqmers_Lmono_rpoB,
-                           mode)
+                locus_scores, coverages, reads = kmer_lists(f, 27,
+                                                            allmers, allmers_rpoB,
+                                                            uniqmers_bongori,
+                                                            uniqmers_I,
+                                                            uniqmers_IIa,
+                                                            uniqmers_IIb,
+                                                            uniqmers_IIIa,
+                                                            uniqmers_IIIb,
+                                                            uniqmers_IV,
+                                                            uniqmers_VI,
+                                                            uniqmers_VII,
+                                                            uniqmers_VIII,
+                                                            uniqmers_bongori_rpoB,
+                                                            uniqmers_S_enterica_rpoB,
+                                                            uniqmers_Escherichia_rpoB,
+                                                            uniqmers_Listeria_ss_rpoB,
+                                                            uniqmers_Lmono_rpoB,
+                                                            mode)
                 pretty_scores = [str(round(score)) for score in locus_scores]
-                print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores))
+                print(f.split('/')[-1] + '\t' + '\t'.join(pretty_scores))
         else:
             for f in files:
-                locus_scores, coverages , reads = kmer_lists( f, 27,
-                           allmers,allmers_rpoB,
-                           uniqmers_bongori,
-                           uniqmers_I,
-                           uniqmers_IIa,
-                           uniqmers_IIb,
-                           uniqmers_IIIa,
-                           uniqmers_IIIb,
-                           uniqmers_IV,
-                           uniqmers_VI,
-                           uniqmers_VII,
-                           uniqmers_VIII,
-                           uniqmers_bongori_rpoB,
-                           uniqmers_S_enterica_rpoB,
-                           uniqmers_Escherichia_rpoB,
-                           uniqmers_Listeria_ss_rpoB,
-                           uniqmers_Lmono_rpoB,
-                           mode)
+                locus_scores, coverages, reads = kmer_lists(f, 27,
+                                                            allmers, allmers_rpoB,
+                                                            uniqmers_bongori,
+                                                            uniqmers_I,
+                                                            uniqmers_IIa,
+                                                            uniqmers_IIb,
+                                                            uniqmers_IIIa,
+                                                            uniqmers_IIIb,
+                                                            uniqmers_IV,
+                                                            uniqmers_VI,
+                                                            uniqmers_VII,
+                                                            uniqmers_VIII,
+                                                            uniqmers_bongori_rpoB,
+                                                            uniqmers_S_enterica_rpoB,
+                                                            uniqmers_Escherichia_rpoB,
+                                                            uniqmers_Listeria_ss_rpoB,
+                                                            uniqmers_Lmono_rpoB,
+                                                            mode)
                 pretty_covs = [str(round(cov, 1)) for cov in coverages]
                 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
 
+
 if __name__ == '__main__':
     main()
 


=====================================
invA_mers_dict → salmid/invA_mers_dict
=====================================


=====================================
rpoB_mers_dict → salmid/rpoB_mers_dict
=====================================


=====================================
salmid/version.py
=====================================
@@ -0,0 +1 @@
+SalmID_version = '0.1.23'


=====================================
version.py deleted
=====================================
@@ -1 +0,0 @@
-SalmID_version = '0.12'



View it on GitLab: https://salsa.debian.org/med-team/salmid/compare/44e471c7d273851232206e6293dfcc14f70ac611...06240738a2c3f2a951322bac4f183c170800d329

-- 
View it on GitLab: https://salsa.debian.org/med-team/salmid/compare/44e471c7d273851232206e6293dfcc14f70ac611...06240738a2c3f2a951322bac4f183c170800d329
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181221/093ee769/attachment-0001.html>