[med-svn] [Git][med-team/cat-bat][master] 7 commits: routine-update: New upstream version

Nilesh Patra gitlab at salsa.debian.org
Wed Dec 16 14:05:59 GMT 2020



Nilesh Patra pushed to branch master at Debian Med / cat-bat


Commits:
301fdafd by Nilesh Patra at 2020-12-16T19:30:43+05:30
routine-update: New upstream version

- - - - -
ad70b73a by Nilesh Patra at 2020-12-16T19:30:44+05:30
New upstream version 5.2
- - - - -
75178cd9 by Nilesh Patra at 2020-12-16T19:30:45+05:30
Update upstream source from tag 'upstream/5.2'

Update to upstream version '5.2'
with Debian dir 9323283c25059db0563395852f42f156bc70f8f9
- - - - -
4c49ab28 by Nilesh Patra at 2020-12-16T19:30:45+05:30
routine-update: Standards-Version: 4.5.1

- - - - -
632e7d39 by Nilesh Patra at 2020-12-16T19:31:27+05:30
Fix fuzz

- - - - -
db35db7d by Nilesh Patra at 2020-12-16T19:35:12+05:30
Add myself to uploaders

- - - - -
db36fc38 by Nilesh Patra at 2020-12-16T19:35:16+05:30
Update changelog

- - - - -


15 changed files:

- CAT_pack/about.py
- CAT_pack/add_names.py
- CAT_pack/bins.py
- CAT_pack/check.py
- CAT_pack/contigs.py
- CAT_pack/prepare.py
- CAT_pack/shared.py
- CAT_pack/single_bin.py
- CAT_pack/summarise.py
- CAT_pack/tax.py
- CHANGELOG.md
- README.md
- debian/changelog
- debian/control
- debian/patches/fix_interpreter.patch


Changes:

=====================================
CAT_pack/about.py
=====================================
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
 
 __author__ = 'F. A. Bastiaan von Meijenfeldt'
-__version__ = '5.1.2'
-__date__ = '15 July, 2020'
+__version__ = '5.2'
+__date__ = '20 November, 2020'


=====================================
CAT_pack/add_names.py
=====================================
@@ -129,9 +129,9 @@ def run():
             if line.startswith('#'):
                 line = line.rstrip().split('\t')
 
-                try:
+                if 'lineage' in line:
                     lineage_index = line.index('lineage')
-                except:
+                else:
                     message = ('{0} is not a supported classification file.'
                             ''.format(input_file))
                     shared.give_user_feedback(


=====================================
CAT_pack/bins.py
=====================================
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import argparse
-import datetime
 import decimal
 import multiprocessing
 import os
@@ -141,7 +140,7 @@ def parse_arguments():
             dest='no_stars',
             required=False,
             action='store_true',
-            help='Suppress marking of suggestive classifications.')
+            help='Suppress marking of suggestive taxonomic assignments.')
     optional.add_argument(
             '--force',
             dest='force',
@@ -314,7 +313,7 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
     if len(bin2contigs) == 1:
         message = '1 bin found!'
     else:
-        message = '{0} bins found!'.format(len(bin2contigs))
+        message = '{0:,d} bins found!'.format(len(bin2contigs))
     shared.give_user_feedback(message, log_file, quiet)
 
     return (bin2contigs, contig_names)
@@ -570,7 +569,8 @@ def run():
 
     with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2:
         outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n')
-        outf2.write('# ORF\tbin\tlineage\tbit-score\n')
+
+        outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n')
         
         for bin_ in sorted(bin2contigs):
             LCAs_ORFs = []
@@ -586,6 +586,8 @@ def run():
 
                         continue
 
+                    n_hits = len(ORF2hits[ORF])
+
                     (taxid,
                             top_bitscore) = tax.find_LCA_for_ORF(
                                     ORF2hits[ORF],
@@ -593,8 +595,8 @@ def run():
                                     taxid2parent)
                      
                     if taxid.startswith('no taxid found'):
-                        outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
-                            ORF, bin_, taxid, top_bitscore))
+                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
+                            ORF, bin_, n_hits, taxid, top_bitscore))
                     else:
                         lineage = tax.find_lineage(taxid, taxid2parent)
 
@@ -602,14 +604,18 @@ def run():
                             lineage = tax.star_lineage(
                                     lineage, taxids_with_multiple_offspring)
 
-                        outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
-                            ORF, bin_, ';'.join(lineage[::-1]), top_bitscore))
+                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
+                            ORF,
+                            bin_,
+                            n_hits,
+                            ';'.join(lineage[::-1]),
+                            top_bitscore))
                                        
                     LCAs_ORFs.append((taxid, top_bitscore),)
                     
             if len(LCAs_ORFs) == 0:
-                outf1.write('{0}\tunclassified\tno hits to database\n'.format(
-                    bin_))
+                outf1.write('{0}\tno taxid assigned\tno hits to database\n'
+                        ''.format(bin_))
 
                 continue
 
@@ -619,14 +625,14 @@ def run():
                             LCAs_ORFs, taxid2parent, args.f)
 
             if lineages == 'no ORFs with taxids found.':
-                outf1.write('{0}\tunclassified\t'
+                outf1.write('{0}\tno taxid assigned\t'
                         'hits not found in taxonomy files\n'.format(bin_))
 
                 continue
 
             if lineages == 'no lineage whitelisted.':
                 outf1.write(
-                        '{0}\tunclassified\t'
+                        '{0}\tno taxid assigned\t'
                         'no lineage reached minimum bit-score support\n'
                         ''.format(bin_))
 
@@ -650,7 +656,7 @@ def run():
                     # There is only one classification.
                     outf1.write(
                             '{0}\t'
-                            'classified\t'
+                            'taxid assigned\t'
                             'based on {1}/{2} ORFs\t'
                             '{3}\t'
                             '{4}\n'.format(
@@ -663,7 +669,7 @@ def run():
                     # There are multiple classifications.
                     outf1.write(
                             '{0}\t'
-                            'classified ({1}/{2})\t'
+                            'taxid assigned ({1}/{2})\t'
                             'based on {3}/{4} ORFs\t'
                             '{5}\t'
                             '{6}\n'.format(
@@ -676,10 +682,8 @@ def run():
                                 ';'.join(scores[::-1])))
                                    
     message = ('\n-----------------\n\n'
-               '[{0}] BAT is done! {1}/{2} bins classified.'.format(
-                   datetime.datetime.now(),
-                   n_classified_bins,
-                   len(bin2contigs)))
+            '{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned.'
+            ''.format(shared.timestamp(), n_classified_bins, len(bin2contigs)))
     shared.give_user_feedback(message, args.log_file, args.quiet,
             show_time=False)
   


=====================================
CAT_pack/check.py
=====================================
@@ -1,5 +1,6 @@
 #!/usr/bin/env/ python3
 
+import hashlib
 import os
 import subprocess
 import sys
@@ -7,6 +8,38 @@ import sys
 import shared
 
 
+def check_md5_gz(gz_file, md5_file, log_file, quiet):
+    message = 'Checking file integrity via MD5 checksum.'
+    shared.give_user_feedback(message, log_file, quiet)
+
+    with open(md5_file, 'r') as f:
+        md5_exp = f.read().split(' ')[0]
+
+    if md5_exp == '':
+        message = ('WARNING: no MD5 found in {0}. Integrity of {1} can not be '
+                'established.'.format(md5_file, gz_file))
+        shared.give_user_feedback(message, log_file, quiet)
+    else:
+        md5 = hashlib.md5()
+
+        block_size = 4096
+        with open(gz_file, 'rb') as f:
+            for chunk in iter(lambda: f.read(block_size), b''):
+                md5.update(chunk)
+        md5 = md5.hexdigest()
+
+        if md5 != md5_exp:
+            message = 'MD5 of {0} does not check out.'.format(gz_file)
+            shared.give_user_feedback(message, log_file, quiet, error=True)
+
+            sys.exit(1)
+        else:
+            message = 'MD5 of {0} checks out.'.format(gz_file)
+            shared.give_user_feedback(message, log_file, quiet)
+
+    return
+
+
 def check_memory(Gb):
     total_memory = None
     error = False
@@ -138,7 +171,7 @@ def check_bin_folder(bin_folder, bin_suffix, log_file, quiet):
                 'WARNING: a single bin is found. You can run BAT in single '
                 'bin mode, with \'CAT bin\' as opposed to \'CAT bins\' for a '
                 'set of bins. Both modes will give the same results, but you '
-                'might find single mode more convenient for your workflow!')
+                'might find single mode more convenient for your workflow.')
         shared.give_user_feedback(message, log_file, quiet)
 
     return error


=====================================
CAT_pack/contigs.py
=====================================
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import argparse
-import datetime
 import decimal
 import multiprocessing
 import sys
@@ -130,7 +129,7 @@ def parse_arguments():
             dest='no_stars',
             required=False,
             action='store_true',
-            help='Suppress marking of suggestive classifications.')
+            help='Suppress marking of suggestive taxonomic assignments.')
     optional.add_argument(
             '--force',
             dest='force',
@@ -465,11 +464,12 @@ def run():
     with open(args.contig2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2:
         outf1.write(
                 '# contig\tclassification\treason\tlineage\tlineage scores\n')
-        outf2.write('# ORF\tlineage\tbit-score\n')
+
+        outf2.write('# ORF\tnumber of hits\tlineage\ttop bit-score\n')
         
         for contig in sorted(contig_names):
             if contig not in contig2ORFs:
-                outf1.write('{0}\tunclassified\tno ORFs found\n'.format(
+                outf1.write('{0}\tno taxid assigned\tno ORFs found\n'.format(
                     contig))
                 
                 continue
@@ -482,14 +482,16 @@ def run():
                         ORF))
 
                     continue
+
+                n_hits = len(ORF2hits[ORF])
                 
                 (taxid,
                         top_bitscore) = tax.find_LCA_for_ORF(
                                 ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent)
                  
                 if taxid.startswith('no taxid found'):
-                    outf2.write('{0}\t{1}\t{2}\n'.format(
-                        ORF, taxid, top_bitscore))
+                    outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
+                        ORF, n_hits, taxid, top_bitscore))
                 else:
                     lineage = tax.find_lineage(taxid, taxid2parent)
 
@@ -497,14 +499,14 @@ def run():
                         lineage = tax.star_lineage(
                                 lineage, taxids_with_multiple_offspring)
                     
-                    outf2.write('{0}\t{1}\t{2}\n'.format(
-                        ORF, ';'.join(lineage[::-1]), top_bitscore))
+                    outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
+                        ORF, n_hits, ';'.join(lineage[::-1]), top_bitscore))
                                    
                 LCAs_ORFs.append((taxid, top_bitscore),)
                 
             if len(LCAs_ORFs) == 0:
-                outf1.write('{0}\tunclassified\tno hits to database\n'.format(
-                    contig))
+                outf1.write('{0}\tno taxid assigned\t'
+                        'no hits to database\n'.format(contig))
 
                 continue
 
@@ -514,14 +516,14 @@ def run():
                             LCAs_ORFs, taxid2parent, args.f)
              
             if lineages == 'no ORFs with taxids found.':
-                outf1.write('{0}\tunclassified\t'
+                outf1.write('{0}\tno taxid assigned\t'
                         'hits not found in taxonomy files\n'.format(contig))
 
                 continue
             
             if lineages == 'no lineage whitelisted.':
                 outf1.write(
-                        '{0}\tunclassified\t'
+                        '{0}\tno taxid assigned\t'
                         'no lineage reached minimum bit-score support\n'
                         ''.format(contig))
 
@@ -540,8 +542,8 @@ def run():
                 if len(lineages) == 1:
                     # There is only one classification.
                     outf1.write(
-                            '{0}'
-                            '\tclassified\t'
+                            '{0}\t'
+                            'taxid assigned\t'
                             'based on {1}/{2} ORFs\t'
                             '{3}\t'
                             '{4}\n'.format(
@@ -554,7 +556,7 @@ def run():
                     # There are multiple classifications.
                     outf1.write(
                             '{0}\t'
-                            'classified ({1}/{2})\t'
+                            'taxid assigned ({1}/{2})\t'
                             'based on {3}/{4} ORFs\t'
                             '{5}\t'
                             '{6}\n'.format(
@@ -566,10 +568,10 @@ def run():
                                 ';'.join(lineage[::-1]),
                                 ';'.join(scores[::-1])))
 
-    message = (
-            '\n-----------------\n\n'
-            '[{0}] CAT is done! {1}/{2} contigs classified.'.format(
-                datetime.datetime.now(),
+    message = ('\n-----------------\n\n'
+            '{0} CAT is done! {1:,d}/{2:,d} contigs have taxonomy assigned.'
+            ''.format(
+                shared.timestamp(),
                 n_classified_contigs,
                 len(contig_names)))
     shared.give_user_feedback(message, args.log_file, args.quiet,


=====================================
CAT_pack/prepare.py
=====================================
@@ -17,7 +17,7 @@ import tax
 
 
 def parse_arguments():
-    date = str(datetime.datetime.now().date())
+    date = datetime.datetime.now().strftime('%Y-%m-%d')
     
     parser = argparse.ArgumentParser(
             prog='CAT prepare',
@@ -125,28 +125,61 @@ def parse_arguments():
 
     # Add extra arguments.
     setattr(args, 'date', date)
-    setattr(args, 'min_mem', 150)
+    setattr(args, 'min_mem', 200)
     shared.expand_arguments(args)
 
     return (args)
 
 
+def memory_bottleneck(args):
+    (total_memory, error) = check.check_memory(args.min_mem)
+    if error:
+        message = (
+                'at least {0:,d}GB of memory is needed for the database '
+                'construction. {1:,d}GB is found on your system. You can try '
+                'to find a machine with more memory, or download '
+                'preconstructed database files from '
+                'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format(
+                    args.min_mem, total_memory))
+        shared.give_user_feedback(message, args.log_file, args.quiet,
+                error=True)
+
+        sys.exit(1)
+
+    return
+
+
 def download_taxonomy_files(taxonomy_folder, date, log_file, quiet):
+    url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/'
+    message = ('Downloading and extracting taxonomy files from {0} to '
+            'taxonomy folder.'.format(url))
+    shared.give_user_feedback(message, log_file, quiet)
+
     url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
     tmp_taxonomy_file = '{0}{1}.taxdump.tar.gz'.format(taxonomy_folder, date)
-
-    message = ('Downloading and extracting taxonomy files from {0} to {1}.'
-            ''.format(url, taxonomy_folder))
-    shared.give_user_feedback(message, log_file, quiet)
-    
     try:
         urllib.request.urlretrieve(url, tmp_taxonomy_file)
     except:
-        message = 'download of taxonomy files failed.'
+        message = 'download of {0} failed.'.format(url)
         shared.give_user_feedback(message, log_file, quiet, error=True)
 
         sys.exit(1)
-    
+
+    url = '{0}.md5'.format(url)
+    md5_file = '{0}{1}.taxdump.tar.gz.md5'.format(taxonomy_folder, date)
+    try:
+        urllib.request.urlretrieve(url, md5_file)
+    except:
+        message = 'download of {0} failed.'.format(url)
+        shared.give_user_feedback(message, log_file, quiet, error=True)
+
+        sys.exit(1)
+
+    message = 'Download complete.'
+    shared.give_user_feedback(message, log_file, quiet)
+
+    check.check_md5_gz(tmp_taxonomy_file, md5_file, log_file, quiet)
+
     try:
         with tarfile.open(tmp_taxonomy_file) as tar:
             tar.extractall(taxonomy_folder)
@@ -155,8 +188,8 @@ def download_taxonomy_files(taxonomy_folder, date, log_file, quiet):
         shared.give_user_feedback(message, log_file, quiet, error=True)
 
         sys.exit(1)
-        
-    message = 'Download complete!'
+
+    message = 'Extracting complete.'
     shared.give_user_feedback(message, log_file, quiet)
 
     return
@@ -164,44 +197,69 @@ def download_taxonomy_files(taxonomy_folder, date, log_file, quiet):
     
 def download_prot_accession2taxid_file(
         prot_accession2taxid_file, date, log_file, quiet):
-    url = ('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/'
-            'prot.accession2taxid.gz')
-
-    message = ('Downloading mapping file from {0} to {1}.'.format(
-        url, prot_accession2taxid_file))
+    url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/'
+    message = 'Downloading mapping file from {0} to taxonomy folder.'.format(
+            url)
     shared.give_user_feedback(message, log_file, quiet)
-    
+
+    url = ('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/'
+            'prot.accession2taxid.FULL.gz')
     try:
         urllib.request.urlretrieve(url, prot_accession2taxid_file)
     except:
-        message = 'download of prot.accession2taxid.gz failed.'
+        message = 'download of {0} failed.'.format(url)
         shared.give_user_feedback(message, log_file, quiet, error=True)
 
         sys.exit(1)
 
-    message = 'Download complete!'
+    url = '{0}.md5'.format(url)
+    md5_file = '{0}.md5'.format(prot_accession2taxid_file)
+    try:
+        urllib.request.urlretrieve(url, md5_file)
+    except:
+        message = 'download of {0} failed.'.format(url)
+        shared.give_user_feedback(message, log_file, quiet, error=True)
+
+        sys.exit(1)
+
+    message = 'Download complete.'
     shared.give_user_feedback(message, log_file, quiet)
-    
+
+    check.check_md5_gz(prot_accession2taxid_file, md5_file, log_file, quiet)
+
     return
 
 
 def download_nr(nr_file, log_file, quiet):
-    url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz'
-    
-    message = 'Downloading nr database from {0} to {1}.'.format(url, nr_file)
+    url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/'
+    message = 'Downloading nr database from {0} to database folder.'.format(
+            url)
     shared.give_user_feedback(message, log_file, quiet)
-    
+
+    url = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz'
     try:
         urllib.request.urlretrieve(url, nr_file)
     except:
-        message = 'download of nr database failed.'
+        message = 'download of {0} failed.'.format(url)
         shared.give_user_feedback(message, log_file, quiet, error=True)
 
         sys.exit(1)
 
-    message = 'Download complete!'
+    url = '{0}.md5'.format(url)
+    md5_file = '{0}.md5'.format(nr_file)
+    try:
+        urllib.request.urlretrieve(url, md5_file)
+    except:
+        message = 'download of {0} failed.'.format(url)
+        shared.give_user_feedback(message, log_file, quiet, error=True)
+
+        sys.exit(1)
+
+    message = 'Download complete.'
     shared.give_user_feedback(message, log_file, quiet)
 
+    check.check_md5_gz(nr_file, md5_file, log_file, quiet)
+
     return
 
 
@@ -213,10 +271,8 @@ def make_diamond_database(
         log_file,
         quiet,
         verbose):
-    message = (
-            'Constructing DIAMOND database {0}.dmnd from {1} using {2} cores. '
-            'Please be patient...'.format(
-                diamond_database_prefix, nr_file, nproc))
+    message = ('Constructing DIAMOND database {0}.dmnd from {1} using {2} '
+            'cores.'.format(diamond_database_prefix, nr_file, nproc))
     shared.give_user_feedback(message, log_file, quiet)
 
     command = [
@@ -236,25 +292,60 @@ def make_diamond_database(
 
         sys.exit(1)
         
-    message = 'DIAMOND database constructed!'
+    message = 'DIAMOND database constructed.'
     shared.give_user_feedback(message, log_file, quiet)
 
     return
-    
-    
-def import_prot_accession2taxid(prot_accession2taxid_file, log_file, quiet):
-    message = 'Loading {0} into memory. Please be patient...'.format(
-            prot_accession2taxid_file)
+
+
+def import_headers_nr(nr_file, log_file, quiet):
+    message = 'Loading file {0}.'.format(nr_file)
+    shared.give_user_feedback(message, log_file, quiet)
+
+    fastaid2prot_accessions = {}
+    prot_accessions_whitelist = set()
+
+    with gzip.open(nr_file, 'rb') as f1:
+        for line in f1:
+            line = line.decode('utf-8')
+
+            if not line.startswith('>'):
+                continue
+
+            line = line.lstrip('>').split('\x01')
+
+            prot_accessions = [i.split(' ')[0] for i in line]
+            fastaid = prot_accessions[0]
+
+            fastaid2prot_accessions[fastaid] = prot_accessions
+            prot_accessions_whitelist.update(prot_accessions)
+
+    return (fastaid2prot_accessions, prot_accessions_whitelist)
+
+
+def import_prot_accession2taxid(
+        prot_accession2taxid_file, prot_accessions_whitelist, log_file, quiet):
+    message = 'Loading file {0}.'.format(prot_accession2taxid_file)
     shared.give_user_feedback(message, log_file, quiet)
     
     prot_accession2taxid = {}
 
     with gzip.open(prot_accession2taxid_file, 'rb') as f1:
-        for line in f1:
+        for n, line in enumerate(f1):
             line = line.decode('utf-8')
-            line = line.split('\t')
 
-            prot_accession2taxid[line[1]] = line[2]
+            line = line.rstrip().split('\t')
+
+            if n == 0:
+                index_1 = line.index('accession.version')
+                index_2 = line.index('taxid')
+
+                continue
+
+            prot_accession = line[index_1]
+
+            if prot_accession in prot_accessions_whitelist:
+                prot_accession2taxid[prot_accession] = line[index_2]
 
     return prot_accession2taxid
 
@@ -264,33 +355,28 @@ def make_fastaid2LCAtaxid_file(
         fastaid2LCAtaxid_file,
         nr_file,
         prot_accession2taxid_file,
+        taxid2parent,
         log_file,
         quiet):
+    (fastaid2prot_accessions,
+            prot_accessions_whitelist) = import_headers_nr(
+                    nr_file, log_file, quiet)
     prot_accession2taxid = import_prot_accession2taxid(
-            prot_accession2taxid_file, log_file, quiet)
-    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
+            prot_accession2taxid_file, prot_accessions_whitelist,
+            log_file, quiet)
 
-    message = ('Finding LCA of all protein accession numbers in fasta headers '
-            'of {0}. Please be patient...'.format(nr_file))
+    message = 'Finding LCA of all protein accession numbers in fasta headers.'
     shared.give_user_feedback(message, log_file, quiet)
     
+    no_taxid = 0
     corrected = 0
     total = 0
-    with gzip.open(nr_file, 'rb') as f1, open(fastaid2LCAtaxid_file, 'w') as outf1:
-        for line in f1:
-            line = line.decode('utf-8')
-            if not line.startswith('>'):
-                continue
-
-            line = line.lstrip('>').split('\x01')
-
-            accession_numbers = [i.split(' ')[0] for i in line]
-            fastaid = accession_numbers[0]
-            
+    with open(fastaid2LCAtaxid_file, 'w') as outf1:
+        for fastaid, prot_accessions in fastaid2prot_accessions.items():
             list_of_lineages = []
-            for accession_number in accession_numbers:
+            for prot_accession in prot_accessions:
                 try:
-                    taxid = prot_accession2taxid[accession_number]
+                    taxid = prot_accession2taxid[prot_accession]
                     lineage = tax.find_lineage(taxid, taxid2parent)
                     list_of_lineages.append(lineage)
                 except:
@@ -305,35 +391,38 @@ def make_fastaid2LCAtaxid_file(
                 # that are missing in prot.accession2taxid or whose taxid is
                 # missing in nodes.dmp. NOTE that these entries are thus not
                 # present in the output file.
+                no_taxid += 1
+
                 continue
 
             LCAtaxid = tax.find_LCA(list_of_lineages)
 
             outf1.write('{0}\t{1}\n'.format(fastaid, LCAtaxid))
 
-            try:
-                if LCAtaxid != prot_accession2taxid[fastaid]:
-                    corrected += 1
-            except:
+            if (fastaid not in prot_accession2taxid or
+                    LCAtaxid != prot_accession2taxid[fastaid]):
                 # If the fastaid cannot be found in prot.accession2taxid, but
                 # a taxid is given to the fastaid based on secondary accession
-                # numbers, it is counted as a correction as well.
+                # numbers, or if the taxid of the header is different from the
+                # LCA taxid, it is counted as corrected.
                 corrected += 1
 
     message = ('Done! File {0} is created. '
-            '{1} of {2} headers ({3:.1f}%) corrected.'.format(
+            '{1:,d} of {2:,d} headers ({3:.1f}%) corrected. '
+            '{4:,d} headers ({5:.1f}%) do not have a taxid assigned.'.format(
                 fastaid2LCAtaxid_file,
                 corrected,
                 total,
-                corrected / total * 100))
+                corrected / total * 100,
+                no_taxid,
+                no_taxid / total * 100))
     shared.give_user_feedback(message, log_file, quiet)
 
     return
-    
-    
-def find_offspring(nodes_dmp, fastaid2LCAtaxid_file, log_file, quiet):
-    (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet)
 
+
+def find_offspring(
+        nodes_dmp, fastaid2LCAtaxid_file, taxid2parent, log_file, quiet):
     message = 'Searching nr database for taxids with multiple offspring.'
     shared.give_user_feedback(message, log_file, quiet)
 
@@ -398,7 +487,7 @@ def prepare(step_list, args):
     if 'download_prot_accession2taxid_file' in step_list:
         setattr(args,
                 'prot_accession2taxid_file',
-                '{0}{1}.prot.accession2taxid.gz'.format(
+                '{0}{1}.prot.accession2taxid.FULL.gz'.format(
                     args.taxonomy_folder, args.date))
 
         download_prot_accession2taxid_file(
@@ -428,6 +517,11 @@ def prepare(step_list, args):
                 args.quiet,
                 args.verbose)
 
+    if ('make_fastaid2LCAtaxid_file' in step_list
+            or 'make_taxids_with_multiple_offspring_file' in step_list):
+        taxid2parent, taxid2rank = tax.import_nodes(
+                args.nodes_dmp, args.log_file, args.quiet)
+
     if 'make_fastaid2LCAtaxid_file' in step_list:
         setattr(args,
                 'fastaid2LCAtaxid_file',
@@ -439,6 +533,7 @@ def prepare(step_list, args):
                 args.fastaid2LCAtaxid_file,
                 args.nr_file,
                 args.prot_accession2taxid_file,
+                taxid2parent,
                 args.log_file,
                 args.quiet)
 
@@ -451,6 +546,7 @@ def prepare(step_list, args):
         taxid2offspring = find_offspring(
                 args.nodes_dmp,
                 args.fastaid2LCAtaxid_file,
+                taxid2parent,
                 args.log_file,
                 args.quiet)
         write_taxids_with_multiple_offspring_file(
@@ -460,7 +556,7 @@ def prepare(step_list, args):
                 args.quiet)
 
     message = ('\n-----------------\n\n'
-            '[{0}] CAT prepare is done!'.format(datetime.datetime.now()))
+            '{0} CAT prepare is done!'.format(shared.timestamp()))
     shared.give_user_feedback(message, args.log_file, args.quiet,
             show_time=False)
 
@@ -471,7 +567,7 @@ def prepare(step_list, args):
 
     message = (
             '\nSupply the following arguments to CAT or BAT if you want to '
-            'use the constructed database:\n'
+            'use this database:\n'
             '-d / --database_folder {0}\n'
             '-t / --taxonomy_folder {1}'.format(
                 args.database_folder, args.taxonomy_folder))
@@ -545,19 +641,7 @@ def run_fresh(args):
         shared.give_user_feedback(message, args.log_file, args.quiet)
         
     # Check memory.
-    (total_memory, error) = check.check_memory(args.min_mem)
-    if error:
-        message = (
-                'at least {0}GB of memory is needed for a fresh database '
-                'construction. {1}GB is found on your system. You can try to '
-                'find a machine with more memory, or download preconstructed '
-                'database files from '
-                'tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format(
-                    args.min_mem, total_memory))
-        shared.give_user_feedback(message, args.log_file, args.quiet,
-                error=True)
-
-        sys.exit(1)
+    memory_bottleneck(args)
 
     step_list = ['download_taxonomy_files',
                  'download_prot_accession2taxid_file',
@@ -775,20 +859,8 @@ def run_existing(args):
 
     if 'make_fastaid2LCAtaxid_file' in step_list:
         # Check memory.
-        (total_memory, error) = check.check_memory(args.min_mem)
-        if error:
-            message = (
-                    'at least {0}GB of memory is needed for the database '
-                    'construction. {1}GB is found on your system. You can try '
-                    'to find a machine with more memory, or download '
-                    'preconstructed database files '
-                    'from tbb.bio.uu.nl/bastiaan/CAT_prepare/.'.format(
-                        args.min_mem, total_memory))
-            shared.give_user_feedback(message, args.log_file, args.quiet,
-                    error=True)
-            
-            sys.exit(1)
-            
+        memory_bottleneck(args)
+
     if len(step_list) == 0:
         message = ('All necessary files are found. Existing database does not '
                 'need any more work...')


=====================================
CAT_pack/shared.py
=====================================
@@ -36,6 +36,13 @@ class SuffixAction(argparse.Action):
         setattr(namespace, self.dest, bin_suffix)
 
 
+def timestamp():
+    now = datetime.datetime.now()
+    str_ = '[{0}]'.format(now.strftime('%Y-%m-%d %H:%M:%S'))
+
+    return str_
+
+
 def expand_arguments(args):
     if 'r' in args:
         setattr(args, 'one_minus_r', (100 - args.r) / 100)
@@ -94,7 +101,12 @@ def explore_taxonomy_folder(args):
                 nodes_dmp = '{0}{1}'.format(args.taxonomy_folder, file_)
             elif file_ == 'names.dmp':
                 names_dmp = '{0}{1}'.format(args.taxonomy_folder, file_)
-            elif file_.endswith('prot.accession2taxid.gz'):
+            elif file_.endswith('prot.accession2taxid.FULL.gz'):
+                prot_accession2taxid_file = '{0}{1}'.format(
+                        args.taxonomy_folder, file_)
+            elif (file_.endswith('prot.accession2taxid.gz') and
+                    prot_accession2taxid_file is None):
+                # Legacy prot_accession2taxid_file.
                 prot_accession2taxid_file = '{0}{1}'.format(
                         args.taxonomy_folder, file_)
 
@@ -156,15 +168,13 @@ def print_variables(args, step_list=None):
     return
 
 
-def give_user_feedback(
-        message, log_file=None, quiet=False, show_time=True, error=False):
+def give_user_feedback(message,
+        log_file=None, quiet=False, show_time=True, error=False):
     if error:
         message = 'ERROR: {0}'.format(message)
 
     if show_time:
-        time = datetime.datetime.now()
-
-        message = '[{0}] {1}'.format(time, message)
+        message = '{0} {1}'.format(timestamp(), message)
 
     message = '{0}\n'.format(message)
 
@@ -191,7 +201,7 @@ def run_prodigal(
     message = (
             'Running Prodigal for ORF prediction. Files {0} and {1} will be '
             'generated. Do not forget to cite Prodigal when using CAT or BAT '
-            'in your publication!'.format(proteins_fasta, proteins_gff))
+            'in your publication.'.format(proteins_fasta, proteins_gff))
     give_user_feedback(message, log_file, quiet)
 
     try:
@@ -231,16 +241,16 @@ def run_diamond(args):
     message = (
             'Homology search with DIAMOND is starting. Please be patient. Do '
             'not forget to cite DIAMOND when using CAT or BAT in your '
-            'publication!\n'
-            '\t\t\t\tquery: {0}\n'
-            '\t\t\t\tdatabase: {1}\n'
-            '\t\t\t\tmode: {2}\n'
-            '\t\t\t\tnumber of cores: {3}\n'
-            '\t\t\t\tblock-size (billions of letters): {4}\n'
-            '\t\t\t\tindex-chunks: {5}\n'
-            '\t\t\t\ttmpdir: {6}\n'
-            '\t\t\t\tcompress: {7}\n'
-            '\t\t\t\ttop: {8}'.format(
+            'publication.\n'
+            '\t\t\tquery: {0}\n'
+            '\t\t\tdatabase: {1}\n'
+            '\t\t\tmode: {2}\n'
+            '\t\t\tnumber of cores: {3}\n'
+            '\t\t\tblock-size (billions of letters): {4}\n'
+            '\t\t\tindex-chunks: {5}\n'
+            '\t\t\ttmpdir: {6}\n'
+            '\t\t\tcompress: {7}\n'
+            '\t\t\ttop: {8}'.format(
                 args.proteins_fasta,
                 args.diamond_database,
                 mode,
@@ -304,9 +314,9 @@ def import_contig_names(fasta_file, log_file, quiet):
                 
                 if contig in contig_names:
                     message = (
-                            'it looks like your fasta file contains duplicate '
-                            'headers! The first duplicate encountered is {0}, '
-                            'but there might be more...'.format(contig))
+                            'your fasta file contains duplicate headers. The '
+                            'first duplicate encountered is {0}, but there '
+                            'might be more...'.format(contig))
                     give_user_feedback(message, log_file, quiet, error=True)
                     
                     sys.exit(1)
@@ -369,14 +379,14 @@ def parse_tabular_alignment(
         if not line[0] == ORF:
             # A new ORF is reached.
             ORF = line[0]
-            best_bitscore = decimal.Decimal(line[11])
+            top_bitscore = decimal.Decimal(line[11])
             ORF2hits[ORF] = []
 
             ORF_done = False
 
         bitscore = decimal.Decimal(line[11])
         
-        if bitscore >= one_minus_r * best_bitscore:
+        if bitscore >= one_minus_r * top_bitscore:
             # The hit has a high enough bit-score to be included.
             hit = line[1]
 


=====================================
CAT_pack/single_bin.py
=====================================
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import argparse
-import datetime
 import decimal
 import multiprocessing
 import sys
@@ -129,7 +128,7 @@ def parse_arguments():
             dest='no_stars',
             required=False,
             action='store_true',
-            help='Suppress marking of suggestive classifications.')
+            help='Suppress marking of suggestive taxonomic assignments.')
     optional.add_argument(
             '--force',
             dest='force',
@@ -466,7 +465,8 @@ def run():
 
     with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2:
         outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n')
-        outf2.write('# ORF\tbin\tlineage\tbit-score\n')
+
+        outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n')
 
         # The list contains only a single bin, but I keep the code like this
         # to make the code consistent across bin and bins.
@@ -484,7 +484,9 @@ def run():
                                 ''.format(ORF, bin_))
                         
                         continue
-                    
+
+                    n_hits = len(ORF2hits[ORF])
+
                     (taxid,
                             top_bitscore) = tax.find_LCA_for_ORF(
                                     ORF2hits[ORF],
@@ -492,8 +494,8 @@ def run():
                                     taxid2parent)
                      
                     if taxid.startswith('no taxid found'):
-                        outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
-                            ORF, bin_, taxid, top_bitscore))
+                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
+                            ORF, bin_, n_hits, taxid, top_bitscore))
                     else:
                         lineage = tax.find_lineage(taxid, taxid2parent)
 
@@ -501,14 +503,18 @@ def run():
                             lineage = tax.star_lineage(
                                     lineage, taxids_with_multiple_offspring)
                         
-                        outf2.write('{0}\t{1}\t{2}\t{3}\n'.format(
-                            ORF, bin_, ';'.join(lineage[::-1]), top_bitscore))
+                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
+                            ORF,
+                            bin_,
+                            n_hits,
+                            ';'.join(lineage[::-1]),
+                            top_bitscore))
                                     
                     LCAs_ORFs.append((taxid, top_bitscore),)
                     
             if len(LCAs_ORFs) == 0:
-                outf1.write('{0}\tunclassified\tno hits to database\n'.format(
-                    bin_))
+                outf1.write('{0}\tno taxid assigned\tno hits to database\n'
+                        ''.format(bin_))
 
                 continue
             
@@ -518,14 +524,14 @@ def run():
                             LCAs_ORFs, taxid2parent, args.f)
              
             if lineages == 'no ORFs with taxids found.':
-                outf1.write('{0}\tunclassified\t'
+                outf1.write('{0}\tno taxid assigned\t'
                         'hits not found in taxonomy files\n'.format(bin_))
 
                 continue
             
             if lineages == 'no lineage whitelisted.':
                 outf1.write(
-                        '{0}\tunclassified\t'
+                        '{0}\tno taxid assigned\t'
                         'no lineage reached minimum bit-score support\n'
                         ''.format(bin_))
 
@@ -549,7 +555,7 @@ def run():
                     # There is only one classification.
                     outf1.write(
                             '{0}\t'
-                            'classified\t'
+                            'taxid assigned\t'
                             'based on {1}/{2} ORFs\t'
                             '{3}\t'
                             '{4}\n'.format(
@@ -562,7 +568,7 @@ def run():
                     # There are multiple classifications.
                     outf1.write(
                             '{0}\t'
-                            'classified ({1}/{2})\t'
+                            'taxid assigned ({1}/{2})\t'
                             'based on {3}/{4} ORFs\t'
                             '{5}\t'
                             '{6}\n'.format(
@@ -575,9 +581,8 @@ def run():
                                 ';'.join(scores[::-1])))
                                    
     message = ('\n-----------------\n'
-               '[{0}] BAT is done! {1}/1 bin classified.'.format(
-                   datetime.datetime.now(),
-                   n_classified_bins))
+               '{0} BAT is done! {1}/1 bin has taxonomy assigned.'.format(
+                   shared.timestamp(), n_classified_bins))
     shared.give_user_feedback(message, args.log_file, args.quiet,
             show_time=False)
   


=====================================
CAT_pack/summarise.py
=====================================
@@ -153,7 +153,7 @@ def summarise_contigs(args):
                                 '{0} appears to be a BAT classification file. '
                                 'If you want to summarise bin '
                                 'classifications, simply don\'t supply a '
-                                'contigs fasta and everything should be fine!'
+                                'contigs fasta and everything should be fine.'
                                 ''.format(args.input_file))
                         shared.give_user_feedback(
                                 message, args.log_file, args.quiet, error=True)
@@ -182,7 +182,7 @@ def summarise_contigs(args):
             sys.exit(1)
             
     length = {}
-    length['unclassified'] = []
+    length['no taxid assigned'] = []
 
     ORFs = {}
 
@@ -225,8 +225,8 @@ def summarise_contigs(args):
 
                 sys.exit(1)
 
-            if line[1] == 'unclassified':
-                length['unclassified'].append(contig2length[contig])
+            if line[1] == 'no taxid assigned':
+                length['no taxid assigned'].append(contig2length[contig])
 
                 continue
 
@@ -270,14 +270,15 @@ def summarise_contigs(args):
     with open(args.output_file, 'w') as outf1:
         n_contigs = len(contig2length)
         total_length = sum(contig2length.values())
-        n_classified_contigs = n_contigs - len(length['unclassified'])
-        total_classified_length = total_length - sum(length['unclassified'])
+        n_classified_contigs = n_contigs - len(length['no taxid assigned'])
+        total_classified_length = total_length - sum(
+                length['no taxid assigned'])
 
-        outf1.write('# total number of contigs in {0} is {1} representing {2} '
-                'positions.\n'.format(
+        outf1.write('# total number of contigs in {0} is {1:,d} representing '
+                '{2:,d} positions.\n'.format(
                     args.contigs_fasta, n_contigs, total_length))
-        outf1.write('# {0} contigs are classified ({1:.2f}%) representing {2} '
-                'positions ({3:.2f}%) in {4}.\n'.format(
+        outf1.write('# {0:,d} contigs have taxonomy assigned ({1:.2f}%) '
+                'representing {2:,d} positions ({3:.2f}%) in {4}.\n'.format(
                     n_classified_contigs,
                     n_classified_contigs / n_contigs * 100,
                     total_classified_length,
@@ -373,7 +374,7 @@ def summarise_bins(args):
             sys.exit(1)
             
     n_bins = {}
-    n_bins['unclassified'] = 0
+    n_bins['no taxid assigned'] = 0
     
     official_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family',
                       'genus', 'species']
@@ -402,8 +403,8 @@ def summarise_bins(args):
 
             bin_trace.add(bin_)
             
-            if line[1] == 'unclassified':
-                n_bins['unclassified'] += 1
+            if line[1] == 'no taxid assigned':
+                n_bins['no taxid assigned'] += 1
                 
                 continue
 
@@ -427,11 +428,11 @@ def summarise_bins(args):
 
         sys.exit(1)
         
-    n_classified_bins = n - n_bins['unclassified']
+    n_classified_bins = n - n_bins['no taxid assigned']
 
     with open(args.output_file, 'w') as outf1:
-        outf1.write('# total number of bins is {0}, of which {1} ({2:.2f}%) '
-                'are classified.\n'.format(
+        outf1.write('# total number of bins is {0:,d}, of which {1:,d} '
+                '({2:.2f}%) have taxonomy assigned.\n'.format(
                     n, n_classified_bins, n_classified_bins / n * 100))
         outf1.write('#\n')
         outf1.write('# rank\tclade\tnumber of bins\n')


=====================================
CAT_pack/tax.py
=====================================
@@ -6,7 +6,7 @@ import shared
 
 
 def import_nodes(nodes_dmp, log_file, quiet):
-    message = 'Importing file {0}.'.format(nodes_dmp)
+    message = 'Loading file {0}.'.format(nodes_dmp)
     shared.give_user_feedback(message, log_file, quiet)
     
     taxid2parent = {}
@@ -27,7 +27,7 @@ def import_nodes(nodes_dmp, log_file, quiet):
 
 
 def import_names(names_dmp, log_file, quiet):
-    message = 'Importing file {0}.'.format(names_dmp)
+    message = 'Loading file {0}.'.format(names_dmp)
     shared.give_user_feedback(message, log_file, quiet)
 
     taxid2name = {}
@@ -46,7 +46,7 @@ def import_names(names_dmp, log_file, quiet):
 
 
 def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet):
-    message = 'Importing file {0}.'.format(fastaid2LCAtaxid_file)
+    message = 'Loading file {0}.'.format(fastaid2LCAtaxid_file)
     shared.give_user_feedback(message, log_file, quiet)
 
     fastaid2LCAtaxid = {}
@@ -64,7 +64,7 @@ def import_fastaid2LCAtaxid(fastaid2LCAtaxid_file, all_hits, log_file, quiet):
 
 def import_taxids_with_multiple_offspring(
         taxids_with_multiple_offspring_file, log_file, quiet):
-    message = 'Importing file {0}.'.format(taxids_with_multiple_offspring_file)
+    message = 'Loading file {0}.'.format(taxids_with_multiple_offspring_file)
     shared.give_user_feedback(message, log_file, quiet)
 
     taxids_with_multiple_offspring = set()
@@ -251,7 +251,7 @@ def convert_to_official_names(lineage, taxid2rank, taxid2name, scores=None):
             'genus', 'species']
     lineage_ranks = [taxid2rank[taxid.rstrip('*')] for taxid in lineage]
 
-    official_names = ['not classified'] * 7
+    official_names = ['no support'] * 7
 
     for (i, rank) in enumerate(official_ranks):
         if rank in lineage_ranks:
@@ -282,11 +282,11 @@ def convert_to_official_names(lineage, taxid2rank, taxid2name, scores=None):
     # Fill the official lineage with NAs if a lower classification is present.
     index_lowest_classification = 0
     for (i, name) in enumerate(official_names):
-        if name != 'not classified':
+        if name != 'no support':
             index_lowest_classification = i
             
     for i in range(index_lowest_classification):
-        if official_names[i] == 'not classified':
+        if official_names[i] == 'no support':
             official_names[i] = 'NA'
 
     return official_names


=====================================
CHANGELOG.md
=====================================
@@ -1,5 +1,8 @@
 # Changelog
 
+## 5.2
+`CAT prepare` now uses the latest taxonomy mapping files from NCBI, significantly expanding taxonomic coverage of proteins in nr. File integrity of downloads is assessed based on md5 checksums. The ORF2LCA output file contains a new column for the number of hits the classification is based on. We have made textual changes to the output files to better reflect the meaning of 'classified' and 'not classified' in different contexts.
+
 ## 5.1.2
 Code streamlining.
 


=====================================
README.md
=====================================
@@ -6,7 +6,7 @@
 - [Getting started](#getting-started)
 - [Usage](#usage)
 - [Interpreting the output files](#interpreting-the-output-files)
-- [Marking suggestive classifications with an asterisk](#marking-suggestive-classifications-with-an-asterisk)
+- [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk)
 - [Optimising running time, RAM, and disk usage](#optimising-running-time-ram-and-disk-usage)
 - [Examples](#examples)
 
@@ -55,15 +55,15 @@ To get started with CAT and BAT, you will have to get the database files on your
 To download the database files, find the most recent version on [tbb.bio.uu.nl/bastiaan/CAT\_prepare/](https://tbb.bio.uu.nl/bastiaan/CAT_prepare/), download and extract, and you are ready to go!
 
 ```
-$ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20200618.tar.gz
+$ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20201123.tar.gz
 
-$ tar -xvzf CAT_prepare_20200618.tar.gz
+$ tar -xvzf CAT_prepare_20201123.tar.gz
 ```
 
-Make sure that your version of DIAMOND is the same as with which the database is constructed. You can find the DIAMOND version within the database log file:
+Your version of DIAMOND should be the same as with which the database is constructed. For this reason the DIAMOND executable is supplied within the CAT prepare folder. Alternatively, you can find the DIAMOND version used for database construction within the database log file:
 
 ```
-$ grep version 2020-06-18.CAT_prepare.fresh.log
+$ grep version 2020-11-23.CAT_prepare.fresh.log
 ```
 
 ### Generating the database files yourself.
@@ -72,14 +72,14 @@ $ grep version 2020-06-18.CAT_prepare.fresh.log
 $ CAT prepare --fresh
 ```
 
-This will download the taxonomy files from NCBI taxonomy to a taxonomy folder, and the nr database to a database folder. A DIAMOND database is constructed from the nr file. CAT prepare also generates a fastaid2LCAtaxid file, as the first accession numbers in the headers of nr are not necessarily the Last Common Ancestor (LCA) of all accession numbers in it. Moreover, the file taxids\_with\_multiple\_offspring is generated. CAT prepare will typically take a few hours to create a fresh database, and will use up to 100GB of memory.
+This will download the taxonomy files from NCBI taxonomy to a taxonomy folder, and the nr database to a database folder. A DIAMOND database is constructed from the nr file. CAT prepare also generates a fastaid2LCAtaxid file, as the first accession numbers in the headers of nr are not necessarily the Last Common Ancestor (LCA) of all accession numbers in it. Moreover, the file taxids\_with\_multiple\_offspring is generated. CAT prepare will typically take a few hours to create a fresh database, and will use up to 200GB of memory.
 
 If some of the files are already on your system (say the taxonomy files and the nr database) you can run:
 ```
 $ CAT prepare --existing -d {folder containing nr} -t {folder containing taxonomy files}
 ```
 
-CAT prepare will try to assess which files need to be downloaded and created and start from that point. CAT prepare only checks if the necessary files are there, not if they are correctly formatted.
+CAT prepare will assess which files need to be downloaded and created and start from that point. CAT prepare only checks if the necessary files are there, not if they are correctly formatted.
 
 ### Running CAT and BAT.
 The taxonomy folder and database folder created by CAT prepare are needed in subsequent CAT and BAT runs. They only need to be generated/downloaded once or whenever you want to update the nr database.
@@ -157,12 +157,12 @@ The contig2classification and bin2classification output looks like this:
 
 contig or bin | classification | reason | lineage | lineage scores
 --- | --- | --- | --- | ---
-contig\_1 | classified | based on 14/15 ORFs | 1;131567;2;1783272 | 1.00; 1.00; 1.00; 0.78
-contig\_2 | classified (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23
-contig\_2 | classified (2/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;33072 | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.77
-contig\_3 | unclassified | no ORFs found
+contig\_1 | taxid assigned | based on 14/15 ORFs | 1;131567;2;1783272 | 1.00; 1.00; 1.00; 0.78
+contig\_2 | taxid assigned (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23
+contig\_2 | taxid assigned (2/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;33072 | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.77
+contig\_3 | no taxid assigned | no ORFs found
 
-Where the lineage scores represent the fraction of bit-score support for each classification. **Contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive classifications with an asterisk](#marking-suggestive-classifications-with-an-asterisk).
+Where the lineage scores represent the fraction of bit-score support for each classification. **Contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk).
 
 To add names to the taxonomy id's in either output file, run:
 
@@ -192,8 +192,8 @@ $ CAT summarise -i {named BAT classification file} -o {output file}
 
 CAT summarise currently does not support classification files wherein some contigs / MAGs have multiple classifications (as contig\_2 above).
 
-## Marking suggestive classifications with an asterisk
-When we want to confidently go down to the lowest taxonomic level possible for an classification, an important assumption is that on that level conflict between classifications could have arisen. Namely, if there were conflicting classifications, the algorithm would have made the classification more conservative by moving up a level. Since it did not, we can trust the low-level classification. However, it is not always possible for conflict to arise, because in some cases no other sequences from the clade are present in the database. This is true for example for the family Dehalococcoidaceae, which in our databases is the sole representative of the order Dehalococcoidales. Thus, here we cannot confidently state that an classification on the family level is more correct than an classification on the order level. For these cases, CAT and BAT mark the lineage with asterisks, starting from the lowest level classification up to the level where conflict could have arisen because the clade contains multiple taxa with database entries. The user is advised to examine starred taxa more carefully, for example by analysing sequence identity between predicted ORFs and hits, or move up the lineage to a confident classification (i.e. the first classification without an asterisk).
+## Marking suggestive taxonomic assignments with an asterisk
+When we want to confidently go down to the lowest taxonomic level possible for a classification, an important assumption is that on that level conflict between classifications could have arisen. Namely, if there were conflicting classifications, the algorithm would have made the classification more conservative by moving up a level. Since it did not, we can trust the low-level classification. However, it is not always possible for conflict to arise, because in some cases no other sequences from the clade are present in the database. This is true for example for the family Dehalococcoidaceae, which in our databases is the sole representative of the order Dehalococcoidales. Thus, here we cannot confidently state that an classification on the family level is more correct than an classification on the order level. For these cases, CAT and BAT mark the lineage with asterisks, starting from the lowest level classification up to the level where conflict could have arisen because the clade contains multiple taxa with database entries. The user is advised to examine starred taxa more carefully, for example by analysing sequence identity between predicted ORFs and hits, or move up the lineage to a confident classification (i.e. the first classification without an asterisk).
 
 If you do not want the asterisks in your output files, you can add the `--no_stars` flag to CAT or BAT.
 


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+cat-bat (5.2-1) unstable; urgency=medium
+
+  * New upstream version
+  * Standards-Version: 4.5.1 (routine-update)
+
+ -- Nilesh Patra <npatra974 at gmail.com>  Wed, 16 Dec 2020 19:31:44 +0530
+
 cat-bat (5.1.2-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/control
=====================================
@@ -1,13 +1,13 @@
 Source: cat-bat
 Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
+Uploaders: Andreas Tille <tille at debian.org>, Nilesh Patra <npatra974 at gmail.com>
 Section: science
 Priority: optional
 Build-Depends: debhelper-compat (= 13),
                dh-python,
                python3,
                diamond-aligner
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
 Vcs-Browser: https://salsa.debian.org/med-team/cat-bat
 Vcs-Git: https://salsa.debian.org/med-team/cat-bat.git
 Homepage: https://github.com/dutilh/CAT


=====================================
debian/patches/fix_interpreter.patch
=====================================
@@ -8,5 +8,5 @@ Description: Fix typo in interpreter line
 -#!/usr/bin/env/ python3
 +#!/usr/bin/env python3
  
+ import hashlib
  import os
- import subprocess



View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/compare/e860a73ba7c64a5a844845c0c1318cf3907e0dbe...db36fc38e711c5cfeb0e217253368f7d9d0b4b29

-- 
View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/compare/e860a73ba7c64a5a844845c0c1318cf3907e0dbe...db36fc38e711c5cfeb0e217253368f7d9d0b4b29
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201216/97ab1665/attachment-0001.html>


More information about the debian-med-commit mailing list