[med-svn] [Git][med-team/cat-bat][master] 9 commits: New upstream version 6.0.1

Sat Nov 1 17:19:46 GMT 2025


Nilesh Patra pushed to branch master at Debian Med / cat-bat


Commits:
1f154708 by Nilesh Patra at 2025-11-01T22:08:27+05:30
New upstream version 6.0.1
- - - - -
6a0478fc by Nilesh Patra at 2025-11-01T22:08:29+05:30
Update upstream source from tag 'upstream/6.0.1'

Update to upstream version '6.0.1'
with Debian dir 24906706d0917c8739b12cb676f53f1c017c9df1
- - - - -
3d1a54e7 by Nilesh Patra at 2025-11-01T22:08:31+05:30
Bump Standards-Version to 4.7.2 (no changes needed)

- - - - -
3f85bab0 by Nilesh Patra at 2025-11-01T22:08:34+05:30
Drop Redundant "Rules-Requires-Root: no"

- - - - -
582aadcc by Nilesh Patra at 2025-11-01T22:08:48+05:30
Drop myself from uploaders

- - - - -
991fa8ec by Nilesh Patra at 2025-11-01T22:10:04+05:30
Drop patches; fixed upstream

- - - - -
3b6f0d27 by Nilesh Patra at 2025-11-01T22:24:20+05:30
Update links to match CAT entry point

- - - - -
0fc67419 by Nilesh Patra at 2025-11-01T22:33:39+05:30
Update mapnage as per updated binary name

- - - - -
a7a747dd by Nilesh Patra at 2025-11-01T22:33:39+05:30
Upload to unstable

- - - - -


24 changed files:

- CAT_pack/CAT → CAT_pack/CAT_pack
- CAT_pack/about.py
- CAT_pack/add_names.py
- CAT_pack/bins.py
- CAT_pack/check.py
- CAT_pack/contigs.py
- CAT_pack/download.py
- CAT_pack/prepare.py
- + CAT_pack/reads.py
- CAT_pack/shared.py
- CAT_pack/summarise.py
- CAT_pack/tax.py
- CHANGELOG.md
- README.md
- debian/changelog
- debian/control
- debian/createmanpages
- debian/links
- + debian/man/CAT.1
- debian/CAT.1 → debian/man/CAT_pack.1
- debian/manpages
- − debian/patches/fix_interpreter.patch
- − debian/patches/series
- + tests/data/PATRIC.weighted_mean_genome_size.txt


Changes:

=====================================
CAT_pack/CAT → CAT_pack/CAT_pack
=====================================
@@ -6,6 +6,7 @@ import about
 import add_names
 import bins
 import contigs
+import reads
 import download
 import prepare
 import summarise
@@ -13,10 +14,10 @@ import summarise
 
 def usage():
     message = (
-        "usage: CAT (download | prepare | contigs | bins | add_names | "
-        "summarise) [-v / --version] [-h / --help]\n"
-        "CAT: error: one of the arguments "
-        "download prepare contigs bins add_names summarise "
+        "usage: CAT_pack (download | prepare | contigs | bins | reads "
+        "| add_names | summarise) [-v / --version] [-h / --help]\n"
+        "CAT_pack: error: one of the arguments "
+        "download prepare contigs bins reads add_names summarise "
         "is required"
     )
     
@@ -26,8 +27,16 @@ def usage():
     
     
 def version():
-    message = ("CAT v{0} ({1}) by {2}.".format(
-        about.__version__, about.__date__, about.__author__))
+    message = ("CAT_pack pack v{0} ({1})".format(
+        about.__version__, about.__date__))
+    if len(about.__authors__) == 1:
+        message += " by {0}".format(about.__authors__[0])
+    elif len(about.__authors__) == 2:
+        message += " by {0}".format(" and ".join(about.__authors__))
+    elif len(about.__authors__) > 2:
+        message += " by {0}, and {1}".format(
+            ", ".join(about.__authors__[:-1]), about.__authors__[-1])
+    message += "."
 
     sys.stdout.write("{0}\n".format(message))
 
@@ -36,16 +45,19 @@ def version():
     
 def help():
     message = (
-        "usage: CAT (prepare | contigs | bin | bins | add_names | summarise) "
-        "[-v / --version] [-h / --help]\n\n"
-        "Run Contig Annotation Tool (CAT) or "
-        "Bin Annotation Tool (BAT).\n\n"
+        "usage: CAT_pack (download | prepare | contigs | bins | reads "
+        "| add_names | summarise) [-v / --version] [-h / --help]\n\n"
+        "Run Contig Annotation Tool (CAT), "
+        "Bin Annotation Tool (BAT), or " 
+        "Read Annotation Tool (RAT).\n\n"
         "Required choice:\n"
         "  download\t\tDownload and preprocess data from NCBI nr or GTDB.\n"
         "  prepare\t\tConstruct database files.\n"
         "  contigs\t\tRun CAT.\n"
         "  bins\t\t\tRun BAT.\n"
-        "  add_names\t\tAdd taxonomic names to CAT or BAT output files.\n"
+        "  reads\t\t\tRun RAT.\n"
+        "  add_names\t\tAdd taxonomic names to CAT, BAT, or RAT output files."
+        "\n"
         "  summarise\t\tSummarise a named CAT or BAT classification file."
         "\n\n"
         "Optional arguments:\n"
@@ -69,6 +81,8 @@ def main():
         contigs.run()
     elif sys.argv[1] == "bins":
         bins.run()
+    elif sys.argv[1] == "reads":
+        reads.run()
     elif sys.argv[1] == "add_names":
         add_names.run()
     elif sys.argv[1] == "summarise":


=====================================
CAT_pack/about.py
=====================================
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
 
-__author__ = "F. A. Bastiaan von Meijenfeldt"
-__version__ = "5.3"
-__date__ = "4 November, 2023"
+__authors__ = ["F. A. Bastiaan von Meijenfeldt", "Nikos Pappas", "Ernestina Hauptfeld"]
+__version__ = "6.0"
+__date__ = "1 March, 2024"


=====================================
CAT_pack/add_names.py
=====================================
@@ -11,9 +11,10 @@ import tax
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        prog="CAT add_names",
-        description="Add taxonomic names to CAT or BAT output files.",
-        usage="CAT add_names -i FILE -o FILE -t DIR [options] [-h / --help]",
+        prog="CAT_pack add_names",
+        description="Add taxonomic names to CAT, BAT, or RAT output files.",
+        usage=("CAT_pack add_names -i FILE -o FILE -t DIR [options] "
+            "[-h / --help]"),
         add_help=False)
     
     required = parser.add_argument_group("Required arguments")
@@ -22,7 +23,7 @@ def parse_arguments():
         "input_file",
         True,
         help_=("Path to input file. Can be classification or ORF2LCA output "
-               "file from CAT or BAT."))
+            "file from CAT, BAT or RAT."))
     shared.add_argument(required, "output_file", True)
     shared.add_argument(required, "taxonomy_folder", True)
 
@@ -38,7 +39,7 @@ def parse_arguments():
     extra_args = [arg for (i, arg) in enumerate(extra_args) if
                   (i, arg) != (0, "add_names")]
     if len(extra_args) > 0:
-        sys.exit("error: too much arguments supplied:\n{0}".format(
+        sys.exit("error: too many arguments supplied:\n{0}".format(
             "\n".join(extra_args)))
 
     # Add extra arguments.
@@ -50,7 +51,7 @@ def parse_arguments():
 def run():
     args = parse_arguments()
 
-    message = "# CAT v{0}.".format(about.__version__)
+    message = "# CAT_pack v{0}.".format(about.__version__)
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False)
 
@@ -84,29 +85,29 @@ def run():
     with open(args.input_file, "r") as f1:
         for line in f1:
             if line.startswith("#"):
-                line = line.rstrip().split("\t")
+                line = line[2:].rstrip().split("\t")
 
                 if "lineage" in line:
                     lineage_index = line.index("lineage")
                 else:
                     message = ("{0} is not a supported classification file."
-                               "".format(args.input_file))
+                            "".format(args.input_file))
                     shared.give_user_feedback(
                         message, args.log_file, args.quiet, error=True)
 
                     sys.exit(1)
-                    
-                try:
-                    scores_index = line.index("lineage scores")
-                except:
-                    scores_index = None
 
+                scores_index = None
+                for i in range(len(line)):
+                    if line[i].startswith("lineage scores"):
+                        scores_index = i
+                    
                 full_length = len(line)
 
                 break
         else:
             message = ("{0} is not a supported classification file."
-                       "".format(args.input_file))
+                    "".format(args.input_file))
             shared.give_user_feedback(message, log_file, quiet, error=True)
 
             sys.exit(1)
@@ -118,7 +119,7 @@ def run():
             if line.startswith("#"):
                 if args.only_official:
                     outf1.write("{0}\tsuperkingdom\tphylum\tclass\torder\t"
-                                "family\tgenus\tspecies\n".format(line))
+                            "family\tgenus\tspecies\n".format(line))
                 else:
                     outf1.write("{0}\tfull lineage names\n".format(line))
                     
@@ -141,7 +142,7 @@ def run():
             
             lineage = line[lineage_index].split(";")
 
-            if scores_index is not None and not args.exclude_scores:
+            if scores_index and not args.exclude_scores:
                 scores = line[scores_index].split(";")
             else:
                 scores = None
@@ -164,5 +165,5 @@ def run():
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT add_names\' to add taxonomic names to CAT or BAT "
-             "output files.")
+    sys.exit("Run \'CAT_pack add_names\' to add taxonomic names to CAT, BAT "
+            "or RAT output files.")


=====================================
CAT_pack/bins.py
=====================================
@@ -14,9 +14,10 @@ import tax
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        prog="CAT bins",
+        prog="CAT_pack bins",
         description="Run Bin Annotation Tool (BAT).",
-        usage="CAT bins -b DIR / FILE -d DIR -t DIR [options] [-h / --help]",
+        usage=("CAT_pack bins -b DIR / FILE -d DIR -t DIR [options] "
+            "[-h / --help]"),
         add_help=False
     )
 
@@ -51,7 +52,7 @@ def parse_arguments():
     extra_args = [arg for (i, arg) in enumerate(extra_args) if
                   (i, arg) != (0, "bins")]
     if len(extra_args) > 0:
-        sys.exit("error: too much arguments supplied:\n{0}".format(
+        sys.exit("error: too many arguments supplied:\n{0}".format(
             "\n".join(extra_args)))
         
     # Check experimental features.
@@ -110,8 +111,9 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
                     if contig in contig2bin:
                         message = (
                             "BAT has encountered {0} twice, in {1} and in "
-                            "{2}. Fasta headers should be unique across bins, "
-                            "please remove or rename duplicates."
+                            "{2}. Fasta headers (the part before the first "
+                            "space in the >line) should be unique across "
+                            "bins, please remove or rename duplicates."
                             "".format(contig, contig2bin[contig], bin_)
                         )
                         shared.give_user_feedback(
@@ -156,7 +158,7 @@ def make_concatenated_fasta(
 def run():
     args = parse_arguments()
 
-    message = "# CAT v{0}.".format(about.__version__)
+    message = "# CAT_pack v{0}.".format(about.__version__)
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False
     )
@@ -567,4 +569,4 @@ def run():
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT bins\' to run Bin Annotation Tool (BAT).")
+    sys.exit("Run \'CAT_pack bins\' to run Bin Annotation Tool (BAT).")


=====================================
CAT_pack/check.py
=====================================
@@ -1,4 +1,4 @@
- #!/usr/bin/env/ python3
+#!/usr/bin/env python3
 
 import hashlib
 import os
@@ -136,6 +136,49 @@ def check_diamond_binaries(path_to_diamond, log_file, quiet):
     return error
 
 
+def check_bwa_binaries(path_to_bwa, log_file, quiet):
+    error = False
+
+    try:
+        p = subprocess.Popen([path_to_bwa],
+                             stderr=subprocess.PIPE)
+        for line in p.stderr:
+            line=line.decode("utf-8")
+            if line.startswith('Version'):
+                output = line.rstrip()
+                message = 'bwa found: {0}.'.format(output)
+                shared.give_user_feedback(message, log_file, quiet)
+    except OSError:
+        message = ('can not find bwa. Please check whether it is '
+                'installed or the path to the binaries is provided.')
+        shared.give_user_feedback(message, log_file, quiet, error=True)
+
+        error = True
+
+    return error
+
+
+def check_samtools_binaries(path_to_samtools, log_file, quiet):
+    error = False
+
+    try:
+        p = subprocess.Popen([path_to_samtools, '--version'],
+                             stdout=subprocess.PIPE)
+        c = p.communicate()
+        output = c[0].decode().split('\n')[0].rstrip()
+
+        message = 'samtools found: {0}.'.format(output)
+        shared.give_user_feedback(message, log_file, quiet)
+    except OSError:
+        message = ('can not find samtools. Please check whether it is '
+                'installed or the path to the binaries is provided.')
+        shared.give_user_feedback(message, log_file, quiet, error=True)
+
+        error = True
+
+    return error
+
+
 def check_bin_folder(bin_folder, bin_suffix, log_file, quiet):
     error = False
 
@@ -343,4 +386,4 @@ def check_whether_ORFs_are_based_on_contigs(
             
             
 if __name__ == "__main__":
-    sys.exit("Run \'CAT\' to run CAT or BAT.")
+    sys.exit("Run \'CAT_pack\' to run CAT, BAT, or RAT.")


=====================================
CAT_pack/contigs.py
=====================================
@@ -12,9 +12,9 @@ import tax
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        prog="CAT contigs",
+        prog="CAT_pack contigs",
         description="Run Contig Annotation Tool (CAT).",
-        usage="CAT contigs -c FILE -d DIR -t DIR [options] [-h / --help]",
+        usage="CAT_pack contigs -c FILE -d DIR -t DIR [options] [-h / --help]",
         add_help=False
     )
     
@@ -48,7 +48,7 @@ def parse_arguments():
     extra_args = [arg for (i, arg) in enumerate(extra_args) if
                   (i, arg) != (0, "contigs")]
     if len(extra_args) > 0:
-        sys.exit("error: too much arguments supplied:\n{0}".format(
+        sys.exit("error: too many arguments supplied:\n{0}".format(
             "\n".join(extra_args)))
         
     # Check experimental features.
@@ -77,7 +77,7 @@ def parse_arguments():
 def run():
     args = parse_arguments()
 
-    message = "# CAT v{0}.".format(about.__version__)
+    message = "# CAT_pack v{0}.".format(about.__version__)
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False)
 
@@ -441,4 +441,4 @@ def run():
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT contigs\' to run Contig Annotation Tool (CAT).")
+    sys.exit("Run \'CAT_pack contigs\' to run Contig Annotation Tool (CAT).")


=====================================
CAT_pack/download.py
=====================================
@@ -1,7 +1,10 @@
+#!/usr/bin/env python3
+
 import argparse
 from collections import namedtuple
 import datetime
 import hashlib
+import os
 import pathlib
 import shutil
 import sys
@@ -17,13 +20,14 @@ def parse_arguments():
     date = datetime.datetime.now().strftime("%Y-%m-%d")
 
     parser = argparse.ArgumentParser(
-        prog="CAT download",
+        prog="CAT_pack download",
         description=(
             "Download and preprocess sequence and taxonomy information. "
             "Currently supports the NCBI non-redundant (nr) database "
-            "and GTDB."
+            "and the GTDB database."
         ),
-        usage="CAT download --db (nr | gtdb) -o DIR [options] [-h / --help]",
+        usage=("CAT_pack download --db (nr | GTDB) -o DIR [options] "
+            "[-h / --help]"),
         add_help=False,
     )
 
@@ -160,8 +164,8 @@ def process_nr(output_dir, log_file, quiet, prefix, cleanup):
     message = (
         "\n-----------------\n\n"
         "Done!\n\n"
-        "A CAT database can be build with:\n\n"
-        "CAT prepare \\\n"
+        "A CAT_pack database can be build with:\n\n"
+        "CAT_pack prepare \\\n"
         "--db_fasta {0} \\\n"
         "--names {1} \\\n"
         "--nodes {2} \\\n"
@@ -198,7 +202,8 @@ fastaRecord = namedtuple(
 ## FUNCTIONS.
 def get_gtdb_latest_version():
     """Read the version number from the VERSION file."""
-    version_url = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt"
+    version_url = (
+            "https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt")
 
     with urllib.request.urlopen(version_url) as f:
         version_data = f.read().decode()
@@ -467,9 +472,8 @@ def extract_duplicates(proteins_dir, gid2taxid, acc2taxid_fp, log_file, quiet):
                 seq_counter += 1
 
             if file_counter % 1000 == 0 and file_counter != 0:
-                message = "Parsed {0} sequences from {1} files.".format(
-                    seq_counter, file_counter
-                )
+                message = "Parsed {0:,d} sequences from {1:,d} files.".format(
+                    seq_counter, file_counter)
                 shared.give_user_feedback(message, log_file, quiet)
         # This else is part of the outter for-loop.
         # It executes when the for loop finishes.
@@ -481,10 +485,10 @@ def extract_duplicates(proteins_dir, gid2taxid, acc2taxid_fp, log_file, quiet):
             redundants = sum(map(len, [v for v in multiplets.values()]))
             
             message = (
-                "    Total files: {0:>12}\n"
-                "{1}Total sequences: {2:>12}\n"
-                "{3}     Multiplets: {4:>12}\n"
-                "{5}of which unique: {6:>12}"
+                "    Total files: {0:>12,d}\n"
+                "{1}Total sequences: {2:>12,d}\n"
+                "{3}     Multiplets: {4:>12,d}\n"
+                "{5}of which unique: {6:>12,d}"
                 "".format(
                     file_counter,
                     padding,
@@ -516,12 +520,16 @@ def write_singletons(
                     skipped += 1
                     
             if file_counter % 1000 == 0 and file_counter != 0:
-                message = ("Written {0} sequences from {1} files ({2} skipped)."
-                        "".format(seq_counter, file_counter, skipped))
+                message = ("Written {0:,d} sequences from {1:,d} files "
+                        "({2:,d} skipped).".format(
+                            seq_counter, file_counter, skipped)
+                        )
                 shared.give_user_feedback(message, log_file, quiet)
         else:
-            message = ("Written {0} sequences from {1} files ({2} skipped)."
-                    "".format(seq_counter, file_counter, skipped))
+            message = ("Written {0:,d} sequences from {1:,d} files "
+                    "({2:,d} skipped).".format(
+                        seq_counter, file_counter, skipped)
+                    )
             shared.give_user_feedback(message, log_file, quiet)
             
     return
@@ -554,7 +562,7 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
     # This needs to be checked for future versions.
     version = get_gtdb_latest_version()
     
-    message = "CAT will download files from GTDB {0}.".format(version)
+    message = "CAT_pack will download files from GTDB {0}.".format(version)
     shared.give_user_feedback(message, log_file, quiet)
     
     gtdb_urls = [
@@ -607,7 +615,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
                 
                 return prefix == abs_directory
             
-            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+            def safe_extract(
+                    tar, path=".", members=None, *, numeric_owner=False):
                 for member in tar.getmembers():
                     member_path = os.path.join(path, member.name)
                     if not is_within_directory(path, member_path):
@@ -710,8 +719,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
     message = (
         "\n-----------------\n\n"
         "Done!\n\n"
-        "A CAT database can be build with:\n\n"
-        "CAT prepare \\\n"
+        "A CAT_pack database can be build with:\n\n"
+        "CAT_pack prepare \\\n"
         "--db_fasta {0} \\\n"
         "--names {1} \\\n"
         "--nodes {2} \\\n"
@@ -738,7 +747,7 @@ def run():
     if args.no_log:
         log_file = None
     else:
-        log_fname = "{0}.CAT_download.log".format(args.date)
+        log_fname = "{0}.CAT_pack_download.log".format(args.date)
         log_file = args.output_dir / pathlib.Path(log_fname)
 
     setattr(args, "log_file", log_file)
@@ -751,12 +760,12 @@ def run():
             prefix=args.date,
             cleanup=args.cleanup,
         )
-    elif args.db == "gtdb":
+    elif args.db == "GTDB":
         process_gtdb(args.output_dir, args.log_file, args.quiet, args.cleanup)
         
     return
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT download\' to download and preprocess data from "
-             "NCBI nr or GTDB.")
+    sys.exit("Run \'CAT_pack download\' to download and preprocess data from "
+            "NCBI nr or GTDB.")


=====================================
CAT_pack/prepare.py
=====================================
@@ -18,10 +18,10 @@ def parse_arguments():
     date = datetime.datetime.now().strftime("%Y-%m-%d")
 
     parser = argparse.ArgumentParser(
-        prog="CAT prepare",
-        description="Construct CAT/BAT database files.",
+        prog="CAT_pack prepare",
+        description="Construct CAT/BAT/RAT database files.",
         usage=(
-            "CAT prepare --db_fasta FILE "
+            "CAT_pack prepare --db_fasta FILE "
             "--acc2tax FILE "
             "--names FILE "
             "--nodes FILE "
@@ -44,7 +44,7 @@ def parse_arguments():
         optional,
         "common_prefix",
         False,
-        default="{0}_CAT".format(date),
+        default="{0}_CAT_pack".format(date),
         help_="Prefix for all files to be created."
     )
     shared.add_argument(optional, "quiet", False)
@@ -80,7 +80,7 @@ def memory_bottleneck(args):
             "construction (e.g. nr). {1}GB is found on your system. You can "
             "try to find a machine with more memory if you run into issues or "
             "download preconstructed database files from "
-            "tbb.bio.uu.nl/bastiaan/CAT_prepare/.".format(
+            "tbb.bio.uu.nl/tina/CAT_pack_prepare/.".format(
                 args.min_mem, total_memory)
         )
         shared.give_user_feedback(
@@ -246,8 +246,8 @@ def make_fastaid2LCAtaxid_file(
 
     message = (
         "Done! File {0} is created. "
-        "{1:,d} of {2:,d} headers ({3:.1f}%) corrected. "
-        "{4:,d} headers ({5:.1f}%) do not have a taxid assigned.".format(
+        "{1:,d} of {2:,d} headers ({3:.2f}%) corrected. "
+        "{4:,d} headers ({5:.2f}%) do not have a taxid assigned.".format(
             fastaid2LCAtaxid_file,
             corrected,
             total,
@@ -303,17 +303,22 @@ def write_taxids_with_multiple_offspring_file(
 
 
 def prepare(step_list, args):
-    shared.print_variables(args, step_list)
-    memory_bottleneck(args)
-
     # This is the root dir.
     db_dir = pathlib.Path(args.db_dir).resolve()
     db_dir.mkdir(exist_ok=True)
+
     if not args.no_log:
         log_fname = "{0}.log".format(args.common_prefix)
         log_path = db_dir / pathlib.Path(log_fname)
+
         setattr(args, "log_file", log_path)
 
+        with open(args.log_file, "w") as outf1:
+            pass
+
+    shared.print_variables(args, step_list)
+    memory_bottleneck(args)
+
     # It should contain...
     # ... 1. a taxonomy folder with names and nodes.
     tax_db = db_dir / pathlib.Path("tax")
@@ -410,13 +415,13 @@ def prepare(step_list, args):
             args.quiet
         )
 
-    message = "\n-----------------\n\n{0} CAT prepare is done!".format(
+    message = "\n-----------------\n\n{0} CAT_pack prepare is done!".format(
         shared.timestamp())
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False)
 
     message = (
-        "\nSupply the following arguments to CAT or BAT if you want to "
+        "\nSupply the following arguments to CAT, BAT, or RAT if you want to "
         "use this database:\n"
         "-d / --database_folder {0}\n"
         "-t / --taxonomy_folder {1}".format(cat_db, tax_db)
@@ -444,7 +449,7 @@ def run():
         message = (
             "Nothing to do here! All files exist. "
             "Please provide a new location or remove one of the files "
-            "created by CAT to launch a build."
+            "created by CAT_pack to launch a build."
         )
         shared.give_user_feedback(
             message, args.log_file, args.quiet, show_time=True)
@@ -455,4 +460,4 @@ def run():
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT prepare\' to construct a CAT/BAT database.")
+    sys.exit("Run \'CAT_pack prepare\' to construct a CAT/BAT/RAT database.")


=====================================
CAT_pack/reads.py
=====================================
@@ -0,0 +1,1365 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import argparse
+import datetime
+import sys
+import decimal
+
+import about
+
+import check
+import shared
+import tax
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+            prog='CAT_pack reads',
+            description='Run Read Annotation Tool (RAT).',
+            # @Tina: Should the explanation on how to use RAT be in the description? Right now the 'Run Read Annotation Tool' is at a little weird position. Also if you forget one argument (say -c) you'll not only get the help message but also the entire usage which is a little overkill.
+            # @Tina: I do like this explanation of complete and partial workflows... Maybe we should add it to the other CAT pack as well?
+            usage='CAT_pack reads -c -t [options] [-h / --help]\n\n'
+            'Complete RAT workflow (perform read mapping, run CAT, BAT, and RAT): '
+            'Supply contigs, reads, database folder, taxonomy folder, and bin folder.\n\n'
+            'Partial workflows:\n'
+            'If you have already mapped your reads, you can supply the sorted mapping '
+            'file and no read mapping will be performed.\n'
+            'If you have already run CAT and/or BAT, you can supply the output '
+            'files (contig2classification, bin2classification) and the path to the '
+            'taxonomy folder instead.\n'
+            'If you prefer not to use bin classification, do not supply the path '
+            'to a bin folder.',
+            add_help=False)
+    
+    required = parser.add_argument_group('Required arguments')
+    shared.add_argument(required, 'contigs_fasta', True)
+    shared.add_argument(required, 'taxonomy_folder', True)
+    shared.add_argument(required, 'mode', True)
+    
+    optional = parser.add_argument_group('Optional arguments')
+    shared.add_argument(optional, 'out_prefix', False, default='./out.RAT')
+    shared.add_argument(optional, 'read_file1', False)
+    shared.add_argument(optional, 'read_file2', False)
+    shared.add_argument(optional, 'bam_file1', False)
+    shared.add_argument(optional, 'bam_file2', False)
+    shared.add_argument(optional, 'alignment_unmapped', False)
+    shared.add_argument(optional, 'bin_fasta_or_folder', False, default=False)
+    shared.add_argument(optional, 'bin_suffix', False)
+    shared.add_argument(optional, 'contig2classification', False)
+    shared.add_argument(optional, 'bin2classification', False)
+    shared.add_argument(optional, 'read2classification', False)
+    shared.add_argument(optional, 'unmapped2classification', False)
+
+    shared.add_argument(optional, 'mapping_quality', False, default=2)
+    shared.add_argument(optional, 'path_to_bwa', False, default='bwa')
+    shared.add_argument(optional, 'path_to_samtools', False, default='samtools')
+
+    shared.add_argument(optional, 'force', False)
+    shared.add_argument(optional, 'quiet', False)
+    shared.add_argument(optional, 'verbose', False)
+    shared.add_argument(optional, 'no_log', False)
+    shared.add_argument(optional, 'help', False)
+    
+    CAT_args = parser.add_argument_group('CAT/BAT-specific arguments')
+    shared.add_argument(CAT_args, 'database_folder', False)
+    shared.add_argument(optional, "proteins_fasta", False)
+    shared.add_argument(optional, "alignment_file", False)
+    shared.add_argument(CAT_args, 'r', False, default=decimal.Decimal(10))
+    shared.add_argument(CAT_args, 'f', False, default=decimal.Decimal(0.5))
+    shared.add_argument(CAT_args, 'path_to_prodigal', False, default="prodigal")
+    shared.add_argument(CAT_args, 'path_to_diamond', False, default="diamond")
+    shared.add_argument(CAT_args, 'no_stars', False)
+    shared.add_argument(CAT_args, 'IkwId', False)
+    
+    dmnd_args = parser.add_argument_group('DIAMOND specific optional arguments')
+    shared.add_all_diamond_arguments(dmnd_args)
+                          
+    (args, extra_args) = parser.parse_known_args()
+    extra_args = [arg for (i, arg) in enumerate(extra_args) if
+        (i, arg) != (0, 'reads')]
+    if len(extra_args) > 0:
+        sys.exit('error: too many arguments supplied:\n{0}'.format(
+            '\n'.join(extra_args)))
+
+    # Add extra arguments.
+    shared.expand_arguments(args)
+    
+    if not args.read_file1:
+        sys.exit('error: you have to supply read files!')
+    
+    #check if the mode is correct
+    for c in [m for m in args.mode]:
+        if c not in ['m', 'c', 'r']:
+            sys.exit('Unknown letter "{}" in the mode argument. Allowed letters ' 
+                     'are "m" for MAGs, "c" for contigs and "r" for reads. '
+                     'Exiting.'.format(c))
+    
+    if 'm' in args.mode:
+        if not args.bin_folder:
+            sys.exit('error: "m" was supplied to mode but no bin_folder. If '
+                     'you want to include MAGs in your profile, please submit '
+                     'a bin folder (and suffix if necessary)')
+        if args.bin_folder and not args.bin2classification and not args.database_folder:
+            sys.exit('error: please provide either a bin2classification file or '
+                     'the path to the CAT_pack database_folder for bin classification!')
+        if args.bin2classification and not args.bin_folder:
+            sys.exit('error: bin2classification file but no bin_folder supplied. '
+                     'RAT requires -b/--bin_folder to include bins in annotation!')
+    
+    if 'c' in args.mode:
+        if not args.contigs_fasta:
+            sys.exit('error: no contigs_fasta supplied.')
+    
+
+        if not args.contig2classification and not args.database_folder:
+            sys.exit('error: please provide either a contig2classification file '
+                     'or the path to the CAT_pack database_folder!')
+            
+    if 'r' in args.mode and not 'c' in args.mode and not 'm' in args.mode:
+        sys.exit('error: we do not recommend annotating all reads directly '
+                 'with diamond. Please include c or m in the mode argument.')
+    
+    
+    return args
+
+
+
+
+
+def run():
+    args = parse_arguments()
+
+    message = '# CAT_pack v{0}.\n'.format(about.__version__)
+    shared.give_user_feedback(message, args.log_file, args.quiet,
+        show_time=False)
+    message = '# Running command: {0}\n\n'.format(' '.join(sys.argv[1:]))
+    shared.give_user_feedback(message, args.log_file, args.quiet,
+        show_time=False)
+    
+    # Checks
+    
+    
+    
+    errors = []
+    
+    errors.append(
+            check.check_input_file(args.contigs_fasta, args.log_file, args.quiet))
+    
+    if args.read_file1:
+        errors.append(
+            check.check_input_file(args.read_file1, args.log_file, args.quiet))
+    if args.read_file2:
+        errors.append(
+            check.check_input_file(args.read_file2, args.log_file, args.quiet))
+    if args.bam_file1:
+        errors.append(
+            check.check_input_file(args.bam_file1, args.log_file, args.quiet))
+    if args.contig2classification:
+        errors.append(
+            check.check_input_file(args.contig2classification, args.log_file, args.quiet))
+    if args.bin2classification:
+        errors.append(
+            check.check_input_file(args.bin2classification, args.log_file, args.quiet))
+    
+    
+    
+    if not args.force:
+        if args.read_file1:
+            outf_bwamem='{0}.{1}.bwamem'.format(args.out_prefix+'.'+
+                                        os.path.split(args.contigs_fasta)[-1], 
+                                        os.path.split(args.read_file1)[-1])
+            
+            errors.append(
+                    check.check_output_file(
+                        outf_bwamem, args.log_file, args.quiet))
+            errors.append(
+                    check.check_output_file(
+                        outf_bwamem + '.bam', args.log_file, args.quiet))
+            errors.append(
+                    check.check_output_file(
+                        outf_bwamem + '.sorted', args.log_file, args.quiet))
+        if not args.contig2classification:
+            outf_cat_protein_faa='{0}.CAT.predicted_proteins.faa'.format(args.out_prefix)
+            outf_cat_c2c='{0}.CAT.contig2classification.txt'.format(args.out_prefix)
+            outf_cat_alignment='{0}.CAT.alignment.diamond'.format(args.out_prefix)
+            errors.append(
+                    check.check_output_file(
+                        outf_cat_protein_faa, args.log_file, args.quiet))
+            errors.append(
+                    check.check_output_file(
+                        outf_cat_c2c, args.log_file, args.quiet))
+            errors.append(
+                    check.check_output_file(
+                        outf_cat_alignment, args.log_file, args.quiet))
+            
+        if 'm' in args.mode:    
+            if args.bin_folder and not args.bin2classification:
+                outf_bat_protein_faa='{0}.BAT.{1}.predicted_proteins.faa'.format(args.out_prefix,
+                                                                             'concatenated')
+                outf_bat_b2c='{0}.BAT.bin2classification.txt'.format(args.out_prefix)
+                outf_bat_alignment='{0}.BAT.{1}.alignment.diamond'.format(args.out_prefix,
+                                                                             'concatenated')
+                errors.append(
+                        check.check_output_file(
+                            outf_bat_protein_faa, args.log_file, args.quiet))
+                errors.append(
+                        check.check_output_file(
+                            outf_bat_b2c, args.log_file, args.quiet))
+                errors.append(
+                        check.check_output_file(
+                            outf_bat_alignment, args.log_file, args.quiet))
+
+        
+    errors.append(
+            check.check_samtools_binaries(
+                args.path_to_samtools, args.log_file, args.quiet))
+    if not args.bam_file1:
+        errors.append(
+            check.check_bwa_binaries(
+                args.path_to_bwa, args.log_file, args.quiet))
+    if not args.contig2classification or (args.bin_folder and not 
+                                          args.bin2classification):
+        errors.append(
+            check.check_prodigal_binaries(
+                args.path_to_prodigal, args.log_file, args.quiet))
+        errors.append(
+            check.check_diamond_binaries(
+                args.path_to_diamond, args.log_file, args.quiet))
+    
+    if True in errors:
+        sys.exit(1)
+    
+    if args.read_file1:
+        reads_files=[args.read_file1]
+        if args.read_file2:
+            reads_files.append(args.read_file2)
+        else:
+            message = (
+                'WARNING: only one read file supplied! Currently RAT does not '
+                'support interlaced read files. If you are working with '
+                'paired-end reads, please provide a reverse read-file!' )
+            shared.give_user_feedback(message, args.log_file, args.quiet, 
+                                      show_time=False)
+    
+    # First: run bwa mem, samtools view and samtools sort if there is no bam file
+    bam_files=[]
+    if not args.bam_file1:
+        message = (
+                '\n'
+                'RAT is running. Mapping reads against assembly with bwa mem.\n')
+        shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=False)
+            
+        
+
+        shared.run_bwa_mem(args.path_to_bwa, args.path_to_samtools,
+                              args.contigs_fasta, reads_files, args.out_prefix,
+                              args.nproc, args.log_file)
+
+            
+            
+        bam_files.append('{0}.{1}.bwamem.sorted'.format(args.out_prefix+'.'
+                        ''+os.path.split(args.contigs_fasta)[-1], 
+                        os.path.split(args.read_file1)[-1]))
+    else:
+        bam_files.append(args.bam_file1)
+        if args.bam_file2:
+            bam_files.append(args.bam_file2)
+            message = (
+                'WARNING: you provided two bam files. Consider mapping forward '
+                'and reverse reads together or repeating the mapping step with '
+                'RAT!')
+            shared.give_user_feedback(message, args.log_file, args.quiet, 
+                                      show_time=False)
+
+    
+    # Run CAT or process CAT output file
+    if 'c' in args.mode:
+        if args.contig2classification:
+            message = (
+                    'contig2classification file supplied. Processing contig '
+                    'classifications.')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+            c2c=process_CAT_table(args.contig2classification, args.nodes_dmp, 
+                                  args.log_file, args.quiet)
+        else:
+            message = (
+                    'No contig2classification file supplied. Running CAT on '
+                    'contigs.')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+            
+            print(args.path_to_prodigal)
+            
+            shared.run_CAT(args, args.contigs_fasta, args.database_folder, 
+                           args.taxonomy_folder, args.log_file, args.quiet, 
+                           args.nproc, args.f, args.r, args.out_prefix)
+            c2c=process_CAT_table('{0}.CAT.contig2classification.txt'
+                                  ''.format(args.out_prefix), 
+                                  args.nodes_dmp, args.log_file, args.quiet)
+    
+    
+    contig2bin={}
+    b2c={}
+    # Process bin folder
+
+    
+    if 'm' in args.mode:
+        
+        if args.bin_folder:
+            message = 'Bin folder supplied. Processing bin folder.'
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+    
+            bins=process_bin_folder(args.bin_folder, args.bin_suffix)
+            if bins=={}:
+                message = ('No files found with suffix {} in folder "{}".' 
+                           'Please check command.'
+                           ''.format(args.bin_suffix, args.bin_folder))
+                shared.give_user_feedback(message, args.log_file, args.quiet,
+                        show_time=True)
+                sys.exit(1)
+            contig2bin=invert_bin_dict(bins)
+    
+    
+        # Run BAT on folder if bin folder but not BAT_file is supplied
+        # Or process BAT file if it is supplied
+        
+        
+        if args.bin_folder and args.bin2classification:
+            message = (
+                    'bin2classification file supplied. Processing bin '
+                    'classifications.')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+            b2c=process_CAT_table(args.bin2classification, args.nodes_dmp, 
+                                  args.log_file, args.quiet)
+        
+        elif args.bin_folder and not args.bin2classification:
+            message = ('No bin2classification file supplied. Running BAT on bin '
+                       'folder.')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+            
+            # If CAT was not run, run BAT
+            if not 'c' in args.mode:
+                shared.run_BAT(args, args.bin_folder, args.database_folder, args.taxonomy_folder,
+                           args.log_file, args.quiet, args.nproc, args.f, args.r, 
+                           args.out_prefix, args.bin_suffix)
+                
+            # If CAT was run, use the CAT output files
+            else:
+                message = ('Calculating bin annotations from previous CAT run.')
+                shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=True)
+                CAT_protein_fasta=('{0}.CAT.predicted_proteins.faa'
+                                   ''.format(args.out_prefix))
+                CAT_diamond_alignment=('{0}.CAT.alignment.diamond'
+                                       ''.format(args.out_prefix))
+                shared.run_BAT(args, args.bin_folder, args.database_folder, args.taxonomy_folder,
+                           args.log_file, args.quiet, args.nproc, args.f, args.r, 
+                           CAT_protein_fasta, CAT_diamond_alignment,
+                           args.out_prefix, args.bin_suffix)
+            b2c=process_CAT_table('{0}.BAT.bin2classification.txt'.format(args.out_prefix), 
+                                  args.nodes_dmp, args.log_file, args.quiet)
+            
+        elif not args.bin_folder:
+            message = 'No bin folder supplied. No bin classification will be made.'
+            
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                    show_time=False)
+    
+    
+    
+    # Process BAM files and grab unmapped reads
+    message = 'Processing mapping file(s).\n'
+    shared.give_user_feedback(message, args.log_file, args.quiet,
+            show_time=True)
+    # @Tina: we could just supply the first argument as a list, that will save you the conditional.
+    if len(bam_files)==2:
+        reads, unmapped_reads, sum_of_reads, paired = process_bam_file(bam_files[0], 
+                                                                       bam_files[1], 
+                                                                       mapping_quality=args.mapping_quality)
+    else:
+        reads, unmapped_reads, sum_of_reads, paired = process_bam_file(bam_files[0], 
+                                                                       mapping_quality=args.mapping_quality)
+    
+    
+    # Get lengths of the contigs from contigs file    
+    contig_length_dict, sum_of_nucleotides = get_contig_lengths(args.contigs_fasta)
+    # make contig_dict
+    contigs, reads, max_primary=make_contig_dict(reads, bam_files, paired, contig2bin)
+
+
+    # If direct_mapping is chosen, map unclassified contigs and unmapped reads
+    # against NR
+    
+    if 'r' in args.mode:
+        setattr(args,'read2classification',True)
+        message = ('Chosen mode: {0}. Classifying unclassified contigs and'
+                    ' unmapped reads with diamond if no classification file is'
+                    ' supplied.'.format(args.mode))
+        shared.give_user_feedback(message, args.log_file, args.quiet,
+            show_time=True)
+        
+        
+        # unmapped_reads is a dictionary that stores the unmapped forward and
+        # reverse reads and looks like this:
+            # unmapped_reads={'fw': [read_id1, read_id2, read_id3], 
+            #                 'rev': [read_id1, read_id2, read_id3]}
+        # I need to mark all the read ids as fw or rev, pull out the sequences,
+        # and then write the input fasta as one document for all reads and 
+        # contigs I want to classify
+        if not args.alignment_unmapped and not args.unmapped2classification:
+            message = ('No unmapped2classification file supplied .Grabbing '
+                       'unmapped and unclassified sequences...')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=True)
+            all_unclassified=list()
+            
+            uncl_unm_fasta='{0}.unclassified_unmapped.fasta'.format(args.out_prefix)
+            unclassified_contigs = get_unclassified_contigs(contig2bin,
+                                                            c2c, b2c)
+            all_unclassified+=list(unclassified_contigs)
+            make_unclassified_seq_fasta(args.contigs_fasta, unclassified_contigs,
+                                        uncl_unm_fasta, 'fasta', 'w')
+            
+            message = ('Contigs written! Appending forward reads...')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=True)
+            make_unclassified_seq_fasta(reads_files[0], unmapped_reads['fw'],
+                                        uncl_unm_fasta, 'fastq', 'a','_1')
+            all_unclassified+=['{}_1'.format(i) for i in unmapped_reads['fw']]
+            
+            message = ('Appending reverse reads...')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=True)
+            make_unclassified_seq_fasta(reads_files[1], unmapped_reads['rev'],
+                                        uncl_unm_fasta, 'fastq', 'a','_2')
+            all_unclassified+=['{}_2'.format(i) for i in unmapped_reads['rev']]
+            # Run diamond on unclassified contigs and unmapped reads
+            setattr(args,
+                    'alignment_file',
+                    '{0}.unclassified_unmapped.alignment.diamond'.format(args.out_prefix))
+            shared.run_diamond(args, blast='blastx', prot_fasta=uncl_unm_fasta,
+                                top=11)
+        else:
+            setattr(args,
+                    'alignment_file',
+                    args.alignment_unmapped)
+        if args.alignment_unmapped and not args.unmapped2classification:
+            all_unclassified=list()
+            unclassified_contigs = get_unclassified_contigs(contig2bin,
+                                                            c2c, b2c)
+            all_unclassified+=list(unclassified_contigs)
+            all_unclassified+=['{}_1'.format(i) for i in unmapped_reads['fw']]
+            all_unclassified+=['{}_2'.format(i) for i in unmapped_reads['rev']]
+       
+            
+        # Now, the diamond alignment has to be parsed with a CAT function
+        if not args.unmapped2classification:
+    
+            taxid2parent, taxid2rank = tax.import_nodes(args.nodes_dmp, 
+                                                        args.log_file,
+                                                        args.quiet)
+            shared.explore_database_folder(args)
+            seq2hits, all_hits = shared.parse_tabular_alignment(args.alignment_file,
+                                                                decimal.Decimal(1-args.r),
+                                                                args.log_file,
+                                                           args.quiet)
+            fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(args.fastaid2LCAtaxid_file, 
+                                                           all_hits, args.log_file,
+                                                           args.quiet)
+            taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
+                args.taxids_with_multiple_offspring_file,
+                args.log_file,
+                args.quiet)
+            message = ('Finding lineages for unclassified/unmapped sequences...')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=True)
+            write_unmapped2classification(seq2hits, 
+                                          all_unclassified,
+                                          fastaid2LCAtaxid, 
+                                          args.out_prefix, 
+                                          taxid2parent,
+                                          taxid2rank,
+                                          taxids_with_multiple_offspring,
+                                          args.no_stars)
+            u2c=process_CAT_table('{0}.unmapped2classification.txt'.format(args.out_prefix), 
+                                  args.nodes_dmp, 
+                                  args.log_file, 
+                                  args.quiet)
+        else:
+            message = ('Loading unmapped2classification...')
+            shared.give_user_feedback(message, args.log_file, args.quiet,
+                show_time=True)
+            u2c=process_CAT_table(args.unmapped2classification, 
+                                  args.nodes_dmp, 
+                                  args.log_file, 
+                                  args.quiet)
+    else:
+        u2c={}
+            
+    message = 'Writing output tables.'
+    shared.give_user_feedback(message, args.log_file, args.quiet,
+            show_time=True)   
+    
+    make_tax_table(c2c,
+                   contigs, 
+                   unmapped_reads,
+                   contig_length_dict,
+                   contig2bin, 
+                   b2c, 
+                   sum_of_nucleotides, 
+                   sum_of_reads, 
+                   args.out_prefix,
+                   u2c)
+        
+    if 'm' in args.mode:
+        make_bin_table(contig2bin, 
+                       contigs, 
+                       contig_length_dict, 
+                       sum_of_reads, 
+                       args.out_prefix)
+    
+       
+    
+    # Finally: Optional: Classify reads and write per_read table
+    if args.read2classification:
+        reads=classify_reads(reads, c2c, b2c, u2c)
+        write_read_table(reads, args.out_prefix, max_primary)
+
+    
+    message = (
+            '\n-----------------\n\n'
+            '[{0}] RAT is done!'.format(
+                datetime.datetime.now()))
+    shared.give_user_feedback(message, args.log_file, args.quiet,
+            show_time=False)
+    return
+
+
+
+
+
+def write_unmapped2classification(seq2hits,
+                                  all_seqs,
+                                  fastaid2LCAtaxid, 
+                                  out_prefix,
+                                  taxid2parent, 
+                                  taxid2rank, 
+                                  taxids_with_multiple_offspring,
+                                  no_stars):
+    
+    
+    
+    with open('{0}.unmapped2classification.txt'.format(out_prefix), 'w') as outf:
+        n_classified_contigs = 0
+        outf.write(
+                '# sequence\tclassification\treason\tlineage\tlineage scores\n')
+        
+        for seq in sorted(all_seqs):
+            if seq not in seq2hits:
+                outf.write('{0}\tno taxid assigned\tno hits found\n'.format(
+                    seq))
+                continue
+            
+            n_hits = len(seq2hits[seq])
+            LCAs_ORFs = []
+            
+            
+            (taxid,
+                    top_bitscore) = tax.find_LCA_for_ORF(
+                            seq2hits[seq], fastaid2LCAtaxid, taxid2parent)
+             
+            if not taxid.startswith('no taxid found'):
+                lineage = tax.find_lineage(taxid, taxid2parent)
+    
+                if not no_stars:
+                    lineage = tax.star_lineage(
+                            lineage, taxids_with_multiple_offspring)
+                               
+            LCAs_ORFs.append((taxid, top_bitscore))
+                    
+            if len(LCAs_ORFs) == 0:
+                outf.write('{0}\tno taxid assigned\t'
+                        'no hits to database\n'.format(seq))
+    
+                continue
+            
+            (lineages,
+                    lineages_scores,
+                    based_on_n_ORFs) = tax.find_weighted_LCA(
+                            LCAs_ORFs, taxid2parent, 0.5)
+             
+            if lineages == 'no ORFs with taxids found.':
+                outf.write('{0}\tno taxid assigned\t'
+                        'hits not found in taxonomy files\n'.format(seq))
+    
+                continue
+            
+            if lineages == 'no lineage whitelisted.':
+                outf.write(
+                        '{0}\tno taxid assigned\t'
+                        'no lineage reached minimum bit-score support\n'
+                        ''.format(seq))
+    
+                continue
+        
+            n_classified_contigs += 1
+
+            for (i, lineage) in enumerate(lineages):
+                if not no_stars:
+                    lineage = tax.star_lineage(
+                            lineage, taxids_with_multiple_offspring)
+                scores = ['{0:.2f}'.format(score) for
+                        score in lineages_scores[i]]
+                
+                if len(lineages) == 1:
+                    # There is only one classification.
+                    outf.write(
+                            '{0}\t'
+                            'taxid assigned\t'
+                            'based on {1} hits\t'
+                            '{2}\t'
+                            '{3}\n'.format(
+                                seq,
+                                n_hits,
+                                ';'.join(lineage[::-1]),
+                                ';'.join(scores[::-1])))
+     
+    return
+
+
+
+
+def classify_reads(read_dict, contig2classification, bin2classification, unmapped2classification):
+    # worked=0
+    for read in read_dict:
+        # If the read is mapped to a contig:
+        if read_dict[read]['contig']!=[]:
+            for r in range(len(read_dict[read]['contig'])):
+                # If there is bin classification, store it
+                if bin2classification:
+                    try:
+                        if (read_dict[read]['bin'][r]=='unbinned' or
+                            len(bin2classification[read_dict[read]['bin'][r]])<1):
+                            
+                            read_dict[read]['taxon_bin'].append('')
+                           
+                        else:
+                            
+                            read_dict[read]['taxon_bin'].append(';'.join(bin2classification[read_dict[read]['bin'][r]][0]))
+                        # worked+=1
+                    except IndexError:
+                        
+                        sys.exit(1)
+                
+                # If there is a contig classification, store it
+                if contig2classification:
+                    contig=read_dict[read]['contig'][r]
+                    if (contig2classification[contig]==[] or
+                        len(contig2classification[contig][0])<2 or 
+                        (len(contig2classification[contig][0])==2 and 
+                        contig2classification[contig][0][1]=='131567')):
+                        read_dict[read]['taxon_contig'].append('')
+                    else:
+                        read_dict[read]['taxon_contig'].append(';'.join(contig2classification[contig][0]))
+                    
+                # If there is no contig classification, but a dm classification, store it
+                if contig in unmapped2classification:
+                    if (unmapped2classification[contig]==[] or 
+                    len(unmapped2classification[contig][0])<2 or 
+                    (len(unmapped2classification[contig][0])==2 and 
+                    unmapped2classification[contig][0][1]=='131567')):
+                        read_dict[read]['taxon_contig_dm'].append('')
+                    else:
+                        read_dict[read]['taxon_contig_dm'].append(';'.join(unmapped2classification[contig][0]))
+        # If the read is not mapped:
+        else:
+            
+            fw='{}_1'.format(read)
+            rev='{}_2'.format(read)
+            if fw in unmapped2classification:
+                if (unmapped2classification[fw]==[] or
+                len(unmapped2classification[fw][0])<2 or 
+                (len(unmapped2classification[fw][0])==2 and 
+                 unmapped2classification[fw][0][1]=='131567')):
+                    read_dict[read]['taxon_read_dm'].append('')
+                else:
+                    read_dict[read]['taxon_read_dm'].append('fw: '+';'.join(unmapped2classification[fw][0]))
+            if rev in unmapped2classification:
+                if (unmapped2classification[rev]==[] or
+                len(unmapped2classification[rev][0])<2 or 
+                (len(unmapped2classification[rev][0])==2 and 
+                 unmapped2classification[rev][0][1]=='131567')):
+                    read_dict[read]['taxon_read_dm'].append('')
+                else:
+                    read_dict[read]['taxon_read_dm'].append('rev: '+';'.join(unmapped2classification[rev][0]))
+    
+    return read_dict
+
+
+
+def write_read_table(read_dict, sample_name, max_primary):
+    # Takes the read dictionary with bin lineage and contig lineage and writes
+    # annotation for each read and each tier of annotation into a file
+    read_table=sample_name + '.read2classification.txt'
+    
+    with open(read_table, 'w') as outf:
+        outf.write('## command: {}\n'.format(' '.join(sys.argv)))
+        outf.write('# read\tclassification\tbin classification\t'
+                   'contig classification\tdirect_mapping_contig\t'
+                   'direct_mapping_read\n'
+                   #'[aligner] unclassified contig\t[aligner] unclassified read\n'
+                   )
+        for read in read_dict:
+
+            for c in range(len(read_dict[read]['contig'])):
+                try:
+                    b_taxon=read_dict[read]['taxon_bin'][c]
+                except IndexError:
+                    b_taxon=''
+                try:    
+                    c_taxon=read_dict[read]['taxon_contig'][c]
+                except IndexError:    
+                    c_taxon=''
+                if b_taxon=='' and c_taxon=='':
+                    try:
+                        c_dm_taxon=read_dict[read]['taxon_contig_dm'][c]
+                        r_dm_taxon=''
+                    except IndexError:
+                        c_dm_taxon=''
+                        r_dm_taxon=''
+                    
+                else:
+                    c_dm_taxon=''
+                    r_dm_taxon=''
+                assigned='taxid assigned'
+                if b_taxon=='' and c_taxon=='' and c_dm_taxon=='':
+                    assigned='no taxid assigned'
+                outf.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format(read,
+                           assigned, b_taxon, c_taxon, c_dm_taxon,
+                           r_dm_taxon))
+            if len(read_dict[read]['contig'])<max_primary:
+                if len(read_dict[read]['taxon_read_dm'])>0:
+                    for c in range(len(read_dict[read]['taxon_read_dm'])):
+                
+                        try:
+                            r_dm_taxon=read_dict[read]['taxon_read_dm'][c]
+                            
+                        except IndexError:
+                            r_dm_taxon=''
+                        assigned='taxid assigned'
+                        if r_dm_taxon=='':
+                            assigned='no taxid assigned'
+                        outf.write('{0}\t{1}\t\t\t\t{2}\n'.format(read,
+                           assigned, r_dm_taxon))
+                else:
+                    n=max_primary-len(read_dict[read]['taxon_read_dm'])
+                    outf.write(n*'{0}\t{1}\n'.format(read, 'no taxid assigned'))
+    return
+
+
+
+def make_bin_table(contig2bin, 
+                   contig_dict, 
+                   contig_length_dict, 
+                   total_reads, 
+                   out_prefix):
+    # Include lineage?
+    bin_data={}
+    bin_file='{0}.bin.reads.txt'.format(out_prefix)
+    
+    with open(bin_file, 'w') as op:
+        op.write('## command: {}\n'.format(' '.join(sys.argv)))
+        op.write('# bin\tnumber of reads\tfraction of reads\tbin length\t'
+                 'corrected fraction\n')
+        for contig in contig_dict:
+            if contig in contig2bin:
+                bin_id=contig2bin[contig]
+            else:
+                bin_id='unbinned'
+            if not bin_id in bin_data:
+                bin_data[bin_id]={
+                    'n_reads': 0,
+                    'n_nucleotides': 0
+                    }
+            # if contig is in contig_dict, then reads have mapped to it, otherwise
+            # no reads have mapped to it
+            try:
+                bin_data[bin_id]['n_reads']+=contig_dict[contig]
+                bin_data[bin_id]['n_nucleotides']+=contig_length_dict[contig]
+            except KeyError:
+                pass
+            
+        divisor=get_proper_fraction(bin_data, total_reads)
+        for bin_id in bin_data:
+            read_fraction=bin_data[bin_id]['n_reads']/total_reads
+            if bin_data[bin_id]['n_nucleotides']!=0:
+                corrected_frac=(read_fraction/bin_data[bin_id]['n_nucleotides'])/divisor
+            else:
+                corrected_frac=0
+            op.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(bin_id, 
+                                                bin_data[bin_id]['n_reads'], 
+                                                read_fraction, 
+                                                bin_data[bin_id]['n_nucleotides'], 
+                                                corrected_frac))
+    return
+
+
+
+
+
+def make_tax_table(c2c_dict, 
+                   contig_dict,
+                   unmapped_reads,
+                   contig_length_dict, 
+                   contig2bin, 
+                   bin2classification, 
+                   sum_of_nucleotides, 
+                   total_reads, 
+                   sample_name,
+                   u2c_dict=None):
+    """
+    Makes a dictionary that keeps all the necessary info in the same place
+    Writes contig table
+    Writes table with all classifications including subspecies
+    """
+    RAT_contig_header=('# contig\tnumber of reads\tfraction of reads\t'
+                       'contig_length\tAverage Coveraget\tlineage\t'
+                       'lineage ranks\tlineage scores\n')
+                       
+    RAT_file_header=('# lineage\tnumber of reads\tfraction of reads\t'
+                     'taxon length\tlineage ranks\n')
+    
+    taxon_dict={}
+    unclassified={
+            'contigs': set(),
+            'n_reads': 0,
+            'n_nucleotides': 0
+            }
+
+    for contig in contig_dict:
+        
+        # If the contig name is '*', that means that a read is unmapped. 
+        # If direct_mapping is enabled, unmapped reads have been run through
+        # diamond and an unmapped2classification dictionary has been supplied
+        # to this function. The reads get assigned a taxon if diamond spit out
+        # a taxon, they get counted as unmapped otherwise.
+        
+        if contig=='*':
+            if u2c_dict:
+                unmapped=0
+                tmp=['fw', 'rev']
+                for direction in tmp:
+                    for read in unmapped_reads[direction]:
+                        u2c_read_id='{0}_{1}'.format(read, 
+                                                     tmp.index(direction)+1)
+                        if u2c_read_id in u2c_dict:
+                            if (u2c_dict[u2c_read_id]==[] or len(u2c_dict[u2c_read_id][0])<2 
+                                or (len(u2c_dict[u2c_read_id][0])==2 and 
+                                u2c_dict[u2c_read_id][0][1]=='131567')):
+                                unmapped+=1
+                            else:
+                                taxon=';'.join(u2c_dict[u2c_read_id][0])
+                                if taxon not in taxon_dict:
+                                    taxon_dict[taxon]={
+                                            'n_reads': 0,
+                                            'n_nucleotides': 0,
+                                            'ranks': u2c_dict[u2c_read_id][1]
+                                            }
+                                taxon_dict[taxon]['n_reads']+=1
+                                # taxon_dict[taxon]['n_nucleotides']+=contig_length_dict[contig]
+            else:
+                unmapped=contig_dict[contig]
+        
+        # If the contig is in contig2bin, that means that it is binned and therefore
+        # will be assigned the lineage of the bin and not the contig
+        elif (contig in contig2bin and 
+              len(bin2classification[contig2bin[contig]])>0):
+        
+            taxon=';'.join(bin2classification[contig2bin[contig]][0])
+         
+            if taxon not in taxon_dict:
+                taxon_dict[taxon]={
+                        'n_reads': 0,
+                        'n_nucleotides': 0,
+                        'ranks': bin2classification[contig2bin[contig]][1]
+                        }
+            # add number of reads that map to the contig to the taxon and
+            # add nucleotides of the contig to the taxon
+            taxon_dict[taxon]['n_reads']+=contig_dict[contig]
+            taxon_dict[taxon]['n_nucleotides']+=contig_length_dict[contig]
+            
+        # If C2C[contig] is an empty list, it's unclassified
+        # If the length of the list of classified tax_ids is shorter than 2, the
+        # contig is not classified at superkingdom level and therefore, for our
+        # purpose unclassified
+        # We check u2c to see whether it was classified in the direct_mapping
+        # step, assign the lineage from u2c or unclassified depending on
+        # result
+        elif (c2c_dict[contig]==[] or len(c2c_dict[contig][0])<2 
+              or (len(c2c_dict[contig][0])==2 and 
+              c2c_dict[contig][0][1]=='131567')):
+            if u2c_dict:
+                if contig not in u2c_dict or (u2c_dict[contig]==[] or len(u2c_dict[contig][0])<2 
+                  or (len(u2c_dict[contig][0])==2 and 
+                  u2c_dict[contig][0][1]=='131567')):
+                    unclassified['n_reads']+=contig_dict[contig]
+                    unclassified['n_nucleotides']+=contig_length_dict[contig]
+                else:
+                    taxon=';'.join(u2c_dict[contig][0])
+                    if taxon not in taxon_dict:
+                        taxon_dict[taxon]={
+                                'n_reads': 0,
+                                'n_nucleotides': 0,
+                                'ranks': u2c_dict[contig][1]
+                                }
+                    taxon_dict[taxon]['n_reads']+=contig_dict[contig]
+                    taxon_dict[taxon]['n_nucleotides']+=contig_length_dict[contig]
+                
+            
+        
+        # The other possibility is that the contig is unbinned, but classified
+        else:
+            taxon=';'.join(c2c_dict[contig][0])
+            if taxon not in taxon_dict:
+                taxon_dict[taxon]={
+                        'n_reads': 0,
+                        'n_nucleotides': 0,
+                        'ranks': c2c_dict[contig][1]
+                        }
+            
+            # add number of reads that map to the contig to the taxon and
+            # add nucleotides of the contig to the taxon
+            taxon_dict[taxon]['n_reads']+=contig_dict[contig]
+            taxon_dict[taxon]['n_nucleotides']+=contig_length_dict[contig]
+    
+    unmapped_line='{0}\t{1}\t{2}\n'.format('unmapped', unmapped,
+                                           unmapped/total_reads)
+    unclassified_line='{0}\t{1}\t{2}\t{3}\t'.format('unclassified',
+                       unclassified['n_reads'],
+                       unclassified['n_reads']/total_reads,
+                       unclassified['n_nucleotides'])
+    
+    
+    # write the output tables for contigs:
+    with open('{0}.contig.abundance.txt'.format(sample_name), 'w') as outf_contig:
+        outf_contig.write('## command: {}\n'.format(' '.join(sys.argv)))
+        outf_contig.write(RAT_contig_header)
+        # @Tina: not to work with decimals instead of floats.
+        # get the divisor to make corrected fraction add up to 1
+
+        divisor=get_proper_fraction(contig_dict, 
+                                    total_reads, 
+                                    unclassified, 
+                                    contig_length_dict)
+        outf_contig.write(unmapped_line)
+        for contig in contig_dict:
+            if not contig=='*':
+                read_frac=contig_dict[contig]/total_reads
+                corrected_frac=(read_frac/contig_length_dict[contig])/divisor
+                # If contig is binned, then get the lineage of the bin and not the contig
+                try:
+                    if contig in contig2bin:
+                        lineage=';'.join(bin2classification[contig2bin[contig]][0])
+                        rank_list=';'.join(bin2classification[contig2bin[contig]][1])
+                        scores=';'.join(bin2classification[contig2bin[contig]][2])
+                        
+                    else:
+                        lineage=';'.join(c2c_dict[contig][0])
+                        rank_list=';'.join(c2c_dict[contig][1])
+                        scores=';'.join(c2c_dict[contig][2])
+                        
+                except IndexError:
+                    lineage=''
+                    rank_list=''
+                    scores=''
+                outf_contig.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(
+                        contig, contig_dict[contig], read_frac, 
+                        contig_length_dict[contig], corrected_frac, lineage, 
+                        rank_list, scores))
+    
+    
+    # write output table for complete lineage
+    with open('{0}.complete.abundance.txt'.format(sample_name), 'w+') as outf_complete:
+        outf_complete.write('## command: {}\n'.format(' '.join(sys.argv)))
+        outf_complete.write(RAT_file_header)
+        outf_complete.write(unmapped_line)
+        outf_complete.write(unclassified_line+'\n')
+
+        # if unclassified['n_reads']!=0:
+        #     outf_complete.write('{0}\n'.format(((unclassified['n_reads']/total_reads)/unclassified['n_nucleotides'])/divisor))
+        # else:
+        #     outf_complete.write('0\n')
+        
+        for taxon in taxon_dict:
+            read_frac=taxon_dict[taxon]['n_reads']/total_reads
+
+            outf_complete.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
+                    taxon, 
+                    taxon_dict[taxon]['n_reads'],
+                    read_frac, 
+                    taxon_dict[taxon]['n_nucleotides'],
+                    # add average genome size
+                    ';'.join(taxon_dict[taxon]['ranks'])))
+    
+    
+    return 
+
+
+
+
+def get_proper_fraction(rank_dict, 
+                        total_reads, 
+                        unclassified={}, 
+                        contig_length_dict={}):
+    divisor=0
+    if unclassified and unclassified['n_reads']!=0:
+        divisor+=(unclassified['n_reads']/total_reads)/unclassified['n_nucleotides']
+    try:
+        for taxon in rank_dict:
+            if not taxon=='*':
+                if rank_dict[taxon]['n_nucleotides']!=0:
+                    divisor+=(rank_dict[taxon]['n_reads']/total_reads)/rank_dict[taxon]['n_nucleotides']
+    except TypeError:
+        for contig in rank_dict:
+            if not contig=='*':
+                divisor+=(rank_dict[contig]/total_reads)/contig_length_dict[contig]
+    return divisor
+
+
+
+def process_CAT_table(CAT_output_table, 
+                      nodes_dmp, 
+                      log_file, 
+                      quiet):
+    """
+    takes CAT contig2classification or bin2classification table and taxonomy 
+    nodes file as input; returns a dictionary that contains the complete 
+    lineage for each contig/bin:
+    (list of tuples of the format [(tax_id1,tax_id2...), 
+                                   (rank1,rank2,...), 
+                                   (score1,score2...)])
+    bin id contains the file extension!
+    """
+    c2c={}
+    
+    # get rank for each taxid
+    taxid2rank=tax.import_nodes(nodes_dmp, log_file, quiet)[1]
+    
+    with open(CAT_output_table, 'r') as contigs:
+        for contig in contigs:
+            
+            # if it is not the first line, get contig id, lineage and lineage scores
+            if not contig.startswith('# '):
+                contig_id=contig.strip().split('\t')[0]
+                c2c[contig_id]=[]
+                
+                # Build a list of tuples for the lineage:
+                # (tax_id1,tax_id2...), (rank1,rank2,...), (score1,score2...)]
+                if len(contig.split('\t'))>3:
+                    tax_ids=contig.split('\t')[3].strip().split(';')
+                    scores=contig.split('\t')[4].strip().split(';')
+                    c2c[contig_id]=[tuple([t for t in tax_ids]),
+                                   tuple([taxid2rank[t.strip('*')] for t in tax_ids]),
+                                   tuple([s for s in scores])]
+                                        
+    return c2c
+            
+
+
+
+
+def process_bam_file(BAM_fw_file, BAM_rev_file=False, path_to_samtools='samtools', mapping_quality=2):
+    """
+    # function that processes the bam file(s) by picking out the primary alignments
+    # and storing the contigs that the forward and reverse reads map to. If only
+    # one file is given, checks whether it's paired or not before it counts the
+    # unmapped reads.
+    # Keeps track of unmapped reads: If paired read file, then if checks the flag
+    # for whether the unmapped read is the first or the second in the pair; if
+    # it's single reads, it just fills up the fw key of the dictionary, and if
+    # there is a reverse file, then the reverse file only adds to the rev key.
+    """
+    read_dict={}
+    unmapped_reads={'fw': set(), 'rev': set()}
+    
+    # open alignment file with samtools, so that we don't need pysam dependency
+    cmd=[path_to_samtools, 'view', BAM_fw_file]
+    proc=subprocess.Popen(cmd, stdout=subprocess.PIPE)
+
+    # check whether it is a paired bam file by checking the flags for "read paired"
+    # flag and setting paired to true if it finds it
+    paired=False
+
+
+    # build read_dictionary with list of 0-2 contigs that each read/mate maps to
+    for mapping in proc.stdout:
+        mapping=mapping.decode("utf-8").rstrip().split('\t')
+        read_id, flag, contig, score=mapping[0], int(mapping[1]), mapping[2], int(mapping[4])
+        
+        if not paired and bin(flag)[-1]=='1':
+            paired=True
+        
+        if read_id not in read_dict:
+            read_dict[read_id]={}
+            read_dict[read_id]['contig']=[]
+            read_dict[read_id]['bin']=[]
+            read_dict[read_id]['taxon_bin']=[]
+            read_dict[read_id]['taxon_contig']=[]
+            read_dict[read_id]['taxon_contig_dm']=[]
+            read_dict[read_id]['taxon_read_dm']=[]
+        
+        # check whether the score is above the threshold and whether the 
+        # 'supplementary alignment' flag is unchecked, if yes, store contig
+        # @Tina: how does this capture unmapped reads? What if a user gives a quality of 0?
+        if score>=mapping_quality and len(bin(flag))<14:
+            read_dict[read_id]['contig'].append(contig)
+        
+        # store unmapped reads as forward and reverse:
+        elif contig=='*' or (score<mapping_quality and len(bin(flag))<14):
+            # check if read is the "first in pair"
+            if paired:
+                if len(bin(flag))>8 and bin(flag)[-7]=='1':
+                    unmapped_reads['fw'].add(read_id)
+                elif len(bin(flag))>8 and bin(flag)[-7]=='0':
+                    unmapped_reads['rev'].add(read_id)
+            else:
+                unmapped_reads['fw'].add(read_id)
+    
+    # if a reverse alignment file is given, add the contig that the mate maps to
+    if BAM_rev_file:
+        cmd=[path_to_samtools, 'view', BAM_fw_file]
+        proc=subprocess.Popen(cmd, stdout=subprocess.PIPE)
+
+        for read in proc.stdout:
+            read=read.decode("utf-8").rstrip().split('\t')
+            read_id, flag, contig, score=read[0], int(read[1]), read[2], int(read[4])
+            if score>=mapping_quality and len(bin(flag))<14:
+                read_dict[read_id]['contig'].append(contig)
+            elif contig=='*' or (score<mapping_quality and len(bin(flag))<14):
+                unmapped_reads['rev'].add(read_id)
+    
+    # calculate sum of reads as number of reads in the dictionary (unmapped reads
+    # are stored as well) for single read mappings, or as twice the number of reads
+    # in the dictionary for paired-end read mappings
+    if paired or BAM_rev_file:
+        sum_of_reads=2*len(read_dict)
+    else:
+        sum_of_reads=len(read_dict)
+
+    return read_dict, unmapped_reads, sum_of_reads, paired
+
+
+
+def make_contig_dict(read_dict, 
+                     bam_files, 
+                     paired, 
+                     contig2bin):
+    """
+    takes the read dictionary that stores which read maps where and returns
+    a dictionary that stores how many reads bind to each contig. It also adds
+    bin annotation to the read dict
+    """
+    contig_dict={}
+    contig_dict['*']=0
+    
+    # check how long the list of contigs per read is allowed to be to calculate
+    # unmapped reads and check that everything is working correctly
+    if len(bam_files)>1 or paired:
+        max_primary=2
+    else:
+        max_primary=1
+    
+    for read in read_dict:
+        # if a read (pair) has multiple primary alignments, exit
+        if len(read_dict[read]['contig'])>max_primary:
+            message = (
+                    'Something is wrong with the alignment. Too many mappings '
+                    'detected. Please check your alignment file or map your reads '
+                    'using RAT.')
+            shared.give_user_feedback(message)
+            sys.exit(1)
+        
+        # otherwise, determine the number of unmapped reads and add it to unmapped
+        # build a dictionary of contigs and how many reads map to them
+        # Add bin info to read dictionary
+        else:
+            contig_dict['*']+=max_primary-len(read_dict[read]['contig'])
+            for contig in read_dict[read]['contig']:
+                if not contig in contig_dict:
+                    contig_dict[contig]=0
+                    
+                contig_dict[contig]+=1
+                if contig2bin:
+                    if contig in contig2bin:
+                        read_dict[read]['bin'].append(contig2bin[contig])
+                    else:
+                        read_dict[read]['bin'].append('unbinned')
+                    
+    return contig_dict, read_dict, max_primary
+
+
+
+
+
+def make_unclassified_seq_fasta(seq_fasta, unclassified_seq_ids, 
+                                unclassified_seq_fasta, f_format, f_mode, 
+                                suffix=''):
+    
+    
+    fasta_dict={}
+    if f_format == 'fasta':
+        with shared.optionally_compressed_handle(seq_fasta, 'r') as f1: 
+            for line in f1:
+                if line.startswith('>'):
+                    header = line.rstrip().split(' ')[0].lstrip('>')
+                    if header in fasta_dict:
+                        sys.exit('Duplicate fasta headers in the file!') # This should be prettified.
+                    fasta_dict.setdefault(header, '')
+                else:
+                    fasta_dict[header] += line.rstrip()
+                    
+                    
+    elif f_format=='fastq':
+        with shared.optionally_compressed_handle(seq_fasta, 'r') as f1: 
+            for n, line in enumerate(f1):
+                if n % 4 == 0:
+                    if not line.startswith('@'):
+                        sys.exit('Unknown seuqence identifier symbol in line {n}!') # This should be prettified.
+                    header = line.rstrip().split(' ')[0].lstrip('@') 
+                    
+                    if header in fasta_dict:
+                        sys.exit('Duplicate fasta headers in the file!')
+                    
+                    fasta_dict.setdefault(header, '')
+                elif n % 4 == 1:
+                    fasta_dict[header] += line.rstrip()
+    else:
+        sys.exit('Unknown read file format!')
+    
+    with open(unclassified_seq_fasta, f_mode) as outf: 
+        # print(unclassified_seq_ids) 
+        suffices=['/1', '/2', '_1', '_2']
+        for seq in unclassified_seq_ids: 
+            if seq in fasta_dict:
+                seq_id=seq
+            else:
+                for s in suffices:
+                    if seq+s in fasta_dict:
+                        seq_id=seq+s
+        # print(seq)
+            outf.write('>{0}{1}\n{2}\n'.format(seq_id, suffix,
+                                               fasta_dict[seq]))
+    
+    
+
+def get_unclassified_contigs(contig2bin, c2c, b2c):
+    """
+    If the contig is in a bin that is unclassified, or if the contig is not in
+    a bin AND unclassified: put it in a set.
+    """
+    unclassified_contigs=list()
+    for c in c2c:
+        classified=False
+        if c in contig2bin:
+            if b2c[contig2bin[c]]:
+                mag_lineage=b2c[contig2bin[c]][0]
+                if len(mag_lineage)>2 or (len(mag_lineage)==2 and 
+                                          mag_lineage[1]!='131567'):
+                    classified=True
+        if not classified:
+            if c2c[c]:
+                lineage=c2c[c][0]
+                if len(lineage)>2 or (len(lineage)==2 and 
+                                          lineage[1]!='131567'):
+                    classified=True
+        if not classified:
+            unclassified_contigs.append(c)
+    return unclassified_contigs
+        
+
+
+def get_contig_lengths(contig_file):
+    """
+    gets the lengths of all contigs from the contig file. Works for all types
+    of contig IDs, as long as it is a proper file with unique headers, because
+    it checks the length of the strings.
+    """
+    contig_length_dict={}
+    sum_of_nucleotides=0
+    
+    # split the fasta file into contigs and prevent empty strings
+    # @Tina: opening a file like this will load the entire file into memory. I would open it line by line myself. This may get super slow depending on how you implement it, it can also be very fast.
+    with open(contig_file, 'r') as inp:
+        # @Tina: what does the [1:] do here? Also shouldn't the strip() remove the trailing newline? or do you mean rstrip()?
+        contigs=inp.read()[1:].strip().split('\n>')
+        
+    # for each contig, store contig name and the length of sequence without whitespace
+    # for each contig, add its length to the sum of nucleotides in the assembly
+    for c in contigs:
+        c_id, c_seq=c.split('\n')[0].strip(), c.split('\n', 1)[1].strip().replace('\n','')
+        contig_length_dict[c_id]=len(c_seq)
+        sum_of_nucleotides+=len(c_seq)
+    
+    return contig_length_dict, sum_of_nucleotides
+
+
+
+
+
+def process_bin_folder(path_to_folder, bin_suffix):
+    """
+    takes the path to a folder containing metagenomic bins and picks out all
+    nucleotide fasta files. Returns a dictionary that contains the bin ids as
+    keys and a set of all contig ids that belong in that bin as values. 
+    Important: Only contains the binned sequences. Every contig that is not
+    in the output dictionary is unbinned.
+    @Tina: Don't we have a function from the BAT workflow that does this?
+    """
+    bin_dict={}
+    bins=os.listdir(path_to_folder)
+    
+    for b in bins:
+        # grabs all fasta files and gets the bin id
+        if b.split('.')[-1]==bin_suffix.split('.')[-1] and not b.rsplit('.', 1)[0].endswith('unbinned'):
+            bin_id=b.strip()
+            bin_dict[bin_id]=set()
+            
+            # open fasta file and split it into contigs
+            contigs=open(path_to_folder+b).read().split('>')[1:]
+            
+            # add all contig ids to set
+            for contig in contigs:
+                bin_dict[bin_id].add(contig.split()[0])
+                
+    return bin_dict
+
+
+def invert_bin_dict(bin_dict):
+    # WORKS
+
+    """
+    @Tina: this should work as well:
+    d_rev = {v: k for k, v in d.items()}
+    return d
+
+    It's shorter, but of course you should make an extra check to see if the
+    values of the first dictionary are unique. I do like unversal functions (e.g.
+    invert_dict as opposed to invert_bin_dict.
+    """
+    
+    contig2bin={}
+    for b in bin_dict:
+        for contig in bin_dict[b]:
+            contig2bin[contig]=b
+    return contig2bin
+
+
+if __name__ == '__main__':
+    sys.exit('Run \'CAT_pack reads\' to run Read Annotation Tool (RAT).')


=====================================
CAT_pack/shared.py
=====================================
@@ -83,11 +83,12 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
             required=required,
             type=str,
             action=PathAction,
+            default=default,
             help=help_,
         )
     elif dest == "db_dir":
         if help_ is None:
-            help_ = ("Path to directory where CAT/BAT database files will "
+            help_ = ("Path to directory where CAT/BAT/RAT database files will "
                     "be created.")
         argument_group.add_argument(
             "--db_dir",
@@ -237,20 +238,20 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
         )
     elif dest == "db":
         if help_ is None:
-            help_ = "Either 'nr' or 'gtdb'."
+            help_ = "Either nr  or GTDB."
         argument_group.add_argument(
             "--db",
             dest="db",
             metavar="",
             required=required,
             type=str,
-            choices=["nr", "gtdb"],
+            choices=["nr", "GTDB"],
             default=None,
             help=help_
         )
     elif dest == "output_dir":
         if help_ is None:
-            help_ = "Path to direcotry where data will be stored."
+            help_ = "Path to directory where data will be stored."
         argument_group.add_argument(
             "-o",
             "--output_dir",
@@ -309,7 +310,7 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
     elif dest == "path_to_prodigal":
         if help_ is None:
             help_ = (
-                "Path to Prodigal binaries. Supply if CAT/BAT cannot find "
+                "Path to Prodigal binaries. Supply if CAT/BAT/RAT cannot find "
                 "Prodigal"
             )
         argument_group.add_argument(
@@ -325,7 +326,7 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
     elif dest == "path_to_diamond":
         if help_ is None:
             help_ = (
-                "Path to DIAMOND binaries. Supply if CAT/BAT cannot find "
+                "Path to DIAMOND binaries. Supply if CAT/BAT/RAT cannot find "
                 "DIAMOND."
             )
         argument_group.add_argument(
@@ -338,6 +339,36 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
             default=default,
             help=help_,
         )
+    elif dest == 'path_to_bwa':
+        if help_ is None:
+            help_ = (
+                'Path to bwa binaries. Supply if RAT cannot find '
+                    'bwa.'
+            )
+        argument_group.add_argument(
+                '--path_to_bwa',
+                dest='path_to_bwa',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                default=default,
+                help=help_)
+    elif dest == 'path_to_samtools':
+        if help_ is None:
+            help_ = (
+                'Path to samtools binaries. Supply if RAT cannot find '
+                    'samtools.'
+            )
+        argument_group.add_argument(
+                '--path_to_samtools',
+                dest='path_to_samtools',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                default=default,
+                help=help_)
     elif dest == "no_stars":
         if help_ is None:
             help_ = "Suppress marking of suggestive taxonomic assignments."
@@ -455,6 +486,135 @@ def add_argument(argument_group, dest, required, default=None, help_=None):
             action="store_true",
             help=help_,
         )
+    elif dest == 'mode':
+        if help_ is None:
+            help_ = ('classification mode. "mcr": integrate annotations from '
+                     'MAGs, contigs, and reads; "cr": integrate annotations '
+                     'from contigs and reads; "mr": integrate annotations '
+                     'from MAGs and reads.')
+        argument_group.add_argument(
+                '-m',
+                '--mode',
+                dest='mode',
+                metavar='',
+                required=required,
+                type=str,
+                action='store',
+                default=default,
+                help=help_)
+    elif dest == 'read_file1':
+        if help_ is None:
+            help_ = ('Path to (forward) read file. Please note that RAT does not '
+            'currently support interlaced read files. Please supply '
+            'a single read file or two files for paired-end reads.')
+        argument_group.add_argument(
+                '-1',
+                '--read_file1',
+                dest='read_file1',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'read_file2':
+        if help_ is None:
+            help_ = ('Path to reverse read file.')
+        argument_group.add_argument(
+                '-2',
+                '--read_file2',
+                dest='read_file2',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'bam_file1':
+        if help_ is None:
+            help_ = ('Path to sorted mapping file.')
+        argument_group.add_argument(
+                '--bam1',
+                dest='bam_file1',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'bam_file2':
+        if help_ is None:
+            help_ = ('Path to second sorted mapping file (not recommended).')
+        argument_group.add_argument(
+                '--bam2',
+                dest='bam_file2',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'mapping_quality':
+        if help_ is None:
+            help_ = ('Minimum mapping quality phred score (default: 2)')
+        argument_group.add_argument(
+                '--mapping_quality',
+                dest='mapping_quality',
+                metavar='',
+                required=required,
+                type=int,
+                default=default,
+                help=help_)
+    elif dest == 'contig2classification':
+        if help_ is None:
+            help_ = ('Path to contig2classification file.')
+        argument_group.add_argument(
+                '--c2c',
+                dest='contig2classification',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'bin2classification':
+        if help_ is None:
+            help_ = ('Path to bin2classification file.')
+        argument_group.add_argument(
+                '--b2c',
+                dest='bin2classification',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'unmapped2classification':
+        if help_ is None:
+            help_ = ('Path to bin2classification file.')
+        argument_group.add_argument(
+                '--u2c',
+                dest='unmapped2classification',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
+    elif dest == 'read2classification':
+        if help_ is None:
+            help_ = ('Includes read classification step.')
+        argument_group.add_argument(
+                '--read2classification',
+                dest='read2classification',
+                required=required,
+                action='store_true',
+                help=help_)
+    elif dest == 'alignment_unmapped':
+        if help_ is None:
+            help_ = ('Path to alignment file of reads and contigs that could'
+                     'not be classified by CAT/BAT.')
+        argument_group.add_argument(
+                '--alignment_unmapped',
+                dest='alignment_unmapped',
+                metavar='',
+                required=required,
+                type=str,
+                action=PathAction,
+                help=help_)
     elif dest == "nproc":
         if help_ is None:
             help_ = "Number of cores to deploy by DIAMOND (default: maximum)."
@@ -591,24 +751,24 @@ def expand_arguments(args):
     if "r" in args:
         setattr(args, "one_minus_r", (100 - args.r) / 100)
 
+    log_file = None
     if "out_prefix" in args:
-        if not args.tmpdir:
-            tmpdir = "{0}/".format(args.out_prefix.rsplit("/", 1)[0])
-
-            setattr(args, "tmpdir", tmpdir)
-
-            # Check out_prefix as the log file needs to be written to a valid
-            # location.
-            error = check.check_out_prefix(args.out_prefix, None, args.quiet)
-            if error:
-                sys.exit(1)
+        # Check out_prefix as the log file needs to be written to a valid
+        # location.
+        error = check.check_out_prefix(args.out_prefix, None, args.quiet)
+        if error:
+            sys.exit(1)
 
+        if not args.no_log:
             log_file = "{0}.log".format(args.out_prefix)
 
             with open(log_file, "w") as outf1:
                 pass
-    else:
-        log_file = None
+
+        if not args.tmpdir:
+            tmpdir = "{0}/".format(args.out_prefix.rsplit("/", 1)[0])
+
+            setattr(args, "tmpdir", tmpdir)
 
     setattr(args, "log_file", log_file)
 
@@ -655,13 +815,20 @@ def expand_arguments(args):
 
         explore_taxonomy_folder(args)
     if "database_folder" in args and not "db_dir" in args:
-        setattr(
-            args,
-            "database_folder",
-            "{0}/".format(args.database_folder.rstrip("/")),
-        )
+        if args.database_folder:
+            setattr(
+                args,
+                "database_folder",
+                "{0}/".format(args.database_folder.rstrip("/")),
+            )
+    
+            explore_database_folder(args)
 
-        explore_database_folder(args)
+    if "bin_fasta_or_folder" in args and args.bin_fasta_or_folder:
+        if os.path.isfile(args.bin_fasta_or_folder):
+            setattr(args, "bin_fasta", args.bin_fasta_or_folder)
+        else:
+            setattr(args, "bin_folder", args.bin_fasta_or_folder)
 
     if "bin_fasta_or_folder" in args:
         if os.path.isfile(args.bin_fasta_or_folder):
@@ -828,7 +995,11 @@ def run_prodigal(
     return
 
 
-def run_diamond(args):
+# @Bastiaan: Here, I have added some arguments, because RAT runs diamond on 
+# the unmapped reads as well, but with blastx. If you call it the way you call
+# it with CAT/BAT, it still does the same thing, but I can run it with blastx
+# on a newly created fasta file and with a top 11.
+def run_diamond(args, blast='blastp', prot_fasta='', top=0):
     if args.sensitive:
         mode = "sensitive"
     else:
@@ -838,7 +1009,12 @@ def run_diamond(args):
         compression = "1"
     else:
         compression = "0"
-
+        
+    if not prot_fasta:
+        prot_fasta=args.proteins_fasta
+    if not top:
+        top=args.top
+        
     message = (
         "Homology search with DIAMOND is starting. Please be patient. Do not "
         "forget to cite DIAMOND when using CAT or BAT in your publication.\n"
@@ -851,7 +1027,8 @@ def run_diamond(args):
         "\t\t\tblock-size (billions of letters): {6}\n"
         "\t\t\tindex-chunks: {7}\n"
         "\t\t\ttmpdir: {8}\n"
-        "\t\t\tcompress: {9}".format(
+        "\t\t\tcompress: {9}\n"
+        "\t\t\tblast flavour: {10}".format(
             args.proteins_fasta,
             args.diamond_database,
             mode,
@@ -861,16 +1038,17 @@ def run_diamond(args):
             args.block_size,
             args.index_chunks,
             args.tmpdir,
-            compression
+            compression,
+            blast
         )
     )
     give_user_feedback(message, args.log_file, args.quiet)
 
     try:
         command = [
-            args.path_to_diamond, "blastp",
+            args.path_to_diamond, blast,
             "-d", args.diamond_database,
-            "-q", args.proteins_fasta,
+            "-q", prot_fasta,
             "--top", str(args.top),
             "--matrix", "BLOSUM62",
             "--evalue", "0.001",
@@ -879,7 +1057,8 @@ def run_diamond(args):
             "--block-size", str(args.block_size),
             "--index-chunks", str(args.index_chunks),
             "--tmpdir", args.tmpdir,
-            "--compress", compression
+            "--compress", compression,
+            
         ]
 
         if not args.verbose:
@@ -890,7 +1069,9 @@ def run_diamond(args):
 
         if args.no_self_hits:
             command += ["--no-self-hits"]
-
+        
+        print(' '.join(command))
+        
         subprocess.check_call(command)
     except:
         message = "DIAMOND finished abnormally."
@@ -908,6 +1089,227 @@ def run_diamond(args):
     return
 
 
+def run_CAT(args,
+        contigs_fasta,
+        database_folder,
+        taxonomy_folder,
+        log_file,
+        quiet,
+        nproc,
+        fraction,
+        CAT_range,
+        path_to_output):
+    message = (
+            'Running CAT.')
+    give_user_feedback(message, log_file, quiet, show_time=True)
+
+    try:
+        command = ['CAT_pack', 'contigs',
+                '-c', contigs_fasta,
+                '-d', database_folder,
+                '-t', taxonomy_folder,
+                '-o', path_to_output+'.CAT',
+                '-n', str(nproc),
+                '-f', str(fraction),
+                '-r', str(CAT_range),
+                              
+                ]
+        if args.path_to_prodigal!='prodigal':
+            command.extend(['--path_to_prodigal', args.path_to_prodigal])
+        if args.path_to_diamond!='diamond':
+            command.extend(['--path_to_diamond', args.path_to_diamond])
+        
+        if args.force:
+            command.append('--force')
+        subprocess.check_call(command)
+    except:
+        message = 'CAT finished abnormally.'
+        give_user_feedback(message, log_file, quiet, error=True)
+
+        sys.exit(1)
+
+    message = 'CAT done!\n'
+    give_user_feedback(message, log_file, quiet, show_time=True)
+
+    return
+
+
+def run_BAT(args,
+        bin_folder,
+        database_folder,
+        taxonomy_folder,
+        log_file,
+        quiet,
+        n_proc,
+        fraction,
+        CAT_range,
+        CAT_protein_fasta=0,
+        CAT_diamond_alignment=0,
+        path_to_output='out.BAT',
+        bin_suffix='.fna'):
+    message = (
+            'Running BAT.')
+    give_user_feedback(message, log_file, quiet, show_time=True)
+
+    try:
+        command = ['CAT_pack', 'bins',
+                '-b', bin_folder,
+                '-d', database_folder,
+                '-t', taxonomy_folder,
+                '-o', path_to_output+'.BAT',
+                '-n', str(n_proc),
+                '-f', str(fraction),
+                '-r', str(CAT_range),
+                '-s', bin_suffix]
+        if CAT_protein_fasta:
+            command.append('-p')
+            command.append(CAT_protein_fasta)
+        if CAT_diamond_alignment:
+            command.append('-a')
+            command.append(CAT_diamond_alignment)
+        if args.path_to_prodigal!='prodigal':
+            command.extend(['--path_to_prodigal', args.path_to_prodigal])
+        if args.path_to_diamond!='diamond':
+            command.extend(['--path_to_diamond', args.path_to_diamond])
+        if args.force:
+            command.append('--force')
+        subprocess.check_call(command)
+    except:
+        message = 'BAT finished abnormally.'
+        give_user_feedback(message, log_file, quiet, error=True)
+
+        sys.exit(1)
+
+    message = 'BAT done!\n'
+    give_user_feedback(message, log_file, quiet, show_time=True)
+
+    return
+
+
+def run_bwa_mem(
+        path_to_bwa,
+        path_to_samtools,
+        contigs_fasta,
+        read_file,
+        out_prefix,
+        nproc,
+        log_file):
+
+    # tested with bwa 0.7.17
+    # tested with samtools 1.11
+    output_file='{0}.{1}.bwamem'.format(out_prefix+
+                                        '.'+os.path.split(contigs_fasta)[-1], 
+                                        os.path.split(read_file[0])[-1])
+    message = (
+            'Running bwa mem for read mapping. File {0}.sorted will be generated.'
+            'Do not forget to cite bwa mem and samtools when using RAT in '
+            'your publication!'.format(output_file))
+    give_user_feedback(message, log_file, show_time=True)
+
+    try:
+        # Run bwa index
+        
+        if check_index(contigs_fasta):
+            message = 'Contigs fasta is already indexed.'
+            give_user_feedback(message, log_file, error=False)
+        
+        else:
+            message = 'Indexing contigs fasta...'
+            give_user_feedback(message, log_file, error=False)
+    
+            command =[
+                    path_to_bwa, 'index',
+                    contigs_fasta
+                    ]
+            subprocess.check_call(command)
+
+        try:
+            # Run bwa mem
+            message = 'Running bwa mem...'
+            give_user_feedback(message, log_file, error=False)
+
+            
+            command =[
+                    path_to_bwa, 'mem',
+                    '-t', str(nproc),
+                    contigs_fasta,
+                    read_file[0]]
+                    
+            if len(read_file)==2:
+                command.append(read_file[1])
+                
+            proc=subprocess.Popen(command, stdout=subprocess.PIPE)
+            with open(output_file, 'wb') as outf:
+                for line in proc.stdout:
+                    outf.write(line)
+
+            try:
+                # Run samtools view and remove bwamem file
+                command =[
+                        path_to_samtools, 'view',
+                        '-b',
+                        '-@', str(nproc),
+                        output_file,
+                        '-o', output_file + '.bam'
+                        ]
+
+                subprocess.check_call(command)
+                os.remove(output_file)
+
+
+                try:
+                    # Run samtools sort and remove sam file
+                    message = 'Sorting bam file...'
+                    give_user_feedback(message, log_file, error=False)
+
+                    command =[
+                        path_to_samtools, 'sort',
+                        '-@', str(nproc),
+                        output_file + '.bam',
+                        '-o', output_file + '.sorted'
+                        ]
+
+                    subprocess.check_call(command)
+                    os.remove(output_file+'.bam')
+
+
+                except:
+                    message = 'samtools sort finished abnormally.'
+                    give_user_feedback(message, log_file, error=True)
+
+                    sys.exit(1)
+            except:
+                message = 'samtools view finished abnormally.'
+                give_user_feedback(message, log_file, error=True)
+
+                sys.exit(1)
+
+
+        except:
+            message = 'Bwa mem finished abnormally.'
+            give_user_feedback(message, log_file, error=True)
+
+            sys.exit(1)
+    except:
+        message = 'Bwa index finished abnormally.'
+        give_user_feedback(message, log_file, error=True)
+
+        sys.exit(1)
+
+    message = 'Read mapping done!\n'
+    give_user_feedback(message, log_file,show_time=True)
+
+    return
+
+
+def check_index(path_to_fasta):
+    indexed=True
+    for suffix in ['.amb', '.ann', '.bwt', '.pac', '.sa']:
+        if not os.path.exists('{0}{1}'.format(path_to_fasta, suffix)):
+            indexed=False
+    return indexed
+
+
 def import_contig_names(fasta_file, log_file, quiet):
     message = "Importing contig names from {0}.".format(fasta_file)
     give_user_feedback(message, log_file, quiet)
@@ -921,7 +1323,8 @@ def import_contig_names(fasta_file, log_file, quiet):
 
                 if contig in contig_names:
                     message = (
-                        "your fasta file contains duplicate headers. The "
+                        "your fasta file contains duplicate headers (the part "
+                        "before the first space in the >line). The "
                         "first duplicate encountered is {0}, but there might "
                         "be more...".format(contig)
                     )
@@ -1041,4 +1444,4 @@ def optionally_compressed_handle(file_path, mode):
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT\' to run CAT or BAT.")
+    sys.exit("Run \'CAT_pack\' to run CAT, BAT, or RAT.")


=====================================
CAT_pack/summarise.py
=====================================
@@ -10,9 +10,9 @@ import shared
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-            prog="CAT summarise",
+            prog="CAT_pack summarise",
             description="Summarise a named CAT or BAT classification file.",
-            usage=("CAT summarise -i FILE -o FILE (-c FILE) "
+            usage=("CAT_pack summarise -i FILE -o FILE (-c FILE) "
                    "[options] [-h / --help]"),
             add_help=False)
     
@@ -44,7 +44,7 @@ def parse_arguments():
     extra_args = [arg for (i, arg) in enumerate(extra_args) if
                   (i, arg) != (0, "summarise")]
     if len(extra_args) > 0:
-        sys.exit("error: too much arguments supplied:\n{0}".format(
+        sys.exit("error: too many arguments supplied:\n{0}".format(
             "\n".join(extra_args)))
 
     # Add extra arguments.
@@ -82,7 +82,7 @@ def import_contig_lengths(contigs_fasta, log_file, quiet):
 
 
 def summarise_contigs(args):
-    message = "# CAT v{0}.".format(about.__version__)
+    message = "# CAT_pack v{0}.".format(about.__version__)
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False)
 
@@ -142,7 +142,7 @@ def summarise_contigs(args):
                         "official ranks not found in header of {0}. Make sure "
                         "that the CAT classification file is named with "
                         "official ranks with "
-                        "\'CAT add_names --only_official\'."
+                        "\'CAT_pack add_names --only_official\'."
                         "".format(args.input_file)
                     )
                     shared.give_user_feedback(
@@ -227,7 +227,7 @@ def summarise_contigs(args):
 
     if len(doubles) != 0:
         message = (
-            "some contigs have multiple classifications. CAT summarise "
+            "some contigs have multiple classifications. CAT_pack summarise "
             "currently does not allow for this. Contigs with multiple "
             "classifications: {0}.".format(", ".join(list(doubles)))
         )
@@ -300,7 +300,7 @@ def summarise_contigs(args):
     
     
 def summarise_bins(args):
-    message = "# CAT v{0}.".format(about.__version__)
+    message = "# CAT_pack v{0}.".format(about.__version__)
     shared.give_user_feedback(
         message, args.log_file, args.quiet, show_time=False)
     
@@ -357,7 +357,7 @@ def summarise_bins(args):
                         "official ranks not found in header of {0}. Make sure "
                         "that the BAT classification file is named with "
                         "official ranks with "
-                        "\'CAT add_names --only_official\'."
+                        "\'CAT_pack add_names --only_official\'."
                         "".format(args.input_file)
                     )
                     shared.give_user_feedback(
@@ -420,8 +420,9 @@ def summarise_bins(args):
                 
     if len(doubles) != 0:
         message = (
-            "some bins have multiple classifications. CAT summarise currently "
-            "does not allow for this. Bins with multiple classifications: {0}."
+            "some bins have multiple classifications. CAT_pack summarise "
+            "currently does not allow for this. Bins with multiple "
+            "classifications: {0}."
             "".format(", ".join(list(doubles)))
         )
         shared.give_user_feedback(
@@ -464,5 +465,5 @@ def run():
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT summarise\' to summarise a named CAT contig "
-             "classification file or named BAT bin classification file.")
+    sys.exit("Run \'CAT_pack summarise\' to summarise a named CAT contig "
+            "classification file or named BAT bin classification file.")


=====================================
CAT_pack/tax.py
=====================================
@@ -297,4 +297,4 @@ def convert_to_official_names(lineage, taxid2rank, taxid2name, scores=None):
 
 
 if __name__ == "__main__":
-    sys.exit("Run \'CAT\' to run CAT or BAT.")
+    sys.exit("Run \'CAT_pack\' to run CAT, BAT, or RAT.")


=====================================
CHANGELOG.md
=====================================
@@ -1,5 +1,12 @@
 # Changelog
 
+## 6.0.1
+Minor release to register for a Zenodo DOI.
+
+## 6.0
+* Change of entry-point for CAT/BAT/RAT: you can now run `CAT_pack` instead of `CAT`, for example `CAT_pack contigs`.
+* We welcome a new member to the CAT pack: Read Annotation Tool! RAT uses CAT and BAT annotations as well as direct read mappings to reconstruct accurate taxonomic profiles of metagenomes! To try RAT, run `CAT_pack reads`.
+ 
 ## 5.3
 * GTDB support.
 * Sequence databases (NCBI nr or GTDB) can be downloaded with `CAT download`, and CAT databases constructed with `CAT prepare`.


=====================================
README.md
=====================================
@@ -1,94 +1,156 @@
-# CAT and BAT
+# CAT, BAT, and RAT
 
 - [Introduction](#introduction)
 - [Dependencies and where to get them](#dependencies-and-where-to-get-them)
 - [Installation](#installation)
 - [Getting started](#getting-started)
-- [Usage](#usage)
-- [Interpreting the output files](#interpreting-the-output-files)
-- [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk)
-- [Optimising running time, RAM, and disk usage](#optimising-running-time-ram-and-disk-usage)
-- [Examples](#examples)
+  - [Downloading preconstructed database files](#downloading-preconstructed-database-files)
+  - [Creating a fresh NCBI nr or GTDB database yourself](#creating-a-fresh-ncbi-nr-or-gtdb-database-yourself)
+  - [Creating a custom database](#creating-a-custom-database)
+  - [Running CAT/BAT/RAT](#running-cat/bat/rat)
+  - [Getting help](#getting-help)
+- [Taxonomic annotation of contigs or MAGs with CAT and BAT](#taxonomic-annotation-of-contigs-or-mags-with-cat-and-bat)
+  - [Interpreting the output files](#interpreting-the-output-files)
+  - [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk)
+  - [Optimising running time, RAM, and disk usage](#optimising-running-time-ram-and-disk-usage)
+  - [Examples](#examples)
+- [Estimating the microbial composition with RAT](#estimating-the-microbial-composition-with-rat)
+  - [Output files](#output-files)
+
 
 ## Introduction
-Contig Annotation Tool (CAT) and Bin Annotation Tool (BAT) are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs/bins) of both known and (highly) unknown microorganisms, as generated by contemporary metagenomics studies. The core algorithm of both programs involves gene calling, mapping of predicted ORFs against a protein database, and voting-based classification of the entire contig / MAG based on classification of the individual ORFs. CAT and BAT can be run from intermediate steps if files are formated appropriately (see [Usage](#usage)).
+**Contig Annotation Tool (CAT)** and **Bin Annotation Tool (BAT)** are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs / bins) of both known and (highly) unknown microorganisms, as generated by contemporary metagenomics studies. The core algorithm of both programs involves gene calling, mapping of predicted ORFs against a protein database, and voting-based classification of the entire contig / MAG based on classification of the individual ORFs. CAT and BAT can be run from intermediate steps if files are formated appropriately.
 
 A paper describing the algorithm together with extensive benchmarks can be found at https://doi.org/10.1186/s13059-019-1817-x. If you use CAT or BAT in your research, it would be great if you could cite us:
 
 * *von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biology. 2019;20:217.*
 
+**Read Annotation Tool (RAT)** estimates the taxonomic composition of metagenomes using CAT and BAT output. A manuscript describing RAT with benchmarks can be found at https://doi.org/10.1101/2023.03.22.533753. If you use RAT in your research, it would be great if you could cite:
+
+* *von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biology. 2019;20:217.*
+* *Hauptfeld E, Pappas N, van Iwaarden S, Snoek BL, Aldas-Vargas A, Dutilh BE, von Meijenfeldt FAB. Integration of taxonomic signals from MAGs and contigs improves read annotation and taxonomic profiling of metagenomes. bioRXiv. 2023.*
+
 
 ## Dependencies and where to get them
-Python 3, https://www.python.org/.
+* Python 3, https://www.python.org/.
+
+* DIAMOND, https://github.com/bbuchfink/diamond.
+
+* Prodigal, https://github.com/hyattpd/Prodigal.
 
-DIAMOND, https://github.com/bbuchfink/diamond.
+RAT further requires (not needed for CAT and BAT):
 
-Prodigal, https://github.com/hyattpd/Prodigal.
+* BWA, https://github.com/lh3/bwa.
+
+* SAMtools, http://www.htslib.org/download/.
+
+CAT, BAT, and RAT have been thoroughly tested on Linux systems, and should run on macOS as well.
 
-CAT and BAT have been thoroughly tested on Linux systems, and should run on macOS as well.
 
 ## Installation
-No installation is required. You can run CAT and BAT by supplying the absolute path:
+No installation is required. You can run CAT, BAT and RAT by supplying the absolute path:
 
 ```
-$ ./CAT_pack/CAT --help
+$ ./CAT_pack/CAT_pack --help
 ```
 
-Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere:
+Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT, BAT, and RAT from anywhere:
 
 ```
-$ CAT --version
+$ CAT_pack --version
 ```
 
-*Special note for Mac users: since the macOS file system is case-insensitive by default, adding the CAT\_pack directory to your `$PATH` variable might replace calls to the standard unix `cat` utility. We advise Mac users to run CAT from its absolute path.*
 
-CAT and BAT can also be installed via Bioconda, thanks to Silas Kieser:
+## Getting started
+To get started with CAT/BAT/RAT, you will have to get the database files on your system.
+You can either download preconstructed database files, or generate them yourself.
+
+### Downloading preconstructed database files
+
+To download the database files, find the most recent version on [tbb.bio.uu.nl/tina/CAT\_pack\_prepare/](https://tbb.bio.uu.nl/tina/CAT_pack_prepare/), download and extract, and you are ready to go!
 
+For NCBI nr:
 ```
-$ conda install -c bioconda cat
+$ wget tbb.bio.uu.nl/tina/CAT_pack_prepare/20231120_CAT_nr.tar.gz
+
+$ tar -xvzf 20231120_CAT_nr.tar.gz
 ```
 
-## Getting started
-To get started with CAT and BAT, you will have to get the database files on your system.
-You can either download preconstructed database files, or generate them yourself.
+For GTDB:
+```
+$ wget tbb.bio.uu.nl/tina/CAT_pack_prepare/20231120_CAT_gtdb.tar.gz
 
-### Downloading preconstructed database files
+$ tar -xvzf 20231120_CAT_gtdb.tar.gz
+```
+
+### Creating a fresh NCBI nr or GTDB database yourself
+
+Instead of using the preconstructed database, you can construct a fresh database yourself. The `download` module can be used to download and process raw data, in preparation for building a new CAT pack database.
+This will ensure that all input dependencies are met and correctly formatted for `CAT_pack prepare`.
 
-To download the database files, find the most recent version on [tbb.bio.uu.nl/bastiaan/CAT\_prepare/](https://tbb.bio.uu.nl/bastiaan/CAT_prepare/), download and extract, and you are ready to go!
+Currently, two databases are supported, NCBI's nr and the Genome Taxonomy Database (GTDB) proteins.
 
+#### NCBI non-redundant protein database (nr)
+
+```
+$ CAT_pack download -db nr -o path/to/nr_data_dir
 ```
-$ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz
 
-$ tar -xvzf CAT_prepare_20210107.tar.gz
+Will download the fasta file with the protein sequences, their mapping to a taxid, and the taxonomy information from NCBI's ftp site.
+
+#### [Genome Taxonomy Database (GTDB)](https://gtdb.ecogenomic.org/) proteins
+
+```
+$ CAT_pack download -db gtdb -o path/to/gtdb_data_dir
 ```
 
-Your version of DIAMOND should be the same as with which the database is constructed. For this reason the DIAMOND executable is supplied within the CAT prepare folder. Alternatively, you can find the DIAMOND version used for database construction within the database log file:
+The files required to build a CAT pack database are provided by the [GTDB downloads page](https://gtdb.ecogenomic.org/downloads).
+
+`CAT_pack download` fetches the necessary files and does some additional processing to get them ready for `CAT_pack prepare`:
+
+  - The taxonomy information from GTDB is transformed into NCBI style `nodes.dmp` and `names.dmp` files.
+  - Protein sequences are extracted from `gtdb_proteins_aa_reps.tar.gz` and are subjected to a round of deduplication.
+The deduplication reduces the redundancy in the DIAMOND database, thus simplifying the alignment process.
+Exact duplicate sequences are identified based on a combination of the MD5sum of the protein sequences and their length.
+Only one representative sequence is kept, with all duplicates encoded in the fasta header.
+This information is later used by `CAT_pack prepare` to assign the LCA of the protein sequence appropriately in the `.fastaid2LCAtaxid` file.
+  - The mapping of all protein sequences to their respective taxonomy is created.
+  - In addition, the newick formatted trees of Bacteria and Archaea are downloaded and - artificially - concatenated under a single `root` node, to produce an `all.tree` file.
+This file is **not** used by the CAT pack but may come in handy for downstream analyses.
+
+When the download and processing of the files is finished successfully you can build a CAT pack database with `CAT_pack prepare`.
+
+For all command line options available see
 
 ```
-$ grep version 2021-01-07.CAT_prepare.fresh.log
+$ CAT_pack download -h
+```
+and
+```
+$ CAT_pack prepare -h
 ```
 
-### Preparing a CAT database
+### Creating a custom database
 
-You *must* have the following input ready before you launch a `CAT prepare` run.
+For a custom CAT pack database, you must have the following input ready before you launch a `CAT_pack prepare` run.
 
 1. A fasta file containing all protein sequences you want to include in your database.
 
 2. A `names.dmp` file that contains mappings of taxids to their ranks and scientific names.
 The format must be the same as the NCBI standard `names.dmp` (uses `\t|\t` as field separator).
 
-An example would look like this:
+An example looks like this:
 
 ```
-1	|	root	|	|	scientific name	|
-2	|	Bacteria	|	|	scientific name	|
+1	|	root	|	scientific name	|
+2	|	Bacteria	|	scientific name	|
 562	|	Escherichia	coli	|	scientific name	|
 ```
 
 3. A `nodes.dmp` file that describes the child-parent relationship of the nodes in the taxonomy tree and their (official) rank.
-The format must be the same as the NCBI standard `nodes.dmp` (uses `\t|\t` as the field separator.
+The format must be the same as the NCBI standard `nodes.dmp` (uses `\t|\t` as the field separator).
 
-An example would look like this:
+An example looks like this:
 
 ```
 1	|	1	|	root	|
@@ -104,9 +166,9 @@ An example would look like this:
 For more information on the `nodes.dmp` and `names.dmp` files, see the [NCBI taxdump_readme.txt](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_readme.txt).
 
 4. A 2-column, tab-separated file containing the mapping of each sequence in the fasta file to a taxid in the taxonomy.
-This *must* contain the header `accession.version	taxid`.
+This file must contain the header `accession.version	taxid`.
 
-An example would look like this
+An example looks like this
 
 ```
 accession.version	taxid
@@ -114,13 +176,11 @@ protein_1	562
 protein_2	123456
 ```
 
-Once all of the above requirements are met you can run `CAT prepare`.
-All the input  needs to be explicitly specified for `CAT prepare` to work.
-
-E.g.
+Once all of the above requirements are met you can run `CAT_pack prepare`.
+All the input  needs to be explicitly specified for `CAT_pack prepare` to work, for example:
 
 ```
-CAT prepare \
+$ CAT_pack prepare \
 --db_fasta path/to/fasta \
 --names path/to/names.dmp \
 --nodes path/to/nodes.dmp \
@@ -128,15 +188,15 @@ CAT prepare \
 --db_dir path/to/output_dir
 ```
 
-will create the `output_dir` that will look like this
+will create an `output_dir` that will look like this
 
 ```
 output_dir
-├── 2021-11-17_CAT.log
+├── 2023-11-05_CAT_pack.log
 ├── db
-│   ├── 2021-11-17_CAT.dmnd
-│   ├── 2021-11-17_CAT.fastaid2LCAtaxid
-│   └── 2021-11-17_CAT.taxids_with_multiple_offspring
+│   ├── 2023-11-05_CAT_pack.dmnd
+│   ├── 2023-11-05_CAT_pack.fastaid2LCAtaxid
+│   └── 2023-11-05_CAT_pack.taxids_with_multiple_offspring
 └── tax
     ├── names.dmp
     └── nodes.dmp
@@ -144,90 +204,48 @@ output_dir
 
 Notes:
 
-- Two subdirs are created `db` and `tax` that contain the necessary files.
+- The two subdirs `db` and `tax` are created that contain all necessary files.
 - The `nodes.dmp` and `names.dmp` in the `tax` directory are copied from their original location.
-This is to ensure that the `-t` flag of the rest of CAT modules works.
-- The default prefix is `<YYYY-MM-DD>_CAT`. You can customize it with the `--common_prefix` option.
+This is to ensure that the `-t` flag of CAT, BAT, and RAT work.
+- The default prefix is `<YYYY-MM-DD>_CAT_pack`. You can customize it with the `--common_prefix` option.
 
 For all command line options available see
 
 ```
-$ CAT prepare -h
+$ CAT_pack prepare -h
 ```
 
-### Downloading raw data for nr and GTDB
-
-The `download` module can be used to download and process raw data, in preparation for building a new CAT database.
-This will ensure that all input dependencies are met and correctly formatted for `CAT prepare`.
-
-Currently, two databases are supported, NCBI's nr and GTDB proteins.
+### Running CAT/BAT/RAT.
+The database files are needed in subsequent CAT/BAT/RAT runs. They only need to be generated/downloaded once or whenever you want to update the database.
 
-* NCBI non-redundant protein database ( aka `nr`)
-
-Command:
-
-```
-$ CAT download -db nr -o path/to/nr_data_dir
+To run CAT/BAT/RAT, respectively:
 ```
+$ CAT_pack contigs     # Runs CAT.
 
-Download the fasta file with the protein sequences, their mapping to a taxid and the taxonomy information from the NCBI's ftp site.
+$ CAT_pack bins        # Runs BAT.
 
-* [Genome Taxonomy Database](https://gtdb.ecogenomic.org/) proteins
-
-Command:
-
-```
-$ CAT download -db gtdb -o path/to/gtdb_data_dir
+$ CAT_pack reads       # Runs RAT.
 ```
 
-The files required to build a CAT database are provided by the [GTDB downloads page](https://gtdb.ecogenomic.org/downloads).
-
-`CAT download` fetches the necessary files and does some additional processing to get them ready for `CAT prepare`:
-
-  - The taxonomy information, provided for each genome from GTDB, is transformed into the NCBI style `nodes.dmp` and `names.dmp`.
-The species level annotation from GTDB is used as the unique taxid identifier.
-For example, all proteins coming from a representative genome for species `Escherichia coli` are assigned a taxid of `s__Escherichia coli`.
-All proteins from that genome get its taxid.
-  - Fasta files containing protein sequences are extracted from the provided `gtdb_proteins_aa_reps.tar.gz` and are subjected to a round of deduplication.
-This is to reduce the redundancy in the DIAMOND database to be created, thus simplifying the alignment process.
-Exact duplicate sequences are identified based on a combination of the MD5 sum of the protein sequences and their length.
-Only one representative sequence is kept, with information on the rest of the accessions identified as duplicates encoded in the fasta header.
-This information is later used by `CAT prepare` to assign the LCA of the protein sequence appropriately in the `.fastaid2LCAtaxid` file.
-  - The mapping of **all** protein sequences (duplicates or not) to their respective taxonomy is created.
-This is also used by `CAT prepare` for proper LCA identification.
-  - In addition, the newick formatted trees for Bacteria and Archaea are downloaded and - artificially - concatenated under a single `root` node, to produce an `all.tree` file.
-This can come in handy for downstream analyses tools that require a phylogeny to be present to calculate diversity indices based on some metric that takes that information into account.
-This is **not** required for `CAT`.
-
-When the download and processing of the files is finished successfully you can build a CAT database with `CAT prepare`.
-
-For all command line options available see
+### Getting help.
+If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT, BAT, or RAT.
 
 ```
-$ CAT download -h
-```
-
-### Running CAT and BAT.
-The taxonomy folder and database folder created by CAT prepare are needed in subsequent CAT and BAT runs. They only need to be generated/downloaded once or whenever you want to update the database.
-
-To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect.
+$ CAT_pack --help
 
-### Getting help.
-If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT.
+$ CAT_pack contigs --help
 
+$ CAT_pack summarise --help
 ```
-$ CAT --help
 
-$ CAT contigs --help
+If you are unsure about what input files are required, you can just run CAT/BAT/RAT, as the appropriate error messages are generated if formatting is incorrect.
 
-$ CAT summarise --help
-```
 
-## Usage
+## Taxonomic annotation of contigs or MAGs with CAT and BAT
 After you have got the database files on your system, you can run CAT to annotate your contig set:
 
 ```
-$ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder}
+$ CAT_pack contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder}
 ```
 
 Multiple output files and a log file will be generated. The final classification files will be called `out.CAT.ORF2LCA.txt` and `out.CAT.contig2classification.txt`.
@@ -235,7 +253,7 @@ Multiple output files and a log file will be generated. The final classification
 Alternatively, if you already have a predicted proteins fasta file and/or an alignment table for example from previous runs, you can supply them to CAT, which will then skip the steps that have already been done and start from there:
 
 ```
-$ CAT contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file}
+$ CAT_pack contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file}
 ```
 
 The headers in the predicted proteins fasta file must look like this `>{contig}_{ORFnumber}`, so that CAT can couple contigs to ORFs. The alignment file must be tab-seperated, with queried ORF in the first column, protein accession number in the second, and bit-score in the 12th.
@@ -243,13 +261,13 @@ The headers in the predicted proteins fasta file must look like this `>{contig}_
 To run BAT on a set of MAGs:
 
 ```
-$ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder}
+$ CAT_pack bins -b {bin folder} -d {database folder} -t {taxonomy folder}
 ```
 
 Alternatively, BAT can be run on a single MAG:
 
 ```
-$ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder}
+$ CAT_pack bins -b {bin fasta} -d {database folder} -t {taxonomy folder}
 ```
 
 Multiple output files and a log file will be generated. The final classification files will be called `out.BAT.ORF2LCA.txt` and `out.BAT.bin2classification.txt`.
@@ -257,134 +275,128 @@ Multiple output files and a log file will be generated. The final classification
 Similarly to CAT, BAT can be run from intermidate steps if gene prediction and alignment have already been carried out once:
 
 ```
-$ CAT bins -b {bin folder} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file}
+$ CAT_pack bins -b {bin folder} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file}
 ```
 
-If BAT is run in single bin mode, you can use these predicted protein and alignment files to classify individual contigs within the MAG with CAT.
+If you have previously run CAT on the set of contigs from which the MAGs originate, you can use the previously predicted protein and alignment files to classify the MAGs.
 
 ```
-$ CAT bin -b {bin fasta} -d {database folder} -t {taxonomy folder}
+$ CAT_pack contigs -c {contigs fasta} -d {database folder} -t {taxonomy folder}
 
-$ CAT contigs -c {bin fasta} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta} -a {alignment file}
+$ CAT_pack bins -b {bin folder} -d {database folder} -t {taxonomy folder} -p {predicted proteins fasta from contig run} -a {alignment file from contig run}
 ```
+This is a great way to run both CAT and BAT on a set of MAGs without needing to do protein prediction and alignment twice!
 
-You can also do this the other way around; start with contig classification and classify the entire MAG with BAT in single bin mode based on the files generated by CAT.
-
-## Interpreting the output files
+### Interpreting the output files
 The ORF2LCA output looks like this:
 
-ORF | lineage | bit-score
---- | --- | ---
-contig\_1\_ORF1 | 1;131567;2;1783272 | 574.7
+ORF | number of hits (r: 10) | lineage | bit-score
+--- | --- | --- | ---
+contig\_1\_ORF1 | 7 | 1;131567;2;1783272 | 574.7
 
 Where the lineage is the full taxonomic lineage of the classification of the ORF, and the bit-score the top-hit bit-score that is assigned to the ORF for voting. The BAT ORF2LCA output file has an extra column where ORFs are linked to the MAG in which they are found.
 
 The contig2classification and bin2classification output looks like this:
 
-contig or bin | classification | reason | lineage | lineage scores
+contig or bin | classification | reason | lineage | lineage scores (f: 0.3)
 --- | --- | --- | --- | ---
 contig\_1 | taxid assigned | based on 14/15 ORFs | 1;131567;2;1783272 | 1.00; 1.00; 1.00; 0.78
-contig\_2 | taxid assigned (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23
+contig\_2 | taxid assigned (1/2) | based on 10/10 ORFs | 1;131567;2;1783272;17id98711;1117;307596;307595;1890422;33071;1416614;1183438\* | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.23;0.23
 contig\_2 | taxid assigned (2/2) | based on 10/10 ORFs | 1;131567;2;1783272;1798711;1117;307596;307595;1890422;33071;33072 | 1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;1.00;0.77
 contig\_3 | no taxid assigned | no ORFs found
 
-Where the lineage scores represent the fraction of bit-score support for each classification. **Contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk).
+Where the lineage scores represent the fraction of bit-score support for each classification. **contig\_2 has two classifications.** This can happen if the *f* parameter is chosen below 0.5. For an explanation of the **starred classification**, see [Marking suggestive taxonomic assignments with an asterisk](#marking-suggestive-taxonomic-assignments-with-an-asterisk).
 
-To add names to the taxonomy id's in either output file, run:
+To add names to the taxids in either output file, run:
 
 ```
-$ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder}
+$ CAT_pack add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder}
 ```
 
-This will show you that for example contig\_1 is classified as Terrabacteria group. To only get official levels (*i.e.* superkingdom, phylum, ...):
+This will show you that for example contig\_1 is classified as Terrabacteria group. To only get official rank (*i.e.* superkingdom, phylum, ...):
 
 ```
-$ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official
+$ CAT_pack add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official
 ```
 
 Or, alternatively:
 
 ```
-$ CAT add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official --exclude_scores
+$ CAT_pack add_names -i {ORF2LCA / classification file} -o {output file} -t {taxonomy folder} --only_official --exclude_scores
 ```
 
 If you have named a CAT or BAT classification file with official names, you can get a summary of the classification, where total length and number of ORFs supporting a taxon are calculated for contigs, and the number of MAGs per encountered taxon for MAG classification:
 
 ```
-$ CAT summarise -c {contigs fasta} -i {named CAT classification file} -o {output file}
+$ CAT_pack summarise -c {contigs fasta} -i {named CAT classification file} -o {output file}
 
-$ CAT summarise -i {named BAT classification file} -o {output file}
+$ CAT_pack summarise -i {named BAT classification file} -o {output file}
 ```
 
-CAT summarise currently does not support classification files wherein some contigs / MAGs have multiple classifications (as contig\_2 above).
+`CAT_pack summarise` currently does not support classification files wherein some contigs / MAGs have multiple classifications (as contig\_2 above).
 
-## Marking suggestive taxonomic assignments with an asterisk
+### Marking suggestive taxonomic assignments with an asterisk
 When we want to confidently go down to the lowest taxonomic level possible for a classification, an important assumption is that on that level conflict between classifications could have arisen. Namely, if there were conflicting classifications, the algorithm would have made the classification more conservative by moving up a level. Since it did not, we can trust the low-level classification. However, it is not always possible for conflict to arise, because in some cases no other sequences from the clade are present in the database. This is true for example for the family Dehalococcoidaceae, which in our databases is the sole representative of the order Dehalococcoidales. Thus, here we cannot confidently state that an classification on the family level is more correct than an classification on the order level. For these cases, CAT and BAT mark the lineage with asterisks, starting from the lowest level classification up to the level where conflict could have arisen because the clade contains multiple taxa with database entries. The user is advised to examine starred taxa more carefully, for example by analysing sequence identity between predicted ORFs and hits, or move up the lineage to a confident classification (i.e. the first classification without an asterisk).
 
 If you do not want the asterisks in your output files, you can add the `--no_stars` flag to CAT or BAT.
 
-## Optimising running time, RAM, and disk usage
+### Optimising running time, RAM, and disk usage
 CAT and BAT may take a while to run, and may use quite a lot of RAM and disk space. Depending on what you value most, you can tune CAT and BAT to maximize one and minimize others. The classification algorithm itself is fast and is friendly on memory and disk space. The most expensive step is alignment with DIAMOND, hence tuning alignment parameters will have the highest impact:
 
 - The `-n / --nproc` argument allows you to choose the number of cores to deploy.
 - You can choose to run DIAMOND in sensitive mode with the `--sensitive` flag. This will increase sensitivity but will make alignment considerably slower.
 - Setting the `--block_size` parameter lower will decrease memory and temporary disk space usage. Setting it higher will increase performance.
-- For high memory machines, it is adviced to set `--index_chunks` to 1. This parameter has no effect on temprary disk space usage.
+- For high memory machines, it is adviced to set `--index_chunks` to 1 (currently the default). This parameter has no effect on temprary disk space usage.
 - You can specify the location of temporary DIAMOND files with the `--tmpdir` argument.
-- You can set the DIAMOND --top parameter (see below).
 
-### Setting the DIAMOND --top parameter
-You can speed up DIAMOND considerably, and at the same time greatly reduce disk usage, by setting the DIAMOND `--top` parameter to lower values. This will govern hits within range of the best hit that are written to the alignment file.
-
-You have to be very carefull to 1) not confuse this parameter with the `r / --range` parameter, which does a similar cut-off but *after* alignment and 2) be aware that if you want to run CAT or BAT again afterwards with different values of the `-r / --range` parameter, your options will be limited to the range you have chosen with `--top` earlier, because all hits that fall outside this range will not be included in the alignment file. **Importantly**, CAT and BAT currently do not warn you if you choose `-r / --range` in a second run higher than `--top` in a previous one, **so it's up to you to remember this!**
-
-If you have understood all this, or you do not plan to tune `-r / --range` at all afterwards, you can add the `--I_know_what_Im_doing` flag and enjoy a huge speedup with much smaller alignment files! For CAT you can for example set `--top 11` and for BAT `--top 6`.
-
-## Examples
+### Examples
 Getting help for running the prepare utility:
 
 ```
-$ CAT prepare --help
+$ CAT_pack prepare --help
 ```
 
-First, create a fresh database. Next, run CAT on a contig set with default parameter settings deploying 16 cores for DIAMOND alignment. Finally, name the contig classification output with official names, and create a summary:
+Run CAT on a contig set with default parameter settings deploying 16 cores for DIAMOND alignment. Name the contig classification output with official names, and create a summary:
 
 ```
-$ CAT prepare --fresh -d CAT_database/ -t CAT_taxonomy/
 
-$ CAT contigs -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -n 16 --out_prefix first_CAT_run
+$ CAT_pack contigs -c contigs.fasta -d db/ -t tax/ -n 16 --out_prefix first_CAT_run
 
-$ CAT add_names -i first_CAT_run.contig2classification.txt -o first_CAT_run.contig2classification.official_names.txt -t CAT_taxonomy/ --only_official
+$ CAT_pack add_names -i first_CAT_run.contig2classification.txt -o first_CAT_run.contig2classification.official_names.txt -t tax/ --only_official
 
-$ CAT summarise -c contigs.fasta -i first_CAT_run.contig2classification.official_names.txt -o CAT_first_run.summary.txt
+$ CAT_pack summarise -c contigs.fasta -i first_CAT_run.contig2classification.official_names.txt -o CAT_first_run.summary.txt
 ```
 
-Run the classification algorithm again with custom parameter settings, and name the contig classification output with all names in the lineage, excluding the scores:
+Run BAT on the set of MAGs that was binned from these contigs, reusing the protein predictions and DIAMOND alignment file generated previously during the contig classification:
 
 ```
-$ CAT contigs --range 5 --fraction 0.1 -c contigs.fasta -d CAT_database/ -t CAT_taxonomy/ -p first_CAT_run.predicted_proteins.fasta -a first_CAT_run.alignment.diamond -o second_CAT_run
-
-$ CAT add_names -i second_CAT_run.contig2classification.txt -o second_CAT_run.contig2classification.names.txt -t CAT_taxonomy/ --exclude_scores
+$ CAT_pack bins -b bins/ -d db/ -t tax/ -p first_CAT_run.predicted_proteins.faa -a first_CAT_run.alignment.diamond -o first_BAT_run
 ```
 
-First, run BAT on a set of MAGs with custom parameter settings, suppressing verbosity and not writing a log file. Next, add names to the ORF2LCA output file:
+Run the contig classification algorithm again with custom parameter settings, and name the output with all names in the lineage, excluding the scores:
 
 ```
-$ CAT bins -r 10 -f 0.1 -b ../bins/ -s .fa -d CAT_database/ -t CAT_taxonomy/ -o BAT_run --quiet --no_log
+$ CAT_pack contigs --range 5 --fraction 0.1 -c contigs.fasta -d db/ -t tax/ -p first_CAT_run.predicted_proteins.faa -a first_CAT_run.alignment.diamond -o second_CAT_run
+
+$ CAT_pack add_names -i second_CAT_run.contig2classification.txt -o second_CAT_run.contig2classification.names.txt -t tax/ --exclude_scores
 
-$ CAT add_names -i BAT_run.ORF2LCA.txt -o BAT_run.ORF2LCA.names.txt -t CAT_taxonomy/
 ```
 
-### Identifying contamination/mis-binned contigs within a MAG.
+Run BAT on the set of MAGs with custom parameter settings, suppressing verbosity and not writing a log file. Next, add names to the ORF2LCA output file:
+
+```
+$ CAT_pack bins -r 3 -f 0.1 -b bins/ -s .fa -d db/ -t tax/ -p first_CAT_run.predicted_proteins.faa -a first_CAT_run.alignment.diamond --o second_BAT_run --quiet --no_log
 
-We often use the combination of CAT/BAT to explore possible contamination within a MAG.
+$ CAT_pack add_names -i second_BAT_run.ORF2LCA.txt -o second_BAT_run.ORF2LCA.names.txt -t tax/
+```
 
-Run BAT on a single MAG. Next, classify the contigs within the MAG individually without generating new protein files or DIAMOND alignments.
+#### Identifying contamination/mis-binned contigs within a MAG
+We often use the combination of CAT / BAT to explore possible contamination within a MAG.
 
 ```
-$ CAT bin -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG
+$ CAT_pack contigs -c ../bins/interesting_MAG.fasta -d db/ -t tax/ -o CAT.interesting_MAG
 
-$ CAT contigs -c ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -p BAT.interesting_MAG.predicted_proteins.faa -a BAT.interesting_MAG.alignment.diamond -o CAT.interesting_MAG
+$ CAT_pack bins -b ../bins/interesting_MAG.fasta -d db/ -t tax/ -p CAT.interesting_MAG.predicted_proteins.faa -a CAT.interesting_MAG.alignment.diamond -o BAT.interesting_MAG
 ```
 
 Contigs that have a different taxonomic signal than the MAG classification are probably contamination.
@@ -392,9 +404,71 @@ Contigs that have a different taxonomic signal than the MAG classification are p
 Alternatively, you can look at contamination from the MAG perspective, by setting the *f* parameter to a low value:
 
 ```
-$ CAT bin -f 0.01 -b ../bins/interesting_MAG.fasta -d CAT_database/ -t CAT_taxonomy/ -o BAT.interesting_MAG
+$ CAT_pack bins -f 0.01 -b ../bins/interesting_MAG.fasta -d db/ -t tax/ -o BAT.interesting_MAG
 
-$ CAT add_names -i BAT.interesting_MAG.bin2classification.txt -o BAT.interesting_MAG.bin2classification.names.txt -t CAT_taxonomy/
+$ CAT_pack add_names -i BAT.interesting_MAG.bin2classification.txt -o BAT.interesting_MAG.bin2classification.names.txt -t tax/
 ```
 
 BAT will output any taxonomic signal with at least 1% support. Low scoring diverging signals are clear signs of contamination!
+
+
+## Estimating the microbial composition with RAT
+RAT estimates the taxonomic composition of metagenomes by integrating taxonomic signals from MAGs, contigs, and reads. RAT has been added to the CAT pack from version 6.0.
+To use RAT, you need the CAT pack database files (see [Getting started](#getting-started) for more information).
+
+RAT makes an integrated profile using MAGs/bins, contigs, and reads. To specify which elements should be integrated, use the `--mode` argument. Possible letters for `--mode` are `m` (for MAGs), `c` (for contigs), and `r` (for reads). All combinations of the three letters are possible, except `r` alone.
+To run RAT's complete workflow, specify the mode, read files, contig files, bin folder, and database files:
+
+```
+$ CAT_pack reads --mode mcr -b bin_folder/ -c contigs.fasta -1 forward_reads.fq.gz -2 reverse_reads.fq.gz -d db/ -t tax/
+```
+
+Currently, RAT supports single read files as well as paired-end read files. Interlaced read files are currently not supported. RAT will run CAT and BAT on the contigs and MAGs, will map the reads back to the contigs, and then try to annotate any unmapped reads separately.
+If you already have a sorted mapping file, you can supply it and RAT will skip the mapping step:
+
+```
+
+$ CAT_pack reads --mode mcr -b bin_folder/ -c contigs.fasta --bam1 mapping_file_sorted.bam -1 forward_reads.fq.gz -2 reverse_reads.fq.gz -d db/ -t tax/
+```
+
+If CAT and/or BAT have already been run on your data, you can supply the output files to RAT to skip the CAT and BAT runs:
+
+```
+
+$ CAT_pack reads --mode mcr -b bin_folder/ -c contigs.fasta -1 forward_reads.fq.gz -2 reverse_reads.fq.gz -d db/ -t tax/ --c2c CAT_contig2classification_file.txt --b2c BAT_bin2classification_file.txt
+```
+
+Similarly, if a previous RAT run crashed after the unmapped reads have already been aligned to the database with diamond, you can supply the intermediate files to continue the run:
+
+```
+
+$ CAT_pack reads --mode mcr -b bin_folder/ -c contigs.fasta -1 forward_reads.fq.gz -2 reverse_reads.fq.gz -d db/ -t tax/ --c2c CAT_contig2classification_file.txt --b2c BAT_bin2classification_file.txt --alignment_unmapped unmapped_alignment_file.diamond
+```
+
+After a RAT run is finished, you can run add_names on the abundance files (only for RAT runs with nr database):
+
+```
+
+$ CAT_pack add_names -i RAT.completete_abundance_file.txt -o RAT.completete_abundance_file_with_names.txt -t tax/ 
+```
+
+Similar to CAT and BAT, the paths to all dependencies can be supplied via an argument:
+
+```
+
+$ CAT_pack reads --mode mcr -b bin_folder/ -c contigs.fasta -1 forward_reads.fq.gz -2 reverse_reads.fq.gz -d db/ -t tax/ --path_to_samtools /path/to/samtools
+```
+
+### Output files
+The RAT output consists of:
+
+- A log file.
+- All CAT output files for the contig fasta.
+- All BAT output files for the MAGs (except DIAMOND alignment and protein fasta).
+- A table that contains the abundance of each MAG.
+- A table that contains all detected taxa and their abundance in the sample.
+- A table that contains the lineage for each read, as well as which step the annotation was made in (optional without `r` in `--mode`).
+- A table that contains the abundance of each contig in the contig fasta.
+- A fasta containing the sequences of all unmapped reads and contigs that could not be annotated by CAT.
+- The diamond alignment of unmapped reads and unannotated contigs.
+- A table that contains the annotations for unmapped reads and (previously) unannotated contigs.


=====================================
debian/changelog
=====================================
@@ -1,3 +1,16 @@
+cat-bat (6.0.1-1) unstable; urgency=medium
+
+  * Team Upload.
+  * New upstream version 6.0.1
+  * Update links to match CAT entry point
+  * Update mapnage as per updated binary name
+  * Drop patches; fixed upstream
+  * Bump Standards-Version to 4.7.2 (no changes needed)
+  * Drop Redundant "Rules-Requires-Root: no"
+  * Drop myself from uploaders
+
+ -- Nilesh Patra <nilesh at debian.org>  Sat, 01 Nov 2025 22:08:51 +0530
+
 cat-bat (5.3-2) unstable; urgency=medium
 
   * Team upload.


=====================================
debian/control
=====================================
@@ -1,17 +1,16 @@
 Source: cat-bat
 Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>, Nilesh Patra <nilesh at debian.org>
+Uploaders: Andreas Tille <tille at debian.org>
 Section: science
 Priority: optional
 Build-Depends: debhelper-compat (= 13),
                dh-sequence-python3,
                python3,
                diamond-aligner
-Standards-Version: 4.6.2
+Standards-Version: 4.7.2
 Vcs-Browser: https://salsa.debian.org/med-team/cat-bat
 Vcs-Git: https://salsa.debian.org/med-team/cat-bat.git
 Homepage: https://github.com/dutilh/CAT
-Rules-Requires-Root: no
 
 Package: cat-bat
 Architecture: any


=====================================
debian/createmanpages
=====================================
@@ -1,19 +1,14 @@
 #!/bin/sh
-MANDIR=debian
+MANDIR=debian/man
 mkdir -p $MANDIR
 
 VERSION=`dpkg-parsechangelog | awk '/^Version:/ {print $2}' | sed -e 's/^[0-9]*://' -e 's/-.*//' -e 's/[+~]dfsg$//'`
 NAME=`grep "^Description:" debian/control | sed 's/^Description: *//' | head -n1`
 PROGNAME=`grep "^Package:" debian/control | sed 's/^Package: *//' | head -n1`
 
-AUTHOR=".SH AUTHOR\n \
-This manpage was written by $DEBFULLNAME for the Debian distribution and\n \
-can be used for any other usage of the program.\
-"
-
 # If program name is different from package name or title should be
 # different from package short description change this here
-progname=CAT
+progname=CAT_pack
 help2man --no-info --no-discard-stderr --help-option=" -h" \
          --name="$NAME" \
             --version-string="$VERSION" ${progname} > $MANDIR/${progname}.1


=====================================
debian/links
=====================================
@@ -1,2 +1,2 @@
-usr/share/cat-bat/CAT	usr/bin/CAT
-
+usr/share/cat-bat/CAT_pack	usr/bin/CAT_pack
+usr/share/cat-bat/CAT_pack	usr/bin/CAT


=====================================
debian/man/CAT.1
=====================================
@@ -0,0 +1,4 @@
+.\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.47.15.
+.TH CAT "1" "May 2025" "CAT 6.0.1" "User Commands"
+.SH NAME
+CAT \- this is essentially a symlink to CAT_pack starting version 6.0.1. Please refer to the manpage of CAT_prep.


=====================================
debian/CAT.1 → debian/man/CAT_pack.1
=====================================
@@ -1,29 +1,30 @@
-.\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.47.15.
-.TH CAT "1" "May 2020" "CAT 5.0.4" "User Commands"
+.\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.49.3.
+.TH CAT_PACK "1" "November 2025" "CAT_pack 6.0.1" "User Commands"
 .SH NAME
-CAT \- tool for taxonomic classification of contigs and metagenome-assembled genomes (MAGs)
-.SH SYNOPSIS
-.B CAT
-(prepare | contigs | bin | bins | add_names | summarise) [\-v / \fB\-\-version]\fR [\-h / \fB\-\-help]\fR
+CAT_pack \- taxonomic classification of contigs and metagenome-assembled genomes (MAGs)
 .SH DESCRIPTION
-Run Contig Annotation Tool (CAT) or Bin Annotation Tool (BAT).
-.SH OPTIONS
+usage: CAT_pack (download | prepare | contigs | bins | reads | add_names | summarise) [\-v / \fB\-\-version]\fR [\-h / \fB\-\-help]\fR
+.PP
+Run Contig Annotation Tool (CAT), Bin Annotation Tool (BAT), or Read Annotation Tool (RAT).
 .SS "Required choice:"
 .TP
+download
+Download and preprocess data from NCBI nr or GTDB.
+.TP
 prepare
-Download database files and construct databases.
+Construct database files.
 .TP
 contigs
 Run CAT.
 .TP
-bin
-Run BAT on a single bin.
-.TP
 bins
-Run BAT on a set of bins.
+Run BAT.
+.TP
+reads
+Run RAT.
 .TP
 add_names
-Add taxonomic names to CAT or BAT output files.
+Add taxonomic names to CAT, BAT, or RAT output files.
 .TP
 summarise
 Summarise a named CAT or BAT classification file.
@@ -34,6 +35,4 @@ Print version information and exit.
 .TP
 \fB\-h\fR, \fB\-\-help\fR
 Show this help message and exit.
-.SH AUTHOR
- This manpage was written by Andreas Tille for the Debian distribution and
- can be used for any other usage of the program.
+


=====================================
debian/manpages
=====================================
@@ -1 +1 @@
-debian/*.1
+debian/man/*.1


=====================================
debian/patches/fix_interpreter.patch deleted
=====================================
@@ -1,13 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-Last-Update: Mon, 18 May 2020 17:05:20 +0200
-Description: Fix typo in interpreter line
-Forwarded: https://github.com/dutilh/CAT/issues/103
-
---- a/CAT_pack/check.py
-+++ b/CAT_pack/check.py
-@@ -1,4 +1,4 @@
-- #!/usr/bin/env/ python3
-+#!/usr/bin/env python3
- 
- import hashlib
- import os


=====================================
debian/patches/series deleted
=====================================
@@ -1 +0,0 @@
-fix_interpreter.patch


=====================================
tests/data/PATRIC.weighted_mean_genome_size.txt
=====================================
The diff for this file was not included because it is too large.


View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/compare/32e95cbf3aa9e46ca6962fa9e6ae5ff29265d854...a7a747ddb5e406a847b63ac13b230a9793a69000

-- 
View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/compare/32e95cbf3aa9e46ca6962fa9e6ae5ff29265d854...a7a747ddb5e406a847b63ac13b230a9793a69000
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251101/7cd52911/attachment-0001.htm>