[med-svn] [Git][med-team/cat-bat][upstream] New upstream version 5.2.2
Nilesh Patra
gitlab at salsa.debian.org
Mon Jan 18 14:18:35 GMT 2021
Nilesh Patra pushed to branch upstream at Debian Med / cat-bat
Commits:
aa8acaa0 by Nilesh Patra at 2021-01-18T19:46:02+05:30
New upstream version 5.2.2
- - - - -
12 changed files:
- CAT_pack/CAT
- CAT_pack/about.py
- CAT_pack/add_names.py
- CAT_pack/bins.py
- CAT_pack/check.py
- CAT_pack/contigs.py
- CAT_pack/prepare.py
- CAT_pack/shared.py
- CAT_pack/single_bin.py
- CAT_pack/summarise.py
- CHANGELOG.md
- README.md
Changes:
=====================================
CAT_pack/CAT
=====================================
@@ -39,7 +39,7 @@ def help():
'Run Contig Annotation Tool (CAT) or '
'Bin Annotation Tool (BAT).\n\n'
'Required choice:\n'
- ' prepare\t\tDownload database files and construct databases.\n'
+ ' prepare\t\tConstruct database files.\n'
' contigs\t\tRun CAT.\n'
' bin\t\t\tRun BAT on a single bin.\n'
' bins\t\t\tRun BAT on a set of bins.\n'
=====================================
CAT_pack/about.py
=====================================
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
__author__ = 'F. A. Bastiaan von Meijenfeldt'
-__version__ = '5.2.1'
-__date__ = '6 January, 2021'
+__version__ = '5.2.2'
+__date__ = '15 January, 2021'
=====================================
CAT_pack/add_names.py
=====================================
@@ -17,70 +17,18 @@ def parse_arguments():
add_help=False)
required = parser.add_argument_group('Required arguments')
-
- required.add_argument(
- '-i',
- '--input_file',
- dest='input_file',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help=('Path to input file. Can be either classification output '
- 'file or ORF2LCA output file.'))
- required.add_argument(
- '-o',
- '--output_file',
- dest='output_file',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to output file.')
- required.add_argument(
- '-t',
- '--taxonomy_folder',
- dest='taxonomy_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains taxonomy files.')
-
+ shared.add_argument(required, 'input_file', True,
+ help_=('Path to input file. Can be classification or ORF2LCA '
+ 'output file from CAT or BAT.'))
+ shared.add_argument(required, 'output_file', True)
+ shared.add_argument(required, 'taxonomy_folder', True)
+
optional = parser.add_argument_group('Optional arguments')
-
- optional.add_argument(
- '--only_official',
- dest='only_official',
- required=False,
- action='store_true',
- help=('Only output official rank names (i.e., superkingdom, '
- 'phylum, class, order, family, genus, species).'))
- optional.add_argument(
- '--exclude_scores',
- dest='exclude_scores',
- required=False,
- action='store_true',
- help=('Do not include bit-score support scores in the lineage of '
- 'a classification output file.'))
- optional.add_argument(
- '--force',
- dest='force',
- required=False,
- action='store_true',
- help='Force overwrite existing files.')
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
+ shared.add_argument(optional, 'only_official', False)
+ shared.add_argument(optional, 'exclude_scores', False)
+ shared.add_argument(optional, 'force', False)
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'help', False)
(args, extra_args) = parser.parse_known_args()
@@ -113,6 +61,10 @@ def run():
check.check_output_file(
args.output_file, args.log_file, args.quiet))
+ errors.append(
+ check.check_in_and_output_file(
+ args.input_file, args.output_file, args.log_file, args.quiet))
+
if True in errors:
sys.exit(1)
=====================================
CAT_pack/bins.py
=====================================
@@ -18,230 +18,43 @@ def parse_arguments():
description='Run Bin Annotation Tool (BAT) on a set of bins.',
usage='CAT bins -b -d -t [options] [-h / --help]',
add_help=False)
-
- required = parser.add_argument_group('Required arguments')
- required.add_argument(
- '-b',
- '--bin_folder',
- dest='bin_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to directory containing bins.')
- required.add_argument(
- '-d',
- '--database_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains database files.')
- required.add_argument(
- '-t',
- '--taxonomy_folder',
- dest='taxonomy_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains taxonomy files.')
+ required = parser.add_argument_group('Required arguments')
+ shared.add_argument(required, 'bin_folder', True)
+ shared.add_argument(required, 'database_folder', True)
+ shared.add_argument(required, 'taxonomy_folder', True)
optional = parser.add_argument_group('Optional arguments')
-
- optional.add_argument(
- '-s',
- '--bin_suffix',
- dest='bin_suffix',
- metavar='',
- required=False,
- type=str,
- default='.fna',
- help='Suffix of bins in bin folder (default: .fna).')
- optional.add_argument(
- '-r',
- '--range',
- dest='r',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(50)],
- action=shared.DecimalAction,
- default=decimal.Decimal(5),
- help='r parameter [0-49] (default: 5).')
- optional.add_argument(
- '-f',
- '--fraction',
- dest='f',
- metavar='',
- required=False,
- type=float,
- choices = [i / 100 for i in range(0, 100)],
- action=shared.DecimalAction,
- default=decimal.Decimal(0.3),
- help='f parameter [0-0.99] (default: 0.3).')
- optional.add_argument(
- '-o',
- '--out_prefix',
- dest='out_prefix',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='./out.BAT',
- help='Prefix for output files (default: out.BAT).')
- optional.add_argument(
- '-p',
- '--proteins_fasta',
- dest='proteins_fasta',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to concatenated predicted proteins fasta file '
- 'generated during an earlier run of BAT. If supplied, BAT '
- 'will skip the protein prediction step.'))
- optional.add_argument(
- '-a',
- '--diamond_alignment',
- dest='alignment_file',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to alignment table generated during an earlier run of '
- 'BAT. If supplied, BAT will skip the alignment step and '
- 'directly classify the bins. A concatenated predicted '
- 'proteins fasta file should also be supplied with argument '
- '[-p / --proteins].'))
- optional.add_argument(
- '--path_to_prodigal',
- dest='path_to_prodigal',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='prodigal',
- help=('Path to Prodigal binaries. Supply if BAT can not find '
- 'Prodigal.'))
- optional.add_argument(
- '--path_to_diamond',
- dest='path_to_diamond',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='diamond',
- help=('Path to DIAMOND binaries. Supply if BAT can not find '
- 'DIAMOND.'))
- optional.add_argument(
- '--no_stars',
- dest='no_stars',
- required=False,
- action='store_true',
- help='Suppress marking of suggestive taxonomic assignments.')
- optional.add_argument(
- '--force',
- dest='force',
- required=False,
- action='store_true',
- help='Force overwrite existing files.')
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '--verbose',
- dest='verbose',
- required=False,
- action='store_true',
- help='Increase verbosity.')
- optional.add_argument(
- '--no_log',
- dest='no_log',
- required=False,
- action='store_true',
- help='Suppress log file.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
- optional.add_argument(
- '--I_know_what_Im_doing',
- dest='IkwId',
- required=False,
- action='store_true',
- help='Flag for experimental features.')
-
+ shared.add_argument(optional, 'bin_suffix', False, default='.fna')
+ shared.add_argument(optional, 'r', False, default=decimal.Decimal(5))
+ shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.3))
+ shared.add_argument(optional, 'out_prefix', False, default='./out.BAT')
+ shared.add_argument(optional, 'proteins_fasta', False,
+ help_=(
+ 'Path to concatenated predicted proteins fasta file '
+ 'generated during an earlier run of BAT on the same bins. If '
+ 'supplied, BAT will skip the protein prediction step.'))
+ shared.add_argument(optional, 'alignment_file', False,
+ help_=(
+ 'Path to alignment table generated during an earlier run of '
+ 'BAT on the same bins. If supplied, BAT will skip the '
+ 'alignment step and directly classify the bins. A '
+ 'concatenated predicted proteins fasta file should also be '
+ 'supplied with argument [-p / --proteins].'))
+ shared.add_argument(optional, 'path_to_prodigal', False,
+ default='prodigal')
+ shared.add_argument(optional, 'path_to_diamond', False, default='diamond')
+ shared.add_argument(optional, 'no_stars', False)
+ shared.add_argument(optional, 'force', False)
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'verbose', False)
+ shared.add_argument(optional, 'no_log', False)
+ shared.add_argument(optional, 'help', False)
+ shared.add_argument(optional, 'IkwId', False)
+
specific = parser.add_argument_group('DIAMOND specific optional arguments')
-
- specific.add_argument(
- '-n',
- '--nproc',
- dest='nproc',
- metavar='',
- required=False,
- type=int,
- default=multiprocessing.cpu_count(),
- help='Number of cores to deploy by DIAMOND (default: maximum).')
- specific.add_argument(
- '--sensitive',
- dest='sensitive',
- required=False,
- action='store_true',
- help='Run DIAMOND in sensitive mode (default: not enabled).')
- specific.add_argument(
- '--block_size',
- dest='block_size',
- metavar='',
- required=False,
- type=float,
- default=2.0,
- help=('DIAMOND block-size parameter (default: 2.0). Lower numbers '
- 'will decrease memory and temporary disk space usage.'))
- specific.add_argument(
- '--index_chunks',
- dest='index_chunks',
- metavar='',
- required=False,
- type=int,
- default=4,
- help=('DIAMOND index-chunks parameter (default: 4). Set to 1 on '
- 'high memory machines. The parameter has no effect on '
- 'temporary disk space usage.'))
- specific.add_argument(
- '--tmpdir',
- dest='tmpdir',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Directory for temporary DIAMOND files (default: directory '
- 'to which output files are written).'))
- specific.add_argument(
- '--compress',
- dest='compress',
- required=False,
- action='store_true',
- help='Compress DIAMOND alignment file.')
- specific.add_argument(
- '--top',
- dest='top',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(51)],
- default=50,
- help=('DIAMOND top parameter [0-50] (default: 50). Governs hits '
- 'within range of best hit that are written to the alignment '
- 'file. This is not the [-r / --range] parameter! Can only be '
- 'set with the [--I_know_what_Im_doing] flag, see README.md.'))
-
+ shared.add_all_diamond_arguments(specific)
+
(args, extra_args) = parser.parse_known_args()
extra_args = [arg for (i, arg) in enumerate(extra_args) if
=====================================
CAT_pack/check.py
=====================================
@@ -276,6 +276,18 @@ def check_input_file(input_file, log_file, quiet):
return error
+def check_in_and_output_file(input_file, output_file, log_file, quiet):
+ error = False
+
+ if input_file == output_file:
+ message = 'input file and output file can not be the same.'
+ shared.give_user_feedback(message, log_file, quiet, error=True)
+
+ error = True
+
+ return error
+
+
def check_top(top, r, log_file, quiet):
error = False
=====================================
CAT_pack/contigs.py
=====================================
@@ -2,7 +2,6 @@
import argparse
import decimal
-import multiprocessing
import sys
import about
@@ -19,218 +18,30 @@ def parse_arguments():
add_help=False)
required = parser.add_argument_group('Required arguments')
-
- required.add_argument(
- '-c',
- '--contigs_fasta',
- dest='contigs_fasta',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to contigs fasta file.')
- required.add_argument(
- '-d',
- '--database_folder',
- dest='database_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains database files.')
- required.add_argument(
- '-t',
- '--taxonomy_folder',
- dest='taxonomy_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains taxonomy files.')
-
+ shared.add_argument(required, 'contigs_fasta', True)
+ shared.add_argument(required, 'database_folder', True)
+ shared.add_argument(required, 'taxonomy_folder', True)
+
optional = parser.add_argument_group('Optional arguments')
-
- optional.add_argument(
- '-r',
- '--range',
- dest='r',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(50)],
- action=shared.DecimalAction,
- default=decimal.Decimal(10),
- help='r parameter [0-49] (default: 10).')
- optional.add_argument(
- '-f',
- '--fraction',
- dest='f',
- metavar='',
- required=False,
- type=float,
- choices = [i / 100 for i in range(0, 100)],
- action=shared.DecimalAction,
- default=decimal.Decimal(0.5),
- help='f parameter [0-0.99] (default: 0.5).')
- optional.add_argument(
- '-o',
- '--out_prefix',
- dest='out_prefix',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='./out.CAT',
- help='Prefix for output files (default: out.CAT).')
- optional.add_argument(
- '-p',
- '--proteins_fasta',
- dest='proteins_fasta',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to predicted proteins fasta file. If supplied, CAT '
- 'will skip the protein prediction step.'))
- optional.add_argument(
- '-a',
- '--diamond_alignment',
- dest='alignment_file',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to alignment table. If supplied, CAT will skip the '
- 'alignment step and directly classify the contigs. A '
- 'predicted proteins fasta file should also be supplied with '
- 'argument [-p / --proteins].'))
- optional.add_argument(
- '--path_to_prodigal',
- dest='path_to_prodigal',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='prodigal',
- help=('Path to Prodigal binaries. Supply if CAT can not find '
- 'Prodigal.'))
- optional.add_argument(
- '--path_to_diamond',
- dest='path_to_diamond',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='diamond',
- help=('Path to DIAMOND binaries. Supply if CAT can not find '
- 'DIAMOND.'))
- optional.add_argument(
- '--no_stars',
- dest='no_stars',
- required=False,
- action='store_true',
- help='Suppress marking of suggestive taxonomic assignments.')
- optional.add_argument(
- '--force',
- dest='force',
- required=False,
- action='store_true',
- help='Force overwrite existing files.')
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '--verbose',
- dest='verbose',
- required=False,
- action='store_true',
- help='Increase verbosity.')
- optional.add_argument(
- '--no_log',
- dest='no_log',
- required=False,
- action='store_true',
- help='Suppress log file.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
- optional.add_argument(
- '--I_know_what_Im_doing',
- dest='IkwId',
- required=False,
- action='store_true',
- help='Flag for experimental features.')
+ shared.add_argument(optional, 'r', False, default=decimal.Decimal(10))
+ shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.5))
+ shared.add_argument(optional, 'out_prefix', False, default='./out.CAT')
+ shared.add_argument(optional, 'proteins_fasta', False)
+ shared.add_argument(optional, 'alignment_file', False)
+ shared.add_argument(optional, 'path_to_prodigal', False,
+ default='prodigal')
+ shared.add_argument(optional, 'path_to_diamond', False, default='diamond')
+ shared.add_argument(optional, 'no_stars', False)
+ shared.add_argument(optional, 'force', False)
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'verbose', False)
+ shared.add_argument(optional, 'no_log', False)
+ shared.add_argument(optional, 'help', False)
+ shared.add_argument(optional, 'IkwId', False)
specific = parser.add_argument_group('DIAMOND specific optional arguments')
+ shared.add_all_diamond_arguments(specific)
- specific.add_argument(
- '-n',
- '--nproc',
- dest='nproc',
- metavar='',
- required=False,
- type=int,
- default=multiprocessing.cpu_count(),
- help='Number of cores to deploy by DIAMOND (default: maximum).')
- specific.add_argument(
- '--sensitive',
- dest='sensitive',
- required=False,
- action='store_true',
- help='Run DIAMOND in sensitive mode (default: not enabled).')
- specific.add_argument(
- '--block_size',
- dest='block_size',
- metavar='',
- required=False,
- type=float,
- default=2.0,
- help=('DIAMOND block-size parameter (default: 2.0). Lower numbers '
- 'will decrease memory and temporary disk space usage.'))
- specific.add_argument(
- '--index_chunks',
- dest='index_chunks',
- metavar='',
- required=False,
- type=int,
- default=4,
- help=('DIAMOND index-chunks parameter (default: 4). Set to 1 on '
- 'high memory machines. The parameter has no effect on '
- 'temporary disk space usage.'))
- specific.add_argument(
- '--tmpdir',
- dest='tmpdir',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Directory for temporary DIAMOND files (default: directory '
- 'to which output files are written).'))
- specific.add_argument(
- '--compress',
- dest='compress',
- required=False,
- action='store_true',
- help='Compress DIAMOND alignment file.')
- specific.add_argument(
- '--top',
- dest='top',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(51)],
- default=50,
- help=('DIAMOND top parameter [0-50] (default: 50). Governs hits '
- 'within range of best hit that are written to the alignment '
- 'file. This is not the [-r / --range] parameter! Can only be '
- 'set with the [--I_know_what_Im_doing] flag, see README.md.'))
-
(args, extra_args) = parser.parse_known_args()
extra_args = [arg for (i, arg) in enumerate(extra_args) if
=====================================
CAT_pack/prepare.py
=====================================
@@ -27,9 +27,7 @@ def parse_arguments():
add_help=False)
required_choice = parser.add_argument_group('Required choice')
-
group = required_choice.add_mutually_exclusive_group(required=True)
-
group.add_argument(
'--fresh',
dest='fresh',
@@ -44,77 +42,30 @@ def parse_arguments():
'files that do not exist yet.'))
optional = parser.add_argument_group('Optional arguments')
+ shared.add_argument(
+ optional,
+ 'database_folder',
+ False,
+ default='./CAT_database.{0}'.format(date),
+ help_=('Name of folder to which database files will be written '
+ '(default: CAT_database.{date})'))
+ shared.add_argument(
+ optional,
+ 'taxonomy_folder',
+ False,
+ default='./CAT_taxonomy.{0}'.format(date),
+ help_=('Name of folder to which taxonomy files will be downloaded '
+ '(default: CAT_taxonomy.{date})'))
+ shared.add_argument(optional, 'path_to_diamond', False, default='diamond')
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'verbose', False)
+ shared.add_argument(optional, 'no_log', False)
+ shared.add_argument(optional, 'help', False)
- optional.add_argument(
- '-d',
- '--database_folder',
- dest='database_folder',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='./{0}_CAT_database'.format(date),
- help=('Name of folder to which database files will be written '
- '(default: {date}_CAT_database).'))
- optional.add_argument(
- '-t',
- '--taxonomy_folder',
- dest='taxonomy_folder',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='./{0}_taxonomy'.format(date),
- help=('Name of folder to which taxonomy files will be downloaded '
- '(default: {date}_taxonomy).'))
- optional.add_argument(
- '--path_to_diamond',
- dest='path_to_diamond',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='diamond',
- help=('Path to DIAMOND binaries. Supply if CAT prepare can not '
- 'find DIAMOND.'))
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '--verbose',
- dest='verbose',
- required=False,
- action='store_true',
- help='Increase verbostity.')
- optional.add_argument(
- '--no_log',
- dest='no_log',
- required=False,
- action='store_true',
- help='Suppress log file.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
-
specific = parser.add_argument_group('DIAMOND specific optional arguments')
+ shared.add_argument(specific, 'nproc', False,
+ default=multiprocessing.cpu_count())
- specific.add_argument(
- '-n',
- '--nproc',
- dest='nproc',
- metavar='',
- required=False,
- type=int,
- default=multiprocessing.cpu_count(),
- help=('Number of cores to deploy by DIAMOND makedb '
- '(default: maximum).'))
-
(args, extra_args) = parser.parse_known_args()
extra_args = [arg for (i, arg) in enumerate(extra_args) if
=====================================
CAT_pack/shared.py
=====================================
@@ -4,6 +4,7 @@ import argparse
import datetime
import decimal
import gzip
+import multiprocessing
import os
import subprocess
import sys
@@ -43,6 +44,404 @@ def timestamp():
return str_
+def add_argument(argument_group, dest, required, default=None, help_=None):
+ if dest == 'contigs_fasta':
+ if help_ is None:
+ help_ = 'Path to contigs fasta file.'
+ argument_group.add_argument(
+ '-c',
+ '--contigs_fasta',
+ dest='contigs_fasta',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'bin_fasta':
+ if help is None:
+ help_ = 'Path to bin fasta file.'
+ argument_group.add_argument(
+ '-b',
+ '--bin_fasta',
+ dest='bin_fasta',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'bin_folder':
+ if help_ is None:
+ help_ = 'Path to directory containing bins.'
+ argument_group.add_argument(
+ '-b',
+ '--bin_folder',
+ dest='bin_folder',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'database_folder':
+ if help_ is None:
+ help_ = 'Path to folder that contains database files.'
+ argument_group.add_argument(
+ '-d',
+ '--database_folder',
+ dest='database_folder',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ default=default,
+ help=help_)
+ elif dest == 'taxonomy_folder':
+ if help_ is None:
+ help_ = 'Path to folder that contains taxonomy files.'
+ argument_group.add_argument(
+ '-t',
+ '--taxonomy_folder',
+ dest='taxonomy_folder',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ default=default,
+ help=help_)
+ elif dest == 'bin_suffix':
+ if help_ is None:
+ help_ = ('Suffix of bins in bin folder (default: {0}).'
+ ''.format(default))
+ argument_group.add_argument(
+ '-s',
+ '--bin_suffix',
+ dest='bin_suffix',
+ metavar='',
+ required=required,
+ type=str,
+ default=default,
+ help=help_)
+ elif dest == 'r':
+ if help_ is None:
+ help_ = 'r parameter [0-49] (default: {0:.0f}).'.format(default)
+ argument_group.add_argument(
+ '-r',
+ '--range',
+ dest='r',
+ metavar='',
+ required=required,
+ type=float,
+ choices = [i for i in range(50)],
+ action=DecimalAction,
+ default=default,
+ help=help_)
+ elif dest == 'f':
+ if help_ is None:
+ help_ = ('f parameter [0-0.99] (default: {0:.2f}).'
+ ''.format(default))
+ argument_group.add_argument(
+ '-f',
+ '--fraction',
+ dest='f',
+ metavar='',
+ required=required,
+ type=float,
+ choices = [i / 100 for i in range(0, 100)],
+ action=DecimalAction,
+ default=default,
+ help=help_)
+ elif dest == 'out_prefix':
+ if help_ is None:
+ help_ = 'Prefix for output files (default: {0}).'.format(default)
+ argument_group.add_argument(
+ '-o',
+ '--out_prefix',
+ dest='out_prefix',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ default=default,
+ help=help_)
+ elif dest == 'proteins_fasta':
+ if help_ is None:
+ help_ = ('Path to predicted proteins fasta file. If supplied, the '
+ 'protein prediction step is skipped.')
+ argument_group.add_argument(
+ '-p',
+ '--proteins_fasta',
+ dest='proteins_fasta',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'alignment_file':
+ if help_ is None:
+ help_ = (
+ 'Path to alignment table. If supplied, the alignment '
+ 'step is skipped and classification is carried out '
+ 'directly. A predicted proteins fasta file should also be '
+ 'supplied with argument [-p / --proteins].')
+ argument_group.add_argument(
+ '-a',
+ '--diamond_alignment',
+ dest='alignment_file',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'path_to_prodigal':
+ if help_ is None:
+ help_ = ('Path to Prodigal binaries. Supply if CAT/BAT cannot '
+ 'find Prodigal')
+ argument_group.add_argument(
+ '--path_to_prodigal',
+ dest='path_to_prodigal',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ default=default,
+ help=help_)
+ elif dest == 'path_to_diamond':
+ if help_ is None:
+ help_ = ('Path to DIAMOND binaries. Supply if CAT/BAT cannot find '
+ 'DIAMOND.')
+ argument_group.add_argument(
+ '--path_to_diamond',
+ dest='path_to_diamond',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ default=default,
+ help=help_)
+ elif dest == 'no_stars':
+ if help_ is None:
+ help_ = 'Suppress marking of suggestive taxonomic assignments.'
+ argument_group.add_argument(
+ '--no_stars',
+ dest='no_stars',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'force':
+ if help_ is None:
+ help_ = 'Force overwrite existing files.'
+ argument_group.add_argument(
+ '--force',
+ dest='force',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'quiet':
+ if help_ is None:
+ help_ = 'Suppress verbosity.'
+ argument_group.add_argument(
+ '-q',
+ '--quiet',
+ dest='quiet',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'verbose':
+ if help_ is None:
+ help_ = 'Increase verbosity.'
+ argument_group.add_argument(
+ '--verbose',
+ dest='verbose',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'no_log':
+ if help_ is None:
+ help_ = 'Suppress log file.'
+ argument_group.add_argument(
+ '--no_log',
+ dest='no_log',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'help':
+ if help_ is None:
+ help_ = 'Show this help message and exit.'
+ argument_group.add_argument(
+ '-h',
+ '--help',
+ action='help',
+ help=help_)
+ elif dest == 'IkwId':
+ if help_ is None:
+ help_ = 'Flag for experimental features.'
+ argument_group.add_argument(
+ '--I_know_what_Im_doing',
+ dest='IkwId',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'input_file':
+ if help_ is None:
+ help_ = 'Path to input file.'
+ argument_group.add_argument(
+ '-i',
+ '--input_file',
+ dest='input_file',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'output_file':
+ if help_ is None:
+ help_ = 'Path to output file.'
+ argument_group.add_argument(
+ '-o',
+ '--output_file',
+ dest='output_file',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'only_official':
+ if help_ is None:
+ help_ = ('Only output official raxonomic ranks (superkingdom, '
+ 'phylum, class, order, family, genus, species).')
+ argument_group.add_argument(
+ '--only_official',
+ dest='only_official',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'exclude_scores':
+ if help_ is None:
+ help_ = ('Do not include bit-score support scores in the lineage '
+ 'of a classification output file.')
+ argument_group.add_argument(
+ '--exclude_scores',
+ dest='exclude_scores',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'nproc':
+ if help_ is None:
+ help_ = 'Number of cores to deploy by DIAMOND (default: maximum).'
+ argument_group.add_argument(
+ '-n',
+ '--nproc',
+ dest='nproc',
+ metavar='',
+ required=required,
+ type=int,
+ default=default,
+ help=help_)
+ elif dest == 'sensitive':
+ if help_ is None:
+ help_ = 'Run DIAMOND in sensitive mode (default: not enabled).'
+ argument_group.add_argument(
+ '--sensitive',
+ dest='sensitive',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'no_self_hits':
+ if help_ is None:
+ help_ = ('Do not report identical self hits by DIAMOND (default: '
+ 'not enabled).')
+ argument_group.add_argument(
+ '--no_self_hits',
+ dest='no_self_hits',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'block_size':
+ if help_ is None:
+ help_ = (
+ 'DIAMOND block-size parameter (default: {0}). Lower '
+ 'numbers will decrease memory and temporary disk space '
+ 'usage.'.format(default))
+ argument_group.add_argument(
+ '--block_size',
+ dest='block_size',
+ metavar='',
+ required=required,
+ type=float,
+ default=default,
+ help=help_)
+ elif dest == 'index_chunks':
+ if help_ is None:
+ help_ = (
+ 'DIAMOND index-chunks parameter (default: {0}). Set to '
+ '1 on high memory machines. The parameter has no effect '
+ 'on temporary disk space usage.'.format(default))
+ argument_group.add_argument(
+ '--index_chunks',
+ dest='index_chunks',
+ metavar='',
+ required=required,
+ type=int,
+ default=default,
+ help=help_)
+ elif dest == 'tmpdir':
+ if help_ is None:
+ help_ = ('Directory for temporary DIAMOND files (default: '
+ 'directory to which output files are written).')
+ argument_group.add_argument(
+ '--tmpdir',
+ dest='tmpdir',
+ metavar='',
+ required=required,
+ type=str,
+ action=PathAction,
+ help=help_)
+ elif dest == 'compress':
+ if help_ is None:
+ help_ = 'Compress DIAMOND alignment file (default: not enabled).'
+ argument_group.add_argument(
+ '--compress',
+ dest='compress',
+ required=required,
+ action='store_true',
+ help=help_)
+ elif dest == 'top':
+ if help_ is None:
+ help_ = (
+ 'DIAMOND top parameter [0-50] (default: {0}). Governs '
+ 'hits within range of best hit that are written to the '
+ 'alignment file. This is not the [-r / --range] '
+ 'parameter! Can only be set with the '
+ '[--I_know_what_Im_doing] flag, see README.md.'
+ ''.format(default))
+ argument_group.add_argument(
+ '--top',
+ dest='top',
+ metavar='',
+ required=required,
+ type=float,
+ choices = [i for i in range(51)],
+ default=default,
+ help=help_)
+ else:
+ sys.exit('Unknown parser dest {0}.'.format(dest))
+
+ return
+
+
+def add_all_diamond_arguments(argument_group):
+ add_argument(argument_group, 'nproc', False,
+ default=multiprocessing.cpu_count())
+ add_argument(argument_group, 'sensitive', False)
+ add_argument(argument_group, 'no_self_hits', False)
+ add_argument(argument_group, 'block_size', False, default=2.0)
+ add_argument(argument_group, 'index_chunks', False, default=4)
+ add_argument(argument_group, 'tmpdir', False)
+ add_argument(argument_group, 'compress', False)
+ add_argument(argument_group, 'top', False, default=50)
+
+ return
+
+
def expand_arguments(args):
if 'r' in args:
setattr(args, 'one_minus_r', (100 - args.r) / 100)
@@ -55,9 +454,9 @@ def expand_arguments(args):
if 'no_log' in args and not args.no_log:
if 'fresh' in args and args.fresh:
- log_file = './{0}.CAT_prepare.fresh.log'.format(args.date)
+ log_file = './CAT_prepare.{0}.fresh.log'.format(args.date)
elif 'fresh' in args and not args.fresh:
- log_file = './{0}.CAT_prepare.existing.log'.format(args.date)
+ log_file = './CAT_prepare.{0}.existing.log'.format(args.date)
else:
# Check out_prefix as the log file needs to be written to a valid
# location.
@@ -245,21 +644,23 @@ def run_diamond(args):
'\t\t\tquery: {0}\n'
'\t\t\tdatabase: {1}\n'
'\t\t\tmode: {2}\n'
- '\t\t\tnumber of cores: {3}\n'
- '\t\t\tblock-size (billions of letters): {4}\n'
- '\t\t\tindex-chunks: {5}\n'
- '\t\t\ttmpdir: {6}\n'
- '\t\t\tcompress: {7}\n'
- '\t\t\ttop: {8}'.format(
+ '\t\t\ttop: {3}\n'
+ '\t\t\tno-self-hits: {4}\n'
+ '\t\t\tnumber of cores: {5}\n'
+ '\t\t\tblock-size (billions of letters): {6}\n'
+ '\t\t\tindex-chunks: {7}\n'
+ '\t\t\ttmpdir: {8}\n'
+ '\t\t\tcompress: {9}'.format(
args.proteins_fasta,
args.diamond_database,
mode,
+ args.top,
+ args.no_self_hits,
args.nproc,
args.block_size,
args.index_chunks,
args.tmpdir,
- compression,
- args.top))
+ compression))
give_user_feedback(message, args.log_file, args.quiet)
try:
@@ -284,6 +685,9 @@ def run_diamond(args):
if args.sensitive:
command += ['--sensitive']
+ if args.no_self_hits:
+ command += ['--no-self-hits']
+
subprocess.check_call(command)
except:
message = 'DIAMOND finished abnormally.'
=====================================
CAT_pack/single_bin.py
=====================================
@@ -17,219 +17,32 @@ def parse_arguments():
description='Run Bin Annotation Tool (BAT) on a single bin.',
usage='CAT bin -b -d -t [options] [-h / --help]',
add_help=False)
-
- required = parser.add_argument_group('Required arguments')
- required.add_argument(
- '-b',
- '--bin_fasta',
- dest='bin_fasta',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to bin fasta file.')
- required.add_argument(
- '-d',
- '--database_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains database files.')
- required.add_argument(
- '-t',
- '--taxonomy_folder',
- dest='taxonomy_folder',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to folder that contains taxonomy files.')
+ required = parser.add_argument_group('Required arguments')
+ shared.add_argument(required, 'bin_fasta', True)
+ shared.add_argument(required, 'database_folder', True)
+ shared.add_argument(required, 'taxonomy_folder', True)
optional = parser.add_argument_group('Optional arguments')
-
- optional.add_argument(
- '-r',
- '--range',
- dest='r',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(50)],
- action=shared.DecimalAction,
- default=decimal.Decimal(5),
- help='r parameter [0-49] (default: 5).')
- optional.add_argument(
- '-f',
- '--fraction',
- dest='f',
- metavar='',
- required=False,
- type=float,
- choices = [i / 100 for i in range(0, 100)],
- action=shared.DecimalAction,
- default=decimal.Decimal(0.3),
- help='f parameter [0-0.99] (default: 0.3).')
- optional.add_argument(
- '-o',
- '--out_prefix',
- dest='out_prefix',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='./out.BAT',
- help='Prefix for output files (default: out.BAT).')
- optional.add_argument(
- '-p',
- '--proteins_fasta',
- dest='proteins_fasta',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to predicted proteins fasta file. If supplied, BAT '
- 'will skip the protein prediction step.'))
- optional.add_argument(
- '-a',
- '--diamond_alignment',
- dest='alignment_file',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to alignment table. If supplied, BAT will skip the '
- 'alignment step and directly classify the bin. A predicted '
- 'proteins fasta file should also be supplied with argument '
- '[-p / --proteins].'))
- optional.add_argument(
- '--path_to_prodigal',
- dest='path_to_prodigal',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='prodigal',
- help=('Path to Prodigal binaries. Supply if BAT can not '
- 'find Prodigal.'))
- optional.add_argument(
- '--path_to_diamond',
- dest='path_to_diamond',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- default='diamond',
- help=('Path to DIAMOND binaries. Supply if BAT can not find '
- 'DIAMOND.'))
- optional.add_argument(
- '--no_stars',
- dest='no_stars',
- required=False,
- action='store_true',
- help='Suppress marking of suggestive taxonomic assignments.')
- optional.add_argument(
- '--force',
- dest='force',
- required=False,
- action='store_true',
- help='Force overwrite existing files.')
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '--verbose',
- dest='verbose',
- required=False,
- action='store_true',
- help='Increase verbosity.')
- optional.add_argument(
- '--no_log',
- dest='no_log',
- required=False,
- action='store_true',
- help='Suppress log file.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
- optional.add_argument(
- '--I_know_what_Im_doing',
- dest='IkwId',
- required=False,
- action='store_true',
- help='Flag for experimental features.')
-
+ shared.add_argument(optional, 'r', False, default=decimal.Decimal(5))
+ shared.add_argument(optional, 'f', False, default=decimal.Decimal(0.3))
+ shared.add_argument(optional, 'out_prefix', False, default='./out.BAT')
+ shared.add_argument(optional, 'proteins_fasta', False)
+ shared.add_argument(optional, 'alignment_file', False)
+ shared.add_argument(optional, 'path_to_prodigal', False,
+ default='prodigal')
+ shared.add_argument(optional, 'path_to_diamond', False, default='diamond')
+ shared.add_argument(optional, 'no_stars', False)
+ shared.add_argument(optional, 'force', False)
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'verbose', False)
+ shared.add_argument(optional, 'no_log', False)
+ shared.add_argument(optional, 'help', False)
+ shared.add_argument(optional, 'IkwId', False)
+
specific = parser.add_argument_group('DIAMOND specific optional arguments')
-
- specific.add_argument(
- '-n',
- '--nproc',
- dest='nproc',
- metavar='',
- required=False,
- type=int,
- default=multiprocessing.cpu_count(),
- help='Number of cores to deploy by DIAMOND (default: maximum).')
- specific.add_argument(
- '--sensitive',
- dest='sensitive',
- required=False,
- action='store_true',
- help='Run DIAMOND in sensitive mode (default: not enabled).')
- specific.add_argument(
- '--block_size',
- dest='block_size',
- metavar='',
- required=False,
- type=float,
- default=2.0,
- help=('DIAMOND block-size parameter (default: 2.0). Lower numbers '
- 'will decrease memory and temporary disk space usage.'))
- specific.add_argument(
- '--index_chunks',
- dest='index_chunks',
- metavar='',
- required=False,
- type=int,
- default=4,
- help=('DIAMOND index-chunks parameter (default: 4). Set to 1 on '
- 'high memory machines. The parameter has no effect on '
- 'temporary disk space usage.'))
- specific.add_argument(
- '--tmpdir',
- dest='tmpdir',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Directory for temporary DIAMOND files (default: directory '
- 'to which output files are written).'))
- specific.add_argument(
- '--compress',
- dest='compress',
- required=False,
- action='store_true',
- help='Compress DIAMOND alignment file.')
- specific.add_argument(
- '--top',
- dest='top',
- metavar='',
- required=False,
- type=float,
- choices = [i for i in range(51)],
- default=50,
- help=('DIAMOND top parameter [0-50] (default: 50). Governs hits '
- 'within range of best hit that are written to the alignment '
- 'file. This is not the [-r / --range] parameter! Can only be '
- 'set with the [--I_know_what_Im_doing] flag, see README.md.'))
-
+ shared.add_all_diamond_arguments(specific)
+
(args, extra_args) = parser.parse_known_args()
extra_args = [arg for (i, arg) in enumerate(extra_args) if
=====================================
CAT_pack/summarise.py
=====================================
@@ -16,60 +16,23 @@ def parse_arguments():
add_help=False)
required = parser.add_argument_group('Required arguments')
-
- required.add_argument(
- '-i',
- '--input_file',
- dest='input_file',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help=(
+ shared.add_argument(required, 'input_file', True,
+ help_=(
'Path to named CAT contig classification file or BAT bin '
'classification file. Currently only official ranks are '
'supported, and only classification files containing a single '
- 'classification per contig / bin.'))
- required.add_argument(
- '-o',
- '--output_file',
- dest='output_file',
- metavar='',
- required=True,
- type=str,
- action=shared.PathAction,
- help='Path to output file.')
+ 'classification per contig / bin. If you want to summarise a '
+ 'contig classification file, you have to supply the contigs '
+ 'fasta file with argument [-c / --contigs_fasta].'))
+ shared.add_argument(required, 'output_file', True)
optional = parser.add_argument_group('Optional arguments')
-
- optional.add_argument(
- '-c',
- '--contigs_fasta',
- dest='contigs_fasta',
- metavar='',
- required=False,
- type=str,
- action=shared.PathAction,
- help=('Path to contigs fasta file. This is required if you want '
- 'to summarise a contig classification file.'))
- optional.add_argument(
- '--force',
- dest='force',
- required=False,
- action='store_true',
- help='Force overwrite existing files.')
- optional.add_argument(
- '-q',
- '--quiet',
- dest='quiet',
- required=False,
- action='store_true',
- help='Suppress verbosity.')
- optional.add_argument(
- '-h',
- '--help',
- action='help',
- help='Show this help message and exit.')
+ shared.add_argument(optional, 'contigs_fasta', False,
+ help_=('Path to contigs fasta file. Required if you want to '
+ 'summarise a contig classification file.'))
+ shared.add_argument(optional, 'force', False)
+ shared.add_argument(optional, 'quiet', False)
+ shared.add_argument(optional, 'help', False)
(args, extra_args) = parser.parse_known_args()
@@ -128,6 +91,10 @@ def summarise_contigs(args):
check.check_output_file(
args.output_file, args.log_file, args.quiet))
+ errors.append(
+ check.check_in_and_output_file(
+ args.input_file, args.output_file, args.log_file, args.quiet))
+
if True in errors:
sys.exit(1)
@@ -323,6 +290,10 @@ def summarise_bins(args):
check.check_output_file(
args.output_file, args.log_file, args.quiet))
+ errors.append(
+ check.check_in_and_output_file(
+ args.input_file, args.output_file, args.log_file, args.quiet))
+
if True in errors:
sys.exit(1)
=====================================
CHANGELOG.md
=====================================
@@ -1,5 +1,8 @@
# Changelog
+## 5.2.2
+We have added the DIAMOND specific `--no_self_hits` flag. We have also added some extra checks and removed redundancy from the parser code. Databases constructed by `CAT prepare` now have a slightly different naming scheme.
+
## 5.2.1
Minor bug fix for `CAT prepare`.
=====================================
README.md
=====================================
@@ -55,15 +55,15 @@ To get started with CAT and BAT, you will have to get the database files on your
To download the database files, find the most recent version on [tbb.bio.uu.nl/bastiaan/CAT\_prepare/](https://tbb.bio.uu.nl/bastiaan/CAT_prepare/), download and extract, and you are ready to go!
```
-$ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20201123.tar.gz
+$ wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz
-$ tar -xvzf CAT_prepare_20201123.tar.gz
+$ tar -xvzf CAT_prepare_20210107.tar.gz
```
Your version of DIAMOND should be the same as with which the database is constructed. For this reason the DIAMOND executable is supplied within the CAT prepare folder. Alternatively, you can find the DIAMOND version used for database construction within the database log file:
```
-$ grep version 2020-11-23.CAT_prepare.fresh.log
+$ grep version 2021-01-07.CAT_prepare.fresh.log
```
### Generating the database files yourself.
View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/commit/aa8acaa0a0454b1a099a39a806827cd1e0d9215b
--
View it on GitLab: https://salsa.debian.org/med-team/cat-bat/-/commit/aa8acaa0a0454b1a099a39a806827cd1e0d9215b
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210118/6a24605b/attachment-0001.html>
More information about the debian-med-commit
mailing list