[med-svn] [Git][med-team/busco][master] 4 commits: New upstream version 5.2.2
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Sun Jul 25 15:35:18 BST 2021
Nilesh Patra pushed to branch master at Debian Med / busco
Commits:
eb09048f by Nilesh Patra at 2021-07-25T19:57:02+05:30
New upstream version 5.2.2
- - - - -
b82e77ea by Nilesh Patra at 2021-07-25T19:57:09+05:30
Update upstream source from tag 'upstream/5.2.2'
Update to upstream version '5.2.2'
with Debian dir 60bf48fbfcfc448b6af6a472c445470864626fb7
- - - - -
851f7574 by Nilesh Patra at 2021-07-25T20:00:00+05:30
Refresh patch
- - - - -
d7282378 by Nilesh Patra at 2021-07-25T20:01:04+05:30
Interim changelog entry
- - - - -
14 changed files:
- CHANGELOG
- debian/changelog
- debian/patches/fix-and-disable-tests.patch
- src/busco/AutoLineage.py
- src/busco/BuscoConfig.py
- src/busco/BuscoRunner.py
- src/busco/_version.py
- src/busco/analysis/BuscoAnalysis.py
- src/busco/analysis/GenomeAnalysis.py
- src/busco/busco_tools/Toolset.py
- src/busco/busco_tools/hmmer.py
- src/busco/run_BUSCO.py
- tests/unittests/BuscoConfig_unittests.py
- tests/unittests/run_BUSCO_unittests.py
Changes:
=====================================
CHANGELOG
=====================================
@@ -1,3 +1,11 @@
+5.2.2
+- Issue #390 fixed
+- Issue #423 fixed
+- Issue #464 fixed
+- Issue #467 fixed
+- Issue #470 fixed
+- Add --tar option to compress some output folders
+
5.2.1
- Minor bug fixes
=====================================
debian/changelog
=====================================
@@ -1,6 +1,6 @@
-busco (5.2.1-1) UNRELEASED; urgency=medium
+busco (5.2.2-1) UNRELEASED; urgency=medium
- * New upstream version 5.2.1
+ * New upstream version 5.2.2
* Update manpage
* Install config.ini file as an example
* Add Depends on python3-biopython and python3-pandas
@@ -8,7 +8,7 @@ busco (5.2.1-1) UNRELEASED; urgency=medium
* d/p/fix-and-disable-tests.patch: Add patch to fix test execution
* Add autopkgtests
- -- Nilesh Patra <nilesh at debian.org> Fri, 09 Jul 2021 22:09:38 +0530
+ -- Nilesh Patra <nilesh at debian.org> Sun, 25 Jul 2021 19:57:23 +0530
busco (5.0.0-1) unstable; urgency=medium
=====================================
debian/patches/fix-and-disable-tests.patch
=====================================
@@ -73,7 +73,7 @@ Last-Update: 2021-07-02
self, mock_config_manager, fake_modedict, mock_cleanup, *args
--- a/tests/unittests/BuscoConfig_unittests.py
+++ b/tests/unittests/BuscoConfig_unittests.py
-@@ -353,7 +353,7 @@
+@@ -355,7 +355,7 @@
config.configure()
config._check_evalue()
@@ -82,7 +82,7 @@ Last-Update: 2021-07-02
def test_evalue_default(self, mock_logger):
self.test_params["evalue"] = 0.001
config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -399,7 +399,7 @@
+@@ -401,7 +401,7 @@
)
@patch(
@@ -91,7 +91,7 @@ Last-Update: 2021-07-02
)
def test_batch_mode_true(self, *args):
config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -409,10 +409,10 @@
+@@ -411,10 +411,10 @@
config.set.assert_has_calls(calls)
@patch(
@@ -104,7 +104,7 @@ Last-Update: 2021-07-02
)
def test_batch_mode_false_with_file(self, *args):
config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -420,10 +420,10 @@
+@@ -422,10 +422,10 @@
config._check_batch_mode()
@patch(
@@ -117,7 +117,7 @@ Last-Update: 2021-07-02
)
def test_batch_mode_false_with_error(self, *args):
config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -441,14 +441,14 @@
+@@ -443,14 +443,14 @@
with self.assertRaises(BuscoConfig.BatchFatalError):
config._check_required_input_exists()
=====================================
src/busco/AutoLineage.py
=====================================
@@ -84,7 +84,7 @@ class AutoSelectLineage:
"""
root_runners = self.run_lineages_list(self.all_lineages)
self.get_best_match_lineage(root_runners)
- if self.virus_check():
+ if self.virus_check() and not self.config.getboolean("busco_run", "auto-lineage-euk"):
self.virus_pipeline = True
self.run_virus_datasets()
self.get_best_match_lineage(type(self).runners)
@@ -135,7 +135,7 @@ class AutoSelectLineage:
max_ind = inds[max_mask]
return max_ind
- def evaluate(self, runners):
+ def evaluate(self, runners, use_percent=False):
"""
Evaluate output scores from all BUSCO runs. Lineage with the highest number of complete (single + multiple)
copy BUSCOs is assigned as the best_match_lineage.
@@ -147,9 +147,9 @@ class AutoSelectLineage:
"""
self.collate_results(runners)
- max_ind = self.get_max_ind(np.array(self.s_buscos) + np.array(self.d_buscos))
+ max_ind = self.get_max_ind(np.array(self.s_percents) + np.array(self.d_percents)) if use_percent else self.get_max_ind(np.array(self.s_buscos) + np.array(self.d_buscos))
if len(max_ind) > 1:
- max_ind2 = self.get_max_ind(np.array(self.f_buscos)[max_ind])
+ max_ind2 = self.get_max_ind(np.array(self.f_percents)[max_ind]) if use_percent else self.get_max_ind(np.array(self.f_buscos)[max_ind])
max_ind = max_ind[max_ind2]
if len(max_ind) > 1:
if ((self.s_buscos[max_ind[0]] == 0.0)
@@ -170,10 +170,12 @@ class AutoSelectLineage:
self.d_buscos = [runner.analysis.hmmer_runner.multi_copy for runner in runners]
self.f_buscos = [runner.analysis.hmmer_runner.only_fragments for runner in runners]
self.s_percents = [runner.analysis.hmmer_runner.s_percent for runner in runners]
+ self.d_percents = [runner.analysis.hmmer_runner.d_percent for runner in runners]
+ self.f_percents = [runner.analysis.hmmer_runner.f_percent for runner in runners]
return
- def get_best_match_lineage(self, runners):
- max_ind = self.evaluate(runners)
+ def get_best_match_lineage(self, runners, use_percent=False):
+ max_ind = self.evaluate(runners, use_percent)
self.selected_runner = runners[int(max_ind)]
self.best_match_lineage_dataset = self.selected_runner.config.get("busco_run", "lineage_dataset")
runners.pop(int(max_ind))
@@ -199,7 +201,8 @@ class AutoSelectLineage:
"Certain mollicute clades use a different genetic code to the rest of bacteria. They are not part "
"of the BUSCO placement tree and need to be tested separately. For more information, see the user "
"guide.")
- self.check_mollicutes()
+ use_percent = self.selected_runner.mode == "proteins"
+ self.check_mollicutes(use_percent)
if os.path.basename(self.selected_runner.config.get("busco_run", "lineage_dataset")).startswith("bacteria"):
logger.info("Bacteria domain is a better match than the mollicutes subclade. Continuing to tree placement.")
self.run_busco_placer()
@@ -223,10 +226,10 @@ class AutoSelectLineage:
def set_best_match_lineage(self):
AnalysisRunner.selected_dataset = os.path.basename(self.best_match_lineage_dataset)
- def check_mollicutes(self):
+ def check_mollicutes(self, use_percent=False):
runners = self.run_lineages_list(["mollicutes"])
runners.append(self.selected_runner)
- self.get_best_match_lineage(runners)
+ self.get_best_match_lineage(runners, use_percent=use_percent)
return
def run_busco_placer(self): # todo: revisit structure of this method after cleaning BuscoPlacer
=====================================
src/busco/BuscoConfig.py
=====================================
@@ -36,6 +36,7 @@ class BaseConfig(ConfigParser):
"use_augustus": False,
"long": False,
"batch_mode": False,
+ "tar": False,
}
DEPENDENCY_SECTIONS = {
@@ -79,6 +80,7 @@ class BaseConfig(ConfigParser):
"limit",
"use_augustus",
"batch_mode",
+ "tar",
]
def __init__(self):
@@ -682,16 +684,9 @@ class BuscoConfigMain(BuscoConfig, BaseConfig):
def _check_value_constraints(self):
"""
- Load default values into config if not provided in config file or on the command line.
:return:
"""
- # for param in list(type(self).DEFAULT_ARGS_VALUES.keys()):
- # try:
- # self.get("busco_run", param)
- # except NoOptionError:
- # self.set("busco_run", param, str(type(self).DEFAULT_ARGS_VALUES[param]))
- # Set auto-lineage to True if either auto-lineage-prok or auto-lineage-euk is selected
if self.getboolean("busco_run", "auto-lineage-prok") or self.getboolean(
"busco_run", "auto-lineage-euk"
):
=====================================
src/busco/BuscoRunner.py
=====================================
@@ -28,6 +28,9 @@ logger = BuscoLogger.get_logger(__name__)
class SingleRunner:
+
+ all_runners = []
+
def __init__(self, config_manager):
self.start_time = time.time()
self.config_manager = config_manager
@@ -65,6 +68,7 @@ class SingleRunner:
asl.set_best_match_lineage()
lineage_dataset = asl.best_match_lineage_dataset
runner = asl.selected_runner
+ type(self).all_runners.extend(asl.runners)
asl.reset()
return lineage_dataset, runner
@@ -121,10 +125,13 @@ class SingleRunner:
else:
self.runner.run_analysis()
AnalysisRunner.selected_dataset = lineage_basename
+ type(self).all_runners.append(self.runner)
+
+ if self.config.getboolean("busco_run", "tar"):
+ self.compress_folders()
self.runner.finish(time.time() - self.start_time)
- except BuscoError as e:
- self.log_error(e)
+ except BuscoError:
raise
except ToolException as e:
@@ -147,6 +154,34 @@ class SingleRunner:
)
raise BatchFatalError(e)
+ def compress_folders(self):
+ for runner in type(self).all_runners:
+ folders_to_compress = [
+ runner.analysis.hmmer_runner.single_copy_sequences_folder,
+ runner.analysis.hmmer_runner.multi_copy_sequences_folder,
+ runner.analysis.hmmer_runner.fragmented_sequences_folder,
+ runner.analysis.hmmer_runner.output_folder,
+ ]
+ if self.config.getboolean("busco_run", "use_augustus"):
+ folders_to_compress.append(
+ runner.analysis.augustus_runner.pred_genes_dir_initial,
+ runner.analysis.augustus_runner.pred_genes_dir_rerun,
+ runner.analysis.augustus_runner.gff_dir,
+ runner.analysis.gff2gb_runner.gb_folder,
+ )
+ for folder in folders_to_compress:
+ try:
+ shutil.make_archive(
+ folder,
+ "gztar",
+ os.path.dirname(folder),
+ os.path.basename(folder),
+ )
+ shutil.rmtree(folder)
+ except OSError:
+ raise
+ # logger.warning("Unable to compress folder {}".format(folder))
+
class BatchRunner:
@@ -276,6 +311,8 @@ class AnalysisRunner:
self.mode = "prok_tran"
elif self.domain == "eukaryota":
self.mode = "euk_tran"
+ elif self.domain == "viruses":
+ self.mode = "prok_genome" # Suggested by Mose - Prodigal may perform better on viruses than BLAST + HMMER.
else:
raise BatchFatalError("Unrecognized mode {}".format(self.mode))
analysis_type = type(self).mode_dict[self.mode]
@@ -505,6 +542,10 @@ class AnalysisRunner:
if not self.config.getboolean("busco_run", "auto-lineage"):
auto_lineage_line = "\nConsider using the auto-lineage mode to select a more specific lineage."
final_output_results.append(auto_lineage_line)
+ with open(
+ self.analysis.hmmer_runner.short_summary_file, "a"
+ ) as short_summary_file:
+ short_summary_file.write(positive_parasitic_line)
except OSError:
pass
=====================================
src/busco/_version.py
=====================================
@@ -6,4 +6,4 @@ Copyright (c) 2016-2021, Evgeny Zdobnov (ez at ezlab.org)
Licensed under the MIT license. See LICENSE.md file.
"""
-__version__ = "5.2.1"
+__version__ = "5.2.2"
=====================================
src/busco/analysis/BuscoAnalysis.py
=====================================
@@ -119,6 +119,12 @@ class BuscoAnalysis(metaclass=ABCMeta):
)
if self.restart and self.hmmer_runner.check_previous_completed_run():
logger.info("Skipping HMMER run as output already processed")
+ elif self.restart and os.path.exists(
+ "{}.tar.gz".format(self.hmmer_runner.output_folder)
+ ):
+ raise BuscoError(
+ "Restart mode incompatible with a previously compressed (--tar) run. Please decompress the HMMER results folder and try again."
+ )
elif len(os.listdir(self.hmmer_runner.results_dir)) > 0:
raise BuscoError(
"HMMER results directory not empty. If you are running in restart mode, make sure you are "
@@ -190,11 +196,14 @@ class BuscoAnalysis(metaclass=ABCMeta):
self._check_dataset_integrity()
if not os.stat(self.input_file).st_size > 0:
raise BuscoError("Input file is empty.")
- with open(self.input_file) as f:
- for line in f:
- if line.startswith(">"):
- self._check_fasta_header(line)
- self._check_seq_uniqueness(line)
+ try:
+ with open(self.input_file) as f:
+ for line in f:
+ if line.startswith(">"):
+ self._check_fasta_header(line)
+ self._check_seq_uniqueness(line)
+ except UnicodeDecodeError as ude:
+ raise BuscoError(ude.msg)
return
def _check_seq_uniqueness(self, line):
=====================================
src/busco/analysis/GenomeAnalysis.py
=====================================
@@ -586,8 +586,10 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
]
return hmmer_result_dict_new, matched_genes_new
- @log("Validating exons and removing overlapping matches", logger)
def exons_to_df(self, records):
+ if self._mode == "genome":
+ logger.info("Validating exons and removing overlapping matches")
+
df = self.metaeuk_runner.records_to_df(records)
df["Start"] = df["Start"].astype(int)
df["Stop"] = df["Stop"].astype(int)
=====================================
src/busco/busco_tools/Toolset.py
=====================================
@@ -69,7 +69,8 @@ class Job(Process):
try:
process = subprocess.run(
self.cmd_line,
- capture_output=True, # stdout and stderr streams are stored and written to file after job completion
+ stdout=subprocess.PIPE, # stdout and stderr streams are stored and written to file after job completion
+ stderr=subprocess.PIPE,
cwd=self.cwd,
shell=False,
timeout=self.timeout,
=====================================
src/busco/busco_tools/hmmer.py
=====================================
@@ -101,6 +101,7 @@ class HMMERRunner(BaseRunner):
self.mode = mode
self.is_fragment = {}
+ self.matched_genes_fragment = {}
self.single_copy_buscos = {}
self.multi_copy_buscos = {}
=====================================
src/busco/run_BUSCO.py
=====================================
@@ -152,7 +152,8 @@ def _parse_args():
required=False,
metavar="SEQUENCE_FILE",
help="Input sequence file in FASTA format. "
- "Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set.",
+ "Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. "
+ "Also possible to use a path to a directory containing multiple input files.",
)
optional.add_argument(
@@ -385,6 +386,14 @@ def _parse_args():
help="Continue a run that had already partially completed.",
)
+ optional.add_argument(
+ "--tar",
+ dest="tar",
+ action="store_true",
+ required=False,
+ help="Compress some subdirectories with many files to save space",
+ )
+
optional.add_argument(
"--update-data",
dest="update-data",
=====================================
tests/unittests/BuscoConfig_unittests.py
=====================================
@@ -43,6 +43,7 @@ class TestBuscoConfig(unittest.TestCase):
"download_path": None,
"update-data": False,
"version": "==SUPPRESS==",
+ "tar": False,
}
self.test_params = {
@@ -80,6 +81,7 @@ class TestBuscoConfig(unittest.TestCase):
"limit",
"use_augustus",
"batch_mode",
+ "tar",
],
"etraining": ["path", "command"],
"gff2gbSmallDNA.pl": ["path", "command"],
=====================================
tests/unittests/run_BUSCO_unittests.py
=====================================
@@ -116,6 +116,7 @@ class TestParams(unittest.TestCase):
"download_path": None,
"update-data": False,
"version": "==SUPPRESS==",
+ "tar": False,
}
self.assertDictEqual(params, correct_parse)
@@ -173,6 +174,7 @@ class TestParams(unittest.TestCase):
"download_path": None,
"update-data": False,
"version": "==SUPPRESS==",
+ "tar": False,
}
self.assertDictEqual(params, correct_parse)
@@ -223,6 +225,7 @@ class TestParams(unittest.TestCase):
"--augustus",
"--update-data",
"--offline",
+ "--tar",
]
command_str = " ".join(
[" ".join([key, str(value)]) for key, value in arg_values.items()]
@@ -261,6 +264,7 @@ class TestParams(unittest.TestCase):
"quiet": True,
"update-data": True,
"version": "==SUPPRESS==",
+ "tar": True,
}
self.assertDictEqual(params, correct_parse)
View it on GitLab: https://salsa.debian.org/med-team/busco/-/compare/f74f6e63cd60bc848648ddfe8e0878d152e7685b...d72823781030d21c44d37fbb82e024b717d2742b
--
View it on GitLab: https://salsa.debian.org/med-team/busco/-/compare/f74f6e63cd60bc848648ddfe8e0878d152e7685b...d72823781030d21c44d37fbb82e024b717d2742b
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210725/3743474c/attachment-0001.htm>
More information about the debian-med-commit
mailing list