[med-svn] [Git][med-team/busco][master] 4 commits: New upstream version 5.2.2

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Sun Jul 25 15:35:18 BST 2021



Nilesh Patra pushed to branch master at Debian Med / busco


Commits:
eb09048f by Nilesh Patra at 2021-07-25T19:57:02+05:30
New upstream version 5.2.2
- - - - -
b82e77ea by Nilesh Patra at 2021-07-25T19:57:09+05:30
Update upstream source from tag 'upstream/5.2.2'

Update to upstream version '5.2.2'
with Debian dir 60bf48fbfcfc448b6af6a472c445470864626fb7
- - - - -
851f7574 by Nilesh Patra at 2021-07-25T20:00:00+05:30
Refresh patch

- - - - -
d7282378 by Nilesh Patra at 2021-07-25T20:01:04+05:30
Interim changelog entry

- - - - -


14 changed files:

- CHANGELOG
- debian/changelog
- debian/patches/fix-and-disable-tests.patch
- src/busco/AutoLineage.py
- src/busco/BuscoConfig.py
- src/busco/BuscoRunner.py
- src/busco/_version.py
- src/busco/analysis/BuscoAnalysis.py
- src/busco/analysis/GenomeAnalysis.py
- src/busco/busco_tools/Toolset.py
- src/busco/busco_tools/hmmer.py
- src/busco/run_BUSCO.py
- tests/unittests/BuscoConfig_unittests.py
- tests/unittests/run_BUSCO_unittests.py


Changes:

=====================================
CHANGELOG
=====================================
@@ -1,3 +1,11 @@
+5.2.2
+- Issue #390 fixed
+- Issue #423 fixed
+- Issue #464 fixed
+- Issue #467 fixed
+- Issue #470 fixed
+- Add --tar option to compress some output folders
+
 5.2.1
 - Minor bug fixes
 


=====================================
debian/changelog
=====================================
@@ -1,6 +1,6 @@
-busco (5.2.1-1) UNRELEASED; urgency=medium
+busco (5.2.2-1) UNRELEASED; urgency=medium
 
-  * New upstream version 5.2.1
+  * New upstream version 5.2.2
   * Update manpage
   * Install config.ini file as an example
   * Add Depends on python3-biopython and python3-pandas
@@ -8,7 +8,7 @@ busco (5.2.1-1) UNRELEASED; urgency=medium
   * d/p/fix-and-disable-tests.patch: Add patch to fix test execution
   * Add autopkgtests
 
- -- Nilesh Patra <nilesh at debian.org>  Fri, 09 Jul 2021 22:09:38 +0530
+ -- Nilesh Patra <nilesh at debian.org>  Sun, 25 Jul 2021 19:57:23 +0530
 
 busco (5.0.0-1) unstable; urgency=medium
 


=====================================
debian/patches/fix-and-disable-tests.patch
=====================================
@@ -73,7 +73,7 @@ Last-Update: 2021-07-02
          self, mock_config_manager, fake_modedict, mock_cleanup, *args
 --- a/tests/unittests/BuscoConfig_unittests.py
 +++ b/tests/unittests/BuscoConfig_unittests.py
-@@ -353,7 +353,7 @@
+@@ -355,7 +355,7 @@
              config.configure()
              config._check_evalue()
  
@@ -82,7 +82,7 @@ Last-Update: 2021-07-02
      def test_evalue_default(self, mock_logger):
          self.test_params["evalue"] = 0.001
          config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -399,7 +399,7 @@
+@@ -401,7 +401,7 @@
          )
  
      @patch(
@@ -91,7 +91,7 @@ Last-Update: 2021-07-02
      )
      def test_batch_mode_true(self, *args):
          config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -409,10 +409,10 @@
+@@ -411,10 +411,10 @@
          config.set.assert_has_calls(calls)
  
      @patch(
@@ -104,7 +104,7 @@ Last-Update: 2021-07-02
      )
      def test_batch_mode_false_with_file(self, *args):
          config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -420,10 +420,10 @@
+@@ -422,10 +422,10 @@
          config._check_batch_mode()
  
      @patch(
@@ -117,7 +117,7 @@ Last-Update: 2021-07-02
      )
      def test_batch_mode_false_with_error(self, *args):
          config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params)
-@@ -441,14 +441,14 @@
+@@ -443,14 +443,14 @@
          with self.assertRaises(BuscoConfig.BatchFatalError):
              config._check_required_input_exists()
  


=====================================
src/busco/AutoLineage.py
=====================================
@@ -84,7 +84,7 @@ class AutoSelectLineage:
         """
         root_runners = self.run_lineages_list(self.all_lineages)
         self.get_best_match_lineage(root_runners)
-        if self.virus_check():
+        if self.virus_check() and not self.config.getboolean("busco_run", "auto-lineage-euk"):
             self.virus_pipeline = True
             self.run_virus_datasets()
             self.get_best_match_lineage(type(self).runners)
@@ -135,7 +135,7 @@ class AutoSelectLineage:
         max_ind = inds[max_mask]
         return max_ind
 
-    def evaluate(self, runners):
+    def evaluate(self, runners, use_percent=False):
         """
         Evaluate output scores from all BUSCO runs. Lineage with the highest number of complete (single + multiple)
         copy BUSCOs is assigned as the best_match_lineage.
@@ -147,9 +147,9 @@ class AutoSelectLineage:
         """
         self.collate_results(runners)
 
-        max_ind = self.get_max_ind(np.array(self.s_buscos) + np.array(self.d_buscos))
+        max_ind = self.get_max_ind(np.array(self.s_percents) + np.array(self.d_percents)) if use_percent else self.get_max_ind(np.array(self.s_buscos) + np.array(self.d_buscos))
         if len(max_ind) > 1:
-            max_ind2 = self.get_max_ind(np.array(self.f_buscos)[max_ind])
+            max_ind2 = self.get_max_ind(np.array(self.f_percents)[max_ind]) if use_percent else self.get_max_ind(np.array(self.f_buscos)[max_ind])
             max_ind = max_ind[max_ind2]
             if len(max_ind) > 1:
                 if ((self.s_buscos[max_ind[0]] == 0.0)
@@ -170,10 +170,12 @@ class AutoSelectLineage:
         self.d_buscos = [runner.analysis.hmmer_runner.multi_copy for runner in runners]
         self.f_buscos = [runner.analysis.hmmer_runner.only_fragments for runner in runners]
         self.s_percents = [runner.analysis.hmmer_runner.s_percent for runner in runners]
+        self.d_percents = [runner.analysis.hmmer_runner.d_percent for runner in runners]
+        self.f_percents = [runner.analysis.hmmer_runner.f_percent for runner in runners]
         return
 
-    def get_best_match_lineage(self, runners):
-        max_ind = self.evaluate(runners)
+    def get_best_match_lineage(self, runners, use_percent=False):
+        max_ind = self.evaluate(runners, use_percent)
         self.selected_runner = runners[int(max_ind)]
         self.best_match_lineage_dataset = self.selected_runner.config.get("busco_run", "lineage_dataset")
         runners.pop(int(max_ind))
@@ -199,7 +201,8 @@ class AutoSelectLineage:
                 "Certain mollicute clades use a different genetic code to the rest of bacteria. They are not part "
                 "of the BUSCO placement tree and need to be tested separately. For more information, see the user "
                 "guide.")
-            self.check_mollicutes()
+            use_percent = self.selected_runner.mode == "proteins"
+            self.check_mollicutes(use_percent)
             if os.path.basename(self.selected_runner.config.get("busco_run", "lineage_dataset")).startswith("bacteria"):
                 logger.info("Bacteria domain is a better match than the mollicutes subclade. Continuing to tree placement.")
                 self.run_busco_placer()
@@ -223,10 +226,10 @@ class AutoSelectLineage:
     def set_best_match_lineage(self):
         AnalysisRunner.selected_dataset = os.path.basename(self.best_match_lineage_dataset)
 
-    def check_mollicutes(self):
+    def check_mollicutes(self, use_percent=False):
         runners = self.run_lineages_list(["mollicutes"])
         runners.append(self.selected_runner)
-        self.get_best_match_lineage(runners)
+        self.get_best_match_lineage(runners, use_percent=use_percent)
         return
 
     def run_busco_placer(self):  # todo: revisit structure of this method after cleaning BuscoPlacer


=====================================
src/busco/BuscoConfig.py
=====================================
@@ -36,6 +36,7 @@ class BaseConfig(ConfigParser):
         "use_augustus": False,
         "long": False,
         "batch_mode": False,
+        "tar": False,
     }
 
     DEPENDENCY_SECTIONS = {
@@ -79,6 +80,7 @@ class BaseConfig(ConfigParser):
         "limit",
         "use_augustus",
         "batch_mode",
+        "tar",
     ]
 
     def __init__(self):
@@ -682,16 +684,9 @@ class BuscoConfigMain(BuscoConfig, BaseConfig):
 
     def _check_value_constraints(self):
         """
-        Load default values into config if not provided in config file or on the command line.
         :return:
         """
-        # for param in list(type(self).DEFAULT_ARGS_VALUES.keys()):
-        #     try:
-        #         self.get("busco_run", param)
-        #     except NoOptionError:
-        #         self.set("busco_run", param, str(type(self).DEFAULT_ARGS_VALUES[param]))
 
-        # Set auto-lineage to True if either auto-lineage-prok or auto-lineage-euk is selected
         if self.getboolean("busco_run", "auto-lineage-prok") or self.getboolean(
             "busco_run", "auto-lineage-euk"
         ):


=====================================
src/busco/BuscoRunner.py
=====================================
@@ -28,6 +28,9 @@ logger = BuscoLogger.get_logger(__name__)
 
 
 class SingleRunner:
+
+    all_runners = []
+
     def __init__(self, config_manager):
         self.start_time = time.time()
         self.config_manager = config_manager
@@ -65,6 +68,7 @@ class SingleRunner:
         asl.set_best_match_lineage()
         lineage_dataset = asl.best_match_lineage_dataset
         runner = asl.selected_runner
+        type(self).all_runners.extend(asl.runners)
         asl.reset()
         return lineage_dataset, runner
 
@@ -121,10 +125,13 @@ class SingleRunner:
             else:
                 self.runner.run_analysis()
                 AnalysisRunner.selected_dataset = lineage_basename
+            type(self).all_runners.append(self.runner)
+
+            if self.config.getboolean("busco_run", "tar"):
+                self.compress_folders()
             self.runner.finish(time.time() - self.start_time)
 
-        except BuscoError as e:
-            self.log_error(e)
+        except BuscoError:
             raise
 
         except ToolException as e:
@@ -147,6 +154,34 @@ class SingleRunner:
             )
             raise BatchFatalError(e)
 
+    def compress_folders(self):
+        for runner in type(self).all_runners:
+            folders_to_compress = [
+                runner.analysis.hmmer_runner.single_copy_sequences_folder,
+                runner.analysis.hmmer_runner.multi_copy_sequences_folder,
+                runner.analysis.hmmer_runner.fragmented_sequences_folder,
+                runner.analysis.hmmer_runner.output_folder,
+            ]
+            if self.config.getboolean("busco_run", "use_augustus"):
+                folders_to_compress.append(
+                    runner.analysis.augustus_runner.pred_genes_dir_initial,
+                    runner.analysis.augustus_runner.pred_genes_dir_rerun,
+                    runner.analysis.augustus_runner.gff_dir,
+                    runner.analysis.gff2gb_runner.gb_folder,
+                )
+            for folder in folders_to_compress:
+                try:
+                    shutil.make_archive(
+                        folder,
+                        "gztar",
+                        os.path.dirname(folder),
+                        os.path.basename(folder),
+                    )
+                    shutil.rmtree(folder)
+                except OSError:
+                    raise
+                    # logger.warning("Unable to compress folder {}".format(folder))
+
 
 class BatchRunner:
 
@@ -276,6 +311,8 @@ class AnalysisRunner:
                 self.mode = "prok_tran"
             elif self.domain == "eukaryota":
                 self.mode = "euk_tran"
+            elif self.domain == "viruses":
+                self.mode = "prok_genome"  # Suggested by Mose - Prodigal may perform better on viruses than BLAST + HMMER.
             else:
                 raise BatchFatalError("Unrecognized mode {}".format(self.mode))
         analysis_type = type(self).mode_dict[self.mode]
@@ -505,6 +542,10 @@ class AnalysisRunner:
                     if not self.config.getboolean("busco_run", "auto-lineage"):
                         auto_lineage_line = "\nConsider using the auto-lineage mode to select a more specific lineage."
                         final_output_results.append(auto_lineage_line)
+                    with open(
+                        self.analysis.hmmer_runner.short_summary_file, "a"
+                    ) as short_summary_file:
+                        short_summary_file.write(positive_parasitic_line)
 
         except OSError:
             pass


=====================================
src/busco/_version.py
=====================================
@@ -6,4 +6,4 @@ Copyright (c) 2016-2021, Evgeny Zdobnov (ez at ezlab.org)
 Licensed under the MIT license. See LICENSE.md file.
 
 """
-__version__ = "5.2.1"
+__version__ = "5.2.2"


=====================================
src/busco/analysis/BuscoAnalysis.py
=====================================
@@ -119,6 +119,12 @@ class BuscoAnalysis(metaclass=ABCMeta):
         )
         if self.restart and self.hmmer_runner.check_previous_completed_run():
             logger.info("Skipping HMMER run as output already processed")
+        elif self.restart and os.path.exists(
+            "{}.tar.gz".format(self.hmmer_runner.output_folder)
+        ):
+            raise BuscoError(
+                "Restart mode incompatible with a previously compressed (--tar) run. Please decompress the HMMER results folder and try again."
+            )
         elif len(os.listdir(self.hmmer_runner.results_dir)) > 0:
             raise BuscoError(
                 "HMMER results directory not empty. If you are running in restart mode, make sure you are "
@@ -190,11 +196,14 @@ class BuscoAnalysis(metaclass=ABCMeta):
         self._check_dataset_integrity()
         if not os.stat(self.input_file).st_size > 0:
             raise BuscoError("Input file is empty.")
-        with open(self.input_file) as f:
-            for line in f:
-                if line.startswith(">"):
-                    self._check_fasta_header(line)
-                    self._check_seq_uniqueness(line)
+        try:
+            with open(self.input_file) as f:
+                for line in f:
+                    if line.startswith(">"):
+                        self._check_fasta_header(line)
+                        self._check_seq_uniqueness(line)
+        except UnicodeDecodeError as ude:
+            raise BuscoError(ude.msg)
         return
 
     def _check_seq_uniqueness(self, line):


=====================================
src/busco/analysis/GenomeAnalysis.py
=====================================
@@ -586,8 +586,10 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
                 ]
         return hmmer_result_dict_new, matched_genes_new
 
-    @log("Validating exons and removing overlapping matches", logger)
     def exons_to_df(self, records):
+        if self._mode == "genome":
+            logger.info("Validating exons and removing overlapping matches")
+
         df = self.metaeuk_runner.records_to_df(records)
         df["Start"] = df["Start"].astype(int)
         df["Stop"] = df["Stop"].astype(int)


=====================================
src/busco/busco_tools/Toolset.py
=====================================
@@ -69,7 +69,8 @@ class Job(Process):
                 try:
                     process = subprocess.run(
                         self.cmd_line,
-                        capture_output=True,  # stdout and stderr streams are stored and written to file after job completion
+                        stdout=subprocess.PIPE,  # stdout and stderr streams are stored and written to file after job completion
+                        stderr=subprocess.PIPE,
                         cwd=self.cwd,
                         shell=False,
                         timeout=self.timeout,


=====================================
src/busco/busco_tools/hmmer.py
=====================================
@@ -101,6 +101,7 @@ class HMMERRunner(BaseRunner):
         self.mode = mode
 
         self.is_fragment = {}
+        self.matched_genes_fragment = {}
 
         self.single_copy_buscos = {}
         self.multi_copy_buscos = {}


=====================================
src/busco/run_BUSCO.py
=====================================
@@ -152,7 +152,8 @@ def _parse_args():
         required=False,
         metavar="SEQUENCE_FILE",
         help="Input sequence file in FASTA format. "
-        "Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set.",
+        "Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. "
+        "Also possible to use a path to a directory containing multiple input files.",
     )
 
     optional.add_argument(
@@ -385,6 +386,14 @@ def _parse_args():
         help="Continue a run that had already partially completed.",
     )
 
+    optional.add_argument(
+        "--tar",
+        dest="tar",
+        action="store_true",
+        required=False,
+        help="Compress some subdirectories with many files to save space",
+    )
+
     optional.add_argument(
         "--update-data",
         dest="update-data",


=====================================
tests/unittests/BuscoConfig_unittests.py
=====================================
@@ -43,6 +43,7 @@ class TestBuscoConfig(unittest.TestCase):
             "download_path": None,
             "update-data": False,
             "version": "==SUPPRESS==",
+            "tar": False,
         }
 
         self.test_params = {
@@ -80,6 +81,7 @@ class TestBuscoConfig(unittest.TestCase):
                 "limit",
                 "use_augustus",
                 "batch_mode",
+                "tar",
             ],
             "etraining": ["path", "command"],
             "gff2gbSmallDNA.pl": ["path", "command"],


=====================================
tests/unittests/run_BUSCO_unittests.py
=====================================
@@ -116,6 +116,7 @@ class TestParams(unittest.TestCase):
             "download_path": None,
             "update-data": False,
             "version": "==SUPPRESS==",
+            "tar": False,
         }
         self.assertDictEqual(params, correct_parse)
 
@@ -173,6 +174,7 @@ class TestParams(unittest.TestCase):
             "download_path": None,
             "update-data": False,
             "version": "==SUPPRESS==",
+            "tar": False,
         }
         self.assertDictEqual(params, correct_parse)
 
@@ -223,6 +225,7 @@ class TestParams(unittest.TestCase):
             "--augustus",
             "--update-data",
             "--offline",
+            "--tar",
         ]
         command_str = " ".join(
             [" ".join([key, str(value)]) for key, value in arg_values.items()]
@@ -261,6 +264,7 @@ class TestParams(unittest.TestCase):
             "quiet": True,
             "update-data": True,
             "version": "==SUPPRESS==",
+            "tar": True,
         }
         self.assertDictEqual(params, correct_parse)
 



View it on GitLab: https://salsa.debian.org/med-team/busco/-/compare/f74f6e63cd60bc848648ddfe8e0878d152e7685b...d72823781030d21c44d37fbb82e024b717d2742b

-- 
View it on GitLab: https://salsa.debian.org/med-team/busco/-/compare/f74f6e63cd60bc848648ddfe8e0878d152e7685b...d72823781030d21c44d37fbb82e024b717d2742b
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210725/3743474c/attachment-0001.htm>


More information about the debian-med-commit mailing list