[med-svn] [Git][med-team/busco][upstream] New upstream version 5.5.0

Andreas Tille (@tille) gitlab at salsa.debian.org
Mon Sep 11 11:32:18 BST 2023



Andreas Tille pushed to branch upstream at Debian Med / busco


Commits:
25d070d5 by Andreas Tille at 2023-09-11T12:27:42+02:00
New upstream version 5.5.0
- - - - -


15 changed files:

- CHANGELOG
- src/busco/AutoLineage.py
- src/busco/BuscoConfig.py
- src/busco/BuscoRunner.py
- src/busco/_version.py
- src/busco/analysis/BuscoAnalysis.py
- src/busco/analysis/GenomeAnalysis.py
- src/busco/busco_tools/augustus.py
- src/busco/busco_tools/base.py
- src/busco/busco_tools/hmmer.py
- src/busco/busco_tools/metaeuk.py
- + src/busco/busco_tools/miniprot.py
- src/busco/run_BUSCO.py
- tests/unittests/BuscoConfig_unittests.py
- tests/unittests/run_BUSCO_unittests.py


Changes:

=====================================
CHANGELOG
=====================================
@@ -1,3 +1,6 @@
+5.5.0
+- Add miniprot pipeline (beta test)
+
 5.4.7
 - Fix bug in overlap handling (Issue #653): this fix also updated the way negative strand coordinates are reported,
 i.e. <gene_id>:<start>-<stop> instead of <gene_id>:<low>-<high>


=====================================
src/busco/AutoLineage.py
=====================================
@@ -256,6 +256,10 @@ class AutoSelectLineage:
                     protein_seqs_dir = self.selected_runner.analysis.augustus_runner.extracted_prot_dir
                     protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir)
                                     if f.split(".")[-2] == "faa"]
+                elif self.config.getboolean("busco_run", "use_miniprot"):
+                    protein_seqs_dir = self.selected_runner.analysis.miniprot_align_runner.translated_proteins_folder
+                    protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir)
+                                    if f.endswith("faa")]
                 else:
                     protein_seqs = self.selected_runner.analysis.metaeuk_runner.combined_pred_protein_seqs
         elif "tran" in self.selected_runner.mode:


=====================================
src/busco/BuscoConfig.py
=====================================
@@ -32,6 +32,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
         "auto-lineage-euk": False,
         "update-data": False,
         "use_augustus": False,
+        "use_miniprot": False,
         "batch_mode": False,
         "tar": False,
     }
@@ -93,6 +94,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
         "tar",
         "download_base_url",
         "use_augustus",
+        "use_miniprot",
         "download_base_url",
     }
 
@@ -122,6 +124,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
         "evalue",
         "limit",
         "use_augustus",
+        "use_miniprot",
         "batch_mode",
         "tar",
         "contig_break",
@@ -212,6 +215,8 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
             elif domain == "eukaryota":
                 if self.getboolean("busco_run", "use_augustus"):
                     mode = "euk_genome_aug"
+                elif self.getboolean("busco_run", "use_miniprot"):
+                    mode = "euk_genome_min"
                 else:
                     mode = "euk_genome_met"
             else:
@@ -237,7 +242,6 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
             self.specific_params = type(self).AUGUSTUS_ARGS
             self.specific_params.update(type(self).BLAST_ARGS)
             self.specific_params.update(type(self).BBTOOLS_ARGS)
-
         elif mode == "euk_genome_met":
             self.specific_params = type(self).METAEUK_ARGS
             self.specific_params.update(type(self).BBTOOLS_ARGS)
@@ -245,7 +249,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
             self.specific_params = type(self).BLAST_ARGS
         elif mode == "euk_tran":
             self.specific_params = type(self).METAEUK_ARGS
-        elif mode == "prok_genome":
+        elif mode in ["prok_genome", "euk_genome_min"]:
             self.specific_params = type(self).BBTOOLS_ARGS
         else:
             self.specific_params = {}


=====================================
src/busco/BuscoRunner.py
=====================================
@@ -3,6 +3,7 @@ from busco.analysis.BuscoAnalysis import BuscoAnalysis
 from busco.analysis.GenomeAnalysis import (
     GenomeAnalysisEukaryotesAugustus,
     GenomeAnalysisEukaryotesMetaeuk,
+    GenomeAnalysisEukaryotesMiniprot,
 )
 from busco.analysis.TranscriptomeAnalysis import (
     TranscriptomeAnalysisProkaryotes,
@@ -331,6 +332,7 @@ class BatchRunner:
 class AnalysisRunner:
 
     mode_dict = {
+        "euk_genome_min": GenomeAnalysisEukaryotesMiniprot,
         "euk_genome_met": GenomeAnalysisEukaryotesMetaeuk,
         "euk_genome_aug": GenomeAnalysisEukaryotesAugustus,
         "prok_genome": GenomeAnalysisProkaryotes,
@@ -441,6 +443,8 @@ class AnalysisRunner:
                 gene_predictor = "prodigal"
             elif self.config.getboolean("busco_run", "use_augustus"):
                 gene_predictor = "augustus"
+            elif self.config.getboolean("busco_run", "use_miniprot"):
+                gene_predictor = "miniprot"
             else:
                 gene_predictor = "metaeuk"
             self.summary["parameters"]["gene_predictor"] = gene_predictor
@@ -894,15 +898,23 @@ class AnalysisRunner:
 
 class SmartBox:
     def __init__(self):
-        self.width = None
-
-    def wrap_header(self, header_text):
-        if len(header_text) < 80:
-            self.width = max(50, len(header_text.expandtabs()))
+        self.width = 50
+
+    def define_width(self, header_text, body_text):
+        lines = body_text.split("\n")
+        lens = [len(x) for x in lines]
+        max_ind = lens.index(max(lens))
+        longest_line = lines[max_ind]
+        if len(header_text) > len(longest_line):
+            longest_line = header_text
+        if len(longest_line) < 80:
+            self.width = max(50, len(longest_line.expandtabs()))
         else:
             self.width = 50
-            header_text = self.wrap_long_line(header_text)
 
+    def wrap_header(self, header_text):
+        if len(header_text) > 80:
+            header_text = self.wrap_long_line(header_text)
         return header_text
 
     def wrap_long_line(self, line):
@@ -955,7 +967,8 @@ class SmartBox:
         return "-" * self.width
 
     def create_results_box(self, header_text, body_text):
-        header = self.wrap_header(header_text)  # Called first to define width
+        self.define_width(header_text, body_text)  # Called first to define width
+        header = self.wrap_header(header_text)
         box_lines = list(["\n"])
         box_lines.append("\t{}".format(self.add_horizontal()))
         framed_header = self.add_vertical(header)


=====================================
src/busco/_version.py
=====================================
@@ -6,4 +6,4 @@ Copyright (c) 2016-2023, Evgeny Zdobnov (ez at ezlab.org)
 Licensed under the MIT license. See LICENSE.md file.
 
 """
-__version__ = "5.4.7"
+__version__ = "5.5.0"


=====================================
src/busco/analysis/BuscoAnalysis.py
=====================================
@@ -132,14 +132,23 @@ class BuscoAnalysis(metaclass=ABCMeta):
             self.hmmer_runner.run()
         self.hmmer_runner.process_output()
         self.validate_output()
-        self.hmmer_runner.filter()
-        self.hmmer_runner.consolidate_busco_lists()
+        self.filter_results()
+        self.consolidate_busco_lists()
         output = self.hmmer_runner.create_output_content()
         self.hmmer_runner.write_hmmer_results(output)
-        self.hmmer_runner.record_results()
+        self.record_results()
         self.hmmer_runner.produce_hmmer_summary()
         return
 
+    def record_results(self):
+        self.hmmer_runner.record_results()
+
+    def filter_results(self):
+        self.hmmer_runner.filter()
+
+    def consolidate_busco_lists(self):
+        self.hmmer_runner.consolidate_busco_lists()
+
     def validate_output(
         self,
     ):  # Transparent method that can be overwritten by child classes


=====================================
src/busco/analysis/GenomeAnalysis.py
=====================================
@@ -14,6 +14,7 @@ from busco.analysis.BuscoAnalysis import BuscoAnalysis
 from busco.analysis.Analysis import NucleotideAnalysis, BLASTAnalysis
 from busco.busco_tools.prodigal import ProdigalRunner
 from busco.busco_tools.metaeuk import MetaeukRunner
+from busco.busco_tools.miniprot import MiniprotIndexRunner, MiniprotAlignRunner
 from busco.busco_tools.bbtools import BBToolsRunner
 from busco.busco_tools.augustus import (
     AugustusRunner,
@@ -34,6 +35,9 @@ import pandas as pd
 from collections import defaultdict
 import subprocess
 from busco.Exceptions import BuscoError
+from multiprocessing import Pool
+from itertools import repeat, chain
+import numpy as np
 
 logger = BuscoLogger.get_logger(__name__)
 
@@ -366,7 +370,6 @@ class GenomeAnalysisEukaryotesAugustus(BLASTAnalysis, GenomeAnalysisEukaryotes):
         self.optimize_augustus_runner.run()
         return
 
-
 class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
     def __init__(self):
         super().__init__()
@@ -793,8 +796,8 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
             end_trim2 = 0
 
         if (
-            hmmer_match_details1[gene_id1]["score"]
-            > hmmer_match_details2[gene_id2]["score"]
+            hmmer_match_details1[gene_id1][0]["score"]  # todo: perhaps revisit - assuming only one match per gene
+            > hmmer_match_details2[gene_id2][0]["score"]
         ):
             priority_match = hmmer_match_details1
             secondary_match = hmmer_match_details2
@@ -814,8 +817,8 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
             priority_gene_trim = (start_trim2, end_trim2)
             secondary_gene_trim = (start_trim1, end_trim1)
 
-        priority_env_coords = iter(priority_match[priority_gene_id]["env_coords"])
-        secondary_env_coords = iter(secondary_match[secondary_gene_id]["env_coords"])
+        priority_env_coords = iter(priority_match[priority_gene_id][i]["env_coords"][0] for i in range(len(priority_match[priority_gene_id])))
+        secondary_env_coords = iter(secondary_match[secondary_gene_id][i]["env_coords"][0] for i in range(len(secondary_match[secondary_gene_id])))
         priority_used_exons, priority_unused_exons = self.find_unused_exons(
             priority_env_coords, priority_exons, priority_gene_trim
         )
@@ -988,3 +991,223 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
         except OSError:
             pass
         super().cleanup()
+
+
+class GenomeAnalysisEukaryotesMiniprot(GenomeAnalysisEukaryotes):
+    def __init__(self):
+        super().__init__()
+        self.miniprot_index_runner = None
+        self.miniprot_align_runner = None
+        self.gene_details = {}
+        self.gene_update_mapping = defaultdict(dict)
+        self.cpus = int(self.config.get("busco_run", "cpu"))
+        self.filtered_records = defaultdict(list)
+        self.filtered_busco_hits = []
+
+    def init_tools(self):
+        super().init_tools()
+        self.miniprot_index_runner = MiniprotIndexRunner()
+        self.miniprot_align_runner = MiniprotAlignRunner()
+
+    def reset(self):
+        super().reset()
+        self.miniprot_index_runner.reset()
+        self.miniprot_align_runner.reset()
+
+    def run_analysis(self):
+        """This function calls all needed steps for running the analysis."""
+        super().run_analysis()
+        incomplete_buscos = None
+        try:
+            self.run_miniprot(incomplete_buscos)
+            self.gene_details.update(self.miniprot_align_runner.gene_details)
+            self.sequences_aa.update(self.miniprot_align_runner.sequences_aa)
+            self.hmmer_runner.miniprot_pipeline = True
+            self.run_hmmer(
+                self.miniprot_align_runner.output_sequences,
+                busco_ids=incomplete_buscos
+            )
+        except NoRerunFile:
+            raise NoGenesError("Miniprot")
+
+        self.hmmer_runner.write_buscos_to_file(self.sequences_aa)#, self.sequences_nt)
+
+    def run_miniprot(self, incomplete_buscos):
+        self.miniprot_index_runner.configure_runner()
+        if self.restart and self.miniprot_index_runner.check_previous_completed_run():
+            logger.info("Skipping Miniprot indexing run as already run")
+        else:
+            self.restart = False
+            self.config.set("busco_run", "restart", str(self.restart))
+            self.miniprot_index_runner.run()
+
+        self.miniprot_align_runner.configure_runner(incomplete_buscos)
+        if self.restart and self.miniprot_align_runner.check_previous_completed_run():
+            logger.info("Skipping Miniprot aligning run as already run")
+        else:
+            self.restart = False
+            self.config.set("busco_run", "restart", str(self.restart))
+            self.miniprot_align_runner.run()
+        self.miniprot_align_runner.parse_output()
+        self.miniprot_align_runner.write_protein_sequences_per_busco()
+
+    def filter_results(self):
+        for record in self.gene_details:
+            if record in self.hmmer_runner.hmmer_records:
+                self.filtered_records[record].append(self.gene_details[record])
+                self.filtered_records[record][-1].update({"length": sum(region["hmm_len"] for region in self.hmmer_runner.hmmer_records[record]),
+                                                      "bitscore": self.filtered_records[record][-1]["score"]})
+                self.filtered_busco_hits.append(record.split("|")[0].split("_")[0])
+        return
+
+    def apply_label2(self, gene_match1, gene_match2, min_complete):
+        protein_length1 = self.gene_details[gene_match1]["protein_length"]
+        protein_length2 = self.gene_details[gene_match2]["protein_length"]
+
+        output1 = self.apply_label1([gene_match1], min_complete, by_rate=True)
+        label_length = {output1: protein_length1}
+        output2 = self.apply_label1([gene_match2], min_complete, by_rate=True)
+        label_length.update({output2: protein_length2})
+
+        if list(label_length.keys()) == ["Single"]:
+            gene_label = "Single"
+        elif list(label_length.keys()) == ["Fragmented"]:
+            gene_label = "Fragmented"
+        elif list(label_length.keys()) == ["Duplicated"]:
+            gene_label = "Duplicated"
+        elif set(label_length.keys()) == {"Single", "Fragmented"}:
+            if label_length["Fragmented"] > label_length["Single"] + (1.5):
+                gene_label = "Fragmented"
+            else:
+                gene_label = "Single"
+        elif set(label_length.keys()) == {"Single", "Duplicated"}:
+            if label_length["Duplicated"] > label_length["Single"] + (1.5):
+                gene_label = "Duplicated"
+            else:
+                gene_label = "Single"
+        elif set(label_length.keys()) == {"Fragmented", "Duplicated"}:
+            if label_length["Fragmented"] > label_length["Duplicated"] + (1.5):
+                gene_label = "Fragmented"
+            else:
+                gene_label = "Duplicated"
+        else:
+            gene_label = "Duplicated"
+        return gene_label
+
+    def apply_label1(self, gene_matches, min_complete, by_rate=False):
+        if len(gene_matches) == 0:
+            gene_label = "Missing"
+        elif len(gene_matches) == 1:
+            if (self.gene_details[gene_matches[0]]["gene_end"] - self.gene_details[gene_matches[0]]["gene_start"])/(self.gene_details[gene_matches[0]]["protein_length"]**int(by_rate)) >= min_complete:
+                gene_label = "Single"
+            else:
+                gene_label = "Fragmented"
+        else:
+            complete_regions = []
+            fragmented_regions = []
+            for gene_match in gene_matches:
+                details = self.gene_details[gene_match]
+                if details["gene_end"] - details["gene_start"] >= min_complete:
+                    complete_regions.append(
+                        (gene_match, details["gene_start"], details["gene_end"]))
+                else:
+                    fragmented_regions.append(
+                        (gene_match, details["gene_start"], details["gene_end"]))
+            if len(complete_regions) == 0:
+                gene_label = "Fragmented"
+            elif len(complete_regions) == 1:
+                gene_label = "Single"
+            else:
+                ctgs = [x[0] for x in complete_regions]
+                if len(set(ctgs)) > 1:
+                    gene_label = "Duplicated"
+                else:
+                    regions = [(x[1], x[2]) for x in complete_regions]
+                    clusters = self.get_region_clusters(regions)
+                    if len(clusters) == 1:
+                        gene_label = "Single"
+                    else:
+                        gene_label = "Duplicated"
+
+        return gene_label
+
+    def determine_busco_label(self, busco_id, gene_matches, ixl, min_complete):
+        gene_label = ""
+        if len(gene_matches) == 0:
+            gene_label = "Missing"
+        elif len(gene_matches) == 1:
+            gene_label = self.apply_label1(gene_matches, min_complete)
+        else:
+            if (ixl[0] - ixl[1])/(ixl[1] + 1e-9) >= 0.2:
+                gene_label = self.apply_label1(gene_matches, min_complete)
+            else:
+                if gene_matches[0] == gene_matches[1]:
+                    gene_label = self.apply_label1(gene_matches, min_complete)
+                else:
+                    gene_label = self.apply_label2(gene_matches[0], gene_matches[1], min_complete)
+        return gene_label
+
+    def consolidate_busco_lists(self):
+        self.single_copy_buscos = defaultdict(lambda: defaultdict(list))
+        self.multi_copy_buscos = defaultdict(lambda: defaultdict(list))
+        self.fragmented_copy_buscos = defaultdict(lambda: defaultdict(list))
+        self.missing_buscos = []
+        self.hmmer_runner.load_buscos()
+        for busco_id in set(self.filtered_busco_hits):
+            min_complete = self.hmmer_runner.cutoff_dict[busco_id]["length"] - 2 * \
+                           self.hmmer_runner.cutoff_dict[busco_id]["sigma"]
+            gene_matches = np.array([x for x in self.filtered_records if x.startswith(busco_id)])
+            ixl = np.array([(self.gene_details[g]["gene_end"] - self.gene_details[g]["gene_start"])*self.gene_details[g]["identity"] for g in gene_matches])
+            sort_order = np.argsort(ixl)[::-1]
+            gene_matches = gene_matches[sort_order]
+            ixl = ixl[sort_order]
+            if len(gene_matches) > 0:
+                output = self.determine_busco_label(busco_id, gene_matches, ixl, min_complete)
+                for gene_match in gene_matches:
+                    gene_id = gene_match#.split("|")[-1]
+                    if output == "Single":
+                        self.single_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+                    elif output == "Fragmented":
+                        self.fragmented_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+                    elif output == "Duplicated":
+                        self.multi_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+            else:
+                self.missing_buscos.append(busco_id)
+        self.hmmer_runner.single_copy_buscos = self.convert_dict(self.single_copy_buscos)
+        self.hmmer_runner.multi_copy_buscos = self.convert_dict(self.multi_copy_buscos)
+        self.hmmer_runner.fragmented_buscos = self.convert_dict(self.fragmented_copy_buscos)
+        self.hmmer_runner.missing_buscos = self.missing_buscos
+
+    @staticmethod
+    def convert_dict(busco_dict):
+        new_dict = {}
+        for key, value in busco_dict.items():
+            new_dict[key] = dict(value)
+        return new_dict
+
+    @staticmethod
+    def get_region_clusters(regions):
+        sorted_regions = sorted(regions, key=lambda x: x[0], reverse=False)
+        clusters = []
+        for (start, stop) in sorted_regions:
+            if not clusters:
+                clusters.append([start, stop])
+            else:
+                last_cluster = clusters[-1]
+                if last_cluster[0] <= start <= last_cluster[1]:
+                    # has overlap
+                    clusters[-1][0] = min(last_cluster[0], start)
+                    clusters[-1][1] = max(last_cluster[1], stop)
+                else:
+                    clusters.append([start, stop])
+        return clusters
+
+    def record_results(self):
+        self.hmmer_runner.record_results(frameshifts=True)
+
+    def cleanup(self):
+        # try:
+        #     self.metaeuk_runner.remove_tmp_files()
+        # except OSError:
+        #     pass
+        super().cleanup()


=====================================
src/busco/busco_tools/augustus.py
=====================================
@@ -259,7 +259,7 @@ class AugustusRunner(BaseRunner):
         return ordered_jobs
 
     def generate_job_args(self):
-        contig_ordinal_inds = defaultdict(int)
+        contig_nominal_inds = defaultdict(int)
         njobs = 0
 
         ordered_jobs = self.sort_jobs()
@@ -272,8 +272,8 @@ class AugustusRunner(BaseRunner):
             contig_tmp_file = "{}.temp".format(
                 contig_name[:100]
             )  # Avoid very long filenames
-            contig_ordinal_inds[busco_group] += 1
-            output_index = contig_ordinal_inds[busco_group]
+            contig_nominal_inds[busco_group] += 1
+            output_index = contig_nominal_inds[busco_group]
             out_filename = os.path.join(
                 self.pred_genes_dir, "{}.out.{}".format(busco_group, output_index)
             )


=====================================
src/busco/busco_tools/base.py
=====================================
@@ -6,6 +6,7 @@ from abc import ABCMeta, abstractmethod
 from busco.BuscoConfig import BuscoConfigAuto
 from busco.Exceptions import BuscoError
 import time
+import gzip
 
 logger = BuscoLogger.get_logger(__name__)
 
@@ -157,6 +158,24 @@ class BaseRunner(Tool, metaclass=ABCMeta):
     def generate_job_args(self):
         pass
 
+    @staticmethod
+    def decompress_refseq_file(gzip_file):  # todo: probably doesn't belong in this class as it is only applicable to metaeuk and miniprot
+        unzipped_filename = gzip_file.split(".gz")[0]
+        if not os.path.exists(unzipped_filename):
+            with gzip.open(gzip_file, "rb") as compressed_file:
+                with open(unzipped_filename, "wb") as decompressed_file:
+                    for line in compressed_file:
+                        decompressed_file.write(line)
+        if os.path.exists(gzip_file):
+            try:
+                os.remove(gzip_file)
+            except OSError:
+                logger.warning(
+                    "Unable to remove compressed refseq file in dataset download"
+                )
+                pass
+        return unzipped_filename
+
     @property
     @abstractmethod
     def output_folder(self):


=====================================
src/busco/busco_tools/hmmer.py
=====================================
@@ -96,6 +96,8 @@ class HMMERRunner(BaseRunner):
         self._already_used_genes = None
         self.missing_buscos = None
 
+        self.miniprot_pipeline = False
+
     def configure_runner(self, input_sequences, busco_ids, mode, gene_details):
         super().configure_runner()
         self.run_number += 1
@@ -255,8 +257,11 @@ class HMMERRunner(BaseRunner):
         merged_dict = defaultdict(lambda: defaultdict(list))
         for hmmer_dict in [self.is_complete, self.is_very_large, self.is_fragment]:
             for busco_id, busco_matches in hmmer_dict.items():
-                merged_dict[busco_id].update(busco_matches)
-        return merged_dict
+                for gene_id, matches in busco_matches.items():
+                    merged_dict[busco_id][gene_id].extend(matches)
+        # for busco_id in merged_dict.keys():
+        #     merged_dict[busco_id] = dict(merged_dict[busco_id])  # convert from defaultdict to dict
+        return dict(merged_dict)
 
     def parse_hmmer_output(self, filename, busco_query):
         """
@@ -268,7 +273,9 @@ class HMMERRunner(BaseRunner):
         :return: Dictionary of (gene_id, total_matched_length) pairs
         :rtype: dict
         """
-        records = defaultdict(dict)
+        records = defaultdict(list)
+        top_hit = None
+        matched_genes = []
 
         with open(filename, "r") as f:
 
@@ -283,27 +290,35 @@ class HMMERRunner(BaseRunner):
                         tlen = int(line[2])
                         bit_score = float(line[7])
 
+                        if self.miniprot_pipeline and top_hit and top_hit != gene_id:  # only load the top result for efficiency
+                            if self._check_overlap(matched_genes, gene_id.split("|", maxsplit=1)[-1]):
+                                continue
+
                         # Extract frame information (present in transcriptome mode)
                         frame = str(line[-1]) if "frame" in str(line[-1]) else None
 
                         # Store bitscore matches for each gene match. If match below cutoff, discard.
                         if bit_score < float(self.cutoff_dict[busco_query]["score"]):
                             continue
-                        if gene_id not in records:
-                            records[gene_id] = {
-                                "tlen": tlen,
-                                "hmm_len": 0,
-                                "env_coords": [],
-                                "score": bit_score,
-                                "frame": frame,
-                            }
+                        records[gene_id].append({
+                            "tlen": tlen,
+                            "hmm_len": 0,
+                            "env_coords": [],
+                            "score": bit_score,
+                            "frame": frame,
+                        })
                         hmm_start = int(line[15])
                         hmm_end = int(line[16])
                         env_start = int(line[19])
                         env_end = int(line[20])
-                        records[gene_id]["hmm_len"] += hmm_end - hmm_start
-                        records[gene_id]["env_coords"].append((env_start, env_end))
-
+                        records[gene_id][-1]["hmm_len"] += hmm_end - hmm_start
+                        records[gene_id][-1]["env_coords"].append((env_start, env_end))
+                        if self.miniprot_pipeline:
+                            hit_busco_seq, hit_gene = gene_id.split("|", maxsplit=1)
+                            if hit_gene not in matched_genes:
+                                matched_genes.append(hit_gene)
+                            if not top_hit:
+                                top_hit = gene_id
                     except IndexError as e:
                         logger.error(
                             "Cannot parse HMMER output file {}".format(filename)
@@ -311,6 +326,22 @@ class HMMERRunner(BaseRunner):
                         raise BuscoError(e)
         return records
 
+    @staticmethod
+    def _check_overlap(matched_genes, gene2):
+        overlaps = []
+        coords2 = gene2.split(":")[-1]
+        for gene1 in matched_genes:
+            coords1 = gene1.split(":")[-1]
+            start1, end1 = coords1.split("-")
+            start2, end2 = coords2.split("-")
+            if int(end2) - int(start2) > int(end1) - int(start1):
+                start1, end1, start2, end2 = start2, end2, start1, end1
+            if int(start1) <= int(start2) <= int(end1) or int(start1) <= int(end2) <= int(end1):
+                overlaps.append(True)
+            else:
+                overlaps.append(False)
+        return any(overlaps)
+
     def _sort_matches(self, matched_record, busco_query):
         """
         The HMMER gene matches are sorted into "complete", "v_large" and "fragmented" matches based on a comparison
@@ -332,8 +363,8 @@ class HMMERRunner(BaseRunner):
 
         # Determine whether matched gene represents a complete, very_large or fragment of a BUSCO
         for gene_id, record in matched_record.items():
-            size = record["hmm_len"]
-            frame = record["frame"]
+            size = sum([record[i]["hmm_len"] for i in range(len(record))])
+            frame = record[0]["frame"]
 
             # Kind of like a z-score, but it is compared with a cutoff value, not a mean
             zeta = (self.cutoff_dict[busco_query]["length"] - size) / self.cutoff_dict[
@@ -353,7 +384,7 @@ class HMMERRunner(BaseRunner):
 
             # Add information about match to dict
             busco_type[gene_id].append(
-                dict({"bitscore": record["score"], "length": size, "frame": frame, "orig gene ID": gene_id})
+                dict({"bitscore": record[0]["score"], "length": size, "frame": frame, "orig gene ID": gene_id})
             )
             # Reference which busco_queries are associated with each gene match
             match_type[gene_id].append(busco_query)
@@ -367,7 +398,7 @@ class HMMERRunner(BaseRunner):
             matched_genes_fragment,
         )
 
-    def process_output(self):
+    def process_output(self, gene_id_lookup=None):
         """
         Load all gene matches from HMMER output and sort into dictionaries depending on match quality
         (complete, v_large, fragment).
@@ -392,11 +423,22 @@ class HMMERRunner(BaseRunner):
             raise ValueError(
                 "HMMER should not be run more than twice in the same Run instance."
             )
-
+        self.gene_id_lookup = gene_id_lookup
         with Pool(self.cpus) as job_pool:
             hmmer_records = job_pool.map(
                 self.load_results_from_file, hmmer_results_files
             )
+        if self.miniprot_pipeline:
+            self.unpack_hmmer_records_miniprot(hmmer_records)
+        else:
+            self.unpack_hmmer_records_default(hmmer_records)
+
+    def unpack_hmmer_records_miniprot(self, hmmer_records):
+        self.hmmer_records = {}
+        for record in hmmer_records:
+            self.hmmer_records.update(record)
+
+    def unpack_hmmer_records_default(self, hmmer_records):
 
         self.is_complete = defaultdict(
             lambda: defaultdict(list), self.is_complete
@@ -459,24 +501,27 @@ class HMMERRunner(BaseRunner):
     def load_results_from_file(self, filename):
         busco_query = str(os.path.basename(filename).split(".")[0])
         matched_record = self.parse_hmmer_output(filename, busco_query)
-        filtered_records = self.remove_overlaps(matched_record)
-        (
-            busco_complete,
-            busco_vlarge,
-            busco_fragment,
-            matched_genes_complete,
-            matched_genes_vlarge,
-            matched_genes_fragment,
-        ) = self._sort_matches(filtered_records, busco_query)
-        return (
-            busco_query,
-            busco_complete,
-            busco_vlarge,
-            busco_fragment,
-            matched_genes_complete,
-            matched_genes_vlarge,
-            matched_genes_fragment,
-        )
+        if self.miniprot_pipeline:
+            return matched_record
+        else:
+            filtered_records = self.remove_overlaps(matched_record)
+            (
+                busco_complete,
+                busco_vlarge,
+                busco_fragment,
+                matched_genes_complete,
+                matched_genes_vlarge,
+                matched_genes_fragment,
+            ) = self._sort_matches(filtered_records, busco_query)
+            return (
+                busco_query,
+                busco_complete,
+                busco_vlarge,
+                busco_fragment,
+                matched_genes_complete,
+                matched_genes_vlarge,
+                matched_genes_fragment,
+            )
 
     def remove_overlaps(self, matched_records):
         seq_ids = []
@@ -484,9 +529,16 @@ class HMMERRunner(BaseRunner):
         high_coords = []
         scores = []
         strands = []
+        record_ids = []
         try:
             for record in matched_records:
-                seq_id, coords = record.split(":")
+                record_ids.append(record)
+                if self.gene_id_lookup is not None:
+                    gene_id = self.gene_id_lookup[int(record)]
+                else:
+                    gene_id = record
+                seq_id, coords = gene_id.split(":")
+                coords = coords.split("_")[0]
                 start_coord, stop_coord = map(int, coords.split("-"))
                 low_coord = min(start_coord, stop_coord)
                 high_coord = max(start_coord, stop_coord)
@@ -498,7 +550,7 @@ class HMMERRunner(BaseRunner):
                 low_coords.append(low_coord)
                 high_coords.append(high_coord)
                 strands.append(strand)
-                scores.append(matched_records[record]["score"])
+                scores.append(matched_records[record][0]["score"])  # multiple matches for same record have the same score
         except ValueError:  # for protein sequences there is no ":<coords>" suffix, so skip the overlap filtering
             return matched_records
 
@@ -509,10 +561,12 @@ class HMMERRunner(BaseRunner):
                 "High coord": high_coords,
                 "Score": scores,
                 "Strand": strands,
+                "Record ID": record_ids,
             }
         )
         results_grouped = records_df.groupby("Sequence")
         entries_to_remove = []
+        record_ids_to_remove = set()
         seq_ids = results_grouped.groups.keys()
         for seq in seq_ids:
             match_finder = self.get_matches(results_grouped, seq)
@@ -530,6 +584,7 @@ class HMMERRunner(BaseRunner):
                         ind_to_remove = idx2
                     else:
                         ind_to_remove = idx1
+                    record_ids_to_remove.add(g1_sorted.loc[ind_to_remove]["Record ID"])
                     record_to_remove = g1_sorted.loc[ind_to_remove]
                     record_start_coord, record_stop_coord = (
                         record_to_remove["Low coord"],
@@ -547,7 +602,7 @@ class HMMERRunner(BaseRunner):
                     )
 
         filtered_records = {
-            i: matched_records[i] for i in matched_records if i not in entries_to_remove
+            i: matched_records[i] for i in matched_records if i not in record_ids_to_remove
         }
 
         return filtered_records
@@ -843,13 +898,16 @@ class HMMERRunner(BaseRunner):
                                 )
                             )
                     elif self.mode == "genome":
-                        scaffold = self.gene_details[gene_id][m]
+                        try:
+                            scaffold = self.gene_details[gene_id][0]
+                        except KeyError:
+                            scaffold = match
                         if self.domain == "eukaryota":
                             location_pattern = ":{}-{}".format(
                                 scaffold["gene_start"], scaffold["gene_end"]
                             )
-                            if gene_id.endswith(location_pattern):
-                                gene_id = gene_id.replace(location_pattern, "")
+                            # if gene_id.endswith(location_pattern):
+                            #     gene_id = gene_id.replace(location_pattern, "")
                         else:  # Remove suffix assigned by Prodigal
                             gene_id = gene_id.rsplit("_", 1)[0]
                         try:
@@ -912,7 +970,7 @@ class HMMERRunner(BaseRunner):
         for busco_group in self.cutoff_dict:
             if not any(
                 busco_group in d
-                for d in [self.is_complete, self.is_very_large, self.is_fragment]
+                for d in [self.single_copy_buscos, self.multi_copy_buscos, self.fragmented_buscos]
             ):
                 output_lines.append("{}\tMissing\n".format(busco_group))
                 self.missing_buscos.append(busco_group)
@@ -1051,26 +1109,31 @@ class HMMERRunner(BaseRunner):
         return sorted_lines
 
     def produce_hmmer_summary(self):
+        frameshift_pattern = "(incl. {} with frameshifts)"
 
         self.hmmer_results_lines.append("***** Results: *****\n\n")
         self.hmmer_results_lines.append(self.one_line_summary_raw)
         self.hmmer_results_lines.append(
-            "{}\tComplete BUSCOs (C)\t\t\t{}\n".format(
-                self.single_copy + self.multi_copy, "   "
+            "{}\tComplete BUSCOs (C)\t{}\t\t{}\n".format(
+                self.single_copy + self.multi_copy, frameshift_pattern.format(self.c_frameshifts) if self.c_frameshifts > 0 else "",
+                "   "
             )
         )
         self.hmmer_results_lines.append(
-            "{}\tComplete and single-copy BUSCOs (S)\t{}\n".format(
-                self.single_copy, "   "
+            "{}\tComplete and single-copy BUSCOs (S)\t{}{}\n".format(
+                self.single_copy, frameshift_pattern.format(self.s_frameshifts) if self.s_frameshifts > 0 else "",
+                "   "
             )
         )
         self.hmmer_results_lines.append(
-            "{}\tComplete and duplicated BUSCOs (D)\t{}\n".format(
-                self.multi_copy, "   "
+            "{}\tComplete and duplicated BUSCOs (D)\t{}{}\n".format(
+                self.multi_copy, frameshift_pattern.format(self.d_frameshifts) if self.d_frameshifts > 0 else "",
+                "   "
             )
         )
         self.hmmer_results_lines.append(
-            "{}\tFragmented BUSCOs (F)\t\t\t{}\n".format(self.only_fragments, "   ")
+            "{}\tFragmented BUSCOs (F)\t{}\t\t{}\n".format(self.only_fragments, frameshift_pattern.format(self.f_frameshifts) if self.f_frameshifts > 0 else "",
+                                                           "   ")
         )
         self.hmmer_results_lines.append(
             "{}\tMissing BUSCOs (M)\t\t\t{}\n".format(
@@ -1097,7 +1160,7 @@ class HMMERRunner(BaseRunner):
 
         return
 
-    def record_results(self):
+    def record_results(self, frameshifts=False):
         self._get_busco_percentages()
         self.one_line_summary_raw = "C:{}%[S:{}%,D:{}%],F:{}%,M:{}%,n:{}\t{}\n".format(
             self.complete_percent,
@@ -1108,6 +1171,25 @@ class HMMERRunner(BaseRunner):
             self.total_buscos,
             "   ",
         )
+        if frameshifts:
+            self.s_frameshifts = 0
+            for x in self.single_copy_buscos.values():
+                for g, details in x.items():
+                    self.s_frameshifts += bool(int(details[0]["frameshift_events"]))  # just add one for each gene_id containing a frameshift
+            self.d_frameshifts = 0
+            for x in self.multi_copy_buscos.values():
+                for g, details in x.items():
+                    self.d_frameshifts += bool(int(details[0]["frameshift_events"]))
+            self.f_frameshifts = 0
+            for x in self.fragmented_buscos.values():
+                for g, details in x.items():
+                    self.f_frameshifts += bool(int(details[0]["frameshift_events"]))
+            self.c_frameshifts = self.s_frameshifts + self.d_frameshifts
+        else:
+            self.s_frameshifts = 0
+            self.d_frameshifts = 0
+            self.f_frameshifts = 0
+            self.c_frameshifts = 0
         self.one_line_summary = "Results:\t{}".format(self.one_line_summary_raw)
 
     @log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True)


=====================================
src/busco/busco_tools/metaeuk.py
=====================================
@@ -6,7 +6,6 @@ from Bio import SeqIO
 import shutil
 from configparser import NoOptionError
 import subprocess
-import gzip
 import pandas as pd
 import numpy as np
 import re
@@ -252,24 +251,6 @@ class MetaeukRunner(BaseRunner):
         self.pred_protein_mod_files.append(self.pred_protein_seqs_modified)
         self.codon_mod_files.append(self.codon_file_modified)
 
-    @staticmethod
-    def decompress_refseq_file(gzip_file):
-        unzipped_filename = gzip_file.split(".gz")[0]
-        if not os.path.exists(unzipped_filename):
-            with gzip.open(gzip_file, "rb") as compressed_file:
-                with open(unzipped_filename, "wb") as decompressed_file:
-                    for line in compressed_file:
-                        decompressed_file.write(line)
-        if os.path.exists(gzip_file):
-            try:
-                os.remove(gzip_file)
-            except OSError:
-                logger.warning(
-                    "Unable to remove compressed refseq file in dataset download"
-                )
-                pass
-        return unzipped_filename
-
     def combine_run_results(self):
         with open(self.combined_pred_protein_seqs, "w") as combined_output:
             for run_result in self.pred_protein_mod_files:


=====================================
src/busco/busco_tools/miniprot.py
=====================================
@@ -0,0 +1,303 @@
+from busco.busco_tools.base import BaseRunner
+from busco.BuscoLogger import BuscoLogger
+import subprocess
+import os
+from pathlib import Path
+import re
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from collections import defaultdict
+import shutil
+import numpy as np
+import re
+
+logger = BuscoLogger.get_logger(__name__)
+class MiniprotRunner(BaseRunner):
+    """
+    Class to run Miniprot
+    """
+    name = "miniprot"
+    cmd = "miniprot"
+    def __init__(self):
+        super().__init__()
+        self._output_folder = os.path.join(self.run_folder, "miniprot_output")
+        self.translated_proteins_folder = os.path.join(
+            self._output_folder, "translated_proteins"
+        )
+        self.create_dirs(
+            [
+                self._output_folder,
+                self.translated_proteins_folder,
+            ]
+        )
+        self.index_file = os.path.join(self._output_folder, "ref.mpi")
+        self.refseq_db = None
+        self.incomplete_buscos = None
+        self._output_basename = None
+
+    def configure_runner(self, *args):
+        """
+        Configure Miniprot runner
+        """
+        super().configure_runner(*args)
+        self.run_number += 1
+
+    def check_tool_dependencies(self):
+        pass
+    def configure_job(self, *args):
+        """
+        Overridden by child classes
+        """
+        return
+
+    def generate_job_args(self):
+        yield
+    def get_version(self):
+        help_output = subprocess.check_output(
+            [self.cmd, "--version"], stderr=subprocess.STDOUT, shell=False
+        )
+        version = help_output.decode("utf-8").strip()
+        return version
+
+    @property
+    def output_folder(self):
+        return self._output_folder
+
+    def reset(self):
+        super().reset()
+
+    def run(self):
+        super().run()
+        self.total = 1
+        self.run_jobs()
+
+class MiniprotIndexRunner(MiniprotRunner):
+
+    name = "miniprot_index"
+    def generate_job_args(self):
+        yield "index"
+
+    def configure_job(self, *args):
+        """
+        Configure Miniprot job
+        """
+
+        miniprot_job = self.create_job()
+        miniprot_job.add_parameter("-t")
+        miniprot_job.add_parameter(str(self.cpus))
+        miniprot_job.add_parameter("-d")
+        miniprot_job.add_parameter(self.index_file)
+        miniprot_job.add_parameter(self.input_file)
+
+        return miniprot_job
+
+
+class MiniprotAlignRunner(MiniprotRunner):
+
+    name = "miniprot_align"
+
+    def __init__(self):
+        super().__init__()
+        self.output_gff = None
+
+        self.gene_details = defaultdict(dict)
+        self.sequences_aa = {}
+        self.busco_matches = defaultdict(set)
+        self.gene_matches = defaultdict(list)
+        self.combined_pred_protein_seqs = os.path.join(
+            self._output_folder, "combined_pred_proteins.fas"
+        )
+        self.output_sequences = []
+        self.gene_nominal = 0
+        self.gene_lookup = {}
+        self.cigar_lookup = {}
+        self.nominal_lookup = defaultdict(list)
+
+    def generate_job_args(self):
+        yield "align"
+
+    def configure_job(self, *args):
+
+        miniprot_job = self.create_job()
+        miniprot_job.add_parameter("--trans")
+        miniprot_job.add_parameter("-u")
+        miniprot_job.add_parameter("-I")
+        miniprot_job.add_parameter("--outs")
+        miniprot_job.add_parameter("0.95")
+        miniprot_job.add_parameter("-t")
+        miniprot_job.add_parameter(str(self.cpus))
+        miniprot_job.add_parameter("--gff")
+
+        miniprot_job.add_parameter(self.index_file)
+        miniprot_job.add_parameter(self.refseq_db)
+
+        return miniprot_job
+
+    def configure_runner(self, incomplete_buscos=None):
+        super().configure_runner([])
+        self.logfile_path_out = os.path.join(
+            self.config.get("busco_run", "main_out"),
+            "logs",
+            "{}_{}_out.log".format(self.name, os.path.basename(self.lineage_dataset)),
+        )
+        self.logfile_path_err = (
+                self.logfile_path_out.rpartition("_out.log")[0] + "_err.log"
+        )
+
+        self.incomplete_buscos = incomplete_buscos
+        self._output_basename = os.path.join(
+            self._output_folder, os.path.basename(self.input_file)
+        )
+        gzip_refseq = os.path.join(self.lineage_dataset, "refseq_db.faa.gz")
+        self.refseq_db = self.decompress_refseq_file(gzip_refseq)
+        self.output_gff = Path(self._output_folder).joinpath(
+            "{}_{}{}".format(Path(self.input_file).stem, os.path.basename(self.lineage_dataset), ".gff"))
+
+    def create_symlink(self):
+        if not self.output_gff.exists():
+            Path(self.output_gff).symlink_to(self.logfile_path_out)
+        return
+
+    def parse_output(self):
+        self.create_symlink()
+        self.ata_seq = ""
+        self.target_id = ""
+        self.contig_id = ""
+        self.contig_start = 0
+        self.contig_end = 0
+        self.strand = ""
+        self.score = 0
+        self.exon_coords = defaultdict(list)
+        self.cigar_seq = ""
+        paf_block_started = False
+        gene_id = ""
+
+        with open(self.output_gff, "r") as gff:
+            for line in gff:
+                if line.startswith("##PAF"):
+                    paf_block_started = True
+                    fields = line.strip().split("\t")[1:]
+                    if fields[5] == "*":
+                        ## Unmapped protein
+                        continue
+                    self.target_id = fields[0]
+                    busco_id = self.target_id.split("_")[0]
+                    self.protein_length = int(fields[1])
+                    self.protein_start = int(fields[2])
+                    self.protein_end = int(fields[3])
+                    self.strand = fields[4]
+                    self.contig_id = fields[5]
+                    self.contig_start = int(fields[7])
+                    self.contig_end = int(fields[8])
+                    gene_id = "{}|{}:{}-{}".format(self.target_id, self.contig_id, self.contig_start, self.contig_end)
+                    self.score = int(fields[13].strip().split(":")[2])
+                    self.cigar_seq = str(fields[17].strip().split(":")[2])
+                    part_lengths, exon_lengths, match_lengths, group_types, ngroups, nexons, frameshifts, \
+                        frameshift_events, frameshift_lengths = self.decode_cigar(self.cigar_seq)
+                    sta_line = gff.readline()
+                    sta_seq = sta_line.strip().split("\t")[1]
+                    self.ata_seq = re.sub("\*", "", sta_seq.upper())
+
+                    self.busco_matches[busco_id].add(gene_id)
+
+                    self.gene_details[gene_id] = {"gene_start": self.contig_start, "gene_end": self.contig_end,
+                                                       "strand": self.strand, "score": self.score,
+                                                       "cigar": self.cigar_seq, "nexons": nexons,
+                                                       "frameshift_events": frameshift_events,
+                                                       "protein_start": self.protein_start,
+                                                       "protein_end": self.protein_end,
+                                                       "protein_length": self.protein_length}
+
+                    self.sequences_aa[gene_id] = SeqRecord(Seq(self.ata_seq), id=gene_id, description=gene_id)
+
+                elif paf_block_started:
+                    fields = line.strip().split("\t")
+                    if fields[2] == "CDS":
+                        start, stop, score, strand = fields[3], fields[4], fields[5], fields[6]
+                        self.exon_coords[gene_id].append((start, stop, score, strand))
+                    if fields[2] == "mRNA":
+                        info_dict = dict(v.split("=") for v in fields[8].split()[0].split(";"))
+                        identity = float(info_dict["Identity"])
+                        self.gene_details[gene_id].update({"identity": identity})
+        for item in self.exon_coords:
+            self.exon_coords[item] = np.array(self.exon_coords[item], dtype=[("start", "i4"), ("stop", "i4"), ("score", "f4"), ("strand", "U1")])
+        return
+
+    @staticmethod
+    def decode_cigar(cigar):
+        frameshifts = []
+        frameshift_events = 0
+        frameshift_lengths = 0
+        pattern = r"[0-9]+[MIDFGNUV]"
+        parts = list(re.finditer(pattern, cigar))
+        part_lengths = []
+        exon_lengths = []
+        exon_length = 0
+        match_lengths = {"M": 0, "I": 0, "D": 0, "F": 0, "G": 0, "N": 0, "U": 0, "V": 0}
+        group_types = {"M": 0, "I": 0, "D": 0, "F": 0, "G": 0, "N": 0, "U": 0, "V": 0}
+        ngroups = 0
+        nexons = 0
+        for p, part in enumerate(parts):
+            ngroups += 1
+            n, type = int(part.group(0)[:-1]), part.group(0)[-1]
+            match_lengths[type] += n
+            group_types[type] += 1
+            if type in ["M", "D"]:
+                exon_length += n
+            elif type in ["U", "V"]:
+                part_lengths.append(exon_length)
+                exon_lengths.append(exon_length)
+                nexons += 1
+                part_lengths.append(1)
+                exon_length = 0
+            elif type == "N":
+                part_lengths.append(exon_length)
+                exon_lengths.append(exon_length)
+                nexons += 1
+                exon_length = 0
+            elif type in ["F", "G"]:
+                # left search
+                q = p - 1
+                left_match_cnt = 0
+                while q >= 0:
+                    part2 = parts[q]
+                    n2, type2 = int(part2.group(0)[:-1]), part2.group(0)[-1]
+                    if type2 == "M":
+                        left_match_cnt += n2
+                    elif type2 in ["N", "U", "V"]:
+                        break
+                    q -= 1
+                # right search
+                q = p + 1
+                right_match_cnt = 0
+                while q < len(parts):
+                    part2 = parts[q]
+                    n2, type2 = int(part2.group(0)[:-1]), part2.group(0)[-1]
+                    if type2 == "M":
+                        right_match_cnt += n2
+                    elif type2 in ["N", "U", "V"]:
+                        break
+                    q += 1
+                if left_match_cnt >= 20 and right_match_cnt >= 20:
+                    frameshifts.append(str(n) + type)
+                    frameshift_events += 1
+                    frameshift_lengths += int(n)
+
+            # elif type == "G":
+            #     exon_length += 1
+        part_lengths.append(exon_length)
+        exon_lengths.append(exon_length)
+        nexons += 1
+        return part_lengths, exon_lengths, match_lengths, group_types, ngroups, nexons, frameshifts, frameshift_events, frameshift_lengths
+
+    def write_protein_sequences_per_busco(self):
+        for busco_id in self.busco_matches:
+            seqs_to_write = []
+            output_filename = os.path.join(self.translated_proteins_folder, "{}.faa".format(busco_id))
+            self.output_sequences.append(output_filename)
+            with open(output_filename, "w") as f:
+                for g in self.busco_matches[busco_id]:
+                    if g in self.sequences_aa:
+                        seqs_to_write.append(self.sequences_aa[g])
+                SeqIO.write(seqs_to_write, f, "fasta")


=====================================
src/busco/run_BUSCO.py
=====================================
@@ -329,6 +329,14 @@ def _parse_args():
         "single string with no white space, with each argument separated by a comma.",
     )
 
+    optional.add_argument(
+        "--miniprot",
+        dest="use_miniprot",
+        action="store_true",
+        required=False,
+        help="Use miniprot gene predictor for eukaryote runs",
+    )
+
     optional.add_argument(
         "--offline",
         dest="offline",


=====================================
tests/unittests/BuscoConfig_unittests.py
=====================================
@@ -35,6 +35,7 @@ class TestBuscoConfig(unittest.TestCase):
             "metaeuk_parameters": None,
             "metaeuk_rerun_parameters": None,
             "use_augustus": False,
+            "use_miniprot": False,
             "augustus_parameters": None,
             "augustus_species": None,
             "long": False,
@@ -80,6 +81,7 @@ class TestBuscoConfig(unittest.TestCase):
                 "evalue",
                 "limit",
                 "use_augustus",
+                "use_miniprot",
                 "batch_mode",
                 "tar",
                 "contig_break",
@@ -181,6 +183,7 @@ class TestBuscoConfig(unittest.TestCase):
             "restart": False,
             "update-data": False,
             "use_augustus": False,
+            "use_miniprot": False,
         }
         config = BuscoConfig.BuscoConfigMain(
             self.base_config, {"lineage_dataset": "test"}


=====================================
tests/unittests/run_BUSCO_unittests.py
=====================================
@@ -108,6 +108,7 @@ class TestParams(unittest.TestCase):
             "metaeuk_parameters": None,
             "metaeuk_rerun_parameters": None,
             "use_augustus": False,
+            "use_miniprot": False,
             "augustus_parameters": None,
             "augustus_species": None,
             "long": False,
@@ -169,6 +170,7 @@ class TestParams(unittest.TestCase):
             "metaeuk_parameters": None,
             "metaeuk_rerun_parameters": None,
             "use_augustus": False,
+            "use_miniprot": False,
             "augustus_parameters": None,
             "augustus_species": None,
             "long": False,
@@ -259,6 +261,7 @@ class TestParams(unittest.TestCase):
             "force": True,
             "restart": True,
             "use_augustus": True,
+            "use_miniprot": False,
             "help": "==SUPPRESS==",
             "in": input_file,
             "limit": limit,



View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/25d070d5cb838817f874882bf9f6770926cdbdb0

-- 
View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/25d070d5cb838817f874882bf9f6770926cdbdb0
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230911/bc041e00/attachment-0001.htm>


More information about the debian-med-commit mailing list