[med-svn] [Git][med-team/busco][upstream] New upstream version 5.5.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Mon Sep 11 11:32:18 BST 2023
Andreas Tille pushed to branch upstream at Debian Med / busco
Commits:
25d070d5 by Andreas Tille at 2023-09-11T12:27:42+02:00
New upstream version 5.5.0
- - - - -
15 changed files:
- CHANGELOG
- src/busco/AutoLineage.py
- src/busco/BuscoConfig.py
- src/busco/BuscoRunner.py
- src/busco/_version.py
- src/busco/analysis/BuscoAnalysis.py
- src/busco/analysis/GenomeAnalysis.py
- src/busco/busco_tools/augustus.py
- src/busco/busco_tools/base.py
- src/busco/busco_tools/hmmer.py
- src/busco/busco_tools/metaeuk.py
- + src/busco/busco_tools/miniprot.py
- src/busco/run_BUSCO.py
- tests/unittests/BuscoConfig_unittests.py
- tests/unittests/run_BUSCO_unittests.py
Changes:
=====================================
CHANGELOG
=====================================
@@ -1,3 +1,6 @@
+5.5.0
+- Add miniprot pipeline (beta test)
+
5.4.7
- Fix bug in overlap handling (Issue #653): this fix also updated the way negative strand coordinates are reported,
i.e. <gene_id>:<start>-<stop> instead of <gene_id>:<low>-<high>
=====================================
src/busco/AutoLineage.py
=====================================
@@ -256,6 +256,10 @@ class AutoSelectLineage:
protein_seqs_dir = self.selected_runner.analysis.augustus_runner.extracted_prot_dir
protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir)
if f.split(".")[-2] == "faa"]
+ elif self.config.getboolean("busco_run", "use_miniprot"):
+ protein_seqs_dir = self.selected_runner.analysis.miniprot_align_runner.translated_proteins_folder
+ protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir)
+ if f.endswith("faa")]
else:
protein_seqs = self.selected_runner.analysis.metaeuk_runner.combined_pred_protein_seqs
elif "tran" in self.selected_runner.mode:
=====================================
src/busco/BuscoConfig.py
=====================================
@@ -32,6 +32,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
"auto-lineage-euk": False,
"update-data": False,
"use_augustus": False,
+ "use_miniprot": False,
"batch_mode": False,
"tar": False,
}
@@ -93,6 +94,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
"tar",
"download_base_url",
"use_augustus",
+ "use_miniprot",
"download_base_url",
}
@@ -122,6 +124,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
"evalue",
"limit",
"use_augustus",
+ "use_miniprot",
"batch_mode",
"tar",
"contig_break",
@@ -212,6 +215,8 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
elif domain == "eukaryota":
if self.getboolean("busco_run", "use_augustus"):
mode = "euk_genome_aug"
+ elif self.getboolean("busco_run", "use_miniprot"):
+ mode = "euk_genome_min"
else:
mode = "euk_genome_met"
else:
@@ -237,7 +242,6 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
self.specific_params = type(self).AUGUSTUS_ARGS
self.specific_params.update(type(self).BLAST_ARGS)
self.specific_params.update(type(self).BBTOOLS_ARGS)
-
elif mode == "euk_genome_met":
self.specific_params = type(self).METAEUK_ARGS
self.specific_params.update(type(self).BBTOOLS_ARGS)
@@ -245,7 +249,7 @@ class BaseConfig(ConfigParser, metaclass=ABCMeta):
self.specific_params = type(self).BLAST_ARGS
elif mode == "euk_tran":
self.specific_params = type(self).METAEUK_ARGS
- elif mode == "prok_genome":
+ elif mode in ["prok_genome", "euk_genome_min"]:
self.specific_params = type(self).BBTOOLS_ARGS
else:
self.specific_params = {}
=====================================
src/busco/BuscoRunner.py
=====================================
@@ -3,6 +3,7 @@ from busco.analysis.BuscoAnalysis import BuscoAnalysis
from busco.analysis.GenomeAnalysis import (
GenomeAnalysisEukaryotesAugustus,
GenomeAnalysisEukaryotesMetaeuk,
+ GenomeAnalysisEukaryotesMiniprot,
)
from busco.analysis.TranscriptomeAnalysis import (
TranscriptomeAnalysisProkaryotes,
@@ -331,6 +332,7 @@ class BatchRunner:
class AnalysisRunner:
mode_dict = {
+ "euk_genome_min": GenomeAnalysisEukaryotesMiniprot,
"euk_genome_met": GenomeAnalysisEukaryotesMetaeuk,
"euk_genome_aug": GenomeAnalysisEukaryotesAugustus,
"prok_genome": GenomeAnalysisProkaryotes,
@@ -441,6 +443,8 @@ class AnalysisRunner:
gene_predictor = "prodigal"
elif self.config.getboolean("busco_run", "use_augustus"):
gene_predictor = "augustus"
+ elif self.config.getboolean("busco_run", "use_miniprot"):
+ gene_predictor = "miniprot"
else:
gene_predictor = "metaeuk"
self.summary["parameters"]["gene_predictor"] = gene_predictor
@@ -894,15 +898,23 @@ class AnalysisRunner:
class SmartBox:
def __init__(self):
- self.width = None
-
- def wrap_header(self, header_text):
- if len(header_text) < 80:
- self.width = max(50, len(header_text.expandtabs()))
+ self.width = 50
+
+ def define_width(self, header_text, body_text):
+ lines = body_text.split("\n")
+ lens = [len(x) for x in lines]
+ max_ind = lens.index(max(lens))
+ longest_line = lines[max_ind]
+ if len(header_text) > len(longest_line):
+ longest_line = header_text
+ if len(longest_line) < 80:
+ self.width = max(50, len(longest_line.expandtabs()))
else:
self.width = 50
- header_text = self.wrap_long_line(header_text)
+ def wrap_header(self, header_text):
+ if len(header_text) > 80:
+ header_text = self.wrap_long_line(header_text)
return header_text
def wrap_long_line(self, line):
@@ -955,7 +967,8 @@ class SmartBox:
return "-" * self.width
def create_results_box(self, header_text, body_text):
- header = self.wrap_header(header_text) # Called first to define width
+ self.define_width(header_text, body_text) # Called first to define width
+ header = self.wrap_header(header_text)
box_lines = list(["\n"])
box_lines.append("\t{}".format(self.add_horizontal()))
framed_header = self.add_vertical(header)
=====================================
src/busco/_version.py
=====================================
@@ -6,4 +6,4 @@ Copyright (c) 2016-2023, Evgeny Zdobnov (ez at ezlab.org)
Licensed under the MIT license. See LICENSE.md file.
"""
-__version__ = "5.4.7"
+__version__ = "5.5.0"
=====================================
src/busco/analysis/BuscoAnalysis.py
=====================================
@@ -132,14 +132,23 @@ class BuscoAnalysis(metaclass=ABCMeta):
self.hmmer_runner.run()
self.hmmer_runner.process_output()
self.validate_output()
- self.hmmer_runner.filter()
- self.hmmer_runner.consolidate_busco_lists()
+ self.filter_results()
+ self.consolidate_busco_lists()
output = self.hmmer_runner.create_output_content()
self.hmmer_runner.write_hmmer_results(output)
- self.hmmer_runner.record_results()
+ self.record_results()
self.hmmer_runner.produce_hmmer_summary()
return
+ def record_results(self):
+ self.hmmer_runner.record_results()
+
+ def filter_results(self):
+ self.hmmer_runner.filter()
+
+ def consolidate_busco_lists(self):
+ self.hmmer_runner.consolidate_busco_lists()
+
def validate_output(
self,
): # Transparent method that can be overwritten by child classes
=====================================
src/busco/analysis/GenomeAnalysis.py
=====================================
@@ -14,6 +14,7 @@ from busco.analysis.BuscoAnalysis import BuscoAnalysis
from busco.analysis.Analysis import NucleotideAnalysis, BLASTAnalysis
from busco.busco_tools.prodigal import ProdigalRunner
from busco.busco_tools.metaeuk import MetaeukRunner
+from busco.busco_tools.miniprot import MiniprotIndexRunner, MiniprotAlignRunner
from busco.busco_tools.bbtools import BBToolsRunner
from busco.busco_tools.augustus import (
AugustusRunner,
@@ -34,6 +35,9 @@ import pandas as pd
from collections import defaultdict
import subprocess
from busco.Exceptions import BuscoError
+from multiprocessing import Pool
+from itertools import repeat, chain
+import numpy as np
logger = BuscoLogger.get_logger(__name__)
@@ -366,7 +370,6 @@ class GenomeAnalysisEukaryotesAugustus(BLASTAnalysis, GenomeAnalysisEukaryotes):
self.optimize_augustus_runner.run()
return
-
class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
def __init__(self):
super().__init__()
@@ -793,8 +796,8 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
end_trim2 = 0
if (
- hmmer_match_details1[gene_id1]["score"]
- > hmmer_match_details2[gene_id2]["score"]
+ hmmer_match_details1[gene_id1][0]["score"] # todo: perhaps revisit - assuming only one match per gene
+ > hmmer_match_details2[gene_id2][0]["score"]
):
priority_match = hmmer_match_details1
secondary_match = hmmer_match_details2
@@ -814,8 +817,8 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
priority_gene_trim = (start_trim2, end_trim2)
secondary_gene_trim = (start_trim1, end_trim1)
- priority_env_coords = iter(priority_match[priority_gene_id]["env_coords"])
- secondary_env_coords = iter(secondary_match[secondary_gene_id]["env_coords"])
+ priority_env_coords = iter(priority_match[priority_gene_id][i]["env_coords"][0] for i in range(len(priority_match[priority_gene_id])))
+ secondary_env_coords = iter(secondary_match[secondary_gene_id][i]["env_coords"][0] for i in range(len(secondary_match[secondary_gene_id])))
priority_used_exons, priority_unused_exons = self.find_unused_exons(
priority_env_coords, priority_exons, priority_gene_trim
)
@@ -988,3 +991,223 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
except OSError:
pass
super().cleanup()
+
+
+class GenomeAnalysisEukaryotesMiniprot(GenomeAnalysisEukaryotes):
+ def __init__(self):
+ super().__init__()
+ self.miniprot_index_runner = None
+ self.miniprot_align_runner = None
+ self.gene_details = {}
+ self.gene_update_mapping = defaultdict(dict)
+ self.cpus = int(self.config.get("busco_run", "cpu"))
+ self.filtered_records = defaultdict(list)
+ self.filtered_busco_hits = []
+
+ def init_tools(self):
+ super().init_tools()
+ self.miniprot_index_runner = MiniprotIndexRunner()
+ self.miniprot_align_runner = MiniprotAlignRunner()
+
+ def reset(self):
+ super().reset()
+ self.miniprot_index_runner.reset()
+ self.miniprot_align_runner.reset()
+
+ def run_analysis(self):
+ """This function calls all needed steps for running the analysis."""
+ super().run_analysis()
+ incomplete_buscos = None
+ try:
+ self.run_miniprot(incomplete_buscos)
+ self.gene_details.update(self.miniprot_align_runner.gene_details)
+ self.sequences_aa.update(self.miniprot_align_runner.sequences_aa)
+ self.hmmer_runner.miniprot_pipeline = True
+ self.run_hmmer(
+ self.miniprot_align_runner.output_sequences,
+ busco_ids=incomplete_buscos
+ )
+ except NoRerunFile:
+ raise NoGenesError("Miniprot")
+
+ self.hmmer_runner.write_buscos_to_file(self.sequences_aa)#, self.sequences_nt)
+
+ def run_miniprot(self, incomplete_buscos):
+ self.miniprot_index_runner.configure_runner()
+ if self.restart and self.miniprot_index_runner.check_previous_completed_run():
+ logger.info("Skipping Miniprot indexing run as already run")
+ else:
+ self.restart = False
+ self.config.set("busco_run", "restart", str(self.restart))
+ self.miniprot_index_runner.run()
+
+ self.miniprot_align_runner.configure_runner(incomplete_buscos)
+ if self.restart and self.miniprot_align_runner.check_previous_completed_run():
+ logger.info("Skipping Miniprot aligning run as already run")
+ else:
+ self.restart = False
+ self.config.set("busco_run", "restart", str(self.restart))
+ self.miniprot_align_runner.run()
+ self.miniprot_align_runner.parse_output()
+ self.miniprot_align_runner.write_protein_sequences_per_busco()
+
+ def filter_results(self):
+ for record in self.gene_details:
+ if record in self.hmmer_runner.hmmer_records:
+ self.filtered_records[record].append(self.gene_details[record])
+ self.filtered_records[record][-1].update({"length": sum(region["hmm_len"] for region in self.hmmer_runner.hmmer_records[record]),
+ "bitscore": self.filtered_records[record][-1]["score"]})
+ self.filtered_busco_hits.append(record.split("|")[0].split("_")[0])
+ return
+
+ def apply_label2(self, gene_match1, gene_match2, min_complete):
+ protein_length1 = self.gene_details[gene_match1]["protein_length"]
+ protein_length2 = self.gene_details[gene_match2]["protein_length"]
+
+ output1 = self.apply_label1([gene_match1], min_complete, by_rate=True)
+ label_length = {output1: protein_length1}
+ output2 = self.apply_label1([gene_match2], min_complete, by_rate=True)
+ label_length.update({output2: protein_length2})
+
+ if list(label_length.keys()) == ["Single"]:
+ gene_label = "Single"
+ elif list(label_length.keys()) == ["Fragmented"]:
+ gene_label = "Fragmented"
+ elif list(label_length.keys()) == ["Duplicated"]:
+ gene_label = "Duplicated"
+ elif set(label_length.keys()) == {"Single", "Fragmented"}:
+ if label_length["Fragmented"] > label_length["Single"] + (1.5):
+ gene_label = "Fragmented"
+ else:
+ gene_label = "Single"
+ elif set(label_length.keys()) == {"Single", "Duplicated"}:
+ if label_length["Duplicated"] > label_length["Single"] + (1.5):
+ gene_label = "Duplicated"
+ else:
+ gene_label = "Single"
+ elif set(label_length.keys()) == {"Fragmented", "Duplicated"}:
+ if label_length["Fragmented"] > label_length["Duplicated"] + (1.5):
+ gene_label = "Fragmented"
+ else:
+ gene_label = "Duplicated"
+ else:
+ gene_label = "Duplicated"
+ return gene_label
+
+ def apply_label1(self, gene_matches, min_complete, by_rate=False):
+ if len(gene_matches) == 0:
+ gene_label = "Missing"
+ elif len(gene_matches) == 1:
+ if (self.gene_details[gene_matches[0]]["gene_end"] - self.gene_details[gene_matches[0]]["gene_start"])/(self.gene_details[gene_matches[0]]["protein_length"]**int(by_rate)) >= min_complete:
+ gene_label = "Single"
+ else:
+ gene_label = "Fragmented"
+ else:
+ complete_regions = []
+ fragmented_regions = []
+ for gene_match in gene_matches:
+ details = self.gene_details[gene_match]
+ if details["gene_end"] - details["gene_start"] >= min_complete:
+ complete_regions.append(
+ (gene_match, details["gene_start"], details["gene_end"]))
+ else:
+ fragmented_regions.append(
+ (gene_match, details["gene_start"], details["gene_end"]))
+ if len(complete_regions) == 0:
+ gene_label = "Fragmented"
+ elif len(complete_regions) == 1:
+ gene_label = "Single"
+ else:
+ ctgs = [x[0] for x in complete_regions]
+ if len(set(ctgs)) > 1:
+ gene_label = "Duplicated"
+ else:
+ regions = [(x[1], x[2]) for x in complete_regions]
+ clusters = self.get_region_clusters(regions)
+ if len(clusters) == 1:
+ gene_label = "Single"
+ else:
+ gene_label = "Duplicated"
+
+ return gene_label
+
+ def determine_busco_label(self, busco_id, gene_matches, ixl, min_complete):
+ gene_label = ""
+ if len(gene_matches) == 0:
+ gene_label = "Missing"
+ elif len(gene_matches) == 1:
+ gene_label = self.apply_label1(gene_matches, min_complete)
+ else:
+ if (ixl[0] - ixl[1])/(ixl[1] + 1e-9) >= 0.2:
+ gene_label = self.apply_label1(gene_matches, min_complete)
+ else:
+ if gene_matches[0] == gene_matches[1]:
+ gene_label = self.apply_label1(gene_matches, min_complete)
+ else:
+ gene_label = self.apply_label2(gene_matches[0], gene_matches[1], min_complete)
+ return gene_label
+
+ def consolidate_busco_lists(self):
+ self.single_copy_buscos = defaultdict(lambda: defaultdict(list))
+ self.multi_copy_buscos = defaultdict(lambda: defaultdict(list))
+ self.fragmented_copy_buscos = defaultdict(lambda: defaultdict(list))
+ self.missing_buscos = []
+ self.hmmer_runner.load_buscos()
+ for busco_id in set(self.filtered_busco_hits):
+ min_complete = self.hmmer_runner.cutoff_dict[busco_id]["length"] - 2 * \
+ self.hmmer_runner.cutoff_dict[busco_id]["sigma"]
+ gene_matches = np.array([x for x in self.filtered_records if x.startswith(busco_id)])
+ ixl = np.array([(self.gene_details[g]["gene_end"] - self.gene_details[g]["gene_start"])*self.gene_details[g]["identity"] for g in gene_matches])
+ sort_order = np.argsort(ixl)[::-1]
+ gene_matches = gene_matches[sort_order]
+ ixl = ixl[sort_order]
+ if len(gene_matches) > 0:
+ output = self.determine_busco_label(busco_id, gene_matches, ixl, min_complete)
+ for gene_match in gene_matches:
+ gene_id = gene_match#.split("|")[-1]
+ if output == "Single":
+ self.single_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+ elif output == "Fragmented":
+ self.fragmented_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+ elif output == "Duplicated":
+ self.multi_copy_buscos[busco_id][gene_id].append(self.gene_details[gene_match])
+ else:
+ self.missing_buscos.append(busco_id)
+ self.hmmer_runner.single_copy_buscos = self.convert_dict(self.single_copy_buscos)
+ self.hmmer_runner.multi_copy_buscos = self.convert_dict(self.multi_copy_buscos)
+ self.hmmer_runner.fragmented_buscos = self.convert_dict(self.fragmented_copy_buscos)
+ self.hmmer_runner.missing_buscos = self.missing_buscos
+
+ @staticmethod
+ def convert_dict(busco_dict):
+ new_dict = {}
+ for key, value in busco_dict.items():
+ new_dict[key] = dict(value)
+ return new_dict
+
+ @staticmethod
+ def get_region_clusters(regions):
+ sorted_regions = sorted(regions, key=lambda x: x[0], reverse=False)
+ clusters = []
+ for (start, stop) in sorted_regions:
+ if not clusters:
+ clusters.append([start, stop])
+ else:
+ last_cluster = clusters[-1]
+ if last_cluster[0] <= start <= last_cluster[1]:
+ # has overlap
+ clusters[-1][0] = min(last_cluster[0], start)
+ clusters[-1][1] = max(last_cluster[1], stop)
+ else:
+ clusters.append([start, stop])
+ return clusters
+
+ def record_results(self):
+ self.hmmer_runner.record_results(frameshifts=True)
+
+ def cleanup(self):
+ # try:
+ # self.metaeuk_runner.remove_tmp_files()
+ # except OSError:
+ # pass
+ super().cleanup()
=====================================
src/busco/busco_tools/augustus.py
=====================================
@@ -259,7 +259,7 @@ class AugustusRunner(BaseRunner):
return ordered_jobs
def generate_job_args(self):
- contig_ordinal_inds = defaultdict(int)
+ contig_nominal_inds = defaultdict(int)
njobs = 0
ordered_jobs = self.sort_jobs()
@@ -272,8 +272,8 @@ class AugustusRunner(BaseRunner):
contig_tmp_file = "{}.temp".format(
contig_name[:100]
) # Avoid very long filenames
- contig_ordinal_inds[busco_group] += 1
- output_index = contig_ordinal_inds[busco_group]
+ contig_nominal_inds[busco_group] += 1
+ output_index = contig_nominal_inds[busco_group]
out_filename = os.path.join(
self.pred_genes_dir, "{}.out.{}".format(busco_group, output_index)
)
=====================================
src/busco/busco_tools/base.py
=====================================
@@ -6,6 +6,7 @@ from abc import ABCMeta, abstractmethod
from busco.BuscoConfig import BuscoConfigAuto
from busco.Exceptions import BuscoError
import time
+import gzip
logger = BuscoLogger.get_logger(__name__)
@@ -157,6 +158,24 @@ class BaseRunner(Tool, metaclass=ABCMeta):
def generate_job_args(self):
pass
+ @staticmethod
+ def decompress_refseq_file(gzip_file): # todo: probably doesn't belong in this class as it is only applicable to metaeuk and miniprot
+ unzipped_filename = gzip_file.split(".gz")[0]
+ if not os.path.exists(unzipped_filename):
+ with gzip.open(gzip_file, "rb") as compressed_file:
+ with open(unzipped_filename, "wb") as decompressed_file:
+ for line in compressed_file:
+ decompressed_file.write(line)
+ if os.path.exists(gzip_file):
+ try:
+ os.remove(gzip_file)
+ except OSError:
+ logger.warning(
+ "Unable to remove compressed refseq file in dataset download"
+ )
+ pass
+ return unzipped_filename
+
@property
@abstractmethod
def output_folder(self):
=====================================
src/busco/busco_tools/hmmer.py
=====================================
@@ -96,6 +96,8 @@ class HMMERRunner(BaseRunner):
self._already_used_genes = None
self.missing_buscos = None
+ self.miniprot_pipeline = False
+
def configure_runner(self, input_sequences, busco_ids, mode, gene_details):
super().configure_runner()
self.run_number += 1
@@ -255,8 +257,11 @@ class HMMERRunner(BaseRunner):
merged_dict = defaultdict(lambda: defaultdict(list))
for hmmer_dict in [self.is_complete, self.is_very_large, self.is_fragment]:
for busco_id, busco_matches in hmmer_dict.items():
- merged_dict[busco_id].update(busco_matches)
- return merged_dict
+ for gene_id, matches in busco_matches.items():
+ merged_dict[busco_id][gene_id].extend(matches)
+ # for busco_id in merged_dict.keys():
+ # merged_dict[busco_id] = dict(merged_dict[busco_id]) # convert from defaultdict to dict
+ return dict(merged_dict)
def parse_hmmer_output(self, filename, busco_query):
"""
@@ -268,7 +273,9 @@ class HMMERRunner(BaseRunner):
:return: Dictionary of (gene_id, total_matched_length) pairs
:rtype: dict
"""
- records = defaultdict(dict)
+ records = defaultdict(list)
+ top_hit = None
+ matched_genes = []
with open(filename, "r") as f:
@@ -283,27 +290,35 @@ class HMMERRunner(BaseRunner):
tlen = int(line[2])
bit_score = float(line[7])
+ if self.miniprot_pipeline and top_hit and top_hit != gene_id: # only load the top result for efficiency
+ if self._check_overlap(matched_genes, gene_id.split("|", maxsplit=1)[-1]):
+ continue
+
# Extract frame information (present in transcriptome mode)
frame = str(line[-1]) if "frame" in str(line[-1]) else None
# Store bitscore matches for each gene match. If match below cutoff, discard.
if bit_score < float(self.cutoff_dict[busco_query]["score"]):
continue
- if gene_id not in records:
- records[gene_id] = {
- "tlen": tlen,
- "hmm_len": 0,
- "env_coords": [],
- "score": bit_score,
- "frame": frame,
- }
+ records[gene_id].append({
+ "tlen": tlen,
+ "hmm_len": 0,
+ "env_coords": [],
+ "score": bit_score,
+ "frame": frame,
+ })
hmm_start = int(line[15])
hmm_end = int(line[16])
env_start = int(line[19])
env_end = int(line[20])
- records[gene_id]["hmm_len"] += hmm_end - hmm_start
- records[gene_id]["env_coords"].append((env_start, env_end))
-
+ records[gene_id][-1]["hmm_len"] += hmm_end - hmm_start
+ records[gene_id][-1]["env_coords"].append((env_start, env_end))
+ if self.miniprot_pipeline:
+ hit_busco_seq, hit_gene = gene_id.split("|", maxsplit=1)
+ if hit_gene not in matched_genes:
+ matched_genes.append(hit_gene)
+ if not top_hit:
+ top_hit = gene_id
except IndexError as e:
logger.error(
"Cannot parse HMMER output file {}".format(filename)
@@ -311,6 +326,22 @@ class HMMERRunner(BaseRunner):
raise BuscoError(e)
return records
+ @staticmethod
+ def _check_overlap(matched_genes, gene2):
+ overlaps = []
+ coords2 = gene2.split(":")[-1]
+ for gene1 in matched_genes:
+ coords1 = gene1.split(":")[-1]
+ start1, end1 = coords1.split("-")
+ start2, end2 = coords2.split("-")
+ if int(end2) - int(start2) > int(end1) - int(start1):
+ start1, end1, start2, end2 = start2, end2, start1, end1
+ if int(start1) <= int(start2) <= int(end1) or int(start1) <= int(end2) <= int(end1):
+ overlaps.append(True)
+ else:
+ overlaps.append(False)
+ return any(overlaps)
+
def _sort_matches(self, matched_record, busco_query):
"""
The HMMER gene matches are sorted into "complete", "v_large" and "fragmented" matches based on a comparison
@@ -332,8 +363,8 @@ class HMMERRunner(BaseRunner):
# Determine whether matched gene represents a complete, very_large or fragment of a BUSCO
for gene_id, record in matched_record.items():
- size = record["hmm_len"]
- frame = record["frame"]
+ size = sum([record[i]["hmm_len"] for i in range(len(record))])
+ frame = record[0]["frame"]
# Kind of like a z-score, but it is compared with a cutoff value, not a mean
zeta = (self.cutoff_dict[busco_query]["length"] - size) / self.cutoff_dict[
@@ -353,7 +384,7 @@ class HMMERRunner(BaseRunner):
# Add information about match to dict
busco_type[gene_id].append(
- dict({"bitscore": record["score"], "length": size, "frame": frame, "orig gene ID": gene_id})
+ dict({"bitscore": record[0]["score"], "length": size, "frame": frame, "orig gene ID": gene_id})
)
# Reference which busco_queries are associated with each gene match
match_type[gene_id].append(busco_query)
@@ -367,7 +398,7 @@ class HMMERRunner(BaseRunner):
matched_genes_fragment,
)
- def process_output(self):
+ def process_output(self, gene_id_lookup=None):
"""
Load all gene matches from HMMER output and sort into dictionaries depending on match quality
(complete, v_large, fragment).
@@ -392,11 +423,22 @@ class HMMERRunner(BaseRunner):
raise ValueError(
"HMMER should not be run more than twice in the same Run instance."
)
-
+ self.gene_id_lookup = gene_id_lookup
with Pool(self.cpus) as job_pool:
hmmer_records = job_pool.map(
self.load_results_from_file, hmmer_results_files
)
+ if self.miniprot_pipeline:
+ self.unpack_hmmer_records_miniprot(hmmer_records)
+ else:
+ self.unpack_hmmer_records_default(hmmer_records)
+
+ def unpack_hmmer_records_miniprot(self, hmmer_records):
+ self.hmmer_records = {}
+ for record in hmmer_records:
+ self.hmmer_records.update(record)
+
+ def unpack_hmmer_records_default(self, hmmer_records):
self.is_complete = defaultdict(
lambda: defaultdict(list), self.is_complete
@@ -459,24 +501,27 @@ class HMMERRunner(BaseRunner):
def load_results_from_file(self, filename):
busco_query = str(os.path.basename(filename).split(".")[0])
matched_record = self.parse_hmmer_output(filename, busco_query)
- filtered_records = self.remove_overlaps(matched_record)
- (
- busco_complete,
- busco_vlarge,
- busco_fragment,
- matched_genes_complete,
- matched_genes_vlarge,
- matched_genes_fragment,
- ) = self._sort_matches(filtered_records, busco_query)
- return (
- busco_query,
- busco_complete,
- busco_vlarge,
- busco_fragment,
- matched_genes_complete,
- matched_genes_vlarge,
- matched_genes_fragment,
- )
+ if self.miniprot_pipeline:
+ return matched_record
+ else:
+ filtered_records = self.remove_overlaps(matched_record)
+ (
+ busco_complete,
+ busco_vlarge,
+ busco_fragment,
+ matched_genes_complete,
+ matched_genes_vlarge,
+ matched_genes_fragment,
+ ) = self._sort_matches(filtered_records, busco_query)
+ return (
+ busco_query,
+ busco_complete,
+ busco_vlarge,
+ busco_fragment,
+ matched_genes_complete,
+ matched_genes_vlarge,
+ matched_genes_fragment,
+ )
def remove_overlaps(self, matched_records):
seq_ids = []
@@ -484,9 +529,16 @@ class HMMERRunner(BaseRunner):
high_coords = []
scores = []
strands = []
+ record_ids = []
try:
for record in matched_records:
- seq_id, coords = record.split(":")
+ record_ids.append(record)
+ if self.gene_id_lookup is not None:
+ gene_id = self.gene_id_lookup[int(record)]
+ else:
+ gene_id = record
+ seq_id, coords = gene_id.split(":")
+ coords = coords.split("_")[0]
start_coord, stop_coord = map(int, coords.split("-"))
low_coord = min(start_coord, stop_coord)
high_coord = max(start_coord, stop_coord)
@@ -498,7 +550,7 @@ class HMMERRunner(BaseRunner):
low_coords.append(low_coord)
high_coords.append(high_coord)
strands.append(strand)
- scores.append(matched_records[record]["score"])
+ scores.append(matched_records[record][0]["score"]) # multiple matches for same record have the same score
except ValueError: # for protein sequences there is no ":<coords>" suffix, so skip the overlap filtering
return matched_records
@@ -509,10 +561,12 @@ class HMMERRunner(BaseRunner):
"High coord": high_coords,
"Score": scores,
"Strand": strands,
+ "Record ID": record_ids,
}
)
results_grouped = records_df.groupby("Sequence")
entries_to_remove = []
+ record_ids_to_remove = set()
seq_ids = results_grouped.groups.keys()
for seq in seq_ids:
match_finder = self.get_matches(results_grouped, seq)
@@ -530,6 +584,7 @@ class HMMERRunner(BaseRunner):
ind_to_remove = idx2
else:
ind_to_remove = idx1
+ record_ids_to_remove.add(g1_sorted.loc[ind_to_remove]["Record ID"])
record_to_remove = g1_sorted.loc[ind_to_remove]
record_start_coord, record_stop_coord = (
record_to_remove["Low coord"],
@@ -547,7 +602,7 @@ class HMMERRunner(BaseRunner):
)
filtered_records = {
- i: matched_records[i] for i in matched_records if i not in entries_to_remove
+ i: matched_records[i] for i in matched_records if i not in record_ids_to_remove
}
return filtered_records
@@ -843,13 +898,16 @@ class HMMERRunner(BaseRunner):
)
)
elif self.mode == "genome":
- scaffold = self.gene_details[gene_id][m]
+ try:
+ scaffold = self.gene_details[gene_id][0]
+ except KeyError:
+ scaffold = match
if self.domain == "eukaryota":
location_pattern = ":{}-{}".format(
scaffold["gene_start"], scaffold["gene_end"]
)
- if gene_id.endswith(location_pattern):
- gene_id = gene_id.replace(location_pattern, "")
+ # if gene_id.endswith(location_pattern):
+ # gene_id = gene_id.replace(location_pattern, "")
else: # Remove suffix assigned by Prodigal
gene_id = gene_id.rsplit("_", 1)[0]
try:
@@ -912,7 +970,7 @@ class HMMERRunner(BaseRunner):
for busco_group in self.cutoff_dict:
if not any(
busco_group in d
- for d in [self.is_complete, self.is_very_large, self.is_fragment]
+ for d in [self.single_copy_buscos, self.multi_copy_buscos, self.fragmented_buscos]
):
output_lines.append("{}\tMissing\n".format(busco_group))
self.missing_buscos.append(busco_group)
@@ -1051,26 +1109,31 @@ class HMMERRunner(BaseRunner):
return sorted_lines
def produce_hmmer_summary(self):
+ frameshift_pattern = "(incl. {} with frameshifts)"
self.hmmer_results_lines.append("***** Results: *****\n\n")
self.hmmer_results_lines.append(self.one_line_summary_raw)
self.hmmer_results_lines.append(
- "{}\tComplete BUSCOs (C)\t\t\t{}\n".format(
- self.single_copy + self.multi_copy, " "
+ "{}\tComplete BUSCOs (C)\t{}\t\t{}\n".format(
+ self.single_copy + self.multi_copy, frameshift_pattern.format(self.c_frameshifts) if self.c_frameshifts > 0 else "",
+ " "
)
)
self.hmmer_results_lines.append(
- "{}\tComplete and single-copy BUSCOs (S)\t{}\n".format(
- self.single_copy, " "
+ "{}\tComplete and single-copy BUSCOs (S)\t{}{}\n".format(
+ self.single_copy, frameshift_pattern.format(self.s_frameshifts) if self.s_frameshifts > 0 else "",
+ " "
)
)
self.hmmer_results_lines.append(
- "{}\tComplete and duplicated BUSCOs (D)\t{}\n".format(
- self.multi_copy, " "
+ "{}\tComplete and duplicated BUSCOs (D)\t{}{}\n".format(
+ self.multi_copy, frameshift_pattern.format(self.d_frameshifts) if self.d_frameshifts > 0 else "",
+ " "
)
)
self.hmmer_results_lines.append(
- "{}\tFragmented BUSCOs (F)\t\t\t{}\n".format(self.only_fragments, " ")
+ "{}\tFragmented BUSCOs (F)\t{}\t\t{}\n".format(self.only_fragments, frameshift_pattern.format(self.f_frameshifts) if self.f_frameshifts > 0 else "",
+ " ")
)
self.hmmer_results_lines.append(
"{}\tMissing BUSCOs (M)\t\t\t{}\n".format(
@@ -1097,7 +1160,7 @@ class HMMERRunner(BaseRunner):
return
- def record_results(self):
+ def record_results(self, frameshifts=False):
self._get_busco_percentages()
self.one_line_summary_raw = "C:{}%[S:{}%,D:{}%],F:{}%,M:{}%,n:{}\t{}\n".format(
self.complete_percent,
@@ -1108,6 +1171,25 @@ class HMMERRunner(BaseRunner):
self.total_buscos,
" ",
)
+ if frameshifts:
+ self.s_frameshifts = 0
+ for x in self.single_copy_buscos.values():
+ for g, details in x.items():
+ self.s_frameshifts += bool(int(details[0]["frameshift_events"])) # just add one for each gene_id containing a frameshift
+ self.d_frameshifts = 0
+ for x in self.multi_copy_buscos.values():
+ for g, details in x.items():
+ self.d_frameshifts += bool(int(details[0]["frameshift_events"]))
+ self.f_frameshifts = 0
+ for x in self.fragmented_buscos.values():
+ for g, details in x.items():
+ self.f_frameshifts += bool(int(details[0]["frameshift_events"]))
+ self.c_frameshifts = self.s_frameshifts + self.d_frameshifts
+ else:
+ self.s_frameshifts = 0
+ self.d_frameshifts = 0
+ self.f_frameshifts = 0
+ self.c_frameshifts = 0
self.one_line_summary = "Results:\t{}".format(self.one_line_summary_raw)
@log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True)
=====================================
src/busco/busco_tools/metaeuk.py
=====================================
@@ -6,7 +6,6 @@ from Bio import SeqIO
import shutil
from configparser import NoOptionError
import subprocess
-import gzip
import pandas as pd
import numpy as np
import re
@@ -252,24 +251,6 @@ class MetaeukRunner(BaseRunner):
self.pred_protein_mod_files.append(self.pred_protein_seqs_modified)
self.codon_mod_files.append(self.codon_file_modified)
- @staticmethod
- def decompress_refseq_file(gzip_file):
- unzipped_filename = gzip_file.split(".gz")[0]
- if not os.path.exists(unzipped_filename):
- with gzip.open(gzip_file, "rb") as compressed_file:
- with open(unzipped_filename, "wb") as decompressed_file:
- for line in compressed_file:
- decompressed_file.write(line)
- if os.path.exists(gzip_file):
- try:
- os.remove(gzip_file)
- except OSError:
- logger.warning(
- "Unable to remove compressed refseq file in dataset download"
- )
- pass
- return unzipped_filename
-
def combine_run_results(self):
with open(self.combined_pred_protein_seqs, "w") as combined_output:
for run_result in self.pred_protein_mod_files:
=====================================
src/busco/busco_tools/miniprot.py
=====================================
@@ -0,0 +1,303 @@
+from busco.busco_tools.base import BaseRunner
+from busco.BuscoLogger import BuscoLogger
+import subprocess
+import os
+from pathlib import Path
+import re
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from collections import defaultdict
+import shutil
+import numpy as np
+import re
+
+logger = BuscoLogger.get_logger(__name__)
+class MiniprotRunner(BaseRunner):
+ """
+ Class to run Miniprot
+ """
+ name = "miniprot"
+ cmd = "miniprot"
+ def __init__(self):
+ super().__init__()
+ self._output_folder = os.path.join(self.run_folder, "miniprot_output")
+ self.translated_proteins_folder = os.path.join(
+ self._output_folder, "translated_proteins"
+ )
+ self.create_dirs(
+ [
+ self._output_folder,
+ self.translated_proteins_folder,
+ ]
+ )
+ self.index_file = os.path.join(self._output_folder, "ref.mpi")
+ self.refseq_db = None
+ self.incomplete_buscos = None
+ self._output_basename = None
+
+ def configure_runner(self, *args):
+ """
+ Configure Miniprot runner
+ """
+ super().configure_runner(*args)
+ self.run_number += 1
+
+ def check_tool_dependencies(self):
+ pass
+ def configure_job(self, *args):
+ """
+ Overridden by child classes
+ """
+ return
+
+ def generate_job_args(self):
+ yield
+ def get_version(self):
+ help_output = subprocess.check_output(
+ [self.cmd, "--version"], stderr=subprocess.STDOUT, shell=False
+ )
+ version = help_output.decode("utf-8").strip()
+ return version
+
+ @property
+ def output_folder(self):
+ return self._output_folder
+
+ def reset(self):
+ super().reset()
+
+ def run(self):
+ super().run()
+ self.total = 1
+ self.run_jobs()
+
+class MiniprotIndexRunner(MiniprotRunner):
+
+ name = "miniprot_index"
+ def generate_job_args(self):
+ yield "index"
+
+ def configure_job(self, *args):
+ """
+ Configure Miniprot job
+ """
+
+ miniprot_job = self.create_job()
+ miniprot_job.add_parameter("-t")
+ miniprot_job.add_parameter(str(self.cpus))
+ miniprot_job.add_parameter("-d")
+ miniprot_job.add_parameter(self.index_file)
+ miniprot_job.add_parameter(self.input_file)
+
+ return miniprot_job
+
+
+class MiniprotAlignRunner(MiniprotRunner):
+
+ name = "miniprot_align"
+
+ def __init__(self):
+ super().__init__()
+ self.output_gff = None
+
+ self.gene_details = defaultdict(dict)
+ self.sequences_aa = {}
+ self.busco_matches = defaultdict(set)
+ self.gene_matches = defaultdict(list)
+ self.combined_pred_protein_seqs = os.path.join(
+ self._output_folder, "combined_pred_proteins.fas"
+ )
+ self.output_sequences = []
+ self.gene_nominal = 0
+ self.gene_lookup = {}
+ self.cigar_lookup = {}
+ self.nominal_lookup = defaultdict(list)
+
+ def generate_job_args(self):
+ yield "align"
+
+ def configure_job(self, *args):
+
+ miniprot_job = self.create_job()
+ miniprot_job.add_parameter("--trans")
+ miniprot_job.add_parameter("-u")
+ miniprot_job.add_parameter("-I")
+ miniprot_job.add_parameter("--outs")
+ miniprot_job.add_parameter("0.95")
+ miniprot_job.add_parameter("-t")
+ miniprot_job.add_parameter(str(self.cpus))
+ miniprot_job.add_parameter("--gff")
+
+ miniprot_job.add_parameter(self.index_file)
+ miniprot_job.add_parameter(self.refseq_db)
+
+ return miniprot_job
+
+ def configure_runner(self, incomplete_buscos=None):
+ super().configure_runner([])
+ self.logfile_path_out = os.path.join(
+ self.config.get("busco_run", "main_out"),
+ "logs",
+ "{}_{}_out.log".format(self.name, os.path.basename(self.lineage_dataset)),
+ )
+ self.logfile_path_err = (
+ self.logfile_path_out.rpartition("_out.log")[0] + "_err.log"
+ )
+
+ self.incomplete_buscos = incomplete_buscos
+ self._output_basename = os.path.join(
+ self._output_folder, os.path.basename(self.input_file)
+ )
+ gzip_refseq = os.path.join(self.lineage_dataset, "refseq_db.faa.gz")
+ self.refseq_db = self.decompress_refseq_file(gzip_refseq)
+ self.output_gff = Path(self._output_folder).joinpath(
+ "{}_{}{}".format(Path(self.input_file).stem, os.path.basename(self.lineage_dataset), ".gff"))
+
+ def create_symlink(self):
+ if not self.output_gff.exists():
+ Path(self.output_gff).symlink_to(self.logfile_path_out)
+ return
+
+ def parse_output(self):
+ self.create_symlink()
+ self.ata_seq = ""
+ self.target_id = ""
+ self.contig_id = ""
+ self.contig_start = 0
+ self.contig_end = 0
+ self.strand = ""
+ self.score = 0
+ self.exon_coords = defaultdict(list)
+ self.cigar_seq = ""
+ paf_block_started = False
+ gene_id = ""
+
+ with open(self.output_gff, "r") as gff:
+ for line in gff:
+ if line.startswith("##PAF"):
+ paf_block_started = True
+ fields = line.strip().split("\t")[1:]
+ if fields[5] == "*":
+ ## Unmapped protein
+ continue
+ self.target_id = fields[0]
+ busco_id = self.target_id.split("_")[0]
+ self.protein_length = int(fields[1])
+ self.protein_start = int(fields[2])
+ self.protein_end = int(fields[3])
+ self.strand = fields[4]
+ self.contig_id = fields[5]
+ self.contig_start = int(fields[7])
+ self.contig_end = int(fields[8])
+ gene_id = "{}|{}:{}-{}".format(self.target_id, self.contig_id, self.contig_start, self.contig_end)
+ self.score = int(fields[13].strip().split(":")[2])
+ self.cigar_seq = str(fields[17].strip().split(":")[2])
+ part_lengths, exon_lengths, match_lengths, group_types, ngroups, nexons, frameshifts, \
+ frameshift_events, frameshift_lengths = self.decode_cigar(self.cigar_seq)
+ sta_line = gff.readline()
+ sta_seq = sta_line.strip().split("\t")[1]
+ self.ata_seq = re.sub("\*", "", sta_seq.upper())
+
+ self.busco_matches[busco_id].add(gene_id)
+
+ self.gene_details[gene_id] = {"gene_start": self.contig_start, "gene_end": self.contig_end,
+ "strand": self.strand, "score": self.score,
+ "cigar": self.cigar_seq, "nexons": nexons,
+ "frameshift_events": frameshift_events,
+ "protein_start": self.protein_start,
+ "protein_end": self.protein_end,
+ "protein_length": self.protein_length}
+
+ self.sequences_aa[gene_id] = SeqRecord(Seq(self.ata_seq), id=gene_id, description=gene_id)
+
+ elif paf_block_started:
+ fields = line.strip().split("\t")
+ if fields[2] == "CDS":
+ start, stop, score, strand = fields[3], fields[4], fields[5], fields[6]
+ self.exon_coords[gene_id].append((start, stop, score, strand))
+ if fields[2] == "mRNA":
+ info_dict = dict(v.split("=") for v in fields[8].split()[0].split(";"))
+ identity = float(info_dict["Identity"])
+ self.gene_details[gene_id].update({"identity": identity})
+ for item in self.exon_coords:
+ self.exon_coords[item] = np.array(self.exon_coords[item], dtype=[("start", "i4"), ("stop", "i4"), ("score", "f4"), ("strand", "U1")])
+ return
+
+ @staticmethod
+ def decode_cigar(cigar):
+ frameshifts = []
+ frameshift_events = 0
+ frameshift_lengths = 0
+ pattern = r"[0-9]+[MIDFGNUV]"
+ parts = list(re.finditer(pattern, cigar))
+ part_lengths = []
+ exon_lengths = []
+ exon_length = 0
+ match_lengths = {"M": 0, "I": 0, "D": 0, "F": 0, "G": 0, "N": 0, "U": 0, "V": 0}
+ group_types = {"M": 0, "I": 0, "D": 0, "F": 0, "G": 0, "N": 0, "U": 0, "V": 0}
+ ngroups = 0
+ nexons = 0
+ for p, part in enumerate(parts):
+ ngroups += 1
+ n, type = int(part.group(0)[:-1]), part.group(0)[-1]
+ match_lengths[type] += n
+ group_types[type] += 1
+ if type in ["M", "D"]:
+ exon_length += n
+ elif type in ["U", "V"]:
+ part_lengths.append(exon_length)
+ exon_lengths.append(exon_length)
+ nexons += 1
+ part_lengths.append(1)
+ exon_length = 0
+ elif type == "N":
+ part_lengths.append(exon_length)
+ exon_lengths.append(exon_length)
+ nexons += 1
+ exon_length = 0
+ elif type in ["F", "G"]:
+ # left search
+ q = p - 1
+ left_match_cnt = 0
+ while q >= 0:
+ part2 = parts[q]
+ n2, type2 = int(part2.group(0)[:-1]), part2.group(0)[-1]
+ if type2 == "M":
+ left_match_cnt += n2
+ elif type2 in ["N", "U", "V"]:
+ break
+ q -= 1
+ # right search
+ q = p + 1
+ right_match_cnt = 0
+ while q < len(parts):
+ part2 = parts[q]
+ n2, type2 = int(part2.group(0)[:-1]), part2.group(0)[-1]
+ if type2 == "M":
+ right_match_cnt += n2
+ elif type2 in ["N", "U", "V"]:
+ break
+ q += 1
+ if left_match_cnt >= 20 and right_match_cnt >= 20:
+ frameshifts.append(str(n) + type)
+ frameshift_events += 1
+ frameshift_lengths += int(n)
+
+ # elif type == "G":
+ # exon_length += 1
+ part_lengths.append(exon_length)
+ exon_lengths.append(exon_length)
+ nexons += 1
+ return part_lengths, exon_lengths, match_lengths, group_types, ngroups, nexons, frameshifts, frameshift_events, frameshift_lengths
+
+ def write_protein_sequences_per_busco(self):
+ for busco_id in self.busco_matches:
+ seqs_to_write = []
+ output_filename = os.path.join(self.translated_proteins_folder, "{}.faa".format(busco_id))
+ self.output_sequences.append(output_filename)
+ with open(output_filename, "w") as f:
+ for g in self.busco_matches[busco_id]:
+ if g in self.sequences_aa:
+ seqs_to_write.append(self.sequences_aa[g])
+ SeqIO.write(seqs_to_write, f, "fasta")
=====================================
src/busco/run_BUSCO.py
=====================================
@@ -329,6 +329,14 @@ def _parse_args():
"single string with no white space, with each argument separated by a comma.",
)
+ optional.add_argument(
+ "--miniprot",
+ dest="use_miniprot",
+ action="store_true",
+ required=False,
+ help="Use miniprot gene predictor for eukaryote runs",
+ )
+
optional.add_argument(
"--offline",
dest="offline",
=====================================
tests/unittests/BuscoConfig_unittests.py
=====================================
@@ -35,6 +35,7 @@ class TestBuscoConfig(unittest.TestCase):
"metaeuk_parameters": None,
"metaeuk_rerun_parameters": None,
"use_augustus": False,
+ "use_miniprot": False,
"augustus_parameters": None,
"augustus_species": None,
"long": False,
@@ -80,6 +81,7 @@ class TestBuscoConfig(unittest.TestCase):
"evalue",
"limit",
"use_augustus",
+ "use_miniprot",
"batch_mode",
"tar",
"contig_break",
@@ -181,6 +183,7 @@ class TestBuscoConfig(unittest.TestCase):
"restart": False,
"update-data": False,
"use_augustus": False,
+ "use_miniprot": False,
}
config = BuscoConfig.BuscoConfigMain(
self.base_config, {"lineage_dataset": "test"}
=====================================
tests/unittests/run_BUSCO_unittests.py
=====================================
@@ -108,6 +108,7 @@ class TestParams(unittest.TestCase):
"metaeuk_parameters": None,
"metaeuk_rerun_parameters": None,
"use_augustus": False,
+ "use_miniprot": False,
"augustus_parameters": None,
"augustus_species": None,
"long": False,
@@ -169,6 +170,7 @@ class TestParams(unittest.TestCase):
"metaeuk_parameters": None,
"metaeuk_rerun_parameters": None,
"use_augustus": False,
+ "use_miniprot": False,
"augustus_parameters": None,
"augustus_species": None,
"long": False,
@@ -259,6 +261,7 @@ class TestParams(unittest.TestCase):
"force": True,
"restart": True,
"use_augustus": True,
+ "use_miniprot": False,
"help": "==SUPPRESS==",
"in": input_file,
"limit": limit,
View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/25d070d5cb838817f874882bf9f6770926cdbdb0
--
View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/25d070d5cb838817f874882bf9f6770926cdbdb0
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230911/bc041e00/attachment-0001.htm>
More information about the debian-med-commit
mailing list