[med-svn] [Git][med-team/busco][upstream] New upstream version 5.4.4

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Sat Dec 31 11:21:28 GMT 2022



Nilesh Patra pushed to branch upstream at Debian Med / busco


Commits:
6e290d93 by Nilesh Patra at 2022-12-31T16:39:46+05:30
New upstream version 5.4.4
- - - - -


10 changed files:

- CHANGELOG
- src/busco/BuscoRunner.py
- src/busco/_version.py
- src/busco/analysis/GenomeAnalysis.py
- src/busco/busco_tools/blast.py
- src/busco/busco_tools/hmmer.py
- src/busco/busco_tools/metaeuk.py
- src/busco/busco_tools/prodigal.py
- test_data/bacteria/expected_log.txt
- test_data/eukaryota/expected_log.txt


Changes:

=====================================
CHANGELOG
=====================================
@@ -1,3 +1,9 @@
+5.4.4
+- Fix bug in tar option (Issue #591)
+- Fix edge case bug in overlap handling (Issue #592)
+- Fix overlap adjustment algorithm and trim reported sequences
+- Fix file open mode (Issue #622)
+
 5.4.3
 - Fix bug in augustus --long pipeline (Issue #586)
 


=====================================
src/busco/BuscoRunner.py
=====================================
@@ -31,7 +31,7 @@ logger = BuscoLogger.get_logger(__name__)
 
 class SingleRunner:
 
-    all_runners = []
+    all_runners = set()
     summary = {
         "parameters": {},
         "lineage_dataset": {},
@@ -81,7 +81,7 @@ class SingleRunner:
             runner = asl.selected_runner
             parent_domain = runner.config.get("busco_run", "domain_run_name")
         finally:
-            type(self).all_runners.extend(asl.runners)
+            type(self).all_runners.update(asl.runners)
             asl.reset()
         return lineage_dataset, runner, parent_domain
 
@@ -108,7 +108,7 @@ class SingleRunner:
         for runner in type(self).all_runners:
             runner.reset()
             runner.analysis.reset()
-        type(self).all_runners = []
+        type(self).all_runners = set()
 
     @log("Input file is {}", logger, attr_name="input_file")
     def run(self):
@@ -146,7 +146,7 @@ class SingleRunner:
 
                 self.runner = AnalysisRunner(self.config)
 
-            type(self).all_runners.append(self.runner)
+            type(self).all_runners.add(self.runner)
 
             if os.path.exists(lineage_results_folder):
                 new_dest = os.path.join(
@@ -165,7 +165,7 @@ class SingleRunner:
 
         except BuscoError:
             if self.runner is not None:
-                type(self).all_runners.append(self.runner)
+                type(self).all_runners.add(self.runner)
             raise
 
         except ToolException as e:


=====================================
src/busco/_version.py
=====================================
@@ -6,4 +6,4 @@ Copyright (c) 2016-2022, Evgeny Zdobnov (ez at ezlab.org)
 Licensed under the MIT license. See LICENSE.md file.
 
 """
-__version__ = "5.4.3"
+__version__ = "5.4.4"


=====================================
src/busco/analysis/GenomeAnalysis.py
=====================================
@@ -454,26 +454,27 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
         # dictionary
 
     def validate_output(self):
-        if len(self.metaeuk_runner.headers_files) < 2:
-            return
+        """Need to run this for both initial and rerun because it is possible that metaeuk matches overlap"""
+
         hmmer_results = self.hmmer_runner.merge_dicts()
 
         if len(hmmer_results) > 0:
-            exon_records = self.get_exon_records(hmmer_results)
+            exon_records = self.get_exon_records(
+                hmmer_results, self.hmmer_runner.run_number
+            )
             df = self.exons_to_df(exon_records)
             overlaps = self.find_overlaps(df)
-            if overlaps:
-                inds_to_remove = self.handle_overlaps(overlaps, df)
-                inds_to_remove = list(set(inds_to_remove))
-                df.drop(inds_to_remove, inplace=True)
+            if len(overlaps) > 0:
+                discarded_exon_lengths = self.handle_overlaps(overlaps, df)
+                # df.drop(inds_to_remove, inplace=True)
                 complete, matched_genes_complete = self.reconstruct_hmmer_results(
-                    df, self.hmmer_runner.is_complete
+                    df, discarded_exon_lengths, self.hmmer_runner.is_complete
                 )
                 v_large, matched_genes_v_large = self.reconstruct_hmmer_results(
-                    df, self.hmmer_runner.is_very_large
+                    df, discarded_exon_lengths, self.hmmer_runner.is_very_large
                 )
                 fragmented, matched_genes_fragmented = self.reconstruct_hmmer_results(
-                    df, self.hmmer_runner.is_fragment
+                    df, discarded_exon_lengths, self.hmmer_runner.is_fragment
                 )
 
                 # Update hmmer runner with new dictionaries
@@ -487,16 +488,17 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
         return
 
     def get_exon_records(
-        self, busco_dict
+        self, busco_dict, run_number
     ):  # Placed in the GenomeAnalysis module because it draws on both hmmer_runner and metaeuk_runner methods
 
         initial_run_results = self.metaeuk_runner.headers_files[0]
-        rerun_results = self.metaeuk_runner.headers_files[1]
+        if run_number == 2:
+            rerun_results = self.metaeuk_runner.headers_files[1]
 
         exon_records = []
         for busco_id, gene_match in busco_dict.items():
             for gene_id, details in gene_match.items():
-                sequence, coords = gene_id.rsplit(":", 1)
+                sequence, coords = details[0]["orig gene ID"].rsplit(":", 1)
                 gene_start, gene_end = coords.split("-")
                 strand = self.gene_details[gene_id][0]["strand"]
                 score = details[0]["bitscore"]
@@ -504,10 +506,11 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
                 # Need to determine run using HMMER results instead of metaeuk results. This is because exons can be
                 # matched identically to different BUSCO IDs, both on the same and on different runs. The presence of a
                 # match in the metaeuk rerun results does not indicate that the HMMER match in question is associated
-                # .with that metaeuk match
+                # with that metaeuk match
                 run_found = (
                     "2"
-                    if os.path.exists(
+                    if run_number == 2
+                    and os.path.exists(
                         os.path.join(
                             self.hmmer_runner.rerun_results_dir,
                             "{}.out".format(busco_id),
@@ -556,7 +559,13 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
                     low_coords, high_coords = self.metaeuk_runner.extract_exon_coords(
                         good_match
                     )
+                    if low_coords[0] > high_coords[0]:  # for negative strand exons the order is reversed
+                        low_coords, high_coords = high_coords, low_coords
+                    trimmed_low = int(gene_id.split(":")[-1].split("-")[0])
+                    trimmed_high = int(gene_id.split(":")[-1].split("-")[1])
                     for i, entry in enumerate(low_coords):
+                        if int(entry) < trimmed_low or int(entry) > trimmed_high:  # don't include exons that were previously removed due to overlaps
+                            continue
                         record = (
                             busco_id,
                             sequence,
@@ -570,8 +579,9 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
                         exon_records.append(record)
         return exon_records
 
-    def reconstruct_hmmer_results(self, df, hmmer_result_dict):
+    def reconstruct_hmmer_results(self, df, discarded_exon_lengths, hmmer_result_dict):
         busco_groups = df.groupby(["Busco id"])
+        inds_to_remove = set(discarded_exon_lengths.keys())
         hmmer_result_dict_new = defaultdict(dict)
         matched_genes_new = defaultdict(list)
         for busco_id, matches in hmmer_result_dict.items():
@@ -583,38 +593,81 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
             for gene_match, busco_gene_group in busco_gene_groups:
                 if gene_match not in matches:
                     continue
-                min_coord = None
-                for idx, row in busco_gene_group.iterrows():
-                    low_coord = row["Start"]
-                    high_coord = row["Stop"]
-                    seq = row["Sequence"]
-                    if min_coord:
-                        min_coord = min(min_coord, low_coord)
-                        max_coord = max(max_coord, high_coord)
-                    else:
-                        min_coord = low_coord
-                        max_coord = high_coord
+                new_gene_start = gene_match.split(":")[-1].split("-")[
+                    0
+                ]  # these two lines are not really used - they just initialize values that will be changed
+                new_gene_stop = gene_match.split(":")[-1].split("-")[1]
+                start_trim = 0
+                end_trim = 0
+                group_indices = set(busco_gene_group.index)
+                intersection = group_indices.intersection(inds_to_remove)
+                if len(intersection) > 0:
+                    if intersection == group_indices:
+                        continue  # remove entire gene - don't add to new dict
+                    ordered_exons = busco_gene_group.sort_values(by="Start").reset_index()
+                    new_indices = ordered_exons.index
+                    seq = ordered_exons.loc[0]["Sequence"]
+
+                    for idx in new_indices:
+                        old_index = ordered_exons.loc[idx]["index"]
+                        if old_index in intersection:
+                            start_trim += discarded_exon_lengths[old_index]
+                        else:
+                            new_gene_start = df.loc[old_index]["Start"]
+                            break
+                    for idx in new_indices[::-1]:
+                        old_index = ordered_exons.loc[idx]["index"]
+                        if old_index in intersection:
+                            end_trim += discarded_exon_lengths[old_index]
+                        else:
+                            new_gene_stop = df.loc[old_index]["Stop"]
+                            break
+                    new_gene_match = "{}:{}-{}".format(
+                        seq, new_gene_start, new_gene_stop
+                    )
+                else:
+                    new_gene_match = gene_match
 
                 details = matches[gene_match]
                 df_strand = busco_gene_group["Strand"].iloc[0]
-                new_gene_match = "{}:{}-{}".format(seq, min_coord, max_coord)
                 hmmer_result_dict_new[busco_id].update({new_gene_match: details})
                 matched_genes_new[new_gene_match].append(busco_id)
                 self.gene_details[new_gene_match] = [
                     {
-                        "gene_start": min_coord,
-                        "gene_end": max_coord,
+                        "gene_start": new_gene_start,
+                        "gene_end": new_gene_stop,
                         "strand": df_strand,
                     }
                 ]
-                self.sequences_aa[new_gene_match] = self.metaeuk_runner.sequences_aa[
-                    gene_match
-                ]
-                self.sequences_nt[new_gene_match] = self.metaeuk_runner.sequences_nt[
-                    gene_match
-                ]
+                if new_gene_match != gene_match:
+                    trimmed_sequence_aa, trimmed_sequence_nt = self.trim_sequence(
+                        gene_match, start_trim, end_trim
+                    )
+                else:
+                    try:
+                        trimmed_sequence_aa = self.metaeuk_runner.sequences_aa[gene_match]
+                        trimmed_sequence_nt = self.metaeuk_runner.sequences_nt[gene_match]
+                    except KeyError:  # happens on the second run if the first run trimmed the sequence already
+                        trimmed_sequence_aa = self.sequences_aa[gene_match]
+                        trimmed_sequence_nt = self.sequences_nt[gene_match]
+                self.sequences_aa[new_gene_match] = trimmed_sequence_aa
+                self.sequences_nt[new_gene_match] = trimmed_sequence_nt
         return hmmer_result_dict_new, matched_genes_new
 
+    def trim_sequence(self, old_gene_match, start_trim, end_trim):
+        old_sequence_aa = self.metaeuk_runner.sequences_aa[old_gene_match]
+        old_sequence_nt = self.metaeuk_runner.sequences_nt[old_gene_match]
+
+        new_sequence_nt = old_sequence_nt[start_trim : len(old_sequence_nt) - end_trim]
+        if start_trim % 3 != 0 and end_trim % 3 != 0:
+            raise BuscoError(
+                "Problem reconstructing amino acid sequence after extracting exons"
+            )
+        new_sequence_aa = old_sequence_aa[
+            int(start_trim / 3) : len(old_sequence_aa) - int(end_trim / 3)
+        ]
+        return new_sequence_aa, new_sequence_nt
+
     def exons_to_df(self, records):
         if self._mode == "genome":
             logger.info("Validating exons and removing overlapping matches")
@@ -624,41 +677,53 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
         df["Stop"] = df["Stop"].astype(int)
         df["Score"] = df["Score"].astype(float)
         df["Run found"] = df["Run found"].astype(int)
-        df.loc[df["Strand"] == "-", ["Start", "Stop"]] = df.loc[
-            df["Strand"] == "-", ["Stop", "Start"]
-        ].values  # reverse coordinates on negative strand
         return df
 
     def find_overlaps(self, df):
-        overlaps = self.metaeuk_runner.test_for_overlaps(df)
-        busco_overlaps = []
-        for overlap in overlaps:
-            match1 = df.loc[overlap[0]]
-            match2 = df.loc[overlap[1]]
-            if (match1["Busco id"] != match2["Busco id"]) and (
-                match1["Start"] % 3 == match2["Start"] % 3
-            ):
-                # check the overlaps are for two different BUSCOs and check overlaps are in the same reading frame
-                busco_overlaps.append(overlap)
-        return busco_overlaps
+        overlaps = self.metaeuk_runner.test_for_overlaps(df, sort=True)
+        logger.info("{} candidate overlapping regions found".format(len(overlaps)))
+        logger.info("{} exons in total".format(len(df)))
+        return overlaps
 
     def handle_overlaps(self, overlaps, df):
-        indices_to_remove = []
-        for overlap_inds in overlaps:
-            bad_inds = self.handle_diff_busco_overlap(overlap_inds, df)
-            indices_to_remove.extend(bad_inds)
-        return indices_to_remove
+        discarded_exon_lengths = {}
+        # Consider three gene matches, A, B and C, with decreasing scores respectively.
+        # If A overlaps B and B overlaps C but A does not overlap C then the order they are treated affects the outcome.
+        for n, overlap_inds in enumerate(overlaps):
+            if (
+                overlap_inds[0] in discarded_exon_lengths
+                or overlap_inds[1] in discarded_exon_lengths
+            ):
+                continue
+            else:
+                # logger.info("Overlap {}:".format(n))
+                bad_inds = self.handle_diff_busco_overlap(overlap_inds, df)
+                for idx in bad_inds:
+                    discarded_exon_lengths[idx] = (
+                        abs(df.iloc[idx]["Stop"] - df.iloc[idx]["Start"]) + 1
+                    )
+        return discarded_exon_lengths
 
     def handle_diff_busco_overlap(self, overlap_inds, df):
         match1 = df.loc[overlap_inds[0]]
         match2 = df.loc[overlap_inds[1]]
         seq = match1["Sequence"]
         busco_match1 = match1["Busco id"]
+        gene_match1 = match1["Orig gene ID"]
         run_match1 = match1["Run found"]
         busco_match2 = match2["Busco id"]
+        gene_match2 = match2["Orig gene ID"]
         run_match2 = match2["Run found"]
-        exons1 = df.loc[(df["Busco id"] == busco_match1) & (df["Sequence"] == seq)]
-        exons2 = df.loc[(df["Busco id"] == busco_match2) & (df["Sequence"] == seq)]
+        exons1 = df.loc[
+            (df["Busco id"] == busco_match1)
+            & (df["Sequence"] == seq)
+            & (df["Orig gene ID"] == gene_match1)
+        ]
+        exons2 = df.loc[
+            (df["Busco id"] == busco_match2)
+            & (df["Sequence"] == seq)
+            & (df["Orig gene ID"] == gene_match2)
+        ]
         hmmer_run_folder1 = (
             self.hmmer_runner.initial_results_dir
             if run_match1 == 1
@@ -730,7 +795,8 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
         # Check if secondary match uses priority match exons
         used_exons = pd.concat([priority_used_exons, secondary_used_exons])
         overlaps = self.metaeuk_runner.test_for_overlaps(used_exons)
-        if overlaps:
+
+        if len(overlaps) > 0:
             # Remove secondary match
             indices_to_remove = secondary_exons.index
             return indices_to_remove
@@ -761,19 +827,28 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
             return indices_to_remove
 
         overlaps = self.metaeuk_runner.test_for_overlaps(exons_to_check)
-        if overlaps:
+        if len(overlaps) > 0:
             for overlap in overlaps:
+                if overlap[0] in indices_to_remove or overlap[1] in indices_to_remove:
+                    continue
                 match1 = exons_to_check.loc[overlap[0]]
-                index_to_remove = (
-                    overlap[0]
-                    if secondary_exons.iloc[0]["Busco id"] == match1["Busco id"]
-                    else overlap[1]
-                )
-                indices_to_remove.append(index_to_remove)
+                if secondary_exons is not None:
+                    index_to_remove = (
+                        overlap[0]
+                        if secondary_exons.iloc[0]["Busco id"] == match1["Busco id"]
+                        else overlap[1]
+                    )  # It is possible that secondary_exons is None so need to check before using iloc
+                else:
+                    match2 = exons_to_check.loc[overlap[1]]
+                    index_to_remove = (
+                        overlap[0] if match1["Score"] < match2["Score"] else overlap[1]
+                    )
+
+                exons_to_remove = secondary_exons if index_to_remove in secondary_exons.index else priority_exons
+                indices_to_remove.extend(list(exons_to_remove.index))
         return indices_to_remove
 
-    @staticmethod
-    def find_unused_exons(env_coords, exons):
+    def find_unused_exons(self, env_coords, exons):
         remaining_hmm_region = 0
         unused_exons = []
         used_exons = []
@@ -815,6 +890,37 @@ class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes):
                 used_exons.append(entry)
             else:
                 unused_exons.append(entry)
+        used_exons, unused_exons = self.adjust_exon_categories(used_exons, unused_exons)
+        return used_exons, unused_exons
+
+    @staticmethod
+    def adjust_exon_categories(used_exons, unused_exons):
+        """
+        Ensure there are no unused exons sandwiched between used exons
+        :param used_exons:
+        :param unused_exons:
+        :return:
+        """
+
+        used_exons_start = [x["Start"] for x in used_exons]
+        used_exons_end = [x["Stop"] for x in used_exons]
+        start = min(used_exons_start)
+        stop = max(used_exons_end)
+        exons_to_remove = set()
+        unused_indices = [exon["index"] for exon in unused_exons]
+        for exon in unused_exons:
+            if not exon["index"] in exons_to_remove and (
+                (exon["Start"] >= start and exon["Start"] < stop)
+                or (exon["Stop"] > start and exon["Stop"] < stop)
+            ):
+                # find exons that either start or stop within the "used" range
+                used_exons.append(exon)
+                exons_to_remove.add(exon["index"])
+        for idx in list(exons_to_remove):
+            idx2 = unused_indices.index(idx)
+            unused_exons.pop(idx2)
+            unused_indices.pop(idx2)
+
         return used_exons, unused_exons
 
     def cleanup(self):


=====================================
src/busco/busco_tools/blast.py
=====================================
@@ -240,7 +240,7 @@ class TBLASTNRunner(BaseRunner):
 
         matched_seqs = []
         busco_ids_retrieved = set()
-        with open(self.ancestral_file, "rU") as anc_file:
+        with open(self.ancestral_file, "r") as anc_file:
 
             for record in SeqIO.parse(anc_file, "fasta"):
                 if any(record.id.startswith(b) for b in self.incomplete_buscos):
@@ -539,7 +539,7 @@ class TBLASTNRunner(BaseRunner):
                 contig_names.append(contig)
 
         # Write sequences that match contig ids
-        with open(self.input_file, "rU") as f:
+        with open(self.input_file, "r") as f:
             for record in SeqIO.parse(f, "fasta"):
                 if record.id in list(set(contig_names)):
                     with open(


=====================================
src/busco/busco_tools/hmmer.py
=====================================
@@ -11,6 +11,7 @@ from busco.BuscoConfig import BuscoConfigAuto
 from busco.Exceptions import BatchFatalError, BuscoError
 import pandas as pd
 import busco
+from multiprocessing import Pool
 
 logger = BuscoLogger.get_logger(__name__)
 
@@ -235,27 +236,6 @@ class HMMERRunner(BaseRunner):
             )
         return
 
-    def process_output(self):
-        self.is_complete = defaultdict(
-            lambda: defaultdict(list), self.is_complete
-        )  # dict of a dict of lists of dicts
-        self.is_fragment = defaultdict(lambda: defaultdict(list), self.is_fragment)
-        self.is_very_large = defaultdict(lambda: defaultdict(list), self.is_very_large)
-        self.matched_genes_complete = defaultdict(list, self.matched_genes_complete)
-        self.matched_genes_vlarge = defaultdict(list, self.matched_genes_vlarge)
-        self.matched_genes_fragment = defaultdict(list, self.matched_genes_fragment)
-
-        self._load_matched_genes()
-
-        self.is_complete = dict(self.is_complete)
-        self.is_fragment = dict(self.is_fragment)
-        self.is_very_large = dict(self.is_very_large)
-        self.matched_genes_complete = dict(self.matched_genes_complete)
-        self.matched_genes_vlarge = dict(self.matched_genes_vlarge)
-        self.matched_genes_fragment = dict(self.matched_genes_fragment)
-
-        return
-
     @staticmethod
     def _get_matched_lengths(nested_dict):
         """
@@ -346,6 +326,9 @@ class HMMERRunner(BaseRunner):
         busco_complete = defaultdict(list)
         busco_vlarge = defaultdict(list)
         busco_fragment = defaultdict(list)
+        matched_genes_complete = defaultdict(list)
+        matched_genes_vlarge = defaultdict(list)
+        matched_genes_fragment = defaultdict(list)
 
         # Determine whether matched gene represents a complete, very_large or fragment of a BUSCO
         for gene_id, record in matched_record.items():
@@ -360,24 +343,31 @@ class HMMERRunner(BaseRunner):
             # gene match can only be either complete, v_large or fragment
             if -2 <= zeta <= 2:
                 busco_type = busco_complete
-                match_type = self.matched_genes_complete
+                match_type = matched_genes_complete
             elif zeta < -2:
                 busco_type = busco_vlarge
-                match_type = self.matched_genes_vlarge
+                match_type = matched_genes_vlarge
             else:
                 busco_type = busco_fragment
-                match_type = self.matched_genes_fragment
+                match_type = matched_genes_fragment
 
             # Add information about match to dict
             busco_type[gene_id].append(
-                dict({"bitscore": record["score"], "length": size, "frame": frame})
+                dict({"bitscore": record["score"], "length": size, "frame": frame, "orig gene ID": gene_id})
             )
             # Reference which busco_queries are associated with each gene match
             match_type[gene_id].append(busco_query)
 
-        return busco_complete, busco_vlarge, busco_fragment
+        return (
+            busco_complete,
+            busco_vlarge,
+            busco_fragment,
+            matched_genes_complete,
+            matched_genes_vlarge,
+            matched_genes_fragment,
+        )
 
-    def _load_matched_genes(self):
+    def process_output(self):
         """
         Load all gene matches from HMMER output and sort into dictionaries depending on match quality
         (complete, v_large, fragment).
@@ -403,14 +393,31 @@ class HMMERRunner(BaseRunner):
                 "HMMER should not be run more than twice in the same Run instance."
             )
 
-        for filename in hmmer_results_files:
-            busco_query = str(os.path.basename(filename).split(".")[0])
-            matched_record = self.parse_hmmer_output(filename, busco_query)
-            filtered_records = self.remove_overlaps(matched_record)
-            busco_complete, busco_vlarge, busco_fragment = self._sort_matches(
-                filtered_records, busco_query
+        with Pool(self.cpus) as job_pool:
+            hmmer_records = job_pool.map(
+                self.load_results_from_file, hmmer_results_files
             )
 
+        self.is_complete = defaultdict(
+            lambda: defaultdict(list), self.is_complete
+        )  # dict of a dict of lists of dicts
+        self.is_fragment = defaultdict(lambda: defaultdict(list), self.is_fragment)
+        self.is_very_large = defaultdict(lambda: defaultdict(list), self.is_very_large)
+        self.matched_genes_complete = defaultdict(list, self.matched_genes_complete)
+        self.matched_genes_vlarge = defaultdict(list, self.matched_genes_vlarge)
+        self.matched_genes_fragment = defaultdict(list, self.matched_genes_fragment)
+
+        for records in hmmer_records:
+            (
+                busco_query,
+                busco_complete,
+                busco_vlarge,
+                busco_fragment,
+                matched_genes_complete,
+                matched_genes_vlarge,
+                matched_genes_fragment,
+            ) = records
+
             # Add all information for this busco_id to the full dictionary
             if len(busco_complete) > 0:
                 self.is_complete[busco_query].update(busco_complete)
@@ -419,8 +426,58 @@ class HMMERRunner(BaseRunner):
             if len(busco_fragment) > 0:
                 self.is_fragment[busco_query].update(busco_fragment)
 
+            for i in range(3):
+                matched_genes_dict_small = [
+                    matched_genes_complete,
+                    matched_genes_vlarge,
+                    matched_genes_fragment,
+                ][i]
+                matched_genes_dict_large = [
+                    self.matched_genes_complete,
+                    self.matched_genes_vlarge,
+                    self.matched_genes_fragment,
+                ][i]
+                for gene_id in matched_genes_dict_small:
+                    if gene_id in matched_genes_dict_large:
+                        matched_genes_dict_large[gene_id].extend(
+                            matched_genes_dict_small[gene_id]
+                        )
+                    else:
+                        matched_genes_dict_large[gene_id] = matched_genes_dict_small[
+                            gene_id
+                        ]
+
+        self.is_complete = dict(self.is_complete)
+        self.is_fragment = dict(self.is_fragment)
+        self.is_very_large = dict(self.is_very_large)
+        self.matched_genes_complete = dict(self.matched_genes_complete)
+        self.matched_genes_vlarge = dict(self.matched_genes_vlarge)
+        self.matched_genes_fragment = dict(self.matched_genes_fragment)
+
         return
 
+    def load_results_from_file(self, filename):
+        busco_query = str(os.path.basename(filename).split(".")[0])
+        matched_record = self.parse_hmmer_output(filename, busco_query)
+        filtered_records = self.remove_overlaps(matched_record)
+        (
+            busco_complete,
+            busco_vlarge,
+            busco_fragment,
+            matched_genes_complete,
+            matched_genes_vlarge,
+            matched_genes_fragment,
+        ) = self._sort_matches(filtered_records, busco_query)
+        return (
+            busco_query,
+            busco_complete,
+            busco_vlarge,
+            busco_fragment,
+            matched_genes_complete,
+            matched_genes_vlarge,
+            matched_genes_fragment,
+        )
+
     @staticmethod
     def remove_overlaps(matched_records):
         seq_ids = []
@@ -1037,34 +1094,6 @@ class HMMERRunner(BaseRunner):
             "   ",
         )
         self.one_line_summary = "Results:\t{}".format(self.one_line_summary_raw)
-        # type(self).summary["results"][
-        #     "one_line_summary"
-        # ] = self.one_line_summary_raw.strip()
-        # type(self).summary["results"]["C"] = self.single_copy + self.multi_copy
-        # type(self).summary["results"]["S"] = self.single_copy
-        # type(self).summary["results"]["D"] = self.multi_copy
-        # type(self).summary["results"]["F"] = self.only_fragments
-        # type(self).summary["results"]["M"] = (
-        #     self.total_buscos - self.single_copy - self.multi_copy - self.only_fragments
-        # )
-        # type(self).summary["parameters"]["mode"] = self.mode
-        #
-        # if self.mode == "genome":
-        #     if self.config.get("busco_run", "domain") in ["prokaryota", "viruses"]:
-        #         gene_predictor = "prodigal"
-        #     elif self.config.getboolean("busco_run", "use_augustus"):
-        #         gene_predictor = "augustus"
-        #     else:
-        #         gene_predictor = "metaeuk"
-        #     type(self).summary["parameters"]["gene_predictor"] = gene_predictor
-        # type(self).summary["lineage_dataset"]["name"] = self.lineage_dataset
-        # type(self).summary["lineage_dataset"][
-        #     "creation_date"
-        # ] = self.dataset_creation_date
-        # type(self).summary["lineage_dataset"][
-        #     "number_species"
-        # ] = self.dataset_nb_species
-        # type(self).summary["lineage_dataset"]["total_buscos"] = self.dataset_nb_buscos
 
     @log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True)
     def _produce_full_hmmer_summary(self):


=====================================
src/busco/busco_tools/metaeuk.py
=====================================
@@ -11,6 +11,8 @@ import pandas as pd
 import numpy as np
 import re
 from busco.Exceptions import BuscoError
+from multiprocessing import Pool
+from itertools import repeat, chain
 
 logger = BuscoLogger.get_logger(__name__)
 
@@ -301,29 +303,70 @@ class MetaeukRunner(BaseRunner):
         return results
 
     @staticmethod
-    def detect_overlap(match1_details, match2_details):
-        return (
-            match1_details["Strand"] == match2_details["Strand"]
-        )  # check overlaps are on the same strand
+    def detect_overlap(results_grouped, seq):
+        overlap_inds = []
+        handled_inds = set()
+        g1 = results_grouped.get_group(seq)
+        g1_sorted = g1.sort_values(
+            "Start"
+        )  # sort to facilitate a single-pass coordinate check
+        for idx1, row1 in g1_sorted.iterrows():
+            start_val = g1_sorted.loc[idx1]["Start"]
+            stop_val = g1_sorted.loc[idx1]["Stop"]
+            matches = g1_sorted[g1_sorted["Start"] >= start_val].loc[
+                g1_sorted["Start"] < stop_val
+            ]  # find entries with a start coordinate between the current exon start and end
+            for idx2 in matches.index.values:
+                if idx2 in handled_inds:
+                    continue
+                match1_details = g1_sorted.loc[idx1]
+                match2_details = g1_sorted.loc[idx2]
+                if idx2 == idx1:  # don't consider overlaps with self
+                    continue
+                elif (
+                    (
+                        (
+                            match1_details["Orig gene ID"] is None  # needed for header-editing step
+                            or match1_details["Orig gene ID"]
+                            != match2_details["Orig gene ID"]
+                        )  # for efficiency skip overlaps that are
+                        # actually the same gene matching multiple BUSCOs, as this will be dealt with in the
+                        # filtering step later
+                    )
+                    and (
+                        match1_details["Strand"]
+                        == match2_details[
+                            "Strand"
+                        ]  # check overlaps are on the same strand
+                    )
+                    and (
+                        match1_details["Start"] % 3
+                        == match2_details["Start"]
+                        % 3  # check overlaps are in the same reading frame
+                    )
+                ):
+                    overlap_inds.append((idx1, idx2))
+            handled_inds.add(idx1)
+        return overlap_inds
 
-    def test_for_overlaps(self, record_df):
+    def test_for_overlaps(self, record_df, sort=False):
         results_grouped = record_df.groupby("Sequence")
-        overlaps = []
-        seq_ids = results_grouped.groups.keys()
-        for seq in seq_ids:
-            g1 = results_grouped.get_group(seq)
-            g1_sorted = g1.sort_values(
-                "Start"
-            )  # sort to facilitate a single-pass coordinate check
-            for idx1, row1 in g1_sorted.iterrows():
-                start_val = g1_sorted.loc[idx1]["Start"]
-                stop_val = g1_sorted.loc[idx1]["Stop"]
-                matches = g1_sorted[g1_sorted["Start"] > start_val].loc[
-                    g1_sorted["Start"] < stop_val
-                ]  # find entries with a start coordinate between the current exon start and end
-                for idx2 in matches.index.values:
-                    if self.detect_overlap(g1_sorted.loc[idx1], g1_sorted.loc[idx2]):
-                        overlaps.append((idx1, idx2))
+        seq_ids = list(results_grouped.groups.keys())
+        with Pool(self.cpus) as job_pool:
+            overlaps = job_pool.starmap(
+                self.detect_overlap, zip(repeat(results_grouped), seq_ids)
+            )
+
+        overlaps = list(chain.from_iterable(overlaps))
+
+        if sort:
+            max_bitscores = []
+            for overlap in overlaps:
+                match1 = record_df.loc[overlap[0]]
+                match2 = record_df.loc[overlap[1]]
+                max_bitscores.append(max(match1["Score"], match2["Score"]))
+            sort_order = np.argsort(max_bitscores)[::-1]  # descending sort
+            overlaps = np.array(overlaps)[sort_order]
 
         return overlaps
 
@@ -443,7 +486,7 @@ class MetaeukRunner(BaseRunner):
 
         matched_seqs = []
         busco_ids_retrieved = set()
-        with open(self.refseq_db, "rU") as refseq_file:
+        with open(self.refseq_db, "r") as refseq_file:
 
             for record in SeqIO.parse(refseq_file, "fasta"):
                 if any(record.id.startswith(b) for b in self.incomplete_buscos):
@@ -619,7 +662,7 @@ class MetaeukRunner(BaseRunner):
         all_records_fna = []
         all_headers = []
         try:
-            with open(self.codon_file, "rU") as f:
+            with open(self.codon_file, "r") as f:
                 for record in SeqIO.parse(f, "fasta"):
                     header_details = self.parse_header(record.id)
                     record.id = header_details["gene_id"]
@@ -627,7 +670,7 @@ class MetaeukRunner(BaseRunner):
                     record.description = header_details["gene_id"]
                     all_records_fna.append(record)
 
-            with open(self.pred_protein_seqs, "rU") as f:
+            with open(self.pred_protein_seqs, "r") as f:
                 for record in SeqIO.parse(f, "fasta"):
                     header_details = self.parse_header(record.id)
                     record.id = header_details["gene_id"]
@@ -652,12 +695,14 @@ class MetaeukRunner(BaseRunner):
 
         if all_headers:
             matches_df = self.records_to_df(all_headers)
-            overlaps = self.test_for_overlaps(matches_df)
+            overlaps = self.test_for_overlaps(matches_df, sort=True)
             inds_to_remove = []
             for overlap in overlaps:
                 match1 = matches_df.loc[overlap[0]]
                 match2 = matches_df.loc[overlap[1]]
-                if match1["Busco id"] == match2["Busco id"]:
+                if match1.name in inds_to_remove or match2.name in inds_to_remove:
+                    continue
+                elif match1["Busco id"] == match2["Busco id"]:
                     if float(match1["Score"]) > float(match2["Score"]):
                         ind_to_remove = match2.name
                     else:


=====================================
src/busco/busco_tools/prodigal.py
=====================================
@@ -267,7 +267,7 @@ class ProdigalRunner(BaseRunner):
             self.logfile_path_err,
         ) = self.get_output_filenames()
 
-        with open(output_filename_fna, "rU") as f:
+        with open(output_filename_fna, "r") as f:
             for record in SeqIO.parse(f, "fasta"):
                 gene_name = record.id
                 self.sequences_nt[gene_name] = record
@@ -279,7 +279,7 @@ class ProdigalRunner(BaseRunner):
                     {"gene_start": gene_start, "gene_end": gene_end, "strand": strand}
                 )
 
-        with open(output_filename_faa, "rU") as f:
+        with open(output_filename_faa, "r") as f:
             for record in SeqIO.parse(f, "fasta"):
                 self.sequences_aa[record.id] = record
 


=====================================
test_data/bacteria/expected_log.txt
=====================================
@@ -1,118 +1,108 @@
-2022-08-11 08:56:42 INFO:	***** Start a BUSCO v5.4.3 analysis, current time: 08/11/2022 08:56:42 *****
-2022-08-11 08:56:42 INFO:	Configuring BUSCO with local environment
-2022-08-11 08:56:42 WARNING:	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
-2022-08-11 08:56:42 INFO:	Mode is genome
-2022-08-11 08:56:42 INFO:	Downloading information on latest versions of BUSCO data...
-2022-08-11 08:56:54 INFO:	Input file is /busco_wd/test_data/bacteria/genome.fna
-2022-08-11 08:56:54 INFO:	No lineage specified. Running lineage auto selector.
+2022-12-05 16:29:58 INFO:	***** Start a BUSCO v5.4.4 analysis, current time: 12/05/2022 16:29:58 *****
+2022-12-05 16:29:58 INFO:	Configuring BUSCO with local environment
+2022-12-05 16:29:58 WARNING:	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
+2022-12-05 16:29:58 INFO:	Mode is genome
+2022-12-05 16:29:58 INFO:	Downloading information on latest versions of BUSCO data...
+2022-12-05 16:30:00 INFO:	Input file is /busco_wd/test_data/bacteria/genome.fna
+2022-12-05 16:30:00 INFO:	No lineage specified. Running lineage auto selector.
 
-2022-08-11 08:56:54 INFO:	***** Starting Auto Select Lineage *****
+2022-12-05 16:30:00 INFO:	***** Starting Auto Select Lineage *****
 	This process runs BUSCO on the generic lineage datasets for the domains archaea, bacteria and eukaryota. Once the optimal domain is selected, BUSCO automatically attempts to find the most appropriate BUSCO dataset to use based on phylogenetic placement.
 	--auto-lineage-euk and --auto-lineage-prok are also available if you know your input assembly is, or is not, an eukaryote. See the user guide for more information.
 	A reminder: Busco evaluations are valid when an appropriate dataset is used, i.e., the dataset belongs to the lineage of the species to test. Because of overlapping markers/spurious matches among domains, busco matches in another domain do not necessarily mean that your genome/proteome contains sequences from this domain. However, a high busco score in multiple domains might help you identify possible contaminations.
-2022-08-11 08:56:54 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/archaea_odb10.2021-02-23.tar.gz'
-2022-08-11 08:57:00 INFO:	Decompressing file '/busco_wd/busco_downloads/lineages/archaea_odb10.tar.gz'
-2022-08-11 08:57:00 INFO:	Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2021-02-23)
-2022-08-11 08:57:00 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 08:57:00
-2022-08-11 08:57:02 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 08:57:02 INFO:	***** Run Prodigal on input to predict and extract genes *****
-2022-08-11 08:57:02 INFO:	Running Prodigal with genetic code 11 in single mode
-2022-08-11 08:57:02 INFO:	Running 1 job(s) on prodigal, starting at 08/11/2022 08:57:02
-2022-08-11 08:57:03 INFO:	[prodigal]	1 of 1 task(s) completed
-2022-08-11 08:57:03 INFO:	Genetic code 11 selected as optimal
-2022-08-11 08:57:03 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 08:57:03 INFO:	Running 194 job(s) on hmmsearch, starting at 08/11/2022 08:57:03
-2022-08-11 08:57:04 INFO:	[hmmsearch]	20 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	39 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	59 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	78 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	97 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	117 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	136 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	156 of 194 task(s) completed
-2022-08-11 08:57:04 INFO:	[hmmsearch]	175 of 194 task(s) completed
-2022-08-11 08:57:05 INFO:	[hmmsearch]	194 of 194 task(s) completed
-2022-08-11 08:57:05 INFO:	Results:	C:5.2%[S:5.2%,D:0.0%],F:1.5%,M:93.3%,n:194	   
+2022-12-05 16:30:00 INFO:	Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2021-02-23)
+2022-12-05 16:30:00 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:30:00
+2022-12-05 16:30:02 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:30:02 INFO:	***** Run Prodigal on input to predict and extract genes *****
+2022-12-05 16:30:02 INFO:	Running Prodigal with genetic code 11 in single mode
+2022-12-05 16:30:02 INFO:	Running 1 job(s) on prodigal, starting at 12/05/2022 16:30:02
+2022-12-05 16:30:04 INFO:	[prodigal]	1 of 1 task(s) completed
+2022-12-05 16:30:04 INFO:	Genetic code 11 selected as optimal
+2022-12-05 16:30:04 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:30:04 INFO:	Running 194 job(s) on hmmsearch, starting at 12/05/2022 16:30:04
+2022-12-05 16:30:06 INFO:	[hmmsearch]	20 of 194 task(s) completed
+2022-12-05 16:30:06 INFO:	[hmmsearch]	39 of 194 task(s) completed
+2022-12-05 16:30:06 INFO:	[hmmsearch]	59 of 194 task(s) completed
+2022-12-05 16:30:06 INFO:	[hmmsearch]	78 of 194 task(s) completed
+2022-12-05 16:30:06 INFO:	[hmmsearch]	97 of 194 task(s) completed
+2022-12-05 16:30:07 INFO:	[hmmsearch]	117 of 194 task(s) completed
+2022-12-05 16:30:07 INFO:	[hmmsearch]	136 of 194 task(s) completed
+2022-12-05 16:30:07 INFO:	[hmmsearch]	156 of 194 task(s) completed
+2022-12-05 16:30:07 INFO:	[hmmsearch]	175 of 194 task(s) completed
+2022-12-05 16:30:07 INFO:	[hmmsearch]	194 of 194 task(s) completed
+2022-12-05 16:30:09 INFO:	Results:	C:5.2%[S:5.2%,D:0.0%],F:1.5%,M:93.3%,n:194	   
 
-2022-08-11 08:57:05 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz'
-2022-08-11 08:57:06 INFO:	Decompressing file '/busco_wd/busco_downloads/lineages/bacteria_odb10.tar.gz'
-2022-08-11 08:57:07 INFO:	Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06)
-2022-08-11 08:57:07 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 08:57:07
-2022-08-11 08:57:08 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 08:57:08 INFO:	***** Run Prodigal on input to predict and extract genes *****
-2022-08-11 08:57:08 INFO:	Genetic code 11 selected as optimal
-2022-08-11 08:57:08 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 08:57:08 INFO:	Running 124 job(s) on hmmsearch, starting at 08/11/2022 08:57:08
-2022-08-11 08:57:09 INFO:	[hmmsearch]	13 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	38 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	50 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	63 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	75 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	87 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	100 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	112 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	[hmmsearch]	124 of 124 task(s) completed
-2022-08-11 08:57:09 INFO:	Results:	C:21.0%[S:21.0%,D:0.0%],F:0.8%,M:78.2%,n:124	   
+2022-12-05 16:30:09 INFO:	Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06)
+2022-12-05 16:30:09 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:30:09
+2022-12-05 16:30:11 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:30:11 INFO:	***** Run Prodigal on input to predict and extract genes *****
+2022-12-05 16:30:11 INFO:	Genetic code 11 selected as optimal
+2022-12-05 16:30:11 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:30:11 INFO:	Running 124 job(s) on hmmsearch, starting at 12/05/2022 16:30:11
+2022-12-05 16:30:12 INFO:	[hmmsearch]	13 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	25 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	38 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	50 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	63 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	75 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	87 of 124 task(s) completed
+2022-12-05 16:30:13 INFO:	[hmmsearch]	100 of 124 task(s) completed
+2022-12-05 16:30:14 INFO:	[hmmsearch]	112 of 124 task(s) completed
+2022-12-05 16:30:14 INFO:	[hmmsearch]	124 of 124 task(s) completed
+2022-12-05 16:30:16 INFO:	Results:	C:21.0%[S:21.0%,D:0.0%],F:0.8%,M:78.2%,n:124	   
 
-2022-08-11 08:57:09 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/eukaryota_odb10.2020-09-10.tar.gz'
-2022-08-11 08:57:13 INFO:	Decompressing file '/busco_wd/busco_downloads/lineages/eukaryota_odb10.tar.gz'
-2022-08-11 08:57:16 INFO:	Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10)
-2022-08-11 08:57:16 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 08:57:16
-2022-08-11 08:57:17 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 08:57:20 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 08:57:20
-2022-08-11 08:57:40 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 08:57:40 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 08:57:40 INFO:	Running 255 job(s) on hmmsearch, starting at 08/11/2022 08:57:40
-2022-08-11 08:57:40 INFO:	[hmmsearch]	26 of 255 task(s) completed
-2022-08-11 08:57:40 INFO:	[hmmsearch]	51 of 255 task(s) completed
-2022-08-11 08:57:40 INFO:	[hmmsearch]	77 of 255 task(s) completed
-2022-08-11 08:57:40 INFO:	[hmmsearch]	102 of 255 task(s) completed
-2022-08-11 08:57:40 INFO:	[hmmsearch]	128 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	[hmmsearch]	153 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	[hmmsearch]	179 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	[hmmsearch]	204 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	[hmmsearch]	230 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	[hmmsearch]	255 of 255 task(s) completed
-2022-08-11 08:57:41 INFO:	Results:	C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255	   
+2022-12-05 16:30:17 INFO:	Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10)
+2022-12-05 16:30:17 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:30:17
+2022-12-05 16:30:19 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:30:19 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:30:19
+2022-12-05 16:31:01 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:31:02 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:31:02 INFO:	Running 255 job(s) on hmmsearch, starting at 12/05/2022 16:31:02
+2022-12-05 16:31:03 INFO:	[hmmsearch]	26 of 255 task(s) completed
+2022-12-05 16:31:04 INFO:	[hmmsearch]	51 of 255 task(s) completed
+2022-12-05 16:31:04 INFO:	[hmmsearch]	77 of 255 task(s) completed
+2022-12-05 16:31:04 INFO:	[hmmsearch]	102 of 255 task(s) completed
+2022-12-05 16:31:05 INFO:	[hmmsearch]	128 of 255 task(s) completed
+2022-12-05 16:31:05 INFO:	[hmmsearch]	153 of 255 task(s) completed
+2022-12-05 16:31:05 INFO:	[hmmsearch]	179 of 255 task(s) completed
+2022-12-05 16:31:05 INFO:	[hmmsearch]	204 of 255 task(s) completed
+2022-12-05 16:31:06 INFO:	[hmmsearch]	230 of 255 task(s) completed
+2022-12-05 16:31:06 INFO:	[hmmsearch]	255 of 255 task(s) completed
+2022-12-05 16:31:08 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:31:10 INFO:	0 candidate overlapping regions found
+2022-12-05 16:31:10 INFO:	3 exons in total
+2022-12-05 16:31:10 INFO:	Results:	C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255	   
 
-2022-08-11 08:57:41 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
-2022-08-11 08:57:57 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 08:57:57
-2022-08-11 08:58:24 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 08:58:24 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 08:58:24 INFO:	Running 252 job(s) on hmmsearch, starting at 08/11/2022 08:58:24
-2022-08-11 08:58:24 INFO:	[hmmsearch]	26 of 252 task(s) completed
-2022-08-11 08:58:24 INFO:	[hmmsearch]	51 of 252 task(s) completed
-2022-08-11 08:58:24 INFO:	[hmmsearch]	76 of 252 task(s) completed
-2022-08-11 08:58:24 INFO:	[hmmsearch]	101 of 252 task(s) completed
-2022-08-11 08:58:25 INFO:	[hmmsearch]	152 of 252 task(s) completed
-2022-08-11 08:58:25 INFO:	[hmmsearch]	177 of 252 task(s) completed
-2022-08-11 08:58:25 INFO:	[hmmsearch]	202 of 252 task(s) completed
-2022-08-11 08:58:25 INFO:	[hmmsearch]	252 of 252 task(s) completed
-2022-08-11 08:58:25 INFO:	Validating exons and removing overlapping matches
-2022-08-11 08:58:25 INFO:	Results:	C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255	   
+2022-12-05 16:31:10 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
+2022-12-05 16:31:26 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:31:26
+2022-12-05 16:32:23 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:32:24 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:32:24 INFO:	Running 252 job(s) on hmmsearch, starting at 12/05/2022 16:32:24
+2022-12-05 16:32:26 INFO:	[hmmsearch]	26 of 252 task(s) completed
+2022-12-05 16:32:26 INFO:	[hmmsearch]	51 of 252 task(s) completed
+2022-12-05 16:32:26 INFO:	[hmmsearch]	76 of 252 task(s) completed
+2022-12-05 16:32:26 INFO:	[hmmsearch]	101 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	126 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	152 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	177 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	202 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	227 of 252 task(s) completed
+2022-12-05 16:32:27 INFO:	[hmmsearch]	252 of 252 task(s) completed
+2022-12-05 16:32:29 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:32:30 INFO:	0 candidate overlapping regions found
+2022-12-05 16:32:30 INFO:	3 exons in total
+2022-12-05 16:32:30 INFO:	Results:	C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255	   
 
-2022-08-11 08:58:25 INFO:	bacteria_odb10 selected
+2022-12-05 16:32:30 INFO:	bacteria_odb10 selected
 
-2022-08-11 08:58:25 INFO:	***** Searching tree for chosen lineage to find best taxonomic match *****
+2022-12-05 16:32:30 INFO:	***** Searching tree for chosen lineage to find best taxonomic match *****
 
-2022-08-11 08:58:25 INFO:	Extract markers...
-2022-08-11 08:58:25 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:25 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:25 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree.bacteria_odb10.2019-12-16.nwk.tar.gz'
-2022-08-11 08:58:31 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/tree.bacteria_odb10.2019-12-16.nwk.tar.gz'
-2022-08-11 08:58:31 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:37 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:37 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa.tar.gz'
-2022-08-11 08:58:44 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa.tar.gz'
-2022-08-11 08:58:45 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:50 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:50 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:56 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 08:58:56 INFO:	Place the markers on the reference tree...
-2022-08-11 08:58:56 INFO:	Running 1 job(s) on sepp, starting at 08/11/2022 08:58:56
-2022-08-11 08:59:51 INFO:	[sepp]	1 of 1 task(s) completed
-2022-08-11 08:59:51 INFO:	Not enough markers were placed on the tree (11). Root lineage bacteria is kept
-2022-08-11 08:59:51 INFO:	
+2022-12-05 16:32:30 INFO:	Extract markers...
+2022-12-05 16:32:30 INFO:	Place the markers on the reference tree...
+2022-12-05 16:32:30 INFO:	Running 1 job(s) on sepp, starting at 12/05/2022 16:32:30
+2022-12-05 16:33:54 INFO:	[sepp]	1 of 1 task(s) completed
+2022-12-05 16:33:54 INFO:	Not enough markers were placed on the tree (11). Root lineage bacteria is kept
+2022-12-05 16:33:54 INFO:	
 
 	--------------------------------------------------
 	|Results from dataset bacteria_odb10              |
@@ -125,12 +115,12 @@
 	|97	Missing BUSCOs (M)                        |
 	|124	Total BUSCO groups searched               |
 	--------------------------------------------------
-2022-08-11 08:59:51 INFO:	BUSCO analysis done with WARNING(s). Total running time: 177 seconds
+2022-12-05 16:33:54 INFO:	BUSCO analysis done with WARNING(s). Total running time: 234 seconds
 
 ***** Summary of warnings: *****
-2022-08-11 08:56:42 WARNING:busco.BuscoConfig	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
+2022-12-05 16:29:58 WARNING:busco.BuscoConfig	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
 
-2022-08-11 08:59:51 INFO:	Results written in /busco_wd/test_bacteria
-2022-08-11 08:59:51 INFO:	For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
+2022-12-05 16:33:54 INFO:	Results written in /busco_wd/test_bacteria
+2022-12-05 16:33:54 INFO:	For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
 
-2022-08-11 08:59:51 INFO:	Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
+2022-12-05 16:33:54 INFO:	Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO


=====================================
test_data/eukaryota/expected_log.txt
=====================================
@@ -1,154 +1,148 @@
-2022-08-11 09:03:07 INFO:	***** Start a BUSCO v5.4.3 analysis, current time: 08/11/2022 09:03:07 *****
-2022-08-11 09:03:07 INFO:	Configuring BUSCO with local environment
-2022-08-11 09:03:07 WARNING:	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
-2022-08-11 09:03:07 INFO:	Mode is genome
-2022-08-11 09:03:07 INFO:	Downloading information on latest versions of BUSCO data...
-2022-08-11 09:03:19 INFO:	Input file is /busco_wd/test_data/eukaryota/genome.fna
-2022-08-11 09:03:19 INFO:	No lineage specified. Running lineage auto selector.
+2022-12-05 16:33:55 INFO:	***** Start a BUSCO v5.4.4 analysis, current time: 12/05/2022 16:33:55 *****
+2022-12-05 16:33:55 INFO:	Configuring BUSCO with local environment
+2022-12-05 16:33:55 WARNING:	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
+2022-12-05 16:33:55 INFO:	Mode is genome
+2022-12-05 16:33:55 INFO:	Downloading information on latest versions of BUSCO data...
+2022-12-05 16:33:57 INFO:	Input file is /busco_wd/test_data/eukaryota/genome.fna
+2022-12-05 16:33:57 INFO:	No lineage specified. Running lineage auto selector.
 
-2022-08-11 09:03:19 INFO:	***** Starting Auto Select Lineage *****
+2022-12-05 16:33:57 INFO:	***** Starting Auto Select Lineage *****
 	This process runs BUSCO on the generic lineage datasets for the domains archaea, bacteria and eukaryota. Once the optimal domain is selected, BUSCO automatically attempts to find the most appropriate BUSCO dataset to use based on phylogenetic placement.
 	--auto-lineage-euk and --auto-lineage-prok are also available if you know your input assembly is, or is not, an eukaryote. See the user guide for more information.
 	A reminder: Busco evaluations are valid when an appropriate dataset is used, i.e., the dataset belongs to the lineage of the species to test. Because of overlapping markers/spurious matches among domains, busco matches in another domain do not necessarily mean that your genome/proteome contains sequences from this domain. However, a high busco score in multiple domains might help you identify possible contaminations.
-2022-08-11 09:03:19 INFO:	Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2021-02-23)
-2022-08-11 09:03:19 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 09:03:19
-2022-08-11 09:03:20 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 09:03:20 INFO:	***** Run Prodigal on input to predict and extract genes *****
-2022-08-11 09:03:20 INFO:	Running Prodigal with genetic code 11 in single mode
-2022-08-11 09:03:20 INFO:	Running 1 job(s) on prodigal, starting at 08/11/2022 09:03:20
-2022-08-11 09:03:21 INFO:	[prodigal]	1 of 1 task(s) completed
-2022-08-11 09:03:21 INFO:	Genetic code 11 selected as optimal
-2022-08-11 09:03:21 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:03:21 INFO:	Running 194 job(s) on hmmsearch, starting at 08/11/2022 09:03:21
-2022-08-11 09:03:22 INFO:	[hmmsearch]	20 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	39 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	59 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	78 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	97 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	97 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	117 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	136 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	136 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	156 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	175 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	[hmmsearch]	194 of 194 task(s) completed
-2022-08-11 09:03:22 INFO:	Results:	C:1.0%[S:1.0%,D:0.0%],F:0.5%,M:98.5%,n:194	   
+2022-12-05 16:33:57 INFO:	Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2021-02-23)
+2022-12-05 16:33:57 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:33:57
+2022-12-05 16:33:58 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:33:58 INFO:	***** Run Prodigal on input to predict and extract genes *****
+2022-12-05 16:33:58 INFO:	Running Prodigal with genetic code 11 in single mode
+2022-12-05 16:33:58 INFO:	Running 1 job(s) on prodigal, starting at 12/05/2022 16:33:58
+2022-12-05 16:33:59 INFO:	[prodigal]	1 of 1 task(s) completed
+2022-12-05 16:33:59 INFO:	Genetic code 11 selected as optimal
+2022-12-05 16:33:59 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:33:59 INFO:	Running 194 job(s) on hmmsearch, starting at 12/05/2022 16:33:59
+2022-12-05 16:34:00 INFO:	[hmmsearch]	20 of 194 task(s) completed
+2022-12-05 16:34:00 INFO:	[hmmsearch]	39 of 194 task(s) completed
+2022-12-05 16:34:00 INFO:	[hmmsearch]	59 of 194 task(s) completed
+2022-12-05 16:34:00 INFO:	[hmmsearch]	78 of 194 task(s) completed
+2022-12-05 16:34:00 INFO:	[hmmsearch]	97 of 194 task(s) completed
+2022-12-05 16:34:01 INFO:	[hmmsearch]	117 of 194 task(s) completed
+2022-12-05 16:34:01 INFO:	[hmmsearch]	136 of 194 task(s) completed
+2022-12-05 16:34:01 INFO:	[hmmsearch]	156 of 194 task(s) completed
+2022-12-05 16:34:01 INFO:	[hmmsearch]	175 of 194 task(s) completed
+2022-12-05 16:34:01 INFO:	[hmmsearch]	194 of 194 task(s) completed
+2022-12-05 16:34:02 INFO:	Results:	C:1.0%[S:1.0%,D:0.0%],F:0.5%,M:98.5%,n:194	   
 
-2022-08-11 09:03:22 INFO:	Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06)
-2022-08-11 09:03:22 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 09:03:22
-2022-08-11 09:03:23 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 09:03:23 INFO:	***** Run Prodigal on input to predict and extract genes *****
-2022-08-11 09:03:23 INFO:	Genetic code 11 selected as optimal
-2022-08-11 09:03:23 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:03:23 INFO:	Running 124 job(s) on hmmsearch, starting at 08/11/2022 09:03:23
-2022-08-11 09:03:24 INFO:	[hmmsearch]	13 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	25 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	25 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	50 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	63 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	75 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	87 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	100 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	112 of 124 task(s) completed
-2022-08-11 09:03:24 INFO:	[hmmsearch]	124 of 124 task(s) completed
-2022-08-11 09:03:24 WARNING:	BUSCO did not find any match. Make sure to check the log files if this is unexpected.
-2022-08-11 09:03:24 INFO:	Results:	C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:124	   
+2022-12-05 16:34:02 INFO:	Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06)
+2022-12-05 16:34:02 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:34:02
+2022-12-05 16:34:03 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:34:03 INFO:	***** Run Prodigal on input to predict and extract genes *****
+2022-12-05 16:34:03 INFO:	Genetic code 11 selected as optimal
+2022-12-05 16:34:03 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:34:03 INFO:	Running 124 job(s) on hmmsearch, starting at 12/05/2022 16:34:03
+2022-12-05 16:34:04 INFO:	[hmmsearch]	13 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	25 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	38 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	50 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	63 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	75 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	87 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	100 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	112 of 124 task(s) completed
+2022-12-05 16:34:04 INFO:	[hmmsearch]	124 of 124 task(s) completed
+2022-12-05 16:34:05 WARNING:	BUSCO did not find any match. Make sure to check the log files if this is unexpected.
+2022-12-05 16:34:05 INFO:	Results:	C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:124	   
 
-2022-08-11 09:03:24 INFO:	Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10)
-2022-08-11 09:03:24 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 09:03:24
-2022-08-11 09:03:25 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 09:03:25 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 09:03:25
-2022-08-11 09:03:45 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 09:03:45 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:03:45 INFO:	Running 255 job(s) on hmmsearch, starting at 08/11/2022 09:03:45
-2022-08-11 09:03:46 INFO:	[hmmsearch]	26 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	51 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	77 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	102 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	153 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	179 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	204 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	230 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	[hmmsearch]	255 of 255 task(s) completed
-2022-08-11 09:03:46 INFO:	Results:	C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255	   
+2022-12-05 16:34:05 INFO:	Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10)
+2022-12-05 16:34:05 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:34:05
+2022-12-05 16:34:06 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:34:06 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:34:06
+2022-12-05 16:34:35 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:34:37 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:34:37 INFO:	Running 255 job(s) on hmmsearch, starting at 12/05/2022 16:34:37
+2022-12-05 16:34:38 INFO:	[hmmsearch]	26 of 255 task(s) completed
+2022-12-05 16:34:38 INFO:	[hmmsearch]	51 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	77 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	102 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	128 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	153 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	179 of 255 task(s) completed
+2022-12-05 16:34:39 INFO:	[hmmsearch]	204 of 255 task(s) completed
+2022-12-05 16:34:40 INFO:	[hmmsearch]	255 of 255 task(s) completed
+2022-12-05 16:34:41 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:34:42 INFO:	0 candidate overlapping regions found
+2022-12-05 16:34:42 INFO:	51 exons in total
+2022-12-05 16:34:42 INFO:	Results:	C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255	   
 
-2022-08-11 09:03:46 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
-2022-08-11 09:04:01 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 09:04:01
-2022-08-11 09:04:16 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 09:04:16 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:04:16 INFO:	Running 206 job(s) on hmmsearch, starting at 08/11/2022 09:04:16
-2022-08-11 09:04:17 INFO:	[hmmsearch]	21 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	42 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	62 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	83 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	104 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	124 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	145 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	186 of 206 task(s) completed
-2022-08-11 09:04:17 INFO:	[hmmsearch]	206 of 206 task(s) completed
-2022-08-11 09:04:18 INFO:	Validating exons and removing overlapping matches
-2022-08-11 09:04:18 INFO:	Results:	C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255	   
+2022-12-05 16:34:42 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
+2022-12-05 16:34:57 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:34:57
+2022-12-05 16:35:17 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:35:18 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:35:18 INFO:	Running 206 job(s) on hmmsearch, starting at 12/05/2022 16:35:18
+2022-12-05 16:35:20 INFO:	[hmmsearch]	21 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	42 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	62 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	83 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	104 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	124 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	145 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	165 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	186 of 206 task(s) completed
+2022-12-05 16:35:20 INFO:	[hmmsearch]	206 of 206 task(s) completed
+2022-12-05 16:35:22 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:35:23 INFO:	0 candidate overlapping regions found
+2022-12-05 16:35:23 INFO:	51 exons in total
+2022-12-05 16:35:23 INFO:	Results:	C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255	   
 
-2022-08-11 09:04:18 INFO:	eukaryota_odb10 selected
+2022-12-05 16:35:23 INFO:	eukaryota_odb10 selected
 
-2022-08-11 09:04:18 INFO:	***** Searching tree for chosen lineage to find best taxonomic match *****
+2022-12-05 16:35:23 INFO:	***** Searching tree for chosen lineage to find best taxonomic match *****
 
-2022-08-11 09:04:18 INFO:	Extract markers...
-2022-08-11 09:04:18 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/list_of_reference_markers.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:18 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/list_of_reference_markers.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:18 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree.eukaryota_odb10.2019-12-16.nwk.tar.gz'
-2022-08-11 09:04:24 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/tree.eukaryota_odb10.2019-12-16.nwk.tar.gz'
-2022-08-11 09:04:24 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree_metadata.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:30 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/tree_metadata.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:30 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/supermatrix.aln.eukaryota_odb10.2019-12-16.faa.tar.gz'
-2022-08-11 09:04:37 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/supermatrix.aln.eukaryota_odb10.2019-12-16.faa.tar.gz'
-2022-08-11 09:04:37 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:42 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:42 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxid-lineage.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:48 INFO:	Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxid-lineage.eukaryota_odb10.2019-12-16.txt.tar.gz'
-2022-08-11 09:04:48 INFO:	Place the markers on the reference tree...
-2022-08-11 09:04:48 INFO:	Running 1 job(s) on sepp, starting at 08/11/2022 09:04:48
-2022-08-11 09:06:54 INFO:	[sepp]	1 of 1 task(s) completed
-2022-08-11 09:06:55 INFO:	Lineage saccharomycetes is selected, supported by 18 markers out of 19
-2022-08-11 09:06:55 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz'
-2022-08-11 09:06:56 INFO:	Decompressing file '/busco_wd/busco_downloads/lineages/saccharomycetes_odb10.tar.gz'
-2022-08-11 09:07:06 INFO:	Running BUSCO using lineage dataset saccharomycetes_odb10 (eukaryota, 2020-08-05)
-2022-08-11 09:07:06 INFO:	Running 1 job(s) on bbtools, starting at 08/11/2022 09:07:06
-2022-08-11 09:07:06 INFO:	[bbtools]	1 of 1 task(s) completed
-2022-08-11 09:07:07 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 09:07:07
-2022-08-11 09:07:10 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 09:07:10 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:07:10 INFO:	Running 2137 job(s) on hmmsearch, starting at 08/11/2022 09:07:10
-2022-08-11 09:07:11 INFO:	[hmmsearch]	214 of 2137 task(s) completed
-2022-08-11 09:07:12 INFO:	[hmmsearch]	428 of 2137 task(s) completed
-2022-08-11 09:07:12 INFO:	[hmmsearch]	642 of 2137 task(s) completed
-2022-08-11 09:07:12 INFO:	[hmmsearch]	855 of 2137 task(s) completed
-2022-08-11 09:07:13 INFO:	[hmmsearch]	1069 of 2137 task(s) completed
-2022-08-11 09:07:13 INFO:	[hmmsearch]	1069 of 2137 task(s) completed
-2022-08-11 09:07:13 INFO:	[hmmsearch]	1069 of 2137 task(s) completed
-2022-08-11 09:07:13 INFO:	[hmmsearch]	1283 of 2137 task(s) completed
-2022-08-11 09:07:14 INFO:	[hmmsearch]	1496 of 2137 task(s) completed
-2022-08-11 09:07:14 INFO:	[hmmsearch]	1710 of 2137 task(s) completed
-2022-08-11 09:07:14 INFO:	[hmmsearch]	1924 of 2137 task(s) completed
-2022-08-11 09:07:15 INFO:	[hmmsearch]	2137 of 2137 task(s) completed
-2022-08-11 09:07:17 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
-2022-08-11 09:07:22 INFO:	Running 1 job(s) on metaeuk, starting at 08/11/2022 09:07:22
-2022-08-11 09:07:25 INFO:	[metaeuk]	1 of 1 task(s) completed
-2022-08-11 09:07:26 INFO:	***** Run HMMER on gene sequences *****
-2022-08-11 09:07:26 INFO:	Running 2093 job(s) on hmmsearch, starting at 08/11/2022 09:07:26
-2022-08-11 09:07:27 INFO:	[hmmsearch]	419 of 2093 task(s) completed
-2022-08-11 09:07:27 INFO:	[hmmsearch]	628 of 2093 task(s) completed
-2022-08-11 09:07:28 INFO:	[hmmsearch]	838 of 2093 task(s) completed
-2022-08-11 09:07:29 INFO:	[hmmsearch]	1047 of 2093 task(s) completed
-2022-08-11 09:07:29 INFO:	[hmmsearch]	1256 of 2093 task(s) completed
-2022-08-11 09:07:29 INFO:	[hmmsearch]	1466 of 2093 task(s) completed
-2022-08-11 09:07:30 INFO:	[hmmsearch]	1675 of 2093 task(s) completed
-2022-08-11 09:07:30 INFO:	[hmmsearch]	1884 of 2093 task(s) completed
-2022-08-11 09:07:31 INFO:	[hmmsearch]	2093 of 2093 task(s) completed
-2022-08-11 09:07:33 INFO:	Validating exons and removing overlapping matches
-2022-08-11 09:07:33 INFO:	Results:	C:2.1%[S:2.1%,D:0.0%],F:0.0%,M:97.9%,n:2137	   
+2022-12-05 16:35:23 INFO:	Extract markers...
+2022-12-05 16:35:23 INFO:	Place the markers on the reference tree...
+2022-12-05 16:35:23 INFO:	Running 1 job(s) on sepp, starting at 12/05/2022 16:35:23
+2022-12-05 16:38:57 INFO:	[sepp]	1 of 1 task(s) completed
+2022-12-05 16:38:58 INFO:	Lineage saccharomycetes is selected, supported by 18 markers out of 19
+2022-12-05 16:38:58 INFO:	Running BUSCO using lineage dataset saccharomycetes_odb10 (eukaryota, 2020-08-05)
+2022-12-05 16:38:58 INFO:	Running 1 job(s) on bbtools, starting at 12/05/2022 16:38:58
+2022-12-05 16:38:59 INFO:	[bbtools]	1 of 1 task(s) completed
+2022-12-05 16:38:59 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:38:59
+2022-12-05 16:39:06 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:39:07 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:39:07 INFO:	Running 2137 job(s) on hmmsearch, starting at 12/05/2022 16:39:07
+2022-12-05 16:39:12 INFO:	[hmmsearch]	214 of 2137 task(s) completed
+2022-12-05 16:39:14 INFO:	[hmmsearch]	428 of 2137 task(s) completed
+2022-12-05 16:39:16 INFO:	[hmmsearch]	642 of 2137 task(s) completed
+2022-12-05 16:39:18 INFO:	[hmmsearch]	855 of 2137 task(s) completed
+2022-12-05 16:39:20 INFO:	[hmmsearch]	1069 of 2137 task(s) completed
+2022-12-05 16:39:22 INFO:	[hmmsearch]	1283 of 2137 task(s) completed
+2022-12-05 16:39:23 INFO:	[hmmsearch]	1496 of 2137 task(s) completed
+2022-12-05 16:39:25 INFO:	[hmmsearch]	1710 of 2137 task(s) completed
+2022-12-05 16:39:27 INFO:	[hmmsearch]	1924 of 2137 task(s) completed
+2022-12-05 16:39:31 INFO:	[hmmsearch]	2137 of 2137 task(s) completed
+2022-12-05 16:39:34 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:39:36 INFO:	0 candidate overlapping regions found
+2022-12-05 16:39:36 INFO:	45 exons in total
+2022-12-05 16:39:36 INFO:	Extracting missing and fragmented buscos from the file refseq_db.faa...
+2022-12-05 16:39:41 INFO:	Running 1 job(s) on metaeuk, starting at 12/05/2022 16:39:41
+2022-12-05 16:39:48 INFO:	[metaeuk]	1 of 1 task(s) completed
+2022-12-05 16:39:50 INFO:	***** Run HMMER on gene sequences *****
+2022-12-05 16:39:50 INFO:	Running 2093 job(s) on hmmsearch, starting at 12/05/2022 16:39:50
+2022-12-05 16:39:54 INFO:	[hmmsearch]	210 of 2093 task(s) completed
+2022-12-05 16:39:55 INFO:	[hmmsearch]	419 of 2093 task(s) completed
+2022-12-05 16:39:56 INFO:	[hmmsearch]	628 of 2093 task(s) completed
+2022-12-05 16:39:57 INFO:	[hmmsearch]	838 of 2093 task(s) completed
+2022-12-05 16:39:58 INFO:	[hmmsearch]	1047 of 2093 task(s) completed
+2022-12-05 16:39:59 INFO:	[hmmsearch]	1256 of 2093 task(s) completed
+2022-12-05 16:40:01 INFO:	[hmmsearch]	1466 of 2093 task(s) completed
+2022-12-05 16:40:02 INFO:	[hmmsearch]	1675 of 2093 task(s) completed
+2022-12-05 16:40:02 INFO:	[hmmsearch]	1884 of 2093 task(s) completed
+2022-12-05 16:40:04 INFO:	[hmmsearch]	2093 of 2093 task(s) completed
+2022-12-05 16:40:06 INFO:	Validating exons and removing overlapping matches
+2022-12-05 16:40:08 INFO:	3 candidate overlapping regions found
+2022-12-05 16:40:08 INFO:	49 exons in total
+2022-12-05 16:40:10 INFO:	Results:	C:2.1%[S:2.1%,D:0.0%],F:0.0%,M:97.9%,n:2137	   
 
-2022-08-11 09:07:33 INFO:	
+2022-12-05 16:40:10 INFO:	
 
 	--------------------------------------------------
 	|Results from generic domain eukaryota_odb10      |
@@ -173,13 +167,13 @@
 	|2091	Missing BUSCOs (M)                        |
 	|2137	Total BUSCO groups searched               |
 	--------------------------------------------------
-2022-08-11 09:07:33 INFO:	BUSCO analysis done with WARNING(s). Total running time: 254 seconds
+2022-12-05 16:40:10 INFO:	BUSCO analysis done with WARNING(s). Total running time: 373 seconds
 
 ***** Summary of warnings: *****
-2022-08-11 09:03:07 WARNING:busco.BuscoConfig	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
-2022-08-11 09:03:24 WARNING:busco.busco_tools.hmmer	BUSCO did not find any match. Make sure to check the log files if this is unexpected.
+2022-12-05 16:33:55 WARNING:busco.BuscoConfig	Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line.
+2022-12-05 16:34:05 WARNING:busco.busco_tools.hmmer	BUSCO did not find any match. Make sure to check the log files if this is unexpected.
 
-2022-08-11 09:07:33 INFO:	Results written in /busco_wd/test_eukaryota
-2022-08-11 09:07:33 INFO:	For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
+2022-12-05 16:40:10 INFO:	Results written in /busco_wd/test_eukaryota
+2022-12-05 16:40:10 INFO:	For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
 
-2022-08-11 09:07:33 INFO:	Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
+2022-12-05 16:40:10 INFO:	Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO



View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/6e290d9343b44c115773574d6c41d27f03e6bf44

-- 
View it on GitLab: https://salsa.debian.org/med-team/busco/-/commit/6e290d9343b44c115773574d6c41d27f03e6bf44
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221231/fc1ed0ac/attachment-0001.htm>


More information about the debian-med-commit mailing list