[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2024.2.0

Sun Feb 18 15:11:04 GMT 2024


Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier


Commits:
c639ef80 by Andreas Tille at 2024-02-18T15:35:09+01:00
New upstream version 2024.2.0
- - - - -


9 changed files:

- .github/workflows/ci-dev.yaml
- README.md
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/tests/test_classifier.py


Changes:

=====================================
.github/workflows/ci-dev.yaml
=====================================
@@ -9,4 +9,4 @@ jobs:
   ci:
     uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml at dev
     with:
-      distro: core
\ No newline at end of file
+      distro: amplicon


=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
 # q2-feature-classifier
 
-![](https://github.com/qiime2/q2-feature-classifier/workflows/ci/badge.svg)
+![](https://github.com/qiime2/q2-feature-classifier/workflows/ci-dev/badge.svg)
 
 This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org.
\ No newline at end of file


=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -14,7 +14,9 @@ from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
     BLAST6Format)
 from .types import BLASTDBDirFmtV5, BLASTDB
-from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from qiime2.plugin import (
+    Int, Str, Float, Choices, Range, Bool, Threads, get_available_cores
+)
 from .plugin_setup import plugin, citations
 from ._consensus_assignment import (
     min_consensus_param, min_consensus_param_description,
@@ -59,6 +61,9 @@ def blast(query: DNAFASTAFormat,
           evalue: float = DEFAULTEVALUE,
           output_no_hits: bool = DEFAULTOUTPUTNOHITS,
           num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame:
+    if num_threads == 0:
+        num_threads = get_available_cores()
+
     if reference_reads and blastdb:
         raise ValueError('Only one reference_reads or blastdb artifact '
                          'can be provided as input. Choose one and try '
@@ -118,6 +123,9 @@ def classify_consensus_blast(ctx,
                              min_consensus=DEFAULTMINCONSENSUS,
                              unassignable_label=DEFAULTUNASSIGNABLELABEL,
                              num_threads=DEFAULTNUMTHREADS):
+    if num_threads == 0:
+        num_threads = get_available_cores()
+
     search_db = ctx.get_action('feature_classifier', 'blast')
     lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
     result, = search_db(query=query, blastdb=blastdb,
@@ -179,7 +187,7 @@ parameters = {'evalue': Float,
               'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
               'strand': Str % Choices(['both', 'plus', 'minus']),
               'output_no_hits': Bool,
-              'num_threads': Int % Range(1, None),
+              'num_threads': Threads,
               }
 
 parameter_descriptions = {
@@ -206,7 +214,8 @@ parameter_descriptions = {
                       'unclassified sequences, otherwise you may run into '
                       'errors downstream from missing feature IDs. Set to '
                       'FALSE to mirror default BLAST search.',
-    'num_threads': 'Number of threads (CPUs) to use in the BLAST search.'
+    'num_threads': 'Number of threads (CPUs) to use in the BLAST search. '
+                   'Pass 0 to use all available CPUs.',
 }
 
 blast6_output = ('search_results', FeatureData[BLAST6])


=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -32,19 +32,19 @@ def find_consensus_annotation(search_results: pd.DataFrame,
                               unassignable_label: str =
                               DEFAULTUNASSIGNABLELABEL
                               ) -> pd.DataFrame:
-    '''Find consensus taxonomy from BLAST6Format alignment summary.
+    """Find consensus taxonomy from BLAST6Format alignment summary.
 
     search_results: pd.dataframe
         BLAST6Format search results with canonical headers attached.
     reference_taxonomy: pd.Series
         Annotations of reference database used for original search.
     min_consensus : float
-        The minimum fraction of the annotations that a specfic annotation
+        The minimum fraction of the annotations that a specific annotation
         must be present in for that annotation to be accepted. Current
         lower boundary is 0.51.
     unassignable_label : str
         The label to apply if no acceptable annotations are identified.
-    '''
+    """
     # load and convert blast6format results to dict of taxa hits
     obs_taxa = _blast6format_df_to_series_of_lists(
         search_results, reference_taxonomy,
@@ -85,9 +85,11 @@ plugin.methods.register_function(
 
 
 def _blast6format_df_to_series_of_lists(
-        assignments, ref_taxa,
-        unassignable_label=DEFAULTUNASSIGNABLELABEL):
-    '''import observed assignments in blast6 format to series of lists.
+        assignments: pd.DataFrame,
+        ref_taxa: pd.Series,
+        unassignable_label: str = DEFAULTUNASSIGNABLELABEL
+) -> pd.Series:
+    """import observed assignments in blast6 format to series of lists.
 
     assignments: pd.DataFrame
         Taxonomy observation map in blast format 6. Each line consists of
@@ -99,13 +101,12 @@ def _blast6format_df_to_series_of_lists(
             <accession ID>  Annotation
         The accession IDs in this taxonomy should match the subject-seq-ids in
         the "assignment" input.
-    '''
-    taxa_hits = assignments.set_index('qseqid')['sseqid']
-
+    """
     # validate that assignments are present in reference taxonomy
     # (i.e., that the correct reference taxonomy was used).
     # Note that we drop unassigned labels from this set.
-    missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''}
+    missing_ids = \
+        set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''}
     if len(missing_ids) > 0:
         raise KeyError('Reference taxonomy and search results do not match. '
                        'The following identifiers were reported in the search '
@@ -115,9 +116,12 @@ def _blast6format_df_to_series_of_lists(
     # if vsearch fails to find assignment, it reports '*' as the
     # accession ID, so we will add this mapping to the reference taxonomy.
     ref_taxa['*'] = unassignable_label
-    # map accession IDs to taxonomy
-    taxa_hits.replace(ref_taxa, inplace=True)
+    assignments_copy = assignments.copy(deep=True)
+    for index, value in assignments_copy.iterrows():
+        sseqid = assignments_copy.iloc[index]['sseqid']
+        assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid]
     # convert to dict of {accession_id: [annotations]}
+    taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid']
     taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list)
 
     return taxa_hits
@@ -131,7 +135,7 @@ def _compute_consensus_annotations(
         ----------
         query_annotations : pd.Series of lists
             Indices are query identifiers, and values are lists of all
-            taxonomic annotations associated with that identfier.
+            taxonomic annotations associated with that identifier.
         Returns
         -------
         pd.DataFrame
@@ -191,7 +195,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
         annotations : list of lists
             Taxonomic annotations to form consensus.
         min_consensus : float
-            The minimum fraction of the annotations that a specfic annotation
+            The minimum fraction of the annotations that a specific annotation
             must be present in for that annotation to be accepted. Current
             lower boundary is 0.51.
         unassignable_label : str
@@ -211,7 +215,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
     # This assumes that a hierarchical taxonomy with even numbers of
     # ranks was used.
     taxa_comparison = [Counter(rank) for rank in zip(*annotations)]
-    # interate rank comparisons in reverse
+    # iterate rank comparisons in reverse
     # to find rank with consensus count > threshold
     for rank in taxa_comparison[::-1]:
         # grab most common label and its count


=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -6,11 +6,55 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
+from dataclasses import dataclass, field
+from functools import cached_property
 from itertools import islice, repeat
-from copy import deepcopy
+from typing import Dict, List
 
 from joblib import Parallel, delayed
 
+
+ at dataclass
+class _TaxonNode:
+    # The _TaxonNode is used to build a hierarchy from a list of sorted class
+    # labels. It allows one to quickly find class label indices of taxonomy
+    # labels that satisfy a given taxonomy hierarchy. For example, given the
+    # 'k__Bacteria' taxon, the _TaxonNode.range property will yield all class
+    # label indices where 'k__Bacteria' is a prefix.
+
+    name: str
+    offset_index: int
+    children: Dict[str, "_TaxonNode"] = field(
+        default_factory=dict,
+        repr=False)
+
+    @classmethod
+    def create_tree(cls, classes: List[str], separator: str):
+        if not all(a <= b for a, b in zip(classes, classes[1:])):
+            raise Exception("classes must be in sorted order")
+        root = cls("Unassigned", 0)
+        for class_start_index, label in enumerate(classes):
+            taxons = label.split(separator)
+            node = root
+            for name in taxons:
+                if name not in node.children:
+                    node.children[name] = cls(name, class_start_index)
+                node = node.children[name]
+        return root
+
+    @property
+    def range(self) -> range:
+        return range(
+            self.offset_index,
+            self.offset_index + self.num_leaf_nodes)
+
+    @cached_property
+    def num_leaf_nodes(self) -> int:
+        if len(self.children) == 0:
+            return 1
+        return sum(c.num_leaf_nodes for c in self.children.values())
+
+
 _specific_fitters = [
         ['naive_bayes',
          [['feat_ext',
@@ -71,25 +115,26 @@ def _predict_chunk_with_conf(pipeline, separator, confidence, chunk):
 
     y = pipeline.classes_[prob_pos.argmax(axis=1)]
 
+    taxonomy_tree = _TaxonNode.create_tree(pipeline.classes_, separator)
+
     results = []
-    split_classes = [c.split(separator) for c in pipeline.classes_]
     for seq_id, taxon, class_probs in zip(seq_ids, y, prob_pos):
-        taxon = taxon.split(separator)
-        classes = zip(deepcopy(split_classes), class_probs)
+        split_taxon = taxon.split(separator)
+        accepted_cum_prob = 0.0
+        cum_prob = 0.0
         result = []
-        for level in taxon:
-            classes = [cls for cls in classes if cls[0].pop(0) == level]
-            cum_prob = sum(c[1] for c in classes)
+        current = taxonomy_tree
+        for rank in split_taxon:
+            current = current.children[rank]
+            cum_prob = class_probs[current.range].sum()
             if cum_prob < confidence:
                 break
-            result.append(level)
-            result_confidence = cum_prob
-        if result:
-            result = separator.join(result)
-            results.append((seq_id, result, result_confidence))
+            accepted_cum_prob = cum_prob
+            result.append(rank)
+        if len(result) == 0:
+            results.append((seq_id, "Unassigned", 1.0 - cum_prob))
         else:
-            results.append((seq_id, 'Unassigned', 1. - cum_prob))
-
+            results.append((seq_id, separator.join(result), accepted_cum_prob))
     return results
 
 


=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2023.9.0, Release-2023.9)"
-    git_full = "04ef68622bc5721f234cf220caf508e7e096bc08"
-    git_date = "2023-10-03 21:57:16 +0000"
+    git_refnames = " (tag: 2024.2.0, Release-2024.2)"
+    git_full = "5ce76be5b72482a7d033fb9d2c41446edd75851a"
+    git_date = "2024-02-16 21:57:24 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -13,7 +13,7 @@ import pandas as pd
 from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, DNAFASTAFormat, BLAST6, BLAST6Format)
 from .plugin_setup import plugin, citations
-from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from qiime2.plugin import Int, Str, Float, Choices, Range, Bool, Threads
 from ._blast import (_run_command)
 from ._consensus_assignment import (DEFAULTUNASSIGNABLELABEL,
                                     min_consensus_param,
@@ -214,7 +214,7 @@ parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
               'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
               'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
               'strand': Str % Choices(['both', 'plus']),
-              'threads': Int % Range(1, None),
+              'threads': Threads,
               'maxhits': Int % Range(1, None) | Str % Choices(['all']),
               'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}
 
@@ -252,7 +252,8 @@ parameter_descriptions = {
                      'lower.',
     'query_cov': 'Reject match if query alignment coverage per high-'
                  'scoring pair is lower.',
-    'threads': 'Number of threads to use for job parallelization.',
+    'threads': 'Number of threads to use for job parallelization. Pass 0 to '
+               'use one per available CPU.',
     'maxhits': 'Maximum number of hits to show once the search is terminated.',
     'maxrejects': 'Maximum number of non-matching target sequences to '
                   'consider before stopping the search. This option works in '


=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -14,7 +14,9 @@ from itertools import chain, islice
 import subprocess
 
 import pandas as pd
-from qiime2.plugin import Int, Str, Float, Bool, Choices, Range
+from qiime2.plugin import (
+    Int, Str, Float, Bool, Choices, Range, Threads, get_available_cores
+)
 from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, DNAIterator, DNAFASTAFormat)
 from q2_types.feature_table import FeatureTable, RelativeFrequency
@@ -203,6 +205,13 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
                      pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
                      read_orientation: str = 'auto'
                      ) -> pd.DataFrame:
+
+    if n_jobs in (0, -1):
+        n_jobs = get_available_cores()
+    elif n_jobs < -1:
+        n_less = abs(n_jobs + 1)
+        n_jobs = get_available_cores(n_less=n_less)
+
     try:
         # autotune reads per batch
         if reads_per_batch == 'auto':
@@ -234,7 +243,7 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
 
 _classify_parameters = {
     'reads_per_batch': Int % Range(1, None) | Str % Choices(['auto']),
-    'n_jobs': Int,
+    'n_jobs': Threads,
     'pre_dispatch': Str,
     'confidence': Float % Range(
         0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices(
@@ -257,7 +266,7 @@ _parameter_descriptions = {
     'reads_per_batch': 'Number of reads to process in each batch. If "auto", '
                        'this parameter is autoscaled to '
                        'min( number of query sequences / n_jobs, 20000).',
-    'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
+    'n_jobs': 'The maximum number of concurrent worker processes. If -1 '
               'all CPUs are used. If 1 is given, no parallel computing '
               'code is used at all, which is useful for debugging. For '
               'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '


=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -16,7 +16,7 @@ import pandas as pd
 import skbio
 import biom
 
-from q2_feature_classifier._skl import _specific_fitters
+from q2_feature_classifier._skl import _specific_fitters, _TaxonNode
 from q2_feature_classifier.classifier import spec_from_pipeline, \
     pipeline_from_spec, populate_class_weight, _autotune_reads_per_batch
 
@@ -242,3 +242,42 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
     def test_autotune_reads_per_batch_more_jobs_than_reads(self):
         self.assertEqual(
             _autotune_reads_per_batch(self.seq_path, n_jobs=1105), 1)
+
+    def test_TaxonNode_create_tree(self):
+        classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+        separator = ';'
+        tree = _TaxonNode.create_tree(classes, separator)
+        self.assertEqual(
+            tree.children['a'].children['b'].children['c'].name, 'c')
+        self.assertEqual(
+            tree.children['a'].children['b'].children['d'].name, 'd')
+        self.assertEqual(
+            tree.children['a'].children['e'].children['f'].name, 'f')
+        self.assertEqual(
+            tree.children['a'].children['e'].children['g'].name, 'g')
+
+    def test_TaxonNode_range(self):
+        classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+        separator = ';'
+        tree = _TaxonNode.create_tree(classes, separator)
+        self.assertEqual(
+            tree.children['a'].children['b'].children['c'].range, range(0, 1))
+        self.assertEqual(
+            tree.children['a'].children['b'].children['d'].range, range(1, 2))
+        self.assertEqual(
+            tree.children['a'].children['e'].children['f'].range, range(2, 3))
+        self.assertEqual(
+            tree.children['a'].children['e'].children['g'].range, range(3, 4))
+        self.assertEqual(
+            tree.children['a'].children['b'].range, range(0, 2))
+        self.assertEqual(
+            tree.children['a'].children['e'].range, range(2, 4))
+
+    def test_TaxonNode_num_leaf_nodes(self):
+        classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+        separator = ';'
+        tree = _TaxonNode.create_tree(classes, separator)
+        self.assertEqual(tree.num_leaf_nodes, 4)
+        self.assertEqual(tree.children['a'].num_leaf_nodes, 4)
+        self.assertEqual(tree.children['a'].children['b'].num_leaf_nodes, 2)
+        self.assertEqual(tree.children['a'].children['e'].num_leaf_nodes, 2)



View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/c639ef80eb28b368122984609d6c81cb506469d3

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/c639ef80eb28b368122984609d6c81cb506469d3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/0f02c364/attachment-0001.htm>