[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2024.2.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Feb 18 15:11:04 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier
Commits:
c639ef80 by Andreas Tille at 2024-02-18T15:35:09+01:00
New upstream version 2024.2.0
- - - - -
9 changed files:
- .github/workflows/ci-dev.yaml
- README.md
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/tests/test_classifier.py
Changes:
=====================================
.github/workflows/ci-dev.yaml
=====================================
@@ -9,4 +9,4 @@ jobs:
ci:
uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml at dev
with:
- distro: core
\ No newline at end of file
+ distro: amplicon
=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
# q2-feature-classifier
-![](https://github.com/qiime2/q2-feature-classifier/workflows/ci/badge.svg)
+![](https://github.com/qiime2/q2-feature-classifier/workflows/ci-dev/badge.svg)
This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org.
\ No newline at end of file
=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -14,7 +14,9 @@ from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
BLAST6Format)
from .types import BLASTDBDirFmtV5, BLASTDB
-from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from qiime2.plugin import (
+ Int, Str, Float, Choices, Range, Bool, Threads, get_available_cores
+)
from .plugin_setup import plugin, citations
from ._consensus_assignment import (
min_consensus_param, min_consensus_param_description,
@@ -59,6 +61,9 @@ def blast(query: DNAFASTAFormat,
evalue: float = DEFAULTEVALUE,
output_no_hits: bool = DEFAULTOUTPUTNOHITS,
num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame:
+ if num_threads == 0:
+ num_threads = get_available_cores()
+
if reference_reads and blastdb:
raise ValueError('Only one reference_reads or blastdb artifact '
'can be provided as input. Choose one and try '
@@ -118,6 +123,9 @@ def classify_consensus_blast(ctx,
min_consensus=DEFAULTMINCONSENSUS,
unassignable_label=DEFAULTUNASSIGNABLELABEL,
num_threads=DEFAULTNUMTHREADS):
+ if num_threads == 0:
+ num_threads = get_available_cores()
+
search_db = ctx.get_action('feature_classifier', 'blast')
lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
result, = search_db(query=query, blastdb=blastdb,
@@ -179,7 +187,7 @@ parameters = {'evalue': Float,
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'strand': Str % Choices(['both', 'plus', 'minus']),
'output_no_hits': Bool,
- 'num_threads': Int % Range(1, None),
+ 'num_threads': Threads,
}
parameter_descriptions = {
@@ -206,7 +214,8 @@ parameter_descriptions = {
'unclassified sequences, otherwise you may run into '
'errors downstream from missing feature IDs. Set to '
'FALSE to mirror default BLAST search.',
- 'num_threads': 'Number of threads (CPUs) to use in the BLAST search.'
+ 'num_threads': 'Number of threads (CPUs) to use in the BLAST search. '
+ 'Pass 0 to use all available CPUs.',
}
blast6_output = ('search_results', FeatureData[BLAST6])
=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -32,19 +32,19 @@ def find_consensus_annotation(search_results: pd.DataFrame,
unassignable_label: str =
DEFAULTUNASSIGNABLELABEL
) -> pd.DataFrame:
- '''Find consensus taxonomy from BLAST6Format alignment summary.
+ """Find consensus taxonomy from BLAST6Format alignment summary.
search_results: pd.dataframe
BLAST6Format search results with canonical headers attached.
reference_taxonomy: pd.Series
Annotations of reference database used for original search.
min_consensus : float
- The minimum fraction of the annotations that a specfic annotation
+ The minimum fraction of the annotations that a specific annotation
must be present in for that annotation to be accepted. Current
lower boundary is 0.51.
unassignable_label : str
The label to apply if no acceptable annotations are identified.
- '''
+ """
# load and convert blast6format results to dict of taxa hits
obs_taxa = _blast6format_df_to_series_of_lists(
search_results, reference_taxonomy,
@@ -85,9 +85,11 @@ plugin.methods.register_function(
def _blast6format_df_to_series_of_lists(
- assignments, ref_taxa,
- unassignable_label=DEFAULTUNASSIGNABLELABEL):
- '''import observed assignments in blast6 format to series of lists.
+ assignments: pd.DataFrame,
+ ref_taxa: pd.Series,
+ unassignable_label: str = DEFAULTUNASSIGNABLELABEL
+) -> pd.Series:
+ """import observed assignments in blast6 format to series of lists.
assignments: pd.DataFrame
Taxonomy observation map in blast format 6. Each line consists of
@@ -99,13 +101,12 @@ def _blast6format_df_to_series_of_lists(
<accession ID> Annotation
The accession IDs in this taxonomy should match the subject-seq-ids in
the "assignment" input.
- '''
- taxa_hits = assignments.set_index('qseqid')['sseqid']
-
+ """
# validate that assignments are present in reference taxonomy
# (i.e., that the correct reference taxonomy was used).
# Note that we drop unassigned labels from this set.
- missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''}
+ missing_ids = \
+ set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''}
if len(missing_ids) > 0:
raise KeyError('Reference taxonomy and search results do not match. '
'The following identifiers were reported in the search '
@@ -115,9 +116,12 @@ def _blast6format_df_to_series_of_lists(
# if vsearch fails to find assignment, it reports '*' as the
# accession ID, so we will add this mapping to the reference taxonomy.
ref_taxa['*'] = unassignable_label
- # map accession IDs to taxonomy
- taxa_hits.replace(ref_taxa, inplace=True)
+ assignments_copy = assignments.copy(deep=True)
+ for index, value in assignments_copy.iterrows():
+ sseqid = assignments_copy.iloc[index]['sseqid']
+ assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid]
# convert to dict of {accession_id: [annotations]}
+ taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid']
taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list)
return taxa_hits
@@ -131,7 +135,7 @@ def _compute_consensus_annotations(
----------
query_annotations : pd.Series of lists
Indices are query identifiers, and values are lists of all
- taxonomic annotations associated with that identfier.
+ taxonomic annotations associated with that identifier.
Returns
-------
pd.DataFrame
@@ -191,7 +195,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
annotations : list of lists
Taxonomic annotations to form consensus.
min_consensus : float
- The minimum fraction of the annotations that a specfic annotation
+ The minimum fraction of the annotations that a specific annotation
must be present in for that annotation to be accepted. Current
lower boundary is 0.51.
unassignable_label : str
@@ -211,7 +215,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
# This assumes that a hierarchical taxonomy with even numbers of
# ranks was used.
taxa_comparison = [Counter(rank) for rank in zip(*annotations)]
- # interate rank comparisons in reverse
+ # iterate rank comparisons in reverse
# to find rank with consensus count > threshold
for rank in taxa_comparison[::-1]:
# grab most common label and its count
=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -6,11 +6,55 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
+from dataclasses import dataclass, field
+from functools import cached_property
from itertools import islice, repeat
-from copy import deepcopy
+from typing import Dict, List
from joblib import Parallel, delayed
+
+ at dataclass
+class _TaxonNode:
+ # The _TaxonNode is used to build a hierarchy from a list of sorted class
+ # labels. It allows one to quickly find class label indices of taxonomy
+ # labels that satisfy a given taxonomy hierarchy. For example, given the
+ # 'k__Bacteria' taxon, the _TaxonNode.range property will yield all class
+ # label indices where 'k__Bacteria' is a prefix.
+
+ name: str
+ offset_index: int
+ children: Dict[str, "_TaxonNode"] = field(
+ default_factory=dict,
+ repr=False)
+
+ @classmethod
+ def create_tree(cls, classes: List[str], separator: str):
+ if not all(a <= b for a, b in zip(classes, classes[1:])):
+ raise Exception("classes must be in sorted order")
+ root = cls("Unassigned", 0)
+ for class_start_index, label in enumerate(classes):
+ taxons = label.split(separator)
+ node = root
+ for name in taxons:
+ if name not in node.children:
+ node.children[name] = cls(name, class_start_index)
+ node = node.children[name]
+ return root
+
+ @property
+ def range(self) -> range:
+ return range(
+ self.offset_index,
+ self.offset_index + self.num_leaf_nodes)
+
+ @cached_property
+ def num_leaf_nodes(self) -> int:
+ if len(self.children) == 0:
+ return 1
+ return sum(c.num_leaf_nodes for c in self.children.values())
+
+
_specific_fitters = [
['naive_bayes',
[['feat_ext',
@@ -71,25 +115,26 @@ def _predict_chunk_with_conf(pipeline, separator, confidence, chunk):
y = pipeline.classes_[prob_pos.argmax(axis=1)]
+ taxonomy_tree = _TaxonNode.create_tree(pipeline.classes_, separator)
+
results = []
- split_classes = [c.split(separator) for c in pipeline.classes_]
for seq_id, taxon, class_probs in zip(seq_ids, y, prob_pos):
- taxon = taxon.split(separator)
- classes = zip(deepcopy(split_classes), class_probs)
+ split_taxon = taxon.split(separator)
+ accepted_cum_prob = 0.0
+ cum_prob = 0.0
result = []
- for level in taxon:
- classes = [cls for cls in classes if cls[0].pop(0) == level]
- cum_prob = sum(c[1] for c in classes)
+ current = taxonomy_tree
+ for rank in split_taxon:
+ current = current.children[rank]
+ cum_prob = class_probs[current.range].sum()
if cum_prob < confidence:
break
- result.append(level)
- result_confidence = cum_prob
- if result:
- result = separator.join(result)
- results.append((seq_id, result, result_confidence))
+ accepted_cum_prob = cum_prob
+ result.append(rank)
+ if len(result) == 0:
+ results.append((seq_id, "Unassigned", 1.0 - cum_prob))
else:
- results.append((seq_id, 'Unassigned', 1. - cum_prob))
-
+ results.append((seq_id, separator.join(result), accepted_cum_prob))
return results
=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2023.9.0, Release-2023.9)"
- git_full = "04ef68622bc5721f234cf220caf508e7e096bc08"
- git_date = "2023-10-03 21:57:16 +0000"
+ git_refnames = " (tag: 2024.2.0, Release-2024.2)"
+ git_full = "5ce76be5b72482a7d033fb9d2c41446edd75851a"
+ git_date = "2024-02-16 21:57:24 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -13,7 +13,7 @@ import pandas as pd
from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAFASTAFormat, BLAST6, BLAST6Format)
from .plugin_setup import plugin, citations
-from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from qiime2.plugin import Int, Str, Float, Choices, Range, Bool, Threads
from ._blast import (_run_command)
from ._consensus_assignment import (DEFAULTUNASSIGNABLELABEL,
min_consensus_param,
@@ -214,7 +214,7 @@ parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'strand': Str % Choices(['both', 'plus']),
- 'threads': Int % Range(1, None),
+ 'threads': Threads,
'maxhits': Int % Range(1, None) | Str % Choices(['all']),
'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}
@@ -252,7 +252,8 @@ parameter_descriptions = {
'lower.',
'query_cov': 'Reject match if query alignment coverage per high-'
'scoring pair is lower.',
- 'threads': 'Number of threads to use for job parallelization.',
+ 'threads': 'Number of threads to use for job parallelization. Pass 0 to '
+ 'use one per available CPU.',
'maxhits': 'Maximum number of hits to show once the search is terminated.',
'maxrejects': 'Maximum number of non-matching target sequences to '
'consider before stopping the search. This option works in '
=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -14,7 +14,9 @@ from itertools import chain, islice
import subprocess
import pandas as pd
-from qiime2.plugin import Int, Str, Float, Bool, Choices, Range
+from qiime2.plugin import (
+ Int, Str, Float, Bool, Choices, Range, Threads, get_available_cores
+)
from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAIterator, DNAFASTAFormat)
from q2_types.feature_table import FeatureTable, RelativeFrequency
@@ -203,6 +205,13 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
read_orientation: str = 'auto'
) -> pd.DataFrame:
+
+ if n_jobs in (0, -1):
+ n_jobs = get_available_cores()
+ elif n_jobs < -1:
+ n_less = abs(n_jobs + 1)
+ n_jobs = get_available_cores(n_less=n_less)
+
try:
# autotune reads per batch
if reads_per_batch == 'auto':
@@ -234,7 +243,7 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
_classify_parameters = {
'reads_per_batch': Int % Range(1, None) | Str % Choices(['auto']),
- 'n_jobs': Int,
+ 'n_jobs': Threads,
'pre_dispatch': Str,
'confidence': Float % Range(
0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices(
@@ -257,7 +266,7 @@ _parameter_descriptions = {
'reads_per_batch': 'Number of reads to process in each batch. If "auto", '
'this parameter is autoscaled to '
'min( number of query sequences / n_jobs, 20000).',
- 'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
+ 'n_jobs': 'The maximum number of concurrent worker processes. If -1 '
'all CPUs are used. If 1 is given, no parallel computing '
'code is used at all, which is useful for debugging. For '
'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '
=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -16,7 +16,7 @@ import pandas as pd
import skbio
import biom
-from q2_feature_classifier._skl import _specific_fitters
+from q2_feature_classifier._skl import _specific_fitters, _TaxonNode
from q2_feature_classifier.classifier import spec_from_pipeline, \
pipeline_from_spec, populate_class_weight, _autotune_reads_per_batch
@@ -242,3 +242,42 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
def test_autotune_reads_per_batch_more_jobs_than_reads(self):
self.assertEqual(
_autotune_reads_per_batch(self.seq_path, n_jobs=1105), 1)
+
+ def test_TaxonNode_create_tree(self):
+ classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+ separator = ';'
+ tree = _TaxonNode.create_tree(classes, separator)
+ self.assertEqual(
+ tree.children['a'].children['b'].children['c'].name, 'c')
+ self.assertEqual(
+ tree.children['a'].children['b'].children['d'].name, 'd')
+ self.assertEqual(
+ tree.children['a'].children['e'].children['f'].name, 'f')
+ self.assertEqual(
+ tree.children['a'].children['e'].children['g'].name, 'g')
+
+ def test_TaxonNode_range(self):
+ classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+ separator = ';'
+ tree = _TaxonNode.create_tree(classes, separator)
+ self.assertEqual(
+ tree.children['a'].children['b'].children['c'].range, range(0, 1))
+ self.assertEqual(
+ tree.children['a'].children['b'].children['d'].range, range(1, 2))
+ self.assertEqual(
+ tree.children['a'].children['e'].children['f'].range, range(2, 3))
+ self.assertEqual(
+ tree.children['a'].children['e'].children['g'].range, range(3, 4))
+ self.assertEqual(
+ tree.children['a'].children['b'].range, range(0, 2))
+ self.assertEqual(
+ tree.children['a'].children['e'].range, range(2, 4))
+
+ def test_TaxonNode_num_leaf_nodes(self):
+ classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g']
+ separator = ';'
+ tree = _TaxonNode.create_tree(classes, separator)
+ self.assertEqual(tree.num_leaf_nodes, 4)
+ self.assertEqual(tree.children['a'].num_leaf_nodes, 4)
+ self.assertEqual(tree.children['a'].children['b'].num_leaf_nodes, 2)
+ self.assertEqual(tree.children['a'].children['e'].num_leaf_nodes, 2)
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/c639ef80eb28b368122984609d6c81cb506469d3
--
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/c639ef80eb28b368122984609d6c81cb506469d3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/0f02c364/attachment-0001.htm>
More information about the debian-med-commit
mailing list