[med-svn] [Git][med-team/q2-feature-classifier][master] 6 commits: routine-update: New upstream version
Mohd Bilal (@rmb)
gitlab at salsa.debian.org
Wed Sep 7 14:36:42 BST 2022
Mohd Bilal pushed to branch master at Debian Med / q2-feature-classifier
Commits:
48abbdd1 by Mohammed Bilal at 2022-09-07T11:11:48+00:00
routine-update: New upstream version
- - - - -
49dba318 by Mohammed Bilal at 2022-09-07T11:11:49+00:00
New upstream version 2022.8.0
- - - - -
99813c05 by Mohammed Bilal at 2022-09-07T11:11:51+00:00
Update upstream source from tag 'upstream/2022.8.0'
Update to upstream version '2022.8.0'
with Debian dir e405bafec4ac00757b0b71ef2e27dee72a0b6213
- - - - -
54b5921c by Mohammed Bilal at 2022-09-07T11:20:58+00:00
drop patch
Not needed anymore
- - - - -
792870e6 by Mohammed Bilal at 2022-09-07T11:22:08+00:00
Update dependency on qiime/q2-* >= 2022.8.0
- - - - -
e8969670 by Mohammed Bilal at 2022-09-07T11:23:47+00:00
Upload to unstable
- - - - -
14 changed files:
- ci/recipe/meta.yaml
- debian/changelog
- debian/control
- − debian/patches/series
- − debian/patches/vsearch.patch
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/tests/__init__.py
- + q2_feature_classifier/tests/data/blast6-format.tsv
- + q2_feature_classifier/tests/data/query-seqs.fasta
- q2_feature_classifier/tests/data/se-dna-sequences.fasta
- q2_feature_classifier/tests/test_consensus_assignment.py
Changes:
=====================================
ci/recipe/meta.yaml
=====================================
@@ -18,13 +18,12 @@ requirements:
run:
- python {{ python }}
- - scikit-learn ==0.24.1
+ - scikit-learn {{ scikit_learn }}
- joblib
- - scikit-bio
- - biom-format >=2.1.5,<2.2.0
+ - scikit-bio {{ scikit_bio }}
+ - biom-format {{ biom_format }}
- blast >=2.6.0
- # There are issues with 2.8.2, and no OS X builds exist after 2.7.0
- - vsearch <=2.7.0
+ - vsearch
- qiime2 {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*
- q2-quality-control {{ qiime2_epoch }}.*
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+q2-feature-classifier (2022.8.0-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version 2022.8.0
+ * drop vsearch patch
+ * Update dependency on qiime/q2-* >= 2022.8.0
+
+ -- Mohammed Bilal <mdbilal at disroot.org> Wed, 07 Sep 2022 11:22:16 +0000
+
q2-feature-classifier (2022.2.0-1) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -14,11 +14,11 @@ Build-Depends: debhelper-compat (= 13),
python3-skbio,
ncbi-blast+,
vsearch,
- qiime (>= 2021.8.0),
- q2-types (>= 2021.8.0),
- q2-quality-control (>= 2021.8.0),
- q2-taxa (>= 2021.8.0),
- q2-feature-table (>= 2021.8.0)
+ qiime (>= 2022.8.0),
+ q2-types (>= 2022.8.0),
+ q2-quality-control (>= 2022.8.0),
+ q2-taxa (>= 2022.8.0),
+ q2-feature-table (>= 2022.8.0)
Standards-Version: 4.6.1
Vcs-Browser: https://salsa.debian.org/med-team/q2-feature-classifier
Vcs-Git: https://salsa.debian.org/med-team/q2-feature-classifier.git
@@ -36,11 +36,11 @@ Depends: ${shlibs:Depends},
python3-biom-format,
ncbi-blast+,
vsearch,
- qiime (>= 2021.8.0),
- q2-types (>= 2021.8.0),
- q2-quality-control (>= 2021.8.0),
- q2-taxa (>= 2021.8.0),
- q2-feature-table (>= 2021.8.0)
+ qiime (>= 2022.8.0),
+ q2-types (>= 2022.8.0),
+ q2-quality-control (>= 2022.8.0),
+ q2-taxa (>= 2022.8.0),
+ q2-feature-table (>= 2022.8.0)
Description: QIIME 2 plugin supporting taxonomic classification
QIIME 2 is a powerful, extensible, and decentralized microbiome analysis
package with a focus on data and analysis transparency. QIIME 2 enables
=====================================
debian/patches/series deleted
=====================================
@@ -1 +0,0 @@
-vsearch.patch
=====================================
debian/patches/vsearch.patch deleted
=====================================
@@ -1,20 +0,0 @@
-Description: fix failure to invoke vsearch --search_exact
-Author: Étienne Mollier <etienne.mollier at mailoo.org>
-Forwarded: https://github.com/qiime2/q2-feature-classifier/pull/170
-Last-Update: 2021-01-27
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- q2-feature-classifier.orig/q2_feature_classifier/_vsearch.py
-+++ q2-feature-classifier/q2_feature_classifier/_vsearch.py
-@@ -50,6 +50,11 @@
- '--threads', str(threads)]
- if search_exact:
- cmd[1] = '--search_exact'
-+ # These options are not compatible with a --search_exact command.
-+ for pop_arg in ['--id', '--maxaccepts', '--maxrejects', '--query_cov']:
-+ pop_idx = cmd.index(pop_arg)
-+ cmd.pop(pop_idx)
-+ cmd.pop(pop_idx)
- if top_hits_only:
- cmd.append('--top_hits_only')
- if output_no_hits:
=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -6,13 +6,16 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
+import subprocess
import pandas as pd
from q2_types.feature_data import (
- FeatureData, Taxonomy, Sequence, DNAFASTAFormat)
-from qiime2.plugin import Int, Str, Float, Choices, Range
+ FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
+ BLAST6Format)
+from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
from .plugin_setup import plugin, citations
-from ._consensus_assignment import (_consensus_assignments,
- _get_default_unassignable_label)
+from ._consensus_assignment import (
+ min_consensus_param, min_consensus_param_description,
+ DEFAULTUNASSIGNABLELABEL)
# ---------------------------------------------------------------
# Reason for num_thread not being exposed.
@@ -25,68 +28,190 @@ from ._consensus_assignment import (_consensus_assignments,
# ---------------------------------------------------------------
-def classify_consensus_blast(
- query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
- reference_taxonomy: pd.Series, maxaccepts: int = 10,
- perc_identity: float = 0.8, query_cov: float = 0.8,
- strand: str = 'both', evalue: float = 0.001,
- min_consensus: float = 0.51,
- unassignable_label: str = _get_default_unassignable_label(),
- ) -> pd.DataFrame:
+# Specify default settings for various functions
+DEFAULTMAXACCEPTS = 10
+DEFAULTPERCENTID = 0.8
+DEFAULTQUERYCOV = 0.8
+DEFAULTSTRAND = 'both'
+DEFAULTEVALUE = 0.001
+DEFAULTMINCONSENSUS = 0.51
+DEFAULTOUTPUTNOHITS = True
+
+
+# NOTE FOR THE FUTURE: should this be called blastn? would it be possible to
+# eventually generalize to e.g., blastp or blastx? or will this be too
+# challenging, e.g., to detect the format internally? A `mode` parameter could
+# be added and TypeMapped to the input type, a bit cumbersome but one way to
+# accomplish this without code bloat. But the question is: would we want to
+# expose different parameters etc? My feeling is let's call this `blast` for
+# now and then cross that bridge when we come to it.
+def blast(query: DNAFASTAFormat,
+ reference_reads: DNAFASTAFormat,
+ maxaccepts: int = DEFAULTMAXACCEPTS,
+ perc_identity: float = DEFAULTPERCENTID,
+ query_cov: float = DEFAULTQUERYCOV,
+ strand: str = DEFAULTSTRAND,
+ evalue: float = DEFAULTEVALUE,
+ output_no_hits: bool = DEFAULTOUTPUTNOHITS) -> pd.DataFrame:
perc_identity = perc_identity * 100
query_cov = query_cov * 100
seqs_fp = str(query)
ref_fp = str(reference_reads)
+ # TODO: generalize to support other blast types?
+ output = BLAST6Format()
cmd = ['blastn', '-query', seqs_fp, '-evalue', str(evalue), '-strand',
- strand, '-outfmt', '7', '-subject', ref_fp, '-perc_identity',
+ strand, '-outfmt', '6', '-subject', ref_fp, '-perc_identity',
str(perc_identity), '-qcov_hsp_perc', str(query_cov),
- '-max_target_seqs', str(maxaccepts), '-out']
- consensus = _consensus_assignments(
- cmd, reference_taxonomy, unassignable_label=unassignable_label,
- min_consensus=min_consensus, output_no_hits=True,
- exp_seq_ids=seqs_fp)
+ '-max_target_seqs', str(maxaccepts), '-out', str(output)]
+ _run_command(cmd)
+ # load as dataframe to quickly validate (note: will fail now if empty)
+ result = output.view(pd.DataFrame)
+
+ # blast will not report reads with no hits. We will report this
+ # information here, so that it is explicitly reported to the user.
+ if output_no_hits:
+ ids_with_hit = set(result['qseqid'].unique())
+ query_ids = {seq.metadata['id'] for seq in query.view(DNAIterator)}
+ missing_ids = query_ids - ids_with_hit
+ if len(missing_ids) > 0:
+ # we will mirror vsearch behavior and annotate unassigneds as '*'
+ # and fill other columns with 0 values (np.nan makes format error).
+ missed = pd.DataFrame(columns=result.columns)
+ missed = missed.append(
+ [{'qseqid': i, 'sseqid': '*'} for i in missing_ids]).fillna(0)
+ # Do two separate appends to make sure that fillna does not alter
+ # any other contents from the original search results.
+ result = result.append(missed, ignore_index=True)
+ return result
+
+
+def classify_consensus_blast(ctx,
+ query,
+ reference_reads,
+ reference_taxonomy,
+ maxaccepts=DEFAULTMAXACCEPTS,
+ perc_identity=DEFAULTPERCENTID,
+ query_cov=DEFAULTQUERYCOV,
+ strand=DEFAULTSTRAND,
+ evalue=DEFAULTEVALUE,
+ output_no_hits=DEFAULTOUTPUTNOHITS,
+ min_consensus=DEFAULTMINCONSENSUS,
+ unassignable_label=DEFAULTUNASSIGNABLELABEL):
+ search_db = ctx.get_action('feature_classifier', 'blast')
+ lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
+ result, = search_db(query=query, reference_reads=reference_reads,
+ maxaccepts=maxaccepts, perc_identity=perc_identity,
+ query_cov=query_cov, strand=strand, evalue=evalue,
+ output_no_hits=output_no_hits)
+ consensus, = lca(search_results=result,
+ reference_taxonomy=reference_taxonomy,
+ min_consensus=min_consensus,
+ unassignable_label=unassignable_label)
+ # New: add BLAST6Format result as an output. This could just as well be a
+ # visualizer generated from these results (using q2-metadata tabulate).
+ # Would that be more useful to the user that the QZA?
+ return consensus, result
+
+
+# Replace this function with QIIME2 API for wrapping commands/binaries,
+# pending https://github.com/qiime2/qiime2/issues/224
+def _run_command(cmd, verbose=True):
+ if verbose:
+ print("Running external command line application. This may print "
+ "messages to stdout and/or stderr.")
+ print("The command being run is below. This command cannot "
+ "be manually re-run as it will depend on temporary files that "
+ "no longer exist.")
+ print("\nCommand:", end=' ')
+ print(" ".join(cmd), end='\n\n')
+ subprocess.run(cmd, check=True)
+
+
+inputs = {'query': FeatureData[Sequence],
+ 'reference_reads': FeatureData[Sequence]}
+
+input_descriptions = {'query': 'Query sequences.',
+ 'reference_reads': 'Reference sequences.'}
- return consensus
+classification_output = ('classification', FeatureData[Taxonomy])
+classification_output_description = {
+ 'classification': 'Taxonomy classifications of query sequences.'}
+parameters = {'evalue': Float,
+ 'maxaccepts': Int % Range(1, None),
+ 'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
+ 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
+ 'strand': Str % Choices(['both', 'plus', 'minus']),
+ 'output_no_hits': Bool,
+ }
+
+parameter_descriptions = {
+ 'evalue': 'BLAST expectation value (E) threshold for saving hits.',
+ 'strand': ('Align against reference sequences in forward ("plus"), '
+ 'reverse ("minus"), or both directions ("both").'),
+ 'maxaccepts': ('Maximum number of hits to keep for each query. BLAST will '
+ 'choose the first N hits in the reference database that '
+ 'exceed perc_identity similarity to query. NOTE: the '
+ 'database is not sorted by similarity to query, so these '
+ 'are the first N hits that pass the threshold, not '
+ 'necessarily the top N hits.'),
+ 'perc_identity': ('Reject match if percent identity to query is lower.'),
+ 'query_cov': 'Reject match if query alignment coverage per high-'
+ 'scoring pair is lower. Note: this uses blastn\'s '
+ 'qcov_hsp_perc parameter, and may not behave identically '
+ 'to the query_cov parameter used by classify-consensus-'
+ 'vsearch.',
+ 'output_no_hits': 'Report both matching and non-matching queries. '
+ 'WARNING: always use the default setting for this '
+ 'option unless if you know what you are doing! If '
+ 'you set this option to False, your sequences and '
+ 'feature table will need to be filtered to exclude '
+ 'unclassified sequences, otherwise you may run into '
+ 'errors downstream from missing feature IDs. Set to '
+ 'FALSE to mirror default BLAST search.',
+}
+
+blast6_output = ('search_results', FeatureData[BLAST6])
+
+blast6_output_description = {'search_results': 'Top hits for each query.'}
+
+
+# Note: name should be changed to blastn if we do NOT generalize this function
plugin.methods.register_function(
+ function=blast,
+ inputs=inputs,
+ parameters=parameters,
+ outputs=[blast6_output],
+ input_descriptions=input_descriptions,
+ parameter_descriptions=parameter_descriptions,
+ output_descriptions=blast6_output_description,
+ name='BLAST+ local alignment search.',
+ description=('Search for top hits in a reference database via local '
+ 'alignment between the query sequences and reference '
+ 'database sequences using BLAST+. Returns a report '
+ 'of the top M hits for each query (where M=maxaccepts).'),
+ citations=[citations['camacho2009blast+']]
+)
+
+
+plugin.pipelines.register_function(
function=classify_consensus_blast,
- inputs={'query': FeatureData[Sequence],
- 'reference_reads': FeatureData[Sequence],
+ inputs={**inputs,
'reference_taxonomy': FeatureData[Taxonomy]},
- parameters={'evalue': Float,
- 'maxaccepts': Int % Range(1, None),
- 'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
- 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
- 'strand': Str % Choices(['both', 'plus', 'minus']),
- 'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
- inclusive_start=False),
+ parameters={**parameters,
+ **min_consensus_param,
'unassignable_label': Str},
- outputs=[('classification', FeatureData[Taxonomy])],
- input_descriptions={'query': 'Sequences to classify taxonomically.',
- 'reference_reads': 'reference sequences.',
+ outputs=[classification_output, blast6_output],
+ input_descriptions={**input_descriptions,
'reference_taxonomy': 'reference taxonomy labels.'},
parameter_descriptions={
- 'evalue': 'BLAST expectation value (E) threshold for saving hits.',
- 'strand': ('Align against reference sequences in forward ("plus"), '
- 'reverse ("minus"), or both directions ("both").'),
- 'maxaccepts': ('Maximum number of hits to keep for each query. Must '
- 'be in range [1, infinity]. BLAST will choose the '
- 'first N hits in the reference database that exceed '
- 'perc_identity similarity to query.'),
- 'perc_identity': ('Reject match if percent identity to query is '
- 'lower. Must be in range [0.0, 1.0].'),
- 'query_cov': 'Reject match if query alignment coverage per high-'
- 'scoring pair is lower. Note: this uses blastn\'s '
- 'qcov_hsp_perc parameter, and may not behave identically '
- 'to the query_cov parameter used by classify-consensus-'
- 'vsearch. Must be in range [0.0, 1.0].',
- 'min_consensus': ('Minimum fraction of assignments must match top '
- 'hit to be accepted as consensus assignment. Must '
- 'be in range (0.5, 1.0].')
+ **parameter_descriptions,
+ **min_consensus_param_description,
+ 'unassignable_label': 'Annotation given to sequences without any hits.'
},
- output_descriptions={
- 'classification': 'Taxonomy classifications of query sequences.'},
+ output_descriptions={**classification_output_description,
+ **blast6_output_description},
name='BLAST+ consensus taxonomy classifier',
description=('Assign taxonomy to query sequences using BLAST+. Performs '
'BLAST+ local alignment between query and reference_reads, '
=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -6,246 +6,219 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-import tempfile
-import subprocess
+from collections import Counter
+from math import ceil
+
import pandas as pd
-from os.path import isfile
-from collections import Counter, defaultdict
-import qiime2
-
-
-def _get_default_unassignable_label():
- return "Unassigned"
-
-
-def _consensus_assignments(
- cmd, ref_taxa, min_consensus=0.51, output_no_hits=False,
- exp_seq_ids=None,
- unassignable_label=_get_default_unassignable_label()):
- '''Run command line subprocess and find consensus taxonomy.'''
- with tempfile.NamedTemporaryFile() as output:
- cmd = cmd + [output.name]
- _run_command(cmd)
- obs_taxa = _import_blast_format_assignments(
- output.name, ref_taxa, unassignable_label=unassignable_label)
- consensus = _compute_consensus_annotations(
- obs_taxa, min_consensus=min_consensus,
- unassignable_label=unassignable_label)
- # If output_no_hits=True, find/add query IDs missing from consensus
- if output_no_hits is True:
- consensus = _output_no_hits(
- consensus, exp_seq_ids, unassignable_label)
- # if there are no consensus results (no hits for any query), default
- # to empty result file (must create empty dict or fail on pd error)
- if not consensus:
- consensus = {'': ('', '')}
- result = pd.DataFrame.from_dict(consensus, 'index')
- result.index.name = 'Feature ID'
- result.columns = ['Taxon', 'Consensus']
- return result
-
-
-def _output_no_hits(obs_taxa, exp_seq_ids,
- unassignable_label=_get_default_unassignable_label()):
- '''If a query ID has no hits, report as unassigned.'''
- # extract list of query sequence IDs
- exp = _open_list_or_file(exp_seq_ids)
- exp = [line.strip('>') for line in exp if line.startswith('>')]
-
- # iterate over expected IDs, add as unassigned to obs_taxa if missing
- for _id in exp:
- if _id not in obs_taxa:
- obs_taxa[_id] = (unassignable_label, 0.0)
-
- return obs_taxa
-
-
-# Replace this function with QIIME2 API for wrapping commands/binaries,
-# pending https://github.com/qiime2/qiime2/issues/224
-def _run_command(cmd, verbose=True):
- if verbose:
- print("Running external command line application. This may print "
- "messages to stdout and/or stderr.")
- print("The command being run is below. This command cannot "
- "be manually re-run as it will depend on temporary files that "
- "no longer exist.")
- print("\nCommand:", end=' ')
- print(" ".join(cmd), end='\n\n')
- subprocess.run(cmd, check=True)
-
-
-def _import_blast_format_assignments(
+
+from qiime2.plugin import Str, Float, Range
+from .plugin_setup import plugin
+from q2_types.feature_data import FeatureData, Taxonomy, BLAST6
+
+
+min_consensus_param = {'min_consensus': Float % Range(
+ 0.5, 1.0, inclusive_end=True, inclusive_start=False)}
+
+min_consensus_param_description = {
+ 'min_consensus': 'Minimum fraction of assignments must match top '
+ 'hit to be accepted as consensus assignment.'}
+
+DEFAULTUNASSIGNABLELABEL = "Unassigned"
+
+
+def find_consensus_annotation(search_results: pd.DataFrame,
+ reference_taxonomy: pd.Series,
+ min_consensus: int = 0.51,
+ unassignable_label: str =
+ DEFAULTUNASSIGNABLELABEL
+ ) -> pd.DataFrame:
+ '''Find consensus taxonomy from BLAST6Format alignment summary.
+
+ search_results: pd.dataframe
+ BLAST6Format search results with canonical headers attached.
+ reference_taxonomy: pd.Series
+ Annotations of reference database used for original search.
+ min_consensus : float
+ The minimum fraction of the annotations that a specfic annotation
+ must be present in for that annotation to be accepted. Current
+ lower boundary is 0.51.
+ unassignable_label : str
+ The label to apply if no acceptable annotations are identified.
+ '''
+ # load and convert blast6format results to dict of taxa hits
+ obs_taxa = _blast6format_df_to_series_of_lists(
+ search_results, reference_taxonomy,
+ unassignable_label=unassignable_label)
+ # TODO: is it worth allowing early stopping if maxaccepts==1?
+ # compute consensus annotations
+ result = _compute_consensus_annotations(
+ obs_taxa, min_consensus=min_consensus,
+ unassignable_label=unassignable_label)
+ result.index.name = 'Feature ID'
+ return result
+
+
+plugin.methods.register_function(
+ function=find_consensus_annotation,
+ inputs={'search_results': FeatureData[BLAST6],
+ 'reference_taxonomy': FeatureData[Taxonomy]},
+ parameters={
+ **min_consensus_param,
+ 'unassignable_label': Str},
+ outputs=[('consensus_taxonomy', FeatureData[Taxonomy])],
+ input_descriptions={
+ 'search_results': 'Search results in BLAST6 output format',
+ 'reference_taxonomy': 'reference taxonomy labels.'},
+ parameter_descriptions={
+ **min_consensus_param_description,
+ 'unassignable_label': 'Annotation given when no consensus is found.'
+ },
+ output_descriptions={
+ 'consensus_taxonomy': 'Consensus taxonomy and scores.'},
+ name='Find consensus among multiple annotations.',
+ description=('Find consensus annotation for each query searched against '
+ 'a reference database, by finding the least common ancestor '
+ 'among one or more semicolon-delimited hierarchical '
+ 'annotations. Note that the annotation hierarchy is assumed '
+ 'to have an even number of ranks.'),
+)
+
+
+def _blast6format_df_to_series_of_lists(
assignments, ref_taxa,
- unassignable_label=_get_default_unassignable_label()):
- '''import observed assignments in blast6 or blast7 format to dict of lists.
+ unassignable_label=DEFAULTUNASSIGNABLELABEL):
+ '''import observed assignments in blast6 format to series of lists.
- assignments: path or list
- Taxonomy observation map in blast format 6 or 7. Each line consists of
+ assignments: pd.DataFrame
+ Taxonomy observation map in blast format 6. Each line consists of
taxonomy assignments of a query sequence in tab-delimited format:
- <query_id> <assignment_id> <...other columns are ignored>
+ <query_id> <subject-seq-id> <...other columns are ignored>
- ref_taxa: dict or pd.Series
+ ref_taxa: pd.Series
Reference taxonomies in tab-delimited format:
- <accession ID> kingdom;phylum;class;order;family;genus;species
+ <accession ID> Annotation
+ The accession IDs in this taxonomy should match the subject-seq-ids in
+ the "assignment" input.
'''
- obs_taxa = defaultdict(list)
- lines = _open_list_or_file(assignments)
-
- for line in lines:
- if not line.startswith('#') or line == "":
- fields = line.split('\t')
- # if vsearch fails to find assignment, it reports '*' as the
- # accession ID, which is completely useless and unproductive..
- if fields[1] == '*':
- t = [unassignable_label]
- else:
- id_ = fields[1]
-
- try:
- t = ref_taxa[id_]
- except KeyError:
- raise KeyError((
- 'Identifier {0} was reported in taxonomic search '
- 'results, but was not present in the reference '
- 'taxonomy.').format(str(id_)))
-
- try:
- t = t.split(';')
- except ValueError:
- raise ValueError((
- 'Reference taxonomy {0} (id: {1}) is incorrectly '
- 'formatted.').format(t, str(id_)))
-
- obs_taxa[fields[0]].append(t)
-
- return obs_taxa
-
-
-def _open_list_or_file(infile):
- if isinstance(infile, list):
- lines = infile
- elif isfile(infile):
- with open(infile, "r") as inputfile:
- lines = [line.strip() for line in inputfile]
- return lines
-
-
-# This code has been ported and adapted from QIIME 1.9.1 with permission from
-# @gregcaporaso.
+ taxa_hits = assignments.set_index('qseqid')['sseqid']
+
+ # validate that assignments are present in reference taxonomy
+ # (i.e., that the correct reference taxonomy was used).
+ # Note that we drop unassigned labels from this set.
+ missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''}
+ if len(missing_ids) > 0:
+ raise KeyError('Reference taxonomy and search results do not match. '
+ 'The following identifiers were reported in the search '
+ 'results but are not present in the reference taxonomy:'
+ ' {0}'.format(', '.join(str(i) for i in missing_ids)))
+
+ # if vsearch fails to find assignment, it reports '*' as the
+ # accession ID, so we will add this mapping to the reference taxonomy.
+ ref_taxa['*'] = unassignable_label
+ # map accession IDs to taxonomy
+ taxa_hits.replace(ref_taxa, inplace=True)
+ # convert to dict of {accession_id: [annotations]}
+ taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list)
+
+ return taxa_hits
+
+
def _compute_consensus_annotations(
query_annotations, min_consensus,
- unassignable_label=_get_default_unassignable_label()):
+ unassignable_label=DEFAULTUNASSIGNABLELABEL):
"""
Parameters
----------
- query_annotations : dict of lists
- Keys are query identifiers, and values are lists of all
+ query_annotations : pd.Series of lists
+ Indices are query identifiers, and values are lists of all
taxonomic annotations associated with that identfier.
Returns
-------
- dict
- Keys are query identifiers, and values are the consensus of the
- input taxonomic annotations.
+ pd.DataFrame
+ Indices are query identifiers, and values are the consensus of the
+ input taxonomic annotations, and the consensus score.
"""
- result = {}
- for query_id, annotations in query_annotations.items():
- consensus_annotation, consensus_fraction = \
- _compute_consensus_annotation(annotations, min_consensus,
- unassignable_label)
- result[query_id] = (
- ';'.join(consensus_annotation), consensus_fraction)
- return result
+ # define function to apply to each list of taxa hits
+ # Note: I am setting this up to open the possibility to define other
+ # functions later (e.g., not simple threshold consensus)
+ def _apply_consensus_function(taxa, min_consensus=min_consensus,
+ unassignable_label=unassignable_label,
+ _consensus_function=_lca_consensus):
+ # if there is no consensus, skip consensus calculation
+ if len(taxa) == 1:
+ taxa, score = taxa.pop(), 1.
+ else:
+ taxa = _taxa_to_cumulative_ranks(taxa)
+ # apply and score consensus
+ taxa, score = _consensus_function(
+ taxa, min_consensus, unassignable_label)
+ # return as a series so that the outer apply returns a dataframe
+ # (i.e., consensus scores get inserted as an additional column)
+ return pd.Series([taxa, score], index=['Taxon', 'Consensus'])
+
+ # If func returns a Series object the result will be a DataFrame.
+ return query_annotations.apply(_apply_consensus_function)
+
+
+# first split semicolon-delimited taxonomies by rank
+# and iteratively join ranks, so that: ['a;b;c', 'a;b;d', 'a;g;g'] -->
+# [['a', 'a;b', 'a;b;c'], ['a', 'a;b', 'a;b;d'], ['a', 'a;g', 'a;g;g']]
+# this is to avoid issues where e.g., the same species name may occur
+# in different taxonomic lineages.
+def _taxa_to_cumulative_ranks(taxa):
+ """
+ Parameters
+ ----------
+ taxa : list or str
+ List of semicolon-delimited taxonomic labels.
+ e.g., ['a;b;c', 'a;b;d']
+ Returns
+ -------
+ list of lists of str
+ Lists of cumulative taxonomic ranks for each input str
+ e.g., [['a', 'a;b', 'a;b;c'], ['a', 'a;b', 'a;b;d']]
+ """
+ return [[';'.join(t.split(';')[:n + 1])
+ for n in range(t.count(';') + 1)]
+ for t in taxa]
-# This code has been ported from QIIME 1.9.1 with permission from @gregcaporaso
-def _compute_consensus_annotation(annotations, min_consensus,
- unassignable_label):
+# Find the LCA by consensus threshold. Return label and the consensus score.
+def _lca_consensus(annotations, min_consensus, unassignable_label):
""" Compute the consensus of a collection of annotations
Parameters
----------
annotations : list of lists
- Taxonomic annotations to compute the consensus of.
+ Taxonomic annotations to form consensus.
min_consensus : float
The minimum fraction of the annotations that a specfic annotation
- must be present in for that annotation to be accepted. This must
- be greater than or equal to 0.51.
+ must be present in for that annotation to be accepted. Current
+ lower boundary is 0.51.
unassignable_label : str
The label to apply if no acceptable annotations are identified.
Result
------
- consensus_annotation
- List containing the consensus assignment
- consensus_fraction
+ consensus_annotation: str
+ The consensus assignment
+ consensus_fraction: float
Fraction of input annotations that agreed at the deepest
level of assignment
"""
- if min_consensus <= 0.5:
- raise ValueError("min_consensus must be greater than 0.5.")
- num_input_annotations = len(annotations)
- consensus_annotation = []
-
- # if the annotations don't all have the same number
- # of levels, the resulting annotation will have a max number
- # of levels equal to the number of levels in the assignment
- # with the fewest number of levels. this is to avoid
- # a case where, for example, there are n assignments, one of
- # which has 7 levels, and the other n-1 assignments have 6 levels.
- # A 7th level in the result would be misleading because it
- # would appear to the user as though it was the consensus
- # across all n assignments.
- num_levels = min([len(a) for a in annotations])
-
- # iterate over the assignment levels
- for level in range(num_levels):
- # count the different taxonomic assignments at the current level..
- # the counts are computed based on the current level and all higher
- # levels to reflect that, for example, 'p__A; c__B; o__C' and
- # 'p__X; c__Y; o__C' represent different taxa at the o__ level (since
- # they are different at the p__ and c__ levels).
- current_level_annotations = \
- Counter([tuple(e[:level + 1]) for e in annotations])
- # identify the most common taxonomic assignment, and compute the
- # fraction of annotations that contained it. it's safe to compute the
- # fraction using num_assignments because the deepest level we'll
- # ever look at here is num_levels (see above comment on how that
- # is decided).
- tax, max_count = current_level_annotations.most_common(1)[0]
- max_consensus_fraction = max_count / num_input_annotations
- # check whether the most common taxonomic assignment is observed
- # in at least min_consensus of the sequences
- if max_consensus_fraction >= min_consensus:
- # if so, append the current level only (e.g., 'o__C' if tax is
- # 'p__A; c__B; o__C', and continue on to the next level
- consensus_annotation.append((tax[-1], max_consensus_fraction))
- else:
- # if not, there is no assignment at this level, and we're
- # done iterating over levels
- break
-
- # construct the results
- # determine the number of levels in the consensus assignment
- consensus_annotation_depth = len(consensus_annotation)
- if consensus_annotation_depth > 0:
- # if it's greater than 0, generate a list of the
- # taxa assignments at each level
- annotation = [a[0] for a in consensus_annotation]
- # and assign the consensus_fraction_result as the
- # consensus fraction at the deepest level
- consensus_fraction_result = \
- consensus_annotation[consensus_annotation_depth - 1][1]
- else:
- # if there are zero assignments, indicate that the taxa is
- # unknown
- annotation = [unassignable_label]
- # and assign the consensus_fraction_result to 1.0 (this is
- # somewhat arbitrary, but could be interpreted as all of the
- # assignments suggest an unknown taxonomy)
- consensus_fraction_result = 1.0
-
- return annotation, consensus_fraction_result
-
-
-def _annotate_method(taxa, method):
- taxa = taxa.view(pd.DataFrame)
- taxa['Method'] = method
- return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)
+ # count total number of labels to get consensus threshold
+ n_annotations = len(annotations)
+ threshold = ceil(n_annotations * min_consensus)
+ # zip together ranks and count frequency of each unique label.
+ # This assumes that a hierarchical taxonomy with even numbers of
+ # ranks was used.
+ taxa_comparison = [Counter(rank) for rank in zip(*annotations)]
+ # interate rank comparisons in reverse
+ # to find rank with consensus count > threshold
+ for rank in taxa_comparison[::-1]:
+ # grab most common label and its count
+ label, count = rank.most_common(1)[0]
+ # TODO: this assumes that min_consensus >= 0.51 (current lower bound)
+ # but could fail to find ties if we allow lower min_consensus scores
+ if count >= threshold:
+ return label, round(count / n_annotations, 3)
+ # if we reach this point, no consensus was ever found at any rank
+ return unassignable_label, 0.0
=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2022.2.0)"
- git_full = "265439e4cac3fe8eb63d5193aa6ddb00ff175b98"
- git_date = "2022-02-18 19:15:48 +0000"
+ git_refnames = " (tag: 2022.8.0)"
+ git_full = "6f7c334d5b36ff20ac088e88c1c13746bf883f76"
+ git_date = "2022-08-23 17:28:47 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -7,49 +7,76 @@
# ----------------------------------------------------------------------------
import tempfile
-import pandas as pd
import qiime2
+import pandas as pd
from q2_types.feature_data import (
- FeatureData, Taxonomy, Sequence, DNAFASTAFormat)
+ FeatureData, Taxonomy, Sequence, DNAFASTAFormat, BLAST6, BLAST6Format)
from .plugin_setup import plugin, citations
from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
-from ._consensus_assignment import (_consensus_assignments, _run_command,
- _get_default_unassignable_label,
- _annotate_method)
+from ._blast import (_run_command)
+from ._consensus_assignment import (DEFAULTUNASSIGNABLELABEL,
+ min_consensus_param,
+ min_consensus_param_description)
from ._taxonomic_classifier import TaxonomicClassifier
from .classifier import _classify_parameters, _parameter_descriptions
+# Specify default settings for various functions
+DEFAULTMAXACCEPTS = 10
+DEFAULTPERCENTID = 0.8
+DEFAULTQUERYCOV = 0.8
+DEFAULTSTRAND = 'both'
+DEFAULTSEARCHEXACT = False
+DEFAULTTOPHITS = False
+DEFAULTMAXHITS = 'all'
+DEFAULTMAXREJECTS = 'all'
+DEFAULTOUTPUTNOHITS = True
+DEFAULTWEAKID = 0.
+DEFAULTTHREADS = 1
+DEFAULTMINCONSENSUS = 0.51
+
-def classify_consensus_vsearch(query: DNAFASTAFormat,
- reference_reads: DNAFASTAFormat,
- reference_taxonomy: pd.Series,
- maxaccepts: int = 10,
- perc_identity: float = 0.8,
- query_cov: float = 0.8,
- strand: str = 'both',
- min_consensus: float = 0.51,
- unassignable_label: str =
- _get_default_unassignable_label(),
- search_exact: bool = False,
- top_hits_only: bool = False,
- maxhits: int = 'all',
- maxrejects: int = 'all',
- output_no_hits: bool = True,
- weak_id: float = 0.,
- threads: str = 1) -> pd.DataFrame:
+def vsearch_global(query: DNAFASTAFormat,
+ reference_reads: DNAFASTAFormat,
+ maxaccepts: int = DEFAULTMAXACCEPTS,
+ perc_identity: float = DEFAULTPERCENTID,
+ query_cov: float = DEFAULTQUERYCOV,
+ strand: str = DEFAULTSTRAND,
+ search_exact: bool = DEFAULTSEARCHEXACT,
+ top_hits_only: bool = DEFAULTTOPHITS,
+ maxhits: int = DEFAULTMAXHITS,
+ maxrejects: int = DEFAULTMAXREJECTS,
+ output_no_hits: bool = DEFAULTOUTPUTNOHITS,
+ weak_id: float = DEFAULTWEAKID,
+ threads: str = DEFAULTTHREADS) -> BLAST6Format:
seqs_fp = str(query)
ref_fp = str(reference_reads)
if maxaccepts == 'all':
maxaccepts = 0
if maxrejects == 'all':
maxrejects = 0
- cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
- '--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
- str(maxaccepts), '--maxrejects', str(maxrejects), '--db', ref_fp,
- '--threads', str(threads)]
+
if search_exact:
- cmd[1] = '--search_exact'
+ cmd = [
+ 'vsearch',
+ '--search_exact', seqs_fp,
+ '--strand', strand,
+ '--db', ref_fp,
+ '--threads', str(threads),
+ ]
+ else:
+ cmd = [
+ 'vsearch',
+ '--usearch_global', seqs_fp,
+ '--id', str(perc_identity),
+ '--query_cov', str(query_cov),
+ '--strand', strand,
+ '--maxaccepts', str(maxaccepts),
+ '--maxrejects', str(maxrejects),
+ '--db', ref_fp,
+ '--threads', str(threads),
+ ]
+
if top_hits_only:
cmd.append('--top_hits_only')
if output_no_hits:
@@ -58,11 +85,52 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
cmd.extend(['--weak_id', str(weak_id)])
if maxhits != 'all':
cmd.extend(['--maxhits', str(maxhits)])
- cmd.append('--blast6out')
- consensus = _consensus_assignments(
- cmd, reference_taxonomy, min_consensus=min_consensus,
- unassignable_label=unassignable_label)
- return consensus
+ output = BLAST6Format()
+ cmd.extend(['--blast6out', str(output)])
+ _run_command(cmd)
+ return output
+
+
+def classify_consensus_vsearch(ctx,
+ query,
+ reference_reads,
+ reference_taxonomy,
+ maxaccepts=DEFAULTMAXACCEPTS,
+ perc_identity=DEFAULTPERCENTID,
+ query_cov=DEFAULTQUERYCOV,
+ strand=DEFAULTSTRAND,
+ search_exact=DEFAULTSEARCHEXACT,
+ top_hits_only=DEFAULTTOPHITS,
+ maxhits=DEFAULTMAXHITS,
+ maxrejects=DEFAULTMAXREJECTS,
+ output_no_hits=DEFAULTOUTPUTNOHITS,
+ weak_id=DEFAULTWEAKID,
+ threads=DEFAULTTHREADS,
+ min_consensus=DEFAULTMINCONSENSUS,
+ unassignable_label=DEFAULTUNASSIGNABLELABEL):
+ search_db = ctx.get_action('feature_classifier', 'vsearch_global')
+ lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
+ result, = search_db(query=query, reference_reads=reference_reads,
+ maxaccepts=maxaccepts, perc_identity=perc_identity,
+ query_cov=query_cov, strand=strand,
+ search_exact=search_exact, top_hits_only=top_hits_only,
+ maxhits=maxhits, maxrejects=maxrejects,
+ output_no_hits=output_no_hits, weak_id=weak_id,
+ threads=threads)
+ consensus, = lca(search_results=result,
+ reference_taxonomy=reference_taxonomy,
+ min_consensus=min_consensus,
+ unassignable_label=unassignable_label)
+ # New: add BLAST6Format result as an output. This could just as well be a
+ # visualizer generated from these results (using q2-metadata tabulate).
+ # Would that be more useful to the user that the QZA?
+ return consensus, result
+
+
+def _annotate_method(taxa, method):
+ taxa = taxa.view(pd.DataFrame)
+ taxa['Method'] = method
+ return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)
def classify_hybrid_vsearch_sklearn(ctx,
@@ -70,17 +138,17 @@ def classify_hybrid_vsearch_sklearn(ctx,
reference_reads,
reference_taxonomy,
classifier,
- maxaccepts=10,
+ maxaccepts=DEFAULTMAXACCEPTS,
perc_identity=0.5,
- query_cov=0.8,
- strand='both',
- min_consensus=0.51,
- maxhits='all',
- maxrejects='all',
+ query_cov=DEFAULTQUERYCOV,
+ strand=DEFAULTSTRAND,
+ min_consensus=DEFAULTMINCONSENSUS,
+ maxhits=DEFAULTMAXHITS,
+ maxrejects=DEFAULTMAXREJECTS,
reads_per_batch='auto',
confidence=0.7,
read_orientation='auto',
- threads=1,
+ threads=DEFAULTTHREADS,
prefilter=True,
sample_size=1000,
randseed=0):
@@ -108,11 +176,13 @@ def classify_hybrid_vsearch_sklearn(ctx,
perc_query_aligned=query_cov, threads=threads)
# find exact matches, perform LCA consensus classification
- taxa1, = ccv(query=query, reference_reads=reference_reads,
- reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
- strand=strand, min_consensus=min_consensus,
- search_exact=True, threads=threads, maxhits=maxhits,
- maxrejects=maxrejects, output_no_hits=True)
+ # note: we only keep the taxonomic assignments, not the search report
+ taxa1, _, = ccv(
+ query=query, reference_reads=reference_reads,
+ reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
+ strand=strand, min_consensus=min_consensus, search_exact=True,
+ threads=threads, maxhits=maxhits, maxrejects=maxrejects,
+ output_no_hits=True)
# Annotate taxonomic assignments with classification method
taxa1 = _annotate_method(taxa1, 'VSEARCH')
@@ -121,7 +191,7 @@ def classify_hybrid_vsearch_sklearn(ctx,
# filter out unassigned seqs
try:
query, = filter_seqs(sequences=query, taxonomy=taxa1,
- include=_get_default_unassignable_label())
+ include=DEFAULTUNASSIGNABLELABEL)
except ValueError:
# get ValueError if all sequences are filtered out.
# so if no sequences are unassigned, return exact match results
@@ -140,26 +210,24 @@ def classify_hybrid_vsearch_sklearn(ctx,
return taxa
-output_descriptions = {
- 'classification': 'The resulting taxonomy classifications.'}
-
parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'strand': Str % Choices(['both', 'plus']),
- 'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
- inclusive_start=False),
'threads': Int % Range(1, None),
'maxhits': Int % Range(1, None) | Str % Choices(['all']),
'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}
+extra_params = {'search_exact': Bool,
+ 'top_hits_only': Bool,
+ 'output_no_hits': Bool,
+ 'weak_id': Float % Range(0.0, 1.0, inclusive_end=True)}
+
inputs = {'query': FeatureData[Sequence],
- 'reference_reads': FeatureData[Sequence],
- 'reference_taxonomy': FeatureData[Taxonomy]}
+ 'reference_reads': FeatureData[Sequence]}
-input_descriptions = {'query': 'Sequences to classify taxonomically.',
- 'reference_reads': 'reference sequences.',
- 'reference_taxonomy': 'reference taxonomy labels.'}
+input_descriptions = {'query': 'Query Sequences.',
+ 'reference_reads': 'Reference sequences.'}
parameter_descriptions = {
'strand': 'Align against reference sequences in forward ("plus") '
@@ -184,8 +252,6 @@ parameter_descriptions = {
'lower.',
'query_cov': 'Reject match if query alignment coverage per high-'
'scoring pair is lower.',
- 'min_consensus': 'Minimum fraction of assignments must match top '
- 'hit to be accepted as consensus assignment.',
'threads': 'Number of threads to use for job parallelization.',
'maxhits': 'Maximum number of hits to show once the search is terminated.',
'maxrejects': 'Maximum number of non-matching target sequences to '
@@ -193,55 +259,92 @@ parameter_descriptions = {
'pair with maxaccepts (see maxaccepts description for '
'details).'}
-outputs = [('classification', FeatureData[Taxonomy])]
+extra_param_descriptions = {
+ 'search_exact': 'Search for exact full-length matches to the query '
+ 'sequences. Only 100% exact matches are reported and this '
+ 'command is much faster than the default. If True, the '
+ 'perc_identity, query_cov, maxaccepts, and maxrejects '
+ 'settings are ignored. Note: query and reference reads '
+ 'must be trimmed to the exact same DNA locus (e.g., '
+ 'primer site) because only exact matches will be '
+ 'reported.',
+ 'top_hits_only': 'Only the top hits between the query and reference '
+ 'sequence sets are reported. For each query, the top '
+ 'hit is the one presenting the highest percentage of '
+ 'identity. Multiple equally scored top hits will be '
+ 'used for consensus taxonomic assignment if '
+ 'maxaccepts is greater than 1.',
+ 'output_no_hits': 'Report both matching and non-matching queries. '
+ 'WARNING: always use the default setting for this '
+ 'option unless if you know what you are doing! If '
+ 'you set this option to False, your sequences and '
+ 'feature table will need to be filtered to exclude '
+ 'unclassified sequences, otherwise you may run into '
+ 'errors downstream from missing feature IDs.',
+ 'weak_id': 'Show hits with percentage of identity of at least N, '
+ 'without terminating the search. A normal search stops as '
+ 'soon as enough hits are found (as defined by maxaccepts, '
+ 'maxrejects, and perc_identity). As weak_id reports weak '
+ 'hits that are not deduced from maxaccepts, high '
+ 'perc_identity values can be used, hence preserving both '
+ 'speed and sensitivity. Logically, weak_id must be smaller '
+ 'than the value indicated by perc_identity, otherwise this '
+ 'option will be ignored.',
+}
+
+classification_output = ('classification', FeatureData[Taxonomy])
+
+classification_output_description = {
+ 'classification': 'Taxonomy classifications of query sequences.'}
+
+blast6_output = ('search_results', FeatureData[BLAST6])
+
+blast6_output_description = {'search_results': 'Top hits for each query.'}
ignore_prefilter = ' This parameter is ignored if `prefilter` is disabled.'
plugin.methods.register_function(
- function=classify_consensus_vsearch,
+ function=vsearch_global,
inputs=inputs,
parameters={**parameters,
- 'unassignable_label': Str,
- 'search_exact': Bool,
- 'top_hits_only': Bool,
- 'output_no_hits': Bool,
- 'weak_id': Float % Range(0.0, 1.0, inclusive_end=True)},
- outputs=outputs,
+ **extra_params},
+ outputs=[blast6_output],
input_descriptions=input_descriptions,
parameter_descriptions={
**parameter_descriptions,
- 'search_exact': 'Search for exact full-length matches to the query '
- 'sequences. Only 100% exact matches are reported and '
- 'this command is much faster than the default. If '
- 'True, the perc_identity and query_cov settings are '
- 'ignored. Note: query and reference reads must be '
- 'trimmed to the exact same DNA locus (e.g., primer '
- 'site) because only exact matches will be reported.',
- 'top_hits_only': 'Only the top hits between the query and reference '
- 'sequence sets are reported. For each query, the top '
- 'hit is the one presenting the highest percentage of '
- 'identity. Multiple equally scored top hits will be '
- 'used for consensus taxonomic assignment if '
- 'maxaccepts is greater than 1.',
- 'output_no_hits': 'Report both matching and non-matching queries. '
- 'WARNING: always use the default setting for this '
- 'option unless if you know what you are doing! If '
- 'you set this option to False, your sequences and '
- 'feature table will need to be filtered to exclude '
- 'unclassified sequences, otherwise you may run into '
- 'errors downstream from missing feature IDs.',
- 'weak_id': 'Show hits with percentage of identity of at least N, '
- 'without terminating the search. A normal search stops as '
- 'soon as enough hits are found (as defined by maxaccepts, '
- 'maxrejects, and perc_identity). As weak_id reports weak '
- 'hits that are not deduced from maxaccepts, high '
- 'perc_identity values can be used, hence preserving both '
- 'speed and sensitivity. Logically, weak_id must be smaller '
- 'than the value indicated by perc_identity, otherwise this '
- 'option will be ignored.',
+ **extra_param_descriptions,
+ },
+ output_descriptions=blast6_output_description,
+ name='VSEARCH global alignment search',
+ description=('Search for top hits in a reference database via global '
+ 'alignment between the query sequences and reference '
+ 'database sequences using VSEARCH. Returns a report of the '
+ 'top M hits for each query (where M=maxaccepts or maxhits).'),
+ citations=[citations['rognes2016vsearch']]
+)
+
+
+plugin.pipelines.register_function(
+ function=classify_consensus_vsearch,
+ inputs={**inputs,
+ 'reference_taxonomy': FeatureData[Taxonomy]},
+ parameters={**parameters,
+ **extra_params,
+ **min_consensus_param,
+ 'unassignable_label': Str,
+ },
+ outputs=[classification_output, blast6_output],
+ input_descriptions={**input_descriptions,
+ 'reference_taxonomy': 'Reference taxonomy labels.'},
+ parameter_descriptions={
+ **parameter_descriptions,
+ **extra_param_descriptions,
+ **min_consensus_param_description,
+ 'unassignable_label': 'Annotation given to sequences without any hits.'
},
- output_descriptions=output_descriptions,
+ output_descriptions={**classification_output_description,
+ **blast6_output_description},
name='VSEARCH-based consensus taxonomy classifier',
description=('Assign taxonomy to query sequences using VSEARCH. Performs '
'VSEARCH global alignment between query and reference_reads, '
@@ -256,21 +359,26 @@ plugin.methods.register_function(
plugin.pipelines.register_function(
function=classify_hybrid_vsearch_sklearn,
- inputs={**inputs, 'classifier': TaxonomicClassifier},
+ inputs={**inputs,
+ 'reference_taxonomy': FeatureData[Taxonomy],
+ 'classifier': TaxonomicClassifier},
parameters={**parameters,
+ **min_consensus_param,
'reads_per_batch': _classify_parameters['reads_per_batch'],
'confidence': _classify_parameters['confidence'],
'read_orientation': _classify_parameters['read_orientation'],
'prefilter': Bool,
'sample_size': Int % Range(1, None),
'randseed': Int % Range(0, None)},
- outputs=outputs,
+ outputs=[classification_output],
input_descriptions={**input_descriptions,
+ 'reference_taxonomy': 'Reference taxonomy labels.',
'classifier': 'Pre-trained sklearn taxonomic '
'classifier for classifying the reads.'},
parameter_descriptions={
**{k: parameter_descriptions[k] for k in [
- 'strand', 'maxaccepts', 'min_consensus', 'threads']},
+ 'strand', 'maxaccepts', 'threads']},
+ **min_consensus_param_description,
'perc_identity': 'Percent sequence similarity to use for PREFILTER. ' +
parameter_descriptions['perc_identity'] + ' Set to a '
'lower value to perform a rough pre-filter.' +
@@ -301,7 +409,7 @@ plugin.pipelines.register_function(
'the same output, which is useful for replicability. Set '
'to 0 to use a pseudo-random seed.' + ignore_prefilter,
},
- output_descriptions=output_descriptions,
+ output_descriptions=classification_output_description,
name='ALPHA Hybrid classifier: VSEARCH exact match + sklearn classifier',
description=('NOTE: THIS PIPELINE IS AN ALPHA RELEASE. Please report bugs '
'to https://forum.qiime2.org!\n'
=====================================
q2_feature_classifier/tests/__init__.py
=====================================
@@ -14,6 +14,8 @@ from qiime2.plugin.testing import TestPluginBase
class FeatureClassifierTestPluginBase(TestPluginBase):
+ package = 'q2_feature_classifier.tests'
+
def setUp(self):
try:
from q2_feature_classifier.plugin_setup import plugin
=====================================
q2_feature_classifier/tests/data/blast6-format.tsv
=====================================
@@ -0,0 +1,3 @@
+1111561 1111561 100.000 75 0 0 1 75 44 118 7.50e-34 139
+1111561 574274 92.308 78 2 4 1 75 44 120 2.11e-24 108
+835097 835097 100.000 80 0 0 1 80 52 131 1.36e-36 148
=====================================
q2_feature_classifier/tests/data/query-seqs.fasta
=====================================
@@ -0,0 +1,6 @@
+>1111561
+ACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGAGTGGCGGACGGGTGAGTAATGCGT
+>835097
+AAGTCGAGCGAAAGACCCCGGGCTTGCCCGGGTGATTTAGCGGCGGACGGCTGAGTAACACGTGAGAAACTTGCCCTTAG
+>junk
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
=====================================
q2_feature_classifier/tests/data/se-dna-sequences.fasta
=====================================
@@ -1649,9 +1649,9 @@ GATGAACGCTAGCGACAGGCCTAACACATGCAAGTCGAGGGGTAACAGGGGCTTCGGCCCGCTGACGACCGGCGCACGGG
>1106599
AGCGAACGCTAGCGGCAGACTTAACACATGCAAGTCGAGCGAGAAACCACTTCGGTGGCGGACAGCGGCGAACGGGTGAGTAACACGTAGGTTACGTGCCCTTTGGTGGGGGATAGCCCAGGGAAACCTGGATTAATACCGCATATGTCC
>4294447
-ATTGAACGCTGGCGGAATGCTTTACACATGCAAGTCGAACGGCAGCACGGGAGCTTGCTCCTGGTGGCGAGTGGCGAACGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGGGGGATAACTAGTCGAAAGATTGGCTAATACCGCATA
+ATTGAACGCTGGCGGAATGCTTTACACATGCAAGTCGAACGGCAGCACGGGAGCTTGCTCCTGGTGGCGAGTGGCGAACGGGGCGTGCTTAATACATGCAAGTCGAACGATAAGGCTCCTTGCAGGACGGATGGGGGATAACTAGTCGAAAGATTGGCTAATACCGCATA
>4295019
-GACGAACGCTGGCGGCGTGCTTAATACATGCAAGTCGAACGATAAGGCTCCTTGCAGGACGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GACGAACGCTGGCGGCGTGCTTAATACATGCAAGTCGAACGATAAGGCTCCTTGCAGGACGG
>4295363
GATGAACGCTAGCGGCAGGCTTAATACATGCAAGTCGAGGGGCAGCACGGGTAGCAATACCATGAGAGTGGCGCACGGGTGTGAGTAACGCGTATTCAACCTACCTGTAACAGAGGGATAGCCCATAGAAATGTGGATTAATATCTCATA
>4295378
@@ -1659,13 +1659,13 @@ ATTGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGAGGTATTTATTTCGGTAGATATCTTAGTGGCGAACGGG
>4295827
AACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGCACCATTTCGGTGGTGAGTGGCGGACGGGNNNNNNNNNNNNNNAGCGGCGGACGGGTGAGTAACGCGTGGGTAACCTGCCTTGTACAGGGGGATAACATATTGAAAAGT
>4296286
-GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGAGAATTCAATAAGCTTGCTTAAAGAATTTTTAGTGGCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGAGAATTCAATAAGCTTGCTTAAAGAATTTTTAGTGGCGG
>4296291
GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGCGATTAGACGGACTGGAGACTTCGGTCAAAGGGAAGACTATGAGTAACGCGTGAGCAACCTGCCCTGTACAGGGGGATAGCCACTGGAAACGGTGATTAATACCCCATA
>4296811
-AGCGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGCCGTAGCAATACGGCAGCGGCAGACGGGAGAGTAACACGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCTAATACCGGATACGCCCTTACGGG
+AGCGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGCCGTAGCAATACGGCAGCGGCAGACGGGAGAGTAACACGGCTAATACCGGATACGCCCTTACGGG
>4296878
-AATGAACGCTGGCGACGTGCTTAACACATGCAAGTCGCACGAGAAAGCTCGAAGCTTGCTTTGGGCGAGTAAAGTGGCGAACGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AATGAACGCTGGCGACGTGCTTAACACATGCAAGTCGCACGAGAAAGCTCGAAGCTTGCTTTGGGCGAGTAAAGTGGCGAAC
>4297375
GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGAGGACTTCGGTCCTAGCGGCGGACGGGTGAGTAACGCGTGGGTAATCTGCCTCATACACATAACGCGTGGGTAATCTGCCTCATACACACGGATAACGTACCGAAAGGT
>4298545
@@ -1673,7 +1673,7 @@ GATGAACGCTAGCGGCAGGCTTAATACATGCAAGTCGAACGGTAACAGGTCCTTCGGGATGCTGACGAGTGGCGGACGGG
>4298715
GATGAACGCTAGCGGGAGGCCTAACACATGCAAGTCGAACGGGATTGGTAGCAATACCATGAGAGTGGCGCACGGGTGAGTAACGCGTGCGCACGGGTGAGTAACGCGTATGCAACCTACCTGTAACAGAGGGATAGCCCATAGAAATGT
>4299233
-GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAGTTTAGATGGAAGCTTGCGAAATCTAAACTTAGTGGCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAGTTTAGATGGAAGCTTGCGAAATCTAAACTTAGTGGCG
>4299277
AGTGAACGCTAGCGGCGCGCCTAACACATGCAAGTCGAGTGGGTTTAGACCCACGGCAAACGGCTGCGTAACACGTTGGAACCTGCCCCGAACTCAGGGATATCCCTCCGAAAGGAGGAGTAATACCGGATGCCCTCGAAAGAGGAAAGA
>4299768
@@ -1697,11 +1697,11 @@ GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGAAGAATTTATCATGAAGCCTTCGGGCAGATTGATAAACT
>4304270
AACGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGAGAAGCCACCTTCGGGTGGTGGACAGTGGCAGACGGGTGAGTGAGTAACGCGTGGGAACGTGCCTTTCGGTTCGGGACAACCCAGGGAAACTTGGGCTAATACCGGATA
>4305631
-AACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGGGTTAATGCTGTAGCAATACAGCATTGACTTAGTGGCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGGGTTAATGCTGTAGCAATACAGCATTGACTTAGTGGCGGNNNNNNNNNNNNNNN
>4305902
GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGGGCTTAGTGAAAAGTTTACTTTTTACTTTGTCTAGCGGCGGGCGTGGATAATCTGCCTGTAAGACAGGGATAACGCCGGGAAACCGGTGCTAATACCCGATAACCTCT
>4307982
-GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTTGAACGGAGCACATTAAGTAGCAATACGAGGTGTGCTTAGTAGCGGACGGGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGGATGCTAATACCGC
+GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTTGAACGGAGCACATTAAGTAGCAATACGAGGTGTGCTTAGTAGCGGACGGGGGNCGGATGCTAATACCGC
>4308184
GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGAAGTTGGAGAGCTNNNNNNNNNNNNNNNNNNNNNNGGGTGAGTAACGCGTGGGCAACCTGCCCTGCAGATCGGGATAACTCCGGGAAACCGGTGCTAATACCGAATAG
>4308610
@@ -1717,7 +1717,7 @@ AATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGAGAAAGGGATTGCTTGCAATCCTGAGTAGAGTGGCGCAC
>4314510
GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAACGGTTTGGTGGTGGGTGGTTAGTGGTTAGTCTTATCACCAAATAGTGGCGGACGGGTGAGTAACGCGTGGATAATCTGCCTGTAAGACAGGGATAACGCCGGGAAACCGGT
>4314518
-GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGTTGAGAGGTGGGAGTTGAGAAGTTAGACGTGGGAGAAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGTTGAGAGGTGGGAGTTGAGAAGTTAGACGTGGGAGAAAGNNNNNNNNNNNNNN
>4314576
GACGAACGCTGGCGGCGTGGATTAGGCATGCAAGTCGAACGCGGAAGGTATTCCTTCGGGAAAATACTGGTAGAGTGGCGGACGGGTGAGTAACGCGTAGGTAATCAGCCTTTAAGTGCGGGATAGCCCCGGGAAACCGGGGTTAATACC
>4314697
@@ -1755,7 +1755,7 @@ GACGAACGCTGGCGGAATGGATTAGGCATGCAAGTCGTACGAGATTTATACTTTCCGGGTATATTGAGAGTGGCAAACGG
>4327602
GACGAACGCTGGCGGTATGCCTAACACATGCAAGTCGAACGTGATCTTCTGTAGTTTTACTAGCACTGAGCGAGAGAAGTGAAAGTGGCGGACGGGTGAGTAACACGTGGGTAACCTGGCCTTTGGAAGGGGATAACCTCGGGAAACCGG
>4327922
-AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGTGCGCGAACGGGACTTCGGTCCCTAGTAGAGCGGCGAACGGGTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGTGCGCGAACGGGACTTCGGTCCCTAGTAGAGCGGCGAACGGGTGNNNNNNNNNN
>4328672
ATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGCGCTAGCTTGCTAGCGGCTGACGAGTGGCGGACGGGTGAGTAACGCGTGGGAATCTACCTGGTAGTGGGGGACAACAGTCGGAAACGACTGCTAATACCGCA
>4330488
@@ -1811,7 +1811,7 @@ AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGAGAAGGTCCCTTCGGGGATGAGTATAGTGGCGCACGGGT
>4348866
GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGGACTTAGTGAAAAGTTTACTTTTTACTTTGTCTAGCGGCGGACGGGAACACGTGGATAATCTGCCTGTAAGACAGGGATAACGCCGGGAAACCGGTGCTAATACCGGA
>4349618
-GATGAACGTTGGCTGTGTGCCTAAGGTATGCAAGTCGAACGGTCTTATTCTGAATTCCAAAGACTTCGGTCAATGGTTTTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GATGAACGTTGGCTGTGTGCCTAAGGTATGCAAGTCGAACGGTCTTATTCTGAATTCCAAAGACTTCGGTCAATGGTTTTCNNNNNNNNNNN
>4351085
AACGAACGCTGGCGGCGCGTTTTACGCATGCAAGTCGAGCGGAAAGCACGGCCTTCGGGCTGTGCCTAGAGCGGCGAACGGGTGAGTAACACGTAGGTGATCTGTCCTGGTGTCGGGGATAGCTTCTGGAAACAGAAGGTAATACCGGAT
>4351087
@@ -1911,7 +1911,7 @@ ATTGAACGCTCGGTTGCACGCTTAACACATGCAAGTCGGACGGCAGCAGCGCTGATTGATTTATTAATTAGTGGCTGGCG
>4388127
AATCAACGCTGGCGGCGTGCCTCAGACATGCAAGTCGAACGATTAAAGCTCCCTTCGGGGAGTGCATAAAGTGGCGCACGGNNNNNTAACACGTAAGTAATCTACCCTCGAGTGGGGAATAACAGTCGGAAACGACTGCTAATACCGCAT
>4388133
-GATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGAACGAGAGAAGGGAGCTTGCTCCCCGATCTAGTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGAACGAGAGAAGGGAGCTTGCTCCCCGATCTAGTGGNNNNNNNNNNN
>4389194
GACTAACGCTGGCGGCACGCATAAGTCATGCAAGTCGTACGAGAAAGGGGTCTTCGGACCCCGAGTAAAGTGGCGAACGGGTGAGTAACGCGTGAATAATCTACCTTTGAGAGGGGGATAACCCCGGGAAACTGGGGCTAATACCGCATG
>4389611
@@ -1969,7 +1969,7 @@ AGTGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGAGAACGGGTTATAGCTTGCTATAATTGTCAGCTAAGTGG
>4408885
AACTAACGCTGGCGGCGTGCCTAACACATGCAAGTCGCACGAGAAAGTTGGCTTCGGCCGATGAGTACAGTGGCGCACGGGTGAGTAACGCGTGGATAATCTGCCCTTTAGTTGGGAATAACGTACCGAAAGGTGCGCTAATACCGAATA
>4409658
-GATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGCAGCACGTCTTCGGATGGTTGGCGGCGACCGGCGCACGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATAGACCAGAGTAACCTGGATAAATGCCCA
+GATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGCAGCACGTCTTCGGATGGTTGGCGGCGACCGGCGCACGGGNNNATAGACCAGAGTAACCTGGATAAATGCCCA
>4409929
GATGANNGCTGGCTGTGTGCCTAATACATGCATGTCGAGCGGAGTATTAATTTATTAGTGCTTAGCGGCAAATGGGTGAGTAACACGTATTTAACCTACCTCAAAGACTGGGATAACAACAGGAAACTGTTGCTAATACCGGATATGTAT
>4411852
@@ -2003,7 +2003,7 @@ GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGATGAAAAAGAAGTAGATCTCTTCGGAGTGACGCCTCTTT
>4420490
GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGAAGCACTCAACTTTAGCTTGCTAAAGTTGTTTGACTGAGCGGCGAACGGGTGAGTAACACGTGGGCAATCTATCCGGAAGACCGGGATAACAGCTCGAAAGGGCTGCTA
>4421302
-AACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGCGAAAGTTTTCTTCGGAAGAGAGTAGAGTGGCGCACGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGCGAAAGTTTTCTTCGGAAGAGAGTAGAGTGGCGCACGGGNNNNNNNNN
>4421368
ATTGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGTAACAGGCGTATCAATACGTGCTGACTAGCGGCGAACGGGTGCGTAATACATGGGAATCTGCCCTTTGGTGGGGAATAACCCGGCGAAAGCCGGGCTAATGCCGCATG
>4421369
@@ -2017,7 +2017,7 @@ GATGAATGCTGGCGGCGTGGATAAGGCATGCAAGTCGTGCGGTCCTGCTTTTTGCAGGATAGCGGCAAACGGGTTAGTAA
>4424614
GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGGCCAGAACGAAGGGCAACCGGAGTTCGGGTTTAGCGGCGCACGGGTGAGTAACGCGTAGGTAATCTACTCCGAAGACCGGGATAACACCTCGAAAGGGGTGCTAATAC
>4424871
-ATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGATGAGGGGAGCTTGCTTCCTGATTCAGCGGCGGACGGGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+ATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGATGAGGGGAGCTTGCTTCCTGATTCAGCGGCGGACGGGGGNNNNNNNNNN
>4426469
GATAAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGTAACATTGGGTGTTTACACTCAGATGACGACCGGCGGATGGGTGAGTAACGAGTATGTAACCTACCTGTTACTAAAGGATAGCCCGAAGAAATTTGGATTAATCCTTT
>4426532
@@ -2033,7 +2033,7 @@ GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGGAGGTAAGCGGATGGTAACTGAAGCTTACTTTAGCGGCG
>4430037
GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGAGAAAGTTACCTTCGGGTAATGAGTAAAGTGGCGGACGGGTGAGTAACGCGTGGATAACCTGCCCCTAGGTCTGGAATAACCTTTCGAAAGGGGGGCTAATTCCGGATG
>4430360
-GATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGAGATTTTATAAAATTACTTTTCGAAGGAAGTTTTATAGATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAGCGAGATTTTATAAAATTACTTTTCGAAGGAAGTTTTATAGATNNNNNNNNNN
>4430730
GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGACGAATGAGAGAAGTTTACTTTTCTTATGAGTTAGTGGCGGACGGGTGAGTAACACGTGGATAATCTACCTGCAACACTGGGATAACGCTGTGAAAACGGTGCTAATA
>4430839
@@ -2049,7 +2049,7 @@ GGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGCCCTGCACATTGCGCTTCTTGAATCTTTCGAGACAGGAC
>4433285
AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGCGAAAGTCATCTTCGGGTGGCGAGTAGAGCGGCGGACGGGTGAGTAACGCGTGGACAACCTACCCTGGGGTACGGGATAACACTTCGAAAGGGGTGCTAATACCGTATG
>4434148
-GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGGAGATATTTGGAGTACTTGTACAAAGGATATCTTAGCGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGGAGATATTTGGAGTACTTGTACAAAGGATATCTTAGCGGCNNNNNNNNNN
>838203
GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCAAGGAGAACATCCCGCTTGCGGGATTATTAAACTGGCGAACGGGTGAGTAACGCGTAGGTAACTTGCCTTAAAGTTTGGGATAACTTTGGGAAACTGGAGCTAATACCGGATAA
>4434769
@@ -2081,7 +2081,7 @@ ATTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGGACGGCAGCGGGGTAGTGCTTGCACTACTGCCGGCGAGTGGCG
>4450215
AACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGCGAAAGGGGGGGCAACTCCCCGAGTAGAGCGGCGCACGGGGAGTAACGCGTGGGTAACCTACCTCCGGGCGGGGGATAACCACTCGAAAGGGTGGCTAATACCGCATGA
>4450538
-GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGTTTGGTGGTGGGTGGTTAGTGGTTAGTCTTTTGGGGTTTTTCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GACGAACGTTGGCGGCGTGCTTAACACATGCAAGTCGAGCGGTTTGGTGGTGGGTGGTTAGTGGTTAGTCTTTTGGGGTTTTTCCTNNNNNNNNNNN
>4450541
GATAAACGCTAGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGCCTTCGGGTGCTGACGAGTGGCGCACGGGTGCGTAACATGTGTAGAACCTACCTATTACTGGGGCATAGCCCGGAGAAATTTGGATTAACACCCCATATTG
>4451743
@@ -2119,7 +2119,7 @@ GATTAACGCTGGCGGTATGTATAATACATGCAAGTCGAACGGCCCTTCGGGGCAGTGGCGAACGGGTGAGGAACACGTTG
>4461437
GACTAACGTTGGCGACGTGCCTAACACATGCAAGTCGAGCGATCTTTATGATAGCGGCGGACGGCTGAGTAACACGTCCGTAACCTACCTCCGAGTGGGGCATAGCTCGTCGAAAGACGGGGTAATACCCCATAATATCCCTTCATAATT
>4462642
-AGTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGAACGGTAACAGGGGCTTCGGCCTGCTGACGAGTGGCGAACGGGGGAGTAATGCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AGTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGAACGGTAACAGGGGCTTCGGCCTGCTGACGAGTGGCGAACGGGGGAGTAATGCGNNNNNNNN
>4462692
GATGAACGCTGGCGGCGTGCCTCATACATGCAAGTCGGACGGATTTAATAGTGTAGCAATATATTGTTAGATTAGTGGCGAACGGGTGAGTAACACGTAGCTAACCTACCCTAAAGTCTGGGATAACATACCGAAAGGGATGCTAATACC
>4463320
@@ -2181,13 +2181,13 @@ AGCGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGATCCGGGGGGCAACCCTCGGGTTAGTGGCGAACGGGTG
>4476462
GATGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGATCGCACTACTTCGGTAGTGAGAGGCGAACGGGTGAGTAACACGTAGGTAACATATCTATATGACGAGGATAACGGTTGGAAACGACTGCTAAGACTGGATAGGAGTAGAGGA
>4476738
-AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGGATGTGCCCGCAAGGGTGCATGGCAGACGGGTGAGTAACGCGTGGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+AATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGGATGTGCCCGCAAGGGTGCATGGCAGACGGGTGAGTAACGCGTGGGG
>4476741
GATGAACGCTGGCGGCGTGCCTAACGCATGCAAGTCGTGCGGGTAGACCTTCGGGTCTGCCAGCGGCGAACGGGTGAGTAACACGTGGGTAACCTGCCGTCTGGTGGGGGATAACCCTCAGAAATGGGGGCTAATACCGCATGATGATCC
>4477309
GATGAACGCTAGCGGCGCGCCTAATACATGCAAGTCGAGCGAGAAAGCTGTGAACCGCAAGGGGATCGGCGAGTAAAGCGGCGAACGGGTGAGTAACACGTAACTAACCTACCCTCAGGACGGGCACAACTCCGGGAAACCGGGGCTAAT
>4477561
-GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGAGATTTAGTGGAGTGGAGACTTCGGTCAAAGTGAAGCTAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+GATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGAGATTTAGTGGAGTGGAGACTTCGGTCAAAGTGAAGCTAA
>4478993
GATGAACGCTAGCGATAGGCCTAACACATGCAAGTCGAGGGGCATCACATGAGGTAGCAATACCGATGGTGGCGACCGGCGCACGGGTGAGTAACACGTATGCAACCTGCCTTCAACAAGGGGATAACCCGTTGAAAGACGGACTAATAC
>4478995
=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -7,96 +7,197 @@
# ----------------------------------------------------------------------------
import pandas as pd
+import pandas.testing as pdt
from qiime2.sdk import Artifact
-from qiime2.plugins import feature_classifier
from q2_feature_classifier._skl import _specific_fitters
-from q2_feature_classifier._blast import classify_consensus_blast
-from q2_feature_classifier._vsearch import classify_consensus_vsearch
from q2_feature_classifier._consensus_assignment import (
- _compute_consensus_annotation,
+ _lca_consensus,
_compute_consensus_annotations,
- _import_blast_format_assignments,
- _output_no_hits)
+ _blast6format_df_to_series_of_lists,
+ _taxa_to_cumulative_ranks)
from q2_types.feature_data import DNAFASTAFormat
from . import FeatureClassifierTestPluginBase
+from qiime2.plugins import feature_classifier as qfc
-class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
- package = 'q2_feature_classifier.tests'
+class SequenceSearchTests(FeatureClassifierTestPluginBase):
def setUp(self):
super().setUp()
- taxonomy = Artifact.import_data(
- 'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
- self.taxonomy = taxonomy.view(pd.Series)
- # TODO: use `Artifact.import_data` here once we have a transformer
- # for DNASequencesDirectoryFormat -> DNAFASTAFormat
- self.reads_fp = self.get_data_path('se-dna-sequences.fasta')
- self.reads = DNAFASTAFormat(self.reads_fp, mode='r')
+ self.query = Artifact.import_data(
+ 'FeatureData[Sequence]', self.get_data_path('query-seqs.fasta'))
+ self.ref = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('se-dna-sequences.fasta'))
- # Make sure blast and vsearch produce expected outputs
- # but there is no "right" taxonomy assignment.
def test_blast(self):
- result = classify_consensus_blast(self.reads, self.reads,
- self.taxonomy)
- res = result.Taxon.to_dict()
- tax = self.taxonomy.to_dict()
- right = 0.
- for taxon in res:
- right += tax[taxon].startswith(res[taxon])
- self.assertGreater(right/len(res), 0.5)
+ result, = qfc.actions.blast(
+ self.query, self.ref, maxaccepts=3, perc_identity=0.9)
+ exp = pd.DataFrame({
+ 'qseqid': {0: '1111561', 1: '1111561', 2: '1111561', 3: '835097',
+ 4: 'junk'},
+ 'sseqid': {0: '1111561', 1: '574274', 2: '149351', 3: '835097',
+ 4: '*'},
+ 'pident': {0: 100.0, 1: 92.308, 2: 91.781, 3: 100.0, 4: 0.0},
+ 'length': {0: 75.0, 1: 78.0, 2: 73.0, 3: 80.0, 4: 0.0},
+ 'mismatch': {0: 0.0, 1: 2.0, 2: 4.0, 3: 0.0, 4: 0.0},
+ 'gapopen': {0: 0.0, 1: 4.0, 2: 2.0, 3: 0.0, 4: 0.0},
+ 'qstart': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 0.0},
+ 'qend': {0: 75.0, 1: 75.0, 2: 71.0, 3: 80.0, 4: 0.0},
+ 'sstart': {0: 24.0, 1: 24.0, 2: 24.0, 3: 32.0, 4: 0.0},
+ 'send': {0: 98.0, 1: 100.0, 2: 96.0, 3: 111.0, 4: 0.0},
+ 'evalue': {0: 8.35e-36, 1: 2.36e-26, 2: 3.94e-24,
+ 3: 1.5000000000000002e-38, 4: 0.0},
+ 'bitscore': {0: 139.0, 1: 108.0, 2: 100.0, 3: 148.0, 4: 0.0}})
+ pdt.assert_frame_equal(result.view(pd.DataFrame), exp)
+
+ def test_blast_no_output_no_hits(self):
+ result, = qfc.actions.blast(
+ self.query, self.ref, maxaccepts=3, perc_identity=0.9,
+ output_no_hits=False)
+ exp = pd.DataFrame({
+ 'qseqid': {0: '1111561', 1: '1111561', 2: '1111561', 3: '835097'},
+ 'sseqid': {0: '1111561', 1: '574274', 2: '149351', 3: '835097'},
+ 'pident': {0: 100.0, 1: 92.308, 2: 91.781, 3: 100.0},
+ 'length': {0: 75.0, 1: 78.0, 2: 73.0, 3: 80.0},
+ 'mismatch': {0: 0.0, 1: 2.0, 2: 4.0, 3: 0.0},
+ 'gapopen': {0: 0.0, 1: 4.0, 2: 2.0, 3: 0.0},
+ 'qstart': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0},
+ 'qend': {0: 75.0, 1: 75.0, 2: 71.0, 3: 80.0},
+ 'sstart': {0: 24.0, 1: 24.0, 2: 24.0, 3: 32.0},
+ 'send': {0: 98.0, 1: 100.0, 2: 96.0, 3: 111.0},
+ 'evalue': {0: 8.35e-36, 1: 2.36e-26, 2: 3.94e-24,
+ 3: 1.5000000000000002e-38},
+ 'bitscore': {0: 139.0, 1: 108.0, 2: 100.0, 3: 148.0}})
+ pdt.assert_frame_equal(result.view(pd.DataFrame), exp)
+
+ def test_vsearch_global(self):
+ result, = qfc.actions.vsearch_global(
+ self.query, self.ref, maxaccepts=3, perc_identity=0.9)
+ exp = pd.DataFrame({
+ 'qseqid': {0: '1111561', 1: '835097', 2: 'junk'},
+ 'sseqid': {0: '1111561', 1: '835097', 2: '*'},
+ 'pident': {0: 100.0, 1: 100.0, 2: 0.0},
+ 'length': {0: 75.0, 1: 80.0, 2: 0.0},
+ 'mismatch': {0: 0.0, 1: 0.0, 2: 0.0},
+ 'gapopen': {0: 0.0, 1: 0.0, 2: 0.0},
+ 'qstart': {0: 1.0, 1: 1.0, 2: 0.0},
+ 'qend': {0: 75.0, 1: 80.0, 2: 0.0},
+ 'sstart': {0: 1.0, 1: 1.0, 2: 0.0},
+ 'send': {0: 150.0, 1: 150.0, 2: 0.0},
+ 'evalue': {0: -1.0, 1: -1.0, 2: -1.0},
+ 'bitscore': {0: 0.0, 1: 0.0, 2: 0.0}})
+ pdt.assert_frame_equal(
+ result.view(pd.DataFrame), exp, check_names=False)
+
+ def test_vsearch_global_no_output_no_hits(self):
+ result, = qfc.actions.vsearch_global(
+ self.query, self.ref, maxaccepts=3, perc_identity=0.9,
+ output_no_hits=False)
+ exp = pd.DataFrame({
+ 'qseqid': {0: '1111561', 1: '835097'},
+ 'sseqid': {0: '1111561', 1: '835097'},
+ 'pident': {0: 100.0, 1: 100.0},
+ 'length': {0: 75.0, 1: 80.0},
+ 'mismatch': {0: 0.0, 1: 0.0},
+ 'gapopen': {0: 0.0, 1: 0.0},
+ 'qstart': {0: 1.0, 1: 1.0},
+ 'qend': {0: 75.0, 1: 80.0},
+ 'sstart': {0: 1.0, 1: 1.0},
+ 'send': {0: 150.0, 1: 150.0},
+ 'evalue': {0: -1.0, 1: -1.0},
+ 'bitscore': {0: 0.0, 1: 0.0}})
+ pdt.assert_frame_equal(
+ result.view(pd.DataFrame), exp, check_names=False)
+
+ def test_vsearch_global_permissive(self):
+ result, = qfc.actions.vsearch_global(
+ self.query, self.ref, maxaccepts=1, perc_identity=0.8,
+ query_cov=0.2)
+ exp = pd.DataFrame({
+ 'qseqid': {0: '1111561', 1: '835097', 2: 'junk'},
+ 'sseqid': {0: '1111561', 1: '835097', 2: '4314518'},
+ 'pident': {0: 100.0, 1: 100.0, 2: 90.0},
+ 'length': {0: 75.0, 1: 80.0, 2: 20.0},
+ 'mismatch': {0: 0.0, 1: 0.0, 2: 2.0},
+ 'gapopen': {0: 0.0, 1: 0.0, 2: 0.0},
+ 'qstart': {0: 1.0, 1: 1.0, 2: 1.0},
+ 'qend': {0: 75.0, 1: 80.0, 2: 100.0},
+ 'sstart': {0: 1.0, 1: 1.0, 2: 1.0},
+ 'send': {0: 150.0, 1: 150.0, 2: 95.0},
+ 'evalue': {0: -1.0, 1: -1.0, 2: -1.0},
+ 'bitscore': {0: 0.0, 1: 0.0, 2: 0.0}})
+ pdt.assert_frame_equal(
+ result.view(pd.DataFrame), exp, check_names=False)
+
+
+# setting up utility test for comparing series below
+def series_is_subset(expected, observed):
+ # join observed and expected results to compare
+ joined = pd.concat([expected, observed], axis=1, join='inner')
+ # check that all observed results are at least a substring of expected
+ # (this should usually be the case, unless if consensus classification
+ # did very badly, e.g., resulting in unclassified)
+ compared = joined.apply(lambda x: x[0].startswith(x[1]), axis=1)
+ # in the original tests we set a threshold of 50% for subsets... most
+ # should be but in some cases misclassification could occur, or dodgy
+ # annotations that screw up the LCA. So just check that we have at least
+ # as many TRUE as FALSE.
+ return len(compared[compared]) >= len(compared[~compared])
- def test_vsearch(self):
- result = classify_consensus_vsearch(self.reads, self.reads,
- self.taxonomy)
- res = result.Taxon.to_dict()
- tax = self.taxonomy.to_dict()
- right = 0.
- for taxon in res:
- right += tax[taxon].startswith(res[taxon])
- self.assertGreater(right/len(res), 0.5)
- def test_vsearch_search_exact(self):
- result = classify_consensus_vsearch(self.reads, self.reads,
- self.taxonomy, search_exact=True)
- res = result.Taxon.to_dict()
- tax = self.taxonomy.to_dict()
- right = 0.
- for taxon in res:
- right += tax[taxon].startswith(res[taxon])
- self.assertGreater(right/len(res), 0.5)
+class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
- def test_vsearch_top_hits_only(self):
- result = classify_consensus_vsearch(self.reads, self.reads,
- self.taxonomy, top_hits_only=True)
- res = result.Taxon.to_dict()
- tax = self.taxonomy.to_dict()
- right = 0.
- for taxon in res:
- right += tax[taxon].startswith(res[taxon])
- self.assertGreater(right/len(res), 0.5)
+ def setUp(self):
+ super().setUp()
+ self.taxonomy = Artifact.import_data(
+ 'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
+ self.reads = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('se-dna-sequences.fasta'))
+ self.exp = self.taxonomy.view(pd.Series)
+
+ # Make sure blast and vsearch produce expected outputs
+ # but there is no "right" taxonomy assignment.
+ # TODO: the results should be deterministic, so we should check expected
+ # search and/or taxonomy classification outputs.
+ def test_classify_consensus_blast(self):
+ result, _, = qfc.actions.classify_consensus_blast(
+ self.reads, self.reads, self.taxonomy)
+ self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
+
+ def test_classify_consensus_vsearch(self):
+ result, _, = qfc.actions.classify_consensus_vsearch(
+ self.reads, self.reads, self.taxonomy)
+ self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
+
+ # search_exact with all other exposed params to confirm compatibility
+ # in future releases of vsearch
+ def test_classify_consensus_vsearch_search_exact(self):
+ result, _, = qfc.actions.classify_consensus_vsearch(
+ self.reads, self.reads, self.taxonomy, search_exact=True,
+ top_hits_only=True, output_no_hits=True, weak_id=0.9, maxhits=10)
+ self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
+
+ def test_classify_consensus_vsearch_top_hits_only(self):
+ result, _, = qfc.actions.classify_consensus_vsearch(
+ self.reads, self.reads, self.taxonomy, top_hits_only=True)
+ self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
# make sure weak_id and other parameters do not conflict with each other.
# This test just makes sure the command runs okay with all options.
# We are not in the business of debugging VSEARCH, but want to have this
# test as a canary in the coal mine.
- def test_vsearch_the_works(self):
- result = classify_consensus_vsearch(self.reads, self.reads,
- self.taxonomy, top_hits_only=True,
- maxhits=1, maxrejects=10,
- weak_id=0.8, perc_identity=0.99,
- output_no_hits=False)
- res = result.Taxon.to_dict()
- tax = self.taxonomy.to_dict()
- right = 0.
- for taxon in res:
- right += tax[taxon].startswith(res[taxon])
- self.assertGreater(right/len(res), 0.5)
+ def test_classify_consensus_vsearch_the_works(self):
+ result, _, = qfc.actions.classify_consensus_vsearch(
+ self.reads, self.reads, self.taxonomy, top_hits_only=True,
+ maxhits=1, maxrejects=10, weak_id=0.8, perc_identity=0.99,
+ output_no_hits=False)
+ self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
class HybridClassiferTests(FeatureClassifierTestPluginBase):
- package = 'q2_feature_classifier.tests'
def setUp(self):
super().setUp()
@@ -110,7 +211,7 @@ class HybridClassiferTests(FeatureClassifierTestPluginBase):
reads = DNAFASTAFormat(reads_fp, mode='r')
self.reads = Artifact.import_data('FeatureData[Sequence]', reads)
- fitter = getattr(feature_classifier.methods,
+ fitter = getattr(qfc.methods,
'fit_classifier_' + _specific_fitters[0][0])
self.classifier = fitter(self.reads, self.taxartifact).classifier
@@ -124,11 +225,11 @@ class HybridClassiferTests(FeatureClassifierTestPluginBase):
def test_classify_hybrid_vsearch_sklearn_all_exact_match(self):
- result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ result, = qfc.actions.classify_hybrid_vsearch_sklearn(
query=self.reads, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier,
prefilter=False)
- result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ result, = qfc.actions.classify_hybrid_vsearch_sklearn(
query=self.reads, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier)
result = result.view(pd.DataFrame)
@@ -141,7 +242,7 @@ class HybridClassiferTests(FeatureClassifierTestPluginBase):
def test_classify_hybrid_vsearch_sklearn_mixed_query(self):
- result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ result, = qfc.actions.classify_hybrid_vsearch_sklearn(
query=self.query, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier,
prefilter=True, read_orientation='same', randseed=1001)
@@ -159,80 +260,104 @@ class HybridClassiferTests(FeatureClassifierTestPluginBase):
class ImportBlastAssignmentTests(FeatureClassifierTestPluginBase):
- def test_import_blast_format_assignments(self):
- in_ = ['# This is a blast comment line',
- 's1\t111\t100.000\t1428\t0\t0\t1\t1428\t1\t1428\t0.0\t2638',
- 's1\t112\t100.000\t1479\t0\t0\t1\t1479\t1\t1479\t0.0\t2732',
- 's2\t113\t100.000\t1336\t0\t0\t1\t1336\t1\t1336\t0.0\t2468',
- 's2\t114\t100.000\t1402\t0\t0\t1\t1402\t1\t1402\t0.0\t2582']
- ref = {'111': 'Aa;Bb;Cc',
- '112': 'Aa;Bb;Cc',
- '113': 'Aa;Dd;Ee',
- '114': 'Aa;Dd;Ff'}
- ref = pd.Series(ref)
- obs = _import_blast_format_assignments(in_, ref)
- exp = {'s1': [['Aa', 'Bb', 'Cc'], ['Aa', 'Bb', 'Cc']],
- 's2': [['Aa', 'Dd', 'Ee'], ['Aa', 'Dd', 'Ff']]}
- self.assertEqual(obs, exp)
-
-
-# This code has been ported from QIIME 1.9.1 with permission from @gregcaporaso
+ def setUp(self):
+ super().setUp()
+ result = Artifact.import_data(
+ 'FeatureData[BLAST6]', self.get_data_path('blast6-format.tsv'))
+ self.result = result.view(pd.DataFrame)
+ taxonomy = Artifact.import_data(
+ 'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
+ self.taxonomy = taxonomy.view(pd.Series)
+
+ def test_blast6format_df_to_series_of_lists(self):
+ # and add in a query without any hits, to check that it is parsed
+ self.result.loc[3] = ['junk', '*'] + [''] * 10
+ obs = _blast6format_df_to_series_of_lists(self.result, self.taxonomy)
+ exp = pd.Series(
+ {'1111561': [
+ 'k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; '
+ 'o__Legionellales; f__; g__; s__',
+ 'k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; '
+ 'o__Legionellales; f__Coxiellaceae; g__; s__'],
+ '835097': [
+ 'k__Bacteria; p__Chloroflexi; c__SAR202; o__; f__; g__; s__'],
+ 'junk': ['Unassigned']},
+ name='sseqid')
+ exp.index.name = 'qseqid'
+ pdt.assert_series_equal(exp, obs)
+
+ # should fail when hit IDs are missing from reference taxonomy
+ # in this case 1128818 is missing
+ def test_blast6format_df_to_series_of_lists_fail_on_missing_ids(self):
+ # add a bad idea
+ self.result.loc[3] = ['junk', 'lost-id'] + [''] * 10
+ with self.assertRaisesRegex(KeyError, "results do not match.*lost-id"):
+ _blast6format_df_to_series_of_lists(self.result, self.taxonomy)
+
+
class ConsensusAnnotationTests(FeatureClassifierTestPluginBase):
+ def test_taxa_to_cumulative_ranks(self):
+ taxa = ['a;b;c', 'a;b;d', 'a;g;g']
+ exp = [['a', 'a;b', 'a;b;c'], ['a', 'a;b', 'a;b;d'],
+ ['a', 'a;g', 'a;g;g']]
+ self.assertEqual(_taxa_to_cumulative_ranks(taxa), exp)
+
+ def test_taxa_to_cumulative_ranks_with_uneven_ranks(self):
+ taxa = ['a;b;c', 'a;b;d', 'a;g;g;somemoregarbage']
+ exp = [['a', 'a;b', 'a;b;c'], ['a', 'a;b', 'a;b;d'],
+ ['a', 'a;g', 'a;g;g', 'a;g;g;somemoregarbage']]
+ self.assertEqual(_taxa_to_cumulative_ranks(taxa), exp)
+
+ def test_taxa_to_cumulative_ranks_with_one_entry(self):
+ taxa = ['a;b;c']
+ exp = [['a', 'a;b', 'a;b;c']]
+ self.assertEqual(_taxa_to_cumulative_ranks(taxa), exp)
+
+ def test_taxa_to_cumulative_ranks_with_empty_list(self):
+ taxa = ['']
+ exp = [['']]
+ self.assertEqual(_taxa_to_cumulative_ranks(taxa), exp)
+
def test_varied_min_fraction(self):
- in_ = [['Ab', 'Bc', 'De'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Jk']]
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;De'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Jk']]
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['Ab', 'Bc', 'Fg'], 2. / 3.)
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('Ab;Bc;Fg', 0.667)
self.assertEqual(actual, expected)
# increased min_consensus_fraction yields decreased specificity
- in_ = [['Ab', 'Bc', 'De'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Jk']]
-
- actual = _compute_consensus_annotation(in_, 0.99, "Unassigned")
- expected = (['Ab', 'Bc'], 1.0)
+ actual = _lca_consensus(in_, 0.99, "Unassigned")
+ expected = ('Ab;Bc', 1.0)
self.assertEqual(actual, expected)
def test_single_annotation(self):
- in_ = [['Ab', 'Bc', 'De']]
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;De']]
- actual = _compute_consensus_annotation(in_, 1.0, "Unassigned")
- expected = (['Ab', 'Bc', 'De'], 1.0)
+ actual = _lca_consensus(in_, 1.0, "Unassigned")
+ expected = ('Ab;Bc;De', 1.0)
self.assertEqual(actual, expected)
- actual = _compute_consensus_annotation(in_, 0.501, "Unassigned")
- expected = (['Ab', 'Bc', 'De'], 1.0)
+ actual = _lca_consensus(in_, 0.501, "Unassigned")
+ expected = ('Ab;Bc;De', 1.0)
self.assertEqual(actual, expected)
def test_no_consensus(self):
- in_ = [['Ab', 'Bc', 'De'],
- ['Cd', 'Bc', 'Fg', 'Hi'],
- ['Ef', 'Bc', 'Fg', 'Jk']]
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;De'],
+ ['Cd', 'Cd;Bc', 'Cd;Bc;Fg', 'Cd;Bc;Fg;Hi'],
+ ['Ef', 'Ef;Bc', 'Ef;Bc;Fg', 'Ef;Bc;Fg;Jk']]
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['Unassigned'], 1.)
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('Unassigned', 0.)
self.assertEqual(actual, expected)
- actual = _compute_consensus_annotation(
+ actual = _lca_consensus(
in_, 0.51, unassignable_label="Hello world!")
- expected = (['Hello world!'], 1.)
+ expected = ('Hello world!', 0.)
self.assertEqual(actual, expected)
- def test_invalid_min_consensus_fraction(self):
- in_ = [['Ab', 'Bc', 'De'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Jk']]
- self.assertRaises(ValueError, _compute_consensus_annotation, in_,
- 0.50, "Unassigned")
- self.assertRaises(ValueError, _compute_consensus_annotation, in_,
- 0.00, "Unassigned")
- self.assertRaises(ValueError, _compute_consensus_annotation, in_,
- -0.1, "Unassigned")
-
def test_overlapping_names(self):
# here the 3rd level is different, but the 4th level is the same
# across the three assignments. this can happen in practice if
@@ -240,11 +365,11 @@ class ConsensusAnnotationTests(FeatureClassifierTestPluginBase):
# an unnamed species
# (e.g., f__x;g__A;s__, f__x;g__B;s__, f__x;g__B;s__)
# in this case, the assignment should be f__x.
- in_ = [['Ab', 'Bc', 'De', 'Jk'],
- ['Ab', 'Bc', 'Fg', 'Jk'],
- ['Ab', 'Bc', 'Hi', 'Jk']]
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['Ab', 'Bc'], 1.)
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;De', 'Ab;Bc;De;Jk'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Jk'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Hi', 'Ab;Bc;Hi;Jk']]
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('Ab;Bc', 1.)
self.assertEqual(actual, expected)
# here the third level is the same in 4/5 of the
@@ -252,94 +377,83 @@ class ConsensusAnnotationTests(FeatureClassifierTestPluginBase):
# different taxa since the higher levels are different.
# the consensus value should be 3/5, not 4/5, to
# reflect that.
- in_ = [['a', 'b', 'c'],
- ['a', 'd', 'e'],
- ['a', 'b', 'c'],
- ['a', 'b', 'c'],
- ['z', 'y', 'c']]
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['a', 'b', 'c'], 0.6)
+ in_ = [['a', 'a;b', 'a;b;c'],
+ ['a', 'a;d', 'a;d;e'],
+ ['a', 'a;b', 'a;b;c'],
+ ['a', 'a;b', 'a;b;c'],
+ ['z', 'z;y', 'z;y;c']]
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('a;b;c', 0.6)
self.assertEqual(actual, expected)
def test_adjusts_resolution(self):
# max result depth is that of shallowest assignment
- in_ = [['Ab', 'Bc', 'Fg'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Hi'],
- ['Ab', 'Bc', 'Fg', 'Hi', 'Jk']]
-
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['Ab', 'Bc', 'Fg'], 1.0)
+ # Reading this test now, I am not entirely sure that this is how
+ # such cases should be handled. Technically such a case should not
+ # arise (as the dbs should have even ranks) so we can leave this for
+ # now, and it is arguable, but in this case I think that majority
+ # should rule. We use `zip` but might want to consider `zip_longest`.
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;Fg'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi', 'Ab;Bc;Fg;Hi;Jk']]
+
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('Ab;Bc;Fg', 1.0)
self.assertEqual(actual, expected)
- in_ = [['Ab', 'Bc', 'Fg'],
- ['Ab', 'Bc', 'Fg', 'Hi', 'Jk'],
- ['Ab', 'Bc', 'Fg', 'Hi', 'Jk'],
- ['Ab', 'Bc', 'Fg', 'Hi', 'Jk'],
- ['Ab', 'Bc', 'Fg', 'Hi', 'Jk']]
+ in_ = [['Ab', 'Ab;Bc', 'Ab;Bc;Fg'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi', 'Ab;Bc;Fg;Hi;Jk'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi', 'Ab;Bc;Fg;Hi;Jk'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi', 'Ab;Bc;Fg;Hi;Jk'],
+ ['Ab', 'Ab;Bc', 'Ab;Bc;Fg', 'Ab;Bc;Fg;Hi', 'Ab;Bc;Fg;Hi;Jk']]
- actual = _compute_consensus_annotation(in_, 0.51, "Unassigned")
- expected = (['Ab', 'Bc', 'Fg'], 1.0)
+ actual = _lca_consensus(in_, 0.51, "Unassigned")
+ expected = ('Ab;Bc;Fg', 1.0)
self.assertEqual(actual, expected)
-# This code has been ported from QIIME 1.9.1 with permission from @gregcaporaso
+# More edge cases are tested for the internals above, so the tests here are
+# made slim to just test the overarching functions.
class ConsensusAnnotationsTests(FeatureClassifierTestPluginBase):
def test_varied_fraction(self):
- in_ = {'q1': [['A', 'B', 'C', 'D'],
- ['A', 'B', 'C', 'E']],
- 'q2': [['A', 'H', 'I', 'J'],
- ['A', 'H', 'K', 'L', 'M'],
- ['A', 'H', 'I', 'J']],
- 'q3': [[]],
- 'q4': [[]],
- 'q5': [[]]}
- expected = {'q1': ('A;B;C', 1.0),
- 'q2': ('A;H;I;J', 2. / 3.),
- 'q3': ('Unassigned', 1.0),
- 'q4': ('Unassigned', 1.0),
- 'q5': ('Unassigned', 1.0)}
- actual = _compute_consensus_annotations(in_, 0.51)
- self.assertEqual(actual, expected)
-
- expected = {'q1': ('A;B;C', 1.0),
- 'q2': ('A;H', 1.0),
- 'q3': ('Unassigned', 1.0),
- 'q4': ('Unassigned', 1.0),
- 'q5': ('Unassigned', 1.0)}
- actual = _compute_consensus_annotations(in_, 0.99)
- self.assertEqual(actual, expected)
-
- def test_varied_label(self):
- in_ = {'q1': [['A', 'B', 'C', 'D'],
- ['A', 'B', 'C', 'E']],
- 'q2': [['A', 'H', 'I', 'J'],
- ['A', 'H', 'K', 'L', 'M'],
- ['A', 'H', 'I', 'J']],
- 'q3': [[]],
- 'q4': [[]],
- 'q5': [[]]}
- expected = {'q1': ('A;B;C', 1.0),
- 'q2': ('A;H;I;J', 2. / 3.),
- 'q3': ('x', 1.0),
- 'q4': ('x', 1.0),
- 'q5': ('x', 1.0)}
- actual = _compute_consensus_annotations(in_, 0.51, "x")
- self.assertEqual(actual, expected)
-
-
-class OutputNoHitsTests(FeatureClassifierTestPluginBase):
-
- def test_output_no_hits(self):
- exp = ['>A111', 'ACGTGTGATCGA',
- '>A112', 'ACTGTCATGTGA',
- '>A113', 'ACTGTGTCGTGA']
- obs = {'A111': ('A;B;C;D', 1.0)}
- res = {'A111': ('A;B;C;D', 1.0),
- 'A112': ('Unassigned', 0.0),
- 'A113': ('Unassigned', 0.0)}
- consensus = _output_no_hits(obs, exp)
- self.assertEqual(consensus, res)
+ in_ = pd.Series({'q1': ['A;B;C;D', 'A;B;C;E'],
+ 'q2': ['A;H;I;J', 'A;H;K;L;M', 'A;H;I;J'],
+ 'q3': ['A', 'A', 'B'],
+ 'q4': ['A', 'B'],
+ 'q5': []})
+ expected = pd.DataFrame({
+ 'Taxon': {'q1': 'A;B;C', 'q2': 'A;H;I;J', 'q3': 'A',
+ 'q4': 'Unassigned', 'q5': 'Unassigned'},
+ 'Consensus': {
+ 'q1': 1.0, 'q2': 0.667, 'q3': 0.667, 'q4': 0.0, 'q5': 0.0}})
+ actual = _compute_consensus_annotations(in_, 0.51, 'Unassigned')
+ pdt.assert_frame_equal(actual, expected, check_names=False)
+
+ expected = pd.DataFrame({
+ 'Taxon': {'q1': 'A;B;C', 'q2': 'A;H', 'q3': 'Unassigned',
+ 'q4': 'Unassigned', 'q5': 'Unassigned'},
+ 'Consensus': {
+ 'q1': 1.0, 'q2': 1.0, 'q3': 0.0, 'q4': 0.0, 'q5': 0.0}})
+ actual = _compute_consensus_annotations(in_, 0.99, 'Unassigned')
+ pdt.assert_frame_equal(actual, expected, check_names=False)
+
+ def test_find_consensus_annotation(self):
+
+ result = Artifact.import_data(
+ 'FeatureData[BLAST6]', self.get_data_path('blast6-format.tsv'))
+ taxonomy = Artifact.import_data(
+ 'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
+ consensus, = qfc.actions.find_consensus_annotation(result, taxonomy)
+ obs = consensus.view(pd.DataFrame)
+ exp = pd.DataFrame(
+ {'Taxon': {
+ '1111561': 'k__Bacteria; p__Proteobacteria; '
+ 'c__Gammaproteobacteria; o__Legionellales',
+ '835097': 'k__Bacteria; p__Chloroflexi; c__SAR202; o__; f__; '
+ 'g__; s__'},
+ 'Consensus': {'1111561': '1.0', '835097': '1.0'}})
+ pdt.assert_frame_equal(exp, obs, check_names=False)
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/compare/88a0614bc38aa20c23df6812312cf2276a73f5ea...e89696705d45605bd2fcceb5fc2f2ad17fee59a6
--
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/compare/88a0614bc38aa20c23df6812312cf2276a73f5ea...e89696705d45605bd2fcceb5fc2f2ad17fee59a6
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220907/f6be160d/attachment-0001.htm>
More information about the debian-med-commit
mailing list