[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2019.7.0
Liubov Chuprikova
gitlab at salsa.debian.org
Sat Sep 7 12:43:11 BST 2019
Liubov Chuprikova pushed to branch upstream at Debian Med / q2-feature-classifier
Commits:
ce3f2d71 by Liubov Chuprikova at 2019-09-07T11:40:49Z
New upstream version 2019.7.0
- - - - -
11 changed files:
- ci/recipe/meta.yaml
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_taxonomic_classifier.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/tests/data/class_weight.biom
- q2_feature_classifier/tests/test_classifier.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- q2_feature_classifier/tests/test_taxonomic_classifier.py
Changes:
=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,8 @@ requirements:
run:
- python {{ python }}
- - scikit-learn 0.20.2
+ - scikit-learn 0.21.2
+ - joblib
- scikit-bio
- biom-format >=2.1.5,<2.2.0
- blast >=2.6.0
@@ -27,6 +28,9 @@ requirements:
- vsearch <=2.7.0
- qiime2 {{ release }}.*
- q2-types {{ release }}.*
+ - q2-quality-control {{ release }}.*
+ - q2-taxa {{ release }}.*
+ - q2-feature-table {{ release }}.*
test:
imports:
=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -11,6 +11,7 @@ import subprocess
import pandas as pd
from os.path import isfile
from collections import Counter, defaultdict
+import qiime2
def _get_default_unassignable_label():
@@ -40,7 +41,7 @@ def _consensus_assignments(
consensus = {'': ('', '')}
result = pd.DataFrame.from_dict(consensus, 'index')
result.index.name = 'Feature ID'
- result.columns = ['Taxon', 'Confidence']
+ result.columns = ['Taxon', 'Consensus']
return result
@@ -242,3 +243,9 @@ def _compute_consensus_annotation(annotations, min_consensus,
consensus_fraction_result = 1.0
return annotation, consensus_fraction_result
+
+
+def _annotate_method(taxa, method):
+ taxa = taxa.view(pd.DataFrame)
+ taxa['Method'] = method
+ return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)
=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -9,7 +9,7 @@
from itertools import islice, repeat
from copy import deepcopy
-from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
_specific_fitters = [
['naive_bayes',
@@ -38,7 +38,7 @@ def _extract_reads(reads):
def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
- pre_dispatch='2*n_jobs', confidence=-1.):
+ pre_dispatch='2*n_jobs', confidence='disable'):
return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
pre_dispatch=pre_dispatch)
(delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
@@ -46,7 +46,7 @@ def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
def _predict_chunk(pipeline, separator, confidence, chunk):
- if confidence < 0.:
+ if confidence == 'disable':
return _predict_chunk_without_conf(pipeline, chunk)
else:
return _predict_chunk_with_conf(pipeline, separator, confidence, chunk)
=====================================
q2_feature_classifier/_taxonomic_classifier.py
=====================================
@@ -11,7 +11,7 @@ import tarfile
import os
import sklearn
-from sklearn.externals import joblib
+import joblib
from sklearn.pipeline import Pipeline
import qiime2.plugin
import qiime2.plugin.model as model
=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.4.0)"
- git_full = "e2ba97ea232a02f4c489b51ff0481583d394a339"
- git_date = "2019-05-03 04:14:49 +0000"
+ git_refnames = " (tag: 2019.7.0)"
+ git_full = "bc7ed3a741779ccb32114a229af65cc8c1510743"
+ git_date = "2019-07-30 18:15:57 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -6,13 +6,19 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
+import tempfile
import pandas as pd
+import qiime2
+
from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAFASTAFormat)
from .plugin_setup import plugin, citations
-from qiime2.plugin import Int, Str, Float, Choices, Range
-from ._consensus_assignment import (_consensus_assignments,
- _get_default_unassignable_label)
+from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from ._consensus_assignment import (_consensus_assignments, _run_command,
+ _get_default_unassignable_label,
+ _annotate_method)
+from ._taxonomic_classifier import TaxonomicClassifier
+from .classifier import _classify_parameters, _parameter_descriptions
def classify_consensus_vsearch(query: DNAFASTAFormat,
@@ -25,53 +31,164 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
min_consensus: float = 0.51,
unassignable_label: str =
_get_default_unassignable_label(),
+ search_exact: bool = False,
+ top_hits_only: bool = False,
threads: str = 1) -> pd.DataFrame:
seqs_fp = str(query)
ref_fp = str(reference_reads)
+ if maxaccepts == 'all':
+ maxaccepts = 0
cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
'--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
str(maxaccepts), '--maxrejects', '0', '--output_no_hits', '--db',
- ref_fp, '--threads', str(threads), '--blast6out']
+ ref_fp, '--threads', str(threads)]
+ if search_exact:
+ cmd[1] = '--search_exact'
+ if top_hits_only:
+ cmd.append('--top_hits_only')
+ cmd.append('--blast6out')
consensus = _consensus_assignments(
cmd, reference_taxonomy, min_consensus=min_consensus,
unassignable_label=unassignable_label)
return consensus
+def classify_hybrid_vsearch_sklearn(ctx,
+ query,
+ reference_reads,
+ reference_taxonomy,
+ classifier,
+ maxaccepts=10,
+ perc_identity=0.5,
+ query_cov=0.8,
+ strand='both',
+ min_consensus=0.51,
+ reads_per_batch=0,
+ confidence=0.7,
+ read_orientation='auto',
+ threads=1,
+ prefilter=True,
+ sample_size=1000,
+ randseed=0):
+ exclude = ctx.get_action('quality_control', 'exclude_seqs')
+ ccv = ctx.get_action('feature_classifier', 'classify_consensus_vsearch')
+ cs = ctx.get_action('feature_classifier', 'classify_sklearn')
+ filter_seqs = ctx.get_action('taxa', 'filter_seqs')
+ merge = ctx.get_action('feature_table', 'merge_taxa')
+
+ # randomly subsample reference sequences for rough positive filter
+ if prefilter:
+ ref = str(reference_reads.view(DNAFASTAFormat))
+ with tempfile.NamedTemporaryFile() as output:
+ cmd = ['vsearch', '--fastx_subsample', ref, '--sample_size',
+ str(sample_size), '--randseed', str(randseed),
+ '--fastaout', output.name]
+ _run_command(cmd)
+ sparse_reference = qiime2.Artifact.import_data(
+ 'FeatureData[Sequence]', output.name)
+
+ # perform rough positive filter on query sequences
+ query, misses, = exclude(
+ query_sequences=query, reference_sequences=sparse_reference,
+ method='vsearch', perc_identity=perc_identity,
+ perc_query_aligned=query_cov, threads=threads)
+
+ # find exact matches, perform LCA consensus classification
+ taxa1, = ccv(query=query, reference_reads=reference_reads,
+ reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
+ strand=strand, min_consensus=min_consensus,
+ search_exact=True, threads=threads)
+
+ # Annotate taxonomic assignments with classification method
+ taxa1 = _annotate_method(taxa1, 'VSEARCH')
+
+ # perform second pass classification on unassigned taxa
+ # filter out unassigned seqs
+ try:
+ query, = filter_seqs(sequences=query, taxonomy=taxa1,
+ include=_get_default_unassignable_label())
+ except ValueError:
+ # get ValueError if all sequences are filtered out.
+ # so if no sequences are unassigned, return exact match results
+ return taxa1
+
+ # classify with sklearn classifier
+ taxa2, = cs(reads=query, classifier=classifier,
+ reads_per_batch=reads_per_batch, n_jobs=threads,
+ confidence=confidence, read_orientation=read_orientation)
+
+ # Annotate taxonomic assignments with classification method
+ taxa2 = _annotate_method(taxa2, 'sklearn')
+
+ # merge into one big happy result
+ taxa, = merge(data=[taxa2, taxa1])
+ return taxa
+
+
+output_descriptions = {
+ 'classification': 'The resulting taxonomy classifications.'}
+
+parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
+ 'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
+ 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
+ 'strand': Str % Choices(['both', 'plus']),
+ 'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
+ inclusive_start=False),
+ 'threads': Int % Range(1, None)}
+
+inputs = {'query': FeatureData[Sequence],
+ 'reference_reads': FeatureData[Sequence],
+ 'reference_taxonomy': FeatureData[Taxonomy]}
+
+input_descriptions = {'query': 'Sequences to classify taxonomically.',
+ 'reference_reads': 'reference sequences.',
+ 'reference_taxonomy': 'reference taxonomy labels.'}
+
+parameter_descriptions = {
+ 'strand': 'Align against reference sequences in forward ("plus") '
+ 'or both directions ("both").',
+ 'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
+ '"all" to keep all hits > perc_identity similarity.',
+ 'perc_identity': 'Reject match if percent identity to query is '
+ 'lower.',
+ 'query_cov': 'Reject match if query alignment coverage per high-'
+ 'scoring pair is lower.',
+ 'min_consensus': 'Minimum fraction of assignments must match top '
+ 'hit to be accepted as consensus assignment.',
+ 'threads': 'Number of threads to use for job parallelization.'}
+
+outputs = [('classification', FeatureData[Taxonomy])]
+
+ignore_prefilter = ' This parameter is ignored if `prefilter` is disabled.'
+
+
plugin.methods.register_function(
function=classify_consensus_vsearch,
- inputs={'query': FeatureData[Sequence],
- 'reference_reads': FeatureData[Sequence],
- 'reference_taxonomy': FeatureData[Taxonomy]},
- parameters={'maxaccepts': Int % Range(0, None),
- 'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
- 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
- 'strand': Str % Choices(['both', 'plus']),
- 'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
- inclusive_start=False),
+ inputs=inputs,
+ parameters={**parameters,
'unassignable_label': Str,
- 'threads': Int},
- outputs=[('classification', FeatureData[Taxonomy])],
- input_descriptions={'query': 'Sequences to classify taxonomically.',
- 'reference_reads': 'reference sequences.',
- 'reference_taxonomy': 'reference taxonomy labels.'},
+ 'search_exact': Bool,
+ 'top_hits_only': Bool},
+ outputs=outputs,
+ input_descriptions=input_descriptions,
parameter_descriptions={
- 'strand': ('Align against reference sequences in forward ("plus") '
- 'or both directions ("both").'),
- 'maxaccepts': ('Maximum number of hits to keep for each query. Set to '
- '0 to keep all hits > perc_identity similarity. Must '
- 'be in range [0, infinity].'),
- 'perc_identity': ('Reject match if percent identity to query is '
- 'lower. Must be in range [0.0, 1.0].'),
- 'query_cov': 'Reject match if query alignment coverage per high-'
- 'scoring pair is lower. Must be in range [0.0, 1.0].',
- 'min_consensus': ('Minimum fraction of assignments must match top '
- 'hit to be accepted as consensus assignment. Must '
- 'be in range (0.5, 1.0].')
+ **parameter_descriptions,
+ 'search_exact': 'Search for exact full-length matches to the query '
+ 'sequences. Only 100% exact matches are reported and '
+ 'this command is much faster than the default. If '
+ 'True, the perc_identity and query_cov settings are '
+ 'ignored. Note: query and reference reads must be '
+ 'trimmed to the exact same DNA locus (e.g., primer '
+ 'site) because only exact matches will be reported.',
+ 'top_hits_only': 'Only the top hits between the query and reference '
+ 'sequence sets are reported. For each query, the top '
+ 'hit is the one presenting the highest percentage of '
+ 'identity. Multiple equally scored top hits will be '
+ 'used for consensus taxonomic assignment if '
+ 'maxaccepts is greater than 1.',
},
- output_descriptions={'classification': 'The resulting taxonomy '
- 'classifications.'},
- name='VSEARCH consensus taxonomy classifier',
+ output_descriptions=output_descriptions,
+ name='VSEARCH-based consensus taxonomy classifier',
description=('Assign taxonomy to query sequences using VSEARCH. Performs '
'VSEARCH global alignment between query and reference_reads, '
'then assigns consensus taxonomy to each query sequence from '
@@ -81,3 +198,68 @@ plugin.methods.register_function(
'choosing the top N hits, not the first N hits.'),
citations=[citations['rognes2016vsearch']]
)
+
+
+plugin.pipelines.register_function(
+ function=classify_hybrid_vsearch_sklearn,
+ inputs={**inputs, 'classifier': TaxonomicClassifier},
+ parameters={**parameters,
+ 'reads_per_batch': _classify_parameters['reads_per_batch'],
+ 'confidence': _classify_parameters['confidence'],
+ 'read_orientation': _classify_parameters['read_orientation'],
+ 'prefilter': Bool,
+ 'sample_size': Int % Range(1, None),
+ 'randseed': Int % Range(0, None)},
+ outputs=outputs,
+ input_descriptions={**input_descriptions,
+ 'classifier': 'Pre-trained sklearn taxonomic '
+ 'classifier for classifying the reads.'},
+ parameter_descriptions={
+ **{k: parameter_descriptions[k] for k in [
+ 'strand', 'maxaccepts', 'min_consensus', 'threads']},
+ 'perc_identity': 'Percent sequence similarity to use for PREFILTER. ' +
+ parameter_descriptions['perc_identity'] + ' Set to a '
+ 'lower value to perform a rough pre-filter.' +
+ ignore_prefilter,
+ 'query_cov': 'Query coverage threshold to use for PREFILTER. ' +
+ parameter_descriptions['query_cov'] + ' Set to a '
+ 'lower value to perform a rough pre-filter.' +
+ ignore_prefilter,
+ 'confidence': _parameter_descriptions['confidence'],
+ 'read_orientation': 'Direction of reads with respect to reference '
+ 'sequences in pre-trained sklearn classifier. '
+ 'same will cause reads to be classified unchanged'
+ '; reverse-complement will cause reads to be '
+ 'reversed and complemented prior to '
+ 'classification. "auto" will autodetect '
+ 'orientation based on the confidence estimates '
+ 'for the first 100 reads.',
+ 'reads_per_batch': 'Number of reads to process in each batch for '
+ 'sklearn classification. If "auto", this parameter '
+ 'is autoscaled to min(number of query sequences / '
+ 'threads, 20000).',
+ 'prefilter': 'Toggle positive filter of query sequences on or off.',
+ 'sample_size': 'Randomly extract the given number of sequences from '
+ 'the reference database to use for prefiltering.' +
+ ignore_prefilter,
+ 'randseed': 'Use integer as a seed for the pseudo-random generator '
+ 'used during prefiltering. A given seed always produces '
+ 'the same output, which is useful for replicability. Set '
+ 'to 0 to use a pseudo-random seed.' + ignore_prefilter,
+ },
+ output_descriptions=output_descriptions,
+ name='ALPHA Hybrid classifier: VSEARCH exact match + sklearn classifier',
+ description=('NOTE: THIS PIPELINE IS AN ALPHA RELEASE. Please report bugs '
+ 'to https://forum.qiime2.org!\n'
+ 'Assign taxonomy to query sequences using hybrid classifier. '
+ 'First performs rough positive filter to remove artifact and '
+ 'low-coverage sequences (use "prefilter" parameter to toggle '
+ 'this step on or off). Second, performs VSEARCH exact match '
+ 'between query and reference_reads to find exact matches, '
+ 'followed by least common ancestor consensus taxonomy '
+ 'assignment from among maxaccepts top hits, min_consensus of '
+ 'which share that taxonomic assignment. Query sequences '
+ 'without an exact match are then classified with a pre-'
+ 'trained sklearn taxonomy classifier to predict the most '
+ 'likely taxonomic lineage.'),
+)
=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -23,6 +23,7 @@ import sklearn
from numpy import median, array, ceil
import biom
import skbio
+import joblib
from ._skl import fit_pipeline, predict, _specific_fitters
from ._taxonomic_classifier import TaxonomicClassifier
@@ -182,7 +183,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
raise ValueError("Value other than zero must be specified as number "
"of jobs to run.")
else:
- n_jobs = sklearn.externals.joblib.effective_n_jobs(n_jobs)
+ n_jobs = joblib.effective_n_jobs(n_jobs)
# we really only want to calculate this if running in parallel
if n_jobs != 1:
@@ -200,7 +201,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
reads_per_batch: int = 0, n_jobs: int = 1,
pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
- read_orientation: str = None
+ read_orientation: str = 'auto'
) -> pd.DataFrame:
# autotune reads per batch
if reads_per_batch == 0:
@@ -226,9 +227,35 @@ _classify_parameters = {
'reads_per_batch': Int % Range(0, None),
'n_jobs': Int,
'pre_dispatch': Str,
- 'confidence': Float,
- 'read_orientation': Str % Choices(['same', 'reverse-complement'])}
-
+ 'confidence': Float % Range(
+ 0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices(
+ ['disable']),
+ 'read_orientation': Str % Choices(['same', 'reverse-complement', 'auto'])}
+
+_parameter_descriptions = {
+ 'confidence': 'Confidence threshold for limiting '
+ 'taxonomic depth. Set to "disable" to disable '
+ 'confidence calculation, or 0 to calculate '
+ 'confidence but not apply it to limit the '
+ 'taxonomic depth of the assignments.',
+ 'read_orientation': 'Direction of reads with '
+ 'respect to reference sequences. same will cause '
+ 'reads to be classified unchanged; reverse-'
+ 'complement will cause reads to be reversed '
+ 'and complemented prior to classification. '
+ '"auto" will autodetect orientation based on the '
+ 'confidence estimates for the first 100 reads.',
+ 'reads_per_batch': 'Number of reads to process in each batch. If "auto", '
+ 'this parameter is autoscaled to '
+ 'min( number of query sequences / n_jobs, 20000).',
+ 'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
+ 'all CPUs are used. If 1 is given, no parallel computing '
+ 'code is used at all, which is useful for debugging. For '
+ 'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '
+ 'n_jobs = -2, all CPUs but one are used.',
+ 'pre_dispatch': '"all" or expression, as in "3*n_jobs". The number of '
+ 'batches (of tasks) to be pre-dispatched.'
+}
plugin.methods.register_function(
function=classify_sklearn,
@@ -242,30 +269,7 @@ plugin.methods.register_function(
'reads': 'The feature data to be classified.',
'classifier': 'The taxonomic classifier for classifying the reads.'
},
- parameter_descriptions={
- 'confidence': 'Confidence threshold for limiting '
- 'taxonomic depth. Provide -1 to disable '
- 'confidence calculation, or 0 to calculate '
- 'confidence but not apply it to limit the '
- 'taxonomic depth of the assignments.',
- 'read_orientation': 'Direction of reads with '
- 'respect to reference sequences. same will cause '
- 'reads to be classified unchanged; reverse-'
- 'complement will cause reads to be reversed '
- 'and complemented prior to classification. '
- 'Default is to autodetect based on the '
- 'confidence estimates for the first 100 reads.',
- 'reads_per_batch': 'Number of reads to process in each batch. If 0, '
- 'this parameter is autoscaled to '
- 'min( number of query sequences / n_jobs, 20000).',
- 'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
- 'all CPUs are used. If 1 is given, no parallel computing '
- 'code is used at all, which is useful for debugging. For '
- 'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '
- 'n_jobs = -2, all CPUs but one are used.',
- 'pre_dispatch': '"all" or expression, as in "3*n_jobs". The number of '
- 'batches (of tasks) to be pre-dispatched.'
- },
+ parameter_descriptions={**_parameter_descriptions},
citations=[citations['pedregosa2011scikit']]
)
=====================================
q2_feature_classifier/tests/data/class_weight.biom
=====================================
Binary files a/q2_feature_classifier/tests/data/class_weight.biom and b/q2_feature_classifier/tests/data/class_weight.biom differ
=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -59,8 +59,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
# should populate the class weight of a pipeline
weights = Artifact.import_data(
'FeatureTable[RelativeFrequency]',
- self.get_data_path('class_weight.biom'),
- view_type='BIOMV100Format')
+ self.get_data_path('class_weight.biom'))
table = weights.view(biom.Table)
svc_spec = [['feat_ext',
@@ -89,8 +88,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
# we should be able to input class_weight to fit_classifier
weights = Artifact.import_data(
'FeatureTable[RelativeFrequency]',
- self.get_data_path('class_weight.biom'),
- view_type='BIOMV100Format')
+ self.get_data_path('class_weight.biom'))
reads = Artifact.import_data(
'FeatureData[Sequence]',
self.get_data_path('se-dna-sequences.fasta'))
=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -9,6 +9,8 @@
import pandas as pd
from qiime2.sdk import Artifact
+from qiime2.plugins import feature_classifier
+from q2_feature_classifier._skl import _specific_fitters
from q2_feature_classifier._blast import classify_consensus_blast
from q2_feature_classifier._vsearch import classify_consensus_vsearch
from q2_feature_classifier._consensus_assignment import (
@@ -55,6 +57,88 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
+ def test_vsearch_search_exact(self):
+ result = classify_consensus_vsearch(self.reads, self.reads,
+ self.taxonomy, search_exact=True)
+ res = result.Taxon.to_dict()
+ tax = self.taxonomy.to_dict()
+ right = 0.
+ for taxon in res:
+ right += tax[taxon].startswith(res[taxon])
+ self.assertGreater(right/len(res), 0.5)
+
+ def test_vsearch_top_hits_only(self):
+ result = classify_consensus_vsearch(self.reads, self.reads,
+ self.taxonomy, top_hits_only=True)
+ res = result.Taxon.to_dict()
+ tax = self.taxonomy.to_dict()
+ right = 0.
+ for taxon in res:
+ right += tax[taxon].startswith(res[taxon])
+ self.assertGreater(right/len(res), 0.5)
+
+
+class HybridClassiferTests(FeatureClassifierTestPluginBase):
+ package = 'q2_feature_classifier.tests'
+
+ def setUp(self):
+ super().setUp()
+ taxonomy = Artifact.import_data(
+ 'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
+ self.taxonomy = taxonomy.view(pd.Series)
+ self.taxartifact = taxonomy
+ # TODO: use `Artifact.import_data` here once we have a transformer
+ # for DNASequencesDirectoryFormat -> DNAFASTAFormat
+ reads_fp = self.get_data_path('se-dna-sequences.fasta')
+ reads = DNAFASTAFormat(reads_fp, mode='r')
+ self.reads = Artifact.import_data('FeatureData[Sequence]', reads)
+
+ fitter = getattr(feature_classifier.methods,
+ 'fit_classifier_' + _specific_fitters[0][0])
+ self.classifier = fitter(self.reads, self.taxartifact).classifier
+
+ self.query = Artifact.import_data('FeatureData[Sequence]', pd.Series(
+ {'A': 'GCCTAACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGA',
+ 'B': 'TAACACATGCAAGTCAACGATGCTTATGTAGCAATATGTAAGTAGAGTGGCGCACG',
+ 'C': 'ATACATGCAAGTCGTACGGTATTCCGGTTTCGGCCGGGAGAGAGTGGCGGATGGGT',
+ 'D': 'GACGAACGCTGGCGACGTGCTTAACACATGCAAGTCGTGCGAGGACGGGCGGTGCT'
+ 'TGCACTGCTCGAGCCGAGCGGCGGACGGGTGAGTAACACGTGAGCAACCTATCTCC'
+ 'GTGCGGGGGACAACCCGGGGAAACCCGGGCTAATACCG'}))
+
+ def test_classify_hybrid_vsearch_sklearn_all_exact_match(self):
+
+ result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ query=self.reads, reference_reads=self.reads,
+ reference_taxonomy=self.taxartifact, classifier=self.classifier,
+ prefilter=False)
+ result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ query=self.reads, reference_reads=self.reads,
+ reference_taxonomy=self.taxartifact, classifier=self.classifier)
+ result = result.view(pd.DataFrame)
+ res = result.Taxon.to_dict()
+ tax = self.taxonomy.to_dict()
+ right = 0.
+ for taxon in res:
+ right += tax[taxon].startswith(res[taxon])
+ self.assertGreater(right/len(res), 0.5)
+
+ def test_classify_hybrid_vsearch_sklearn_mixed_query(self):
+
+ result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+ query=self.query, reference_reads=self.reads,
+ reference_taxonomy=self.taxartifact, classifier=self.classifier,
+ prefilter=True, read_orientation='same', randseed=1001)
+ result = result.view(pd.DataFrame)
+ obs = result.Taxon.to_dict()
+ exp = {'A': 'k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; '
+ 'o__Legionellales; f__; g__; s__',
+ 'B': 'k__Bacteria; p__Chlorobi; c__; o__; f__; g__; s__',
+ 'C': 'k__Bacteria; p__Bacteroidetes; c__Cytophagia; '
+ 'o__Cytophagales; f__Cyclobacteriaceae; g__; s__',
+ 'D': 'k__Bacteria; p__Gemmatimonadetes; c__Gemm-5; o__; f__; '
+ 'g__; s__'}
+ self.assertDictEqual(obs, exp)
+
class ImportBlastAssignmentTests(FeatureClassifierTestPluginBase):
=====================================
q2_feature_classifier/tests/test_taxonomic_classifier.py
=====================================
@@ -14,7 +14,7 @@ import os
import shutil
import sklearn
-from sklearn.externals import joblib
+import joblib
from sklearn.pipeline import Pipeline
from qiime2.sdk import Artifact
from qiime2.plugins.feature_classifier.methods import \
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/commit/ce3f2d71dc19f5e2778f83d61d5b0be27a8bee02
--
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/commit/ce3f2d71dc19f5e2778f83d61d5b0be27a8bee02
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190907/fcaece4c/attachment-0001.html>
More information about the debian-med-commit
mailing list