[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2019.7.0

Sat Sep 7 12:43:11 BST 2019


Liubov Chuprikova pushed to branch upstream at Debian Med / q2-feature-classifier


Commits:
ce3f2d71 by Liubov Chuprikova at 2019-09-07T11:40:49Z
New upstream version 2019.7.0
- - - - -


11 changed files:

- ci/recipe/meta.yaml
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_taxonomic_classifier.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/tests/data/class_weight.biom
- q2_feature_classifier/tests/test_classifier.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- q2_feature_classifier/tests/test_taxonomic_classifier.py


Changes:

=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,8 @@ requirements:
 
   run:
     - python {{ python }}
-    - scikit-learn 0.20.2
+    - scikit-learn 0.21.2
+    - joblib
     - scikit-bio
     - biom-format >=2.1.5,<2.2.0
     - blast >=2.6.0
@@ -27,6 +28,9 @@ requirements:
     - vsearch <=2.7.0
     - qiime2 {{ release }}.*
     - q2-types {{ release }}.*
+    - q2-quality-control {{ release }}.*
+    - q2-taxa {{ release }}.*
+    - q2-feature-table {{ release }}.*
 
 test:
   imports:


=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -11,6 +11,7 @@ import subprocess
 import pandas as pd
 from os.path import isfile
 from collections import Counter, defaultdict
+import qiime2
 
 
 def _get_default_unassignable_label():
@@ -40,7 +41,7 @@ def _consensus_assignments(
             consensus = {'': ('', '')}
         result = pd.DataFrame.from_dict(consensus, 'index')
         result.index.name = 'Feature ID'
-        result.columns = ['Taxon', 'Confidence']
+        result.columns = ['Taxon', 'Consensus']
         return result
 
 
@@ -242,3 +243,9 @@ def _compute_consensus_annotation(annotations, min_consensus,
         consensus_fraction_result = 1.0
 
     return annotation, consensus_fraction_result
+
+
+def _annotate_method(taxa, method):
+    taxa = taxa.view(pd.DataFrame)
+    taxa['Method'] = method
+    return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)


=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -9,7 +9,7 @@
 from itertools import islice, repeat
 from copy import deepcopy
 
-from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
 
 _specific_fitters = [
         ['naive_bayes',
@@ -38,7 +38,7 @@ def _extract_reads(reads):
 
 
 def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
-            pre_dispatch='2*n_jobs', confidence=-1.):
+            pre_dispatch='2*n_jobs', confidence='disable'):
     return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
                                 pre_dispatch=pre_dispatch)
             (delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
@@ -46,7 +46,7 @@ def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
 
 
 def _predict_chunk(pipeline, separator, confidence, chunk):
-    if confidence < 0.:
+    if confidence == 'disable':
         return _predict_chunk_without_conf(pipeline, chunk)
     else:
         return _predict_chunk_with_conf(pipeline, separator, confidence, chunk)


=====================================
q2_feature_classifier/_taxonomic_classifier.py
=====================================
@@ -11,7 +11,7 @@ import tarfile
 import os
 
 import sklearn
-from sklearn.externals import joblib
+import joblib
 from sklearn.pipeline import Pipeline
 import qiime2.plugin
 import qiime2.plugin.model as model


=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2019.4.0)"
-    git_full = "e2ba97ea232a02f4c489b51ff0481583d394a339"
-    git_date = "2019-05-03 04:14:49 +0000"
+    git_refnames = " (tag: 2019.7.0)"
+    git_full = "bc7ed3a741779ccb32114a229af65cc8c1510743"
+    git_date = "2019-07-30 18:15:57 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -6,13 +6,19 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
+import tempfile
 import pandas as pd
+import qiime2
+
 from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, DNAFASTAFormat)
 from .plugin_setup import plugin, citations
-from qiime2.plugin import Int, Str, Float, Choices, Range
-from ._consensus_assignment import (_consensus_assignments,
-                                    _get_default_unassignable_label)
+from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
+from ._consensus_assignment import (_consensus_assignments, _run_command,
+                                    _get_default_unassignable_label,
+                                    _annotate_method)
+from ._taxonomic_classifier import TaxonomicClassifier
+from .classifier import _classify_parameters, _parameter_descriptions
 
 
 def classify_consensus_vsearch(query: DNAFASTAFormat,
@@ -25,53 +31,164 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
                                min_consensus: float = 0.51,
                                unassignable_label: str =
                                _get_default_unassignable_label(),
+                               search_exact: bool = False,
+                               top_hits_only: bool = False,
                                threads: str = 1) -> pd.DataFrame:
     seqs_fp = str(query)
     ref_fp = str(reference_reads)
+    if maxaccepts == 'all':
+        maxaccepts = 0
     cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
            '--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
            str(maxaccepts), '--maxrejects', '0', '--output_no_hits', '--db',
-           ref_fp, '--threads', str(threads), '--blast6out']
+           ref_fp, '--threads', str(threads)]
+    if search_exact:
+        cmd[1] = '--search_exact'
+    if top_hits_only:
+        cmd.append('--top_hits_only')
+    cmd.append('--blast6out')
     consensus = _consensus_assignments(
         cmd, reference_taxonomy, min_consensus=min_consensus,
         unassignable_label=unassignable_label)
     return consensus
 
 
+def classify_hybrid_vsearch_sklearn(ctx,
+                                    query,
+                                    reference_reads,
+                                    reference_taxonomy,
+                                    classifier,
+                                    maxaccepts=10,
+                                    perc_identity=0.5,
+                                    query_cov=0.8,
+                                    strand='both',
+                                    min_consensus=0.51,
+                                    reads_per_batch=0,
+                                    confidence=0.7,
+                                    read_orientation='auto',
+                                    threads=1,
+                                    prefilter=True,
+                                    sample_size=1000,
+                                    randseed=0):
+    exclude = ctx.get_action('quality_control', 'exclude_seqs')
+    ccv = ctx.get_action('feature_classifier', 'classify_consensus_vsearch')
+    cs = ctx.get_action('feature_classifier', 'classify_sklearn')
+    filter_seqs = ctx.get_action('taxa', 'filter_seqs')
+    merge = ctx.get_action('feature_table', 'merge_taxa')
+
+    # randomly subsample reference sequences for rough positive filter
+    if prefilter:
+        ref = str(reference_reads.view(DNAFASTAFormat))
+        with tempfile.NamedTemporaryFile() as output:
+            cmd = ['vsearch', '--fastx_subsample', ref, '--sample_size',
+                   str(sample_size), '--randseed', str(randseed),
+                   '--fastaout', output.name]
+            _run_command(cmd)
+            sparse_reference = qiime2.Artifact.import_data(
+                'FeatureData[Sequence]', output.name)
+
+            # perform rough positive filter on query sequences
+            query, misses, = exclude(
+                query_sequences=query, reference_sequences=sparse_reference,
+                method='vsearch', perc_identity=perc_identity,
+                perc_query_aligned=query_cov, threads=threads)
+
+    # find exact matches, perform LCA consensus classification
+    taxa1, = ccv(query=query, reference_reads=reference_reads,
+                 reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
+                 strand=strand, min_consensus=min_consensus,
+                 search_exact=True, threads=threads)
+
+    # Annotate taxonomic assignments with classification method
+    taxa1 = _annotate_method(taxa1, 'VSEARCH')
+
+    # perform second pass classification on unassigned taxa
+    # filter out unassigned seqs
+    try:
+        query, = filter_seqs(sequences=query, taxonomy=taxa1,
+                             include=_get_default_unassignable_label())
+    except ValueError:
+        # get ValueError if all sequences are filtered out.
+        # so if no sequences are unassigned, return exact match results
+        return taxa1
+
+    # classify with sklearn classifier
+    taxa2, = cs(reads=query, classifier=classifier,
+                reads_per_batch=reads_per_batch, n_jobs=threads,
+                confidence=confidence, read_orientation=read_orientation)
+
+    # Annotate taxonomic assignments with classification method
+    taxa2 = _annotate_method(taxa2, 'sklearn')
+
+    # merge into one big happy result
+    taxa, = merge(data=[taxa2, taxa1])
+    return taxa
+
+
+output_descriptions = {
+    'classification': 'The resulting taxonomy classifications.'}
+
+parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
+              'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
+              'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
+              'strand': Str % Choices(['both', 'plus']),
+              'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
+                                             inclusive_start=False),
+              'threads': Int % Range(1, None)}
+
+inputs = {'query': FeatureData[Sequence],
+          'reference_reads': FeatureData[Sequence],
+          'reference_taxonomy': FeatureData[Taxonomy]}
+
+input_descriptions = {'query': 'Sequences to classify taxonomically.',
+                      'reference_reads': 'reference sequences.',
+                      'reference_taxonomy': 'reference taxonomy labels.'}
+
+parameter_descriptions = {
+    'strand': 'Align against reference sequences in forward ("plus") '
+              'or both directions ("both").',
+    'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
+                  '"all" to keep all hits > perc_identity similarity.',
+    'perc_identity': 'Reject match if percent identity to query is '
+                     'lower.',
+    'query_cov': 'Reject match if query alignment coverage per high-'
+                 'scoring pair is lower.',
+    'min_consensus': 'Minimum fraction of assignments must match top '
+                     'hit to be accepted as consensus assignment.',
+    'threads': 'Number of threads to use for job parallelization.'}
+
+outputs = [('classification', FeatureData[Taxonomy])]
+
+ignore_prefilter = ' This parameter is ignored if `prefilter` is disabled.'
+
+
 plugin.methods.register_function(
     function=classify_consensus_vsearch,
-    inputs={'query': FeatureData[Sequence],
-            'reference_reads': FeatureData[Sequence],
-            'reference_taxonomy': FeatureData[Taxonomy]},
-    parameters={'maxaccepts': Int % Range(0, None),
-                'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
-                'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
-                'strand': Str % Choices(['both', 'plus']),
-                'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
-                                               inclusive_start=False),
+    inputs=inputs,
+    parameters={**parameters,
                 'unassignable_label': Str,
-                'threads': Int},
-    outputs=[('classification', FeatureData[Taxonomy])],
-    input_descriptions={'query': 'Sequences to classify taxonomically.',
-                        'reference_reads': 'reference sequences.',
-                        'reference_taxonomy': 'reference taxonomy labels.'},
+                'search_exact': Bool,
+                'top_hits_only': Bool},
+    outputs=outputs,
+    input_descriptions=input_descriptions,
     parameter_descriptions={
-        'strand': ('Align against reference sequences in forward ("plus") '
-                   'or both directions ("both").'),
-        'maxaccepts': ('Maximum number of hits to keep for each query. Set to '
-                       '0 to keep all hits > perc_identity similarity. Must '
-                       'be in range [0, infinity].'),
-        'perc_identity': ('Reject match if percent identity to query is '
-                          'lower. Must be in range [0.0, 1.0].'),
-        'query_cov': 'Reject match if query alignment coverage per high-'
-                     'scoring pair is lower. Must be in range [0.0, 1.0].',
-        'min_consensus': ('Minimum fraction of assignments must match top '
-                          'hit to be accepted as consensus assignment. Must '
-                          'be in range (0.5, 1.0].')
+        **parameter_descriptions,
+        'search_exact': 'Search for exact full-length matches to the query '
+                        'sequences. Only 100% exact matches are reported and '
+                        'this command is much faster than the default. If '
+                        'True, the perc_identity and query_cov settings are '
+                        'ignored. Note: query and reference reads must be '
+                        'trimmed to the exact same DNA locus (e.g., primer '
+                        'site) because only exact matches will be reported.',
+        'top_hits_only': 'Only the top hits between the query and reference '
+                         'sequence sets are reported. For each query, the top '
+                         'hit is the one presenting the highest percentage of '
+                         'identity. Multiple equally scored top hits will be '
+                         'used for consensus taxonomic assignment if '
+                         'maxaccepts is greater than 1.',
     },
-    output_descriptions={'classification': 'The resulting taxonomy '
-                         'classifications.'},
-    name='VSEARCH consensus taxonomy classifier',
+    output_descriptions=output_descriptions,
+    name='VSEARCH-based consensus taxonomy classifier',
     description=('Assign taxonomy to query sequences using VSEARCH. Performs '
                  'VSEARCH global alignment between query and reference_reads, '
                  'then assigns consensus taxonomy to each query sequence from '
@@ -81,3 +198,68 @@ plugin.methods.register_function(
                  'choosing the top N hits, not the first N hits.'),
     citations=[citations['rognes2016vsearch']]
 )
+
+
+plugin.pipelines.register_function(
+    function=classify_hybrid_vsearch_sklearn,
+    inputs={**inputs, 'classifier': TaxonomicClassifier},
+    parameters={**parameters,
+                'reads_per_batch': _classify_parameters['reads_per_batch'],
+                'confidence': _classify_parameters['confidence'],
+                'read_orientation': _classify_parameters['read_orientation'],
+                'prefilter': Bool,
+                'sample_size': Int % Range(1, None),
+                'randseed': Int % Range(0, None)},
+    outputs=outputs,
+    input_descriptions={**input_descriptions,
+                        'classifier': 'Pre-trained sklearn taxonomic '
+                                      'classifier for classifying the reads.'},
+    parameter_descriptions={
+        **{k: parameter_descriptions[k] for k in [
+            'strand', 'maxaccepts', 'min_consensus', 'threads']},
+        'perc_identity': 'Percent sequence similarity to use for PREFILTER. ' +
+                         parameter_descriptions['perc_identity'] + ' Set to a '
+                         'lower value to perform a rough pre-filter.' +
+                         ignore_prefilter,
+        'query_cov': 'Query coverage threshold to use for PREFILTER. ' +
+                     parameter_descriptions['query_cov'] + ' Set to a '
+                     'lower value to perform a rough pre-filter.' +
+                     ignore_prefilter,
+        'confidence': _parameter_descriptions['confidence'],
+        'read_orientation': 'Direction of reads with respect to reference '
+                            'sequences in pre-trained sklearn classifier. '
+                            'same will cause reads to be classified unchanged'
+                            '; reverse-complement will cause reads to be '
+                            'reversed and complemented prior to '
+                            'classification. "auto" will autodetect '
+                            'orientation based on the confidence estimates '
+                            'for the first 100 reads.',
+        'reads_per_batch': 'Number of reads to process in each batch for '
+                           'sklearn classification. If "auto", this parameter '
+                           'is autoscaled to min(number of query sequences / '
+                           'threads, 20000).',
+        'prefilter': 'Toggle positive filter of query sequences on or off.',
+        'sample_size': 'Randomly extract the given number of sequences from '
+                       'the reference database to use for prefiltering.' +
+                       ignore_prefilter,
+        'randseed': 'Use integer as a seed for the pseudo-random generator '
+                    'used during prefiltering. A given seed always produces '
+                    'the same output, which is useful for replicability. Set '
+                    'to 0 to use a pseudo-random seed.' + ignore_prefilter,
+    },
+    output_descriptions=output_descriptions,
+    name='ALPHA Hybrid classifier: VSEARCH exact match + sklearn classifier',
+    description=('NOTE: THIS PIPELINE IS AN ALPHA RELEASE. Please report bugs '
+                 'to https://forum.qiime2.org!\n'
+                 'Assign taxonomy to query sequences using hybrid classifier. '
+                 'First performs rough positive filter to remove artifact and '
+                 'low-coverage sequences (use "prefilter" parameter to toggle '
+                 'this step on or off). Second, performs VSEARCH exact match '
+                 'between query and reference_reads to find exact matches, '
+                 'followed by least common ancestor consensus taxonomy '
+                 'assignment from among maxaccepts top hits, min_consensus of '
+                 'which share that taxonomic assignment. Query sequences '
+                 'without an exact match are then classified with a pre-'
+                 'trained sklearn taxonomy classifier to predict the most '
+                 'likely taxonomic lineage.'),
+)


=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -23,6 +23,7 @@ import sklearn
 from numpy import median, array, ceil
 import biom
 import skbio
+import joblib
 
 from ._skl import fit_pipeline, predict, _specific_fitters
 from ._taxonomic_classifier import TaxonomicClassifier
@@ -182,7 +183,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
         raise ValueError("Value other than zero must be specified as number "
                          "of jobs to run.")
     else:
-        n_jobs = sklearn.externals.joblib.effective_n_jobs(n_jobs)
+        n_jobs = joblib.effective_n_jobs(n_jobs)
 
     # we really only want to calculate this if running in parallel
     if n_jobs != 1:
@@ -200,7 +201,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
 def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
                      reads_per_batch: int = 0, n_jobs: int = 1,
                      pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
-                     read_orientation: str = None
+                     read_orientation: str = 'auto'
                      ) -> pd.DataFrame:
     # autotune reads per batch
     if reads_per_batch == 0:
@@ -226,9 +227,35 @@ _classify_parameters = {
     'reads_per_batch': Int % Range(0, None),
     'n_jobs': Int,
     'pre_dispatch': Str,
-    'confidence': Float,
-    'read_orientation': Str % Choices(['same', 'reverse-complement'])}
-
+    'confidence': Float % Range(
+        0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices(
+            ['disable']),
+    'read_orientation': Str % Choices(['same', 'reverse-complement', 'auto'])}
+
+_parameter_descriptions = {
+    'confidence': 'Confidence threshold for limiting '
+                  'taxonomic depth. Set to "disable" to disable '
+                  'confidence calculation, or 0 to calculate '
+                  'confidence but not apply it to limit the '
+                  'taxonomic depth of the assignments.',
+    'read_orientation': 'Direction of reads with '
+                        'respect to reference sequences. same will cause '
+                        'reads to be classified unchanged; reverse-'
+                        'complement will cause reads to be reversed '
+                        'and complemented prior to classification. '
+                        '"auto" will autodetect orientation based on the '
+                        'confidence estimates for the first 100 reads.',
+    'reads_per_batch': 'Number of reads to process in each batch. If "auto", '
+                       'this parameter is autoscaled to '
+                       'min( number of query sequences / n_jobs, 20000).',
+    'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
+              'all CPUs are used. If 1 is given, no parallel computing '
+              'code is used at all, which is useful for debugging. For '
+              'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '
+              'n_jobs = -2, all CPUs but one are used.',
+    'pre_dispatch': '"all" or expression, as in "3*n_jobs". The number of '
+                    'batches (of tasks) to be pre-dispatched.'
+}
 
 plugin.methods.register_function(
     function=classify_sklearn,
@@ -242,30 +269,7 @@ plugin.methods.register_function(
         'reads': 'The feature data to be classified.',
         'classifier': 'The taxonomic classifier for classifying the reads.'
     },
-    parameter_descriptions={
-        'confidence': 'Confidence threshold for limiting '
-                      'taxonomic depth. Provide -1 to disable '
-                      'confidence calculation, or 0 to calculate '
-                      'confidence but not apply it to limit the '
-                      'taxonomic depth of the assignments.',
-        'read_orientation': 'Direction of reads with '
-                            'respect to reference sequences. same will cause '
-                            'reads to be classified unchanged; reverse-'
-                            'complement will cause reads to be reversed '
-                            'and complemented prior to classification. '
-                            'Default is to autodetect based on the '
-                            'confidence estimates for the first 100 reads.',
-        'reads_per_batch': 'Number of reads to process in each batch. If 0, '
-                           'this parameter is autoscaled to '
-                           'min( number of query sequences / n_jobs, 20000).',
-        'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
-                  'all CPUs are used. If 1 is given, no parallel computing '
-                  'code is used at all, which is useful for debugging. For '
-                  'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for '
-                  'n_jobs = -2, all CPUs but one are used.',
-        'pre_dispatch': '"all" or expression, as in "3*n_jobs". The number of '
-                        'batches (of tasks) to be pre-dispatched.'
-    },
+    parameter_descriptions={**_parameter_descriptions},
     citations=[citations['pedregosa2011scikit']]
 )
 


=====================================
q2_feature_classifier/tests/data/class_weight.biom
=====================================
Binary files a/q2_feature_classifier/tests/data/class_weight.biom and b/q2_feature_classifier/tests/data/class_weight.biom differ


=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -59,8 +59,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
         # should populate the class weight of a pipeline
         weights = Artifact.import_data(
             'FeatureTable[RelativeFrequency]',
-            self.get_data_path('class_weight.biom'),
-            view_type='BIOMV100Format')
+            self.get_data_path('class_weight.biom'))
         table = weights.view(biom.Table)
 
         svc_spec = [['feat_ext',
@@ -89,8 +88,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
         # we should be able to input class_weight to fit_classifier
         weights = Artifact.import_data(
             'FeatureTable[RelativeFrequency]',
-            self.get_data_path('class_weight.biom'),
-            view_type='BIOMV100Format')
+            self.get_data_path('class_weight.biom'))
         reads = Artifact.import_data(
             'FeatureData[Sequence]',
             self.get_data_path('se-dna-sequences.fasta'))


=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -9,6 +9,8 @@
 import pandas as pd
 
 from qiime2.sdk import Artifact
+from qiime2.plugins import feature_classifier
+from q2_feature_classifier._skl import _specific_fitters
 from q2_feature_classifier._blast import classify_consensus_blast
 from q2_feature_classifier._vsearch import classify_consensus_vsearch
 from q2_feature_classifier._consensus_assignment import (
@@ -55,6 +57,88 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
             right += tax[taxon].startswith(res[taxon])
         self.assertGreater(right/len(res), 0.5)
 
+    def test_vsearch_search_exact(self):
+        result = classify_consensus_vsearch(self.reads, self.reads,
+                                            self.taxonomy, search_exact=True)
+        res = result.Taxon.to_dict()
+        tax = self.taxonomy.to_dict()
+        right = 0.
+        for taxon in res:
+            right += tax[taxon].startswith(res[taxon])
+        self.assertGreater(right/len(res), 0.5)
+
+    def test_vsearch_top_hits_only(self):
+        result = classify_consensus_vsearch(self.reads, self.reads,
+                                            self.taxonomy, top_hits_only=True)
+        res = result.Taxon.to_dict()
+        tax = self.taxonomy.to_dict()
+        right = 0.
+        for taxon in res:
+            right += tax[taxon].startswith(res[taxon])
+        self.assertGreater(right/len(res), 0.5)
+
+
+class HybridClassiferTests(FeatureClassifierTestPluginBase):
+    package = 'q2_feature_classifier.tests'
+
+    def setUp(self):
+        super().setUp()
+        taxonomy = Artifact.import_data(
+            'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
+        self.taxonomy = taxonomy.view(pd.Series)
+        self.taxartifact = taxonomy
+        # TODO: use `Artifact.import_data` here once we have a transformer
+        # for DNASequencesDirectoryFormat -> DNAFASTAFormat
+        reads_fp = self.get_data_path('se-dna-sequences.fasta')
+        reads = DNAFASTAFormat(reads_fp, mode='r')
+        self.reads = Artifact.import_data('FeatureData[Sequence]', reads)
+
+        fitter = getattr(feature_classifier.methods,
+                         'fit_classifier_' + _specific_fitters[0][0])
+        self.classifier = fitter(self.reads, self.taxartifact).classifier
+
+        self.query = Artifact.import_data('FeatureData[Sequence]', pd.Series(
+            {'A': 'GCCTAACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGA',
+             'B': 'TAACACATGCAAGTCAACGATGCTTATGTAGCAATATGTAAGTAGAGTGGCGCACG',
+             'C': 'ATACATGCAAGTCGTACGGTATTCCGGTTTCGGCCGGGAGAGAGTGGCGGATGGGT',
+             'D': 'GACGAACGCTGGCGACGTGCTTAACACATGCAAGTCGTGCGAGGACGGGCGGTGCT'
+                  'TGCACTGCTCGAGCCGAGCGGCGGACGGGTGAGTAACACGTGAGCAACCTATCTCC'
+                  'GTGCGGGGGACAACCCGGGGAAACCCGGGCTAATACCG'}))
+
+    def test_classify_hybrid_vsearch_sklearn_all_exact_match(self):
+
+        result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+            query=self.reads, reference_reads=self.reads,
+            reference_taxonomy=self.taxartifact, classifier=self.classifier,
+            prefilter=False)
+        result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+            query=self.reads, reference_reads=self.reads,
+            reference_taxonomy=self.taxartifact, classifier=self.classifier)
+        result = result.view(pd.DataFrame)
+        res = result.Taxon.to_dict()
+        tax = self.taxonomy.to_dict()
+        right = 0.
+        for taxon in res:
+            right += tax[taxon].startswith(res[taxon])
+        self.assertGreater(right/len(res), 0.5)
+
+    def test_classify_hybrid_vsearch_sklearn_mixed_query(self):
+
+        result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
+            query=self.query, reference_reads=self.reads,
+            reference_taxonomy=self.taxartifact, classifier=self.classifier,
+            prefilter=True, read_orientation='same', randseed=1001)
+        result = result.view(pd.DataFrame)
+        obs = result.Taxon.to_dict()
+        exp = {'A': 'k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; '
+                    'o__Legionellales; f__; g__; s__',
+               'B': 'k__Bacteria; p__Chlorobi; c__; o__; f__; g__; s__',
+               'C': 'k__Bacteria; p__Bacteroidetes; c__Cytophagia; '
+                    'o__Cytophagales; f__Cyclobacteriaceae; g__; s__',
+               'D': 'k__Bacteria; p__Gemmatimonadetes; c__Gemm-5; o__; f__; '
+                    'g__; s__'}
+        self.assertDictEqual(obs, exp)
+
 
 class ImportBlastAssignmentTests(FeatureClassifierTestPluginBase):
 


=====================================
q2_feature_classifier/tests/test_taxonomic_classifier.py
=====================================
@@ -14,7 +14,7 @@ import os
 import shutil
 
 import sklearn
-from sklearn.externals import joblib
+import joblib
 from sklearn.pipeline import Pipeline
 from qiime2.sdk import Artifact
 from qiime2.plugins.feature_classifier.methods import \



View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/commit/ce3f2d71dc19f5e2778f83d61d5b0be27a8bee02

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/commit/ce3f2d71dc19f5e2778f83d61d5b0be27a8bee02
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190907/fcaece4c/attachment-0001.html>