[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2020.11.0

Andreas Tille gitlab at salsa.debian.org
Thu Dec 3 13:23:29 GMT 2020



Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier


Commits:
afac41d7 by Andreas Tille at 2020-12-03T14:20:01+01:00
New upstream version 2020.11.0
- - - - -


24 changed files:

- LICENSE
- ci/recipe/meta.yaml
- q2_feature_classifier/__init__.py
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_cutter.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_taxonomic_classifier.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/custom.py
- q2_feature_classifier/plugin_setup.py
- q2_feature_classifier/tests/__init__.py
- + q2_feature_classifier/tests/data/dna-sequences-degenerate-primers.fasta
- + q2_feature_classifier/tests/data/dna-sequences-mixed.fasta
- + q2_feature_classifier/tests/data/dna-sequences-reverse.fasta
- q2_feature_classifier/tests/data/dna-sequences.fasta
- q2_feature_classifier/tests/test_classifier.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- q2_feature_classifier/tests/test_custom.py
- q2_feature_classifier/tests/test_cutter.py
- q2_feature_classifier/tests/test_taxonomic_classifier.py
- setup.py


Changes:

=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2016-2019, QIIME 2 development team.
+Copyright (c) 2016-2020, QIIME 2 development team.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without


=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,7 @@ requirements:
 
   run:
     - python {{ python }}
-    - scikit-learn 0.21.2
+    - scikit-learn 0.23.1
     - joblib
     - scikit-bio
     - biom-format >=2.1.5,<2.2.0


=====================================
q2_feature_classifier/__init__.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -14,6 +14,16 @@ from .plugin_setup import plugin, citations
 from ._consensus_assignment import (_consensus_assignments,
                                     _get_default_unassignable_label)
 
+# ---------------------------------------------------------------
+# Reason for num_thread not being exposed.
+# BLAST doesn't allow threading when a subject is provided(As of 2/19/20).
+# num_thread was removed to prevent warning that stated:
+# "'Num_thread' is currently ignored when 'subject' is specified"(issue #77).
+# Seen here: https://github.com/qiime2/q2-feature-classifier/issues/77.
+# A '-subject' input is required in this function.
+# Therefore num_thread is not exposable.
+# ---------------------------------------------------------------
+
 
 def classify_consensus_blast(
         query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,


=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/_cutter.py
=====================================
@@ -1,16 +1,21 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from itertools import chain
-
-from qiime2.plugin import Int, Str, Float, Range
-from q2_types.feature_data import FeatureData, Sequence, DNAIterator
 import skbio
+import os
+from joblib import Parallel, delayed, effective_n_jobs
+
+from qiime2.plugin import Int, Str, Float, Range, Choices
+from q2_types.feature_data import (FeatureData, Sequence, DNAIterator,
+                                   DNASequencesDirectoryFormat)
+from q2_types.feature_data._format import DNAFASTAFormat
+from q2_feature_classifier._skl import _chunks
+from q2_feature_classifier.classifier import _autotune_reads_per_batch
 
 from .plugin_setup import plugin
 
@@ -98,75 +103,117 @@ def _approx_match(seq, f_primer, r_primer, identity):
     return None
 
 
-def _gen_reads(sequences,  f_primer, r_primer, trunc_len, trim_left, identity,
-               min_length, max_length):
+def _gen_reads(sequence, f_primer, r_primer, trim_right, trunc_len, trim_left,
+               identity, min_length, max_length, read_orientation):
     f_primer = skbio.DNA(f_primer)
     r_primer = skbio.DNA(r_primer)
-    for seq in sequences:
-        amp = _exact_match(seq, f_primer, r_primer)
-        if not amp:
-            amp = _exact_match(seq.reverse_complement(), f_primer, r_primer)
-        if not amp:
-            amp = _approx_match(seq, f_primer, r_primer, identity)
-        if not amp:
-            amp = _approx_match(
-                seq.reverse_complement(), f_primer, r_primer, identity)
-        if not amp:
-            continue
-        # we want to filter by max length before trimming
-        if max_length > 0 and len(amp) > max_length:
-            continue
-        if trunc_len > 0:
-            amp = amp[:trunc_len]
-        if trim_left > 0:
-            amp = amp[trim_left:]
-        if min_length > 0 and len(amp) < min_length:
-            continue
-        if not amp:
-            continue
-        yield amp
-
-
-def extract_reads(sequences: DNAIterator,  f_primer: str, r_primer: str,
+    amp = None
+    if read_orientation in ['forward', 'both']:
+        amp = _exact_match(sequence, f_primer, r_primer)
+    if not amp and read_orientation in ['reverse', 'both']:
+        amp = _exact_match(sequence.reverse_complement(), f_primer, r_primer)
+    if not amp and read_orientation in ['forward', 'both']:
+        amp = _approx_match(sequence, f_primer, r_primer, identity)
+    if not amp and read_orientation in ['reverse', 'both']:
+        amp = _approx_match(
+            sequence.reverse_complement(), f_primer, r_primer, identity)
+    if not amp:
+        return
+    # we want to filter by max length before trimming
+    if max_length > 0 and len(amp) > max_length:
+        return
+    if trim_right > 0:
+        amp = amp[:-trim_right]
+    if trunc_len > 0:
+        amp = amp[:trunc_len]
+    if trim_left > 0:
+        amp = amp[trim_left:]
+    if min_length > 0 and len(amp) < min_length:
+        return
+    if not amp:
+        return
+    return amp
+
+
+def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str,
+                  r_primer: str, trim_right: int = 0,
                   trunc_len: int = 0, trim_left: int = 0,
                   identity: float = 0.8, min_length: int = 50,
-                  max_length: int = 0) -> DNAIterator:
+                  max_length: int = 0, n_jobs: int = 1,
+                  batch_size: int = 'auto', read_orientation: str = 'both') \
+                  -> DNAFASTAFormat:
     """Extract the read selected by a primer or primer pair. Only sequences
     which match the primers at greater than the specified identity are returned
 
     Parameters
     ----------
-    sequences : DNAIterator
-        an aligned list of skbio.sequence.DNA query sequences
+    sequences : DNASequencesDirectoryFormat
+        An aligned list of skbio.sequence.DNA query sequences
     f_primer : skbio.sequence.DNA
-        forward primer sequence
+        Forward primer sequence
     r_primer : skbio.sequence.DNA
-        reverse primer sequence
+        Reverse primer sequence
+    trim_right : int, optional
+        `trim_right` nucleotides are removed from the 3' end if trim_right is
+        positive. Applied before trunc_len.
     trunc_len : int, optional
-        read is cut to trunc_len if trunc_len is positive. Applied before
-        trim_left.
+        Read is cut to trunc_len if trunc_len is positive. Applied after
+        trim_right.
     trim_left : int, optional
-        trim_left nucleotides are removed from the 5' end if trim_left is
-        positive. Applied after trunc_len.
+        `trim_left` nucleotides are removed from the 5' end if trim_left is
+        positive. Applied after trim_right and trunc_len.
     identity : float, optional
-        minimum combined primer match identity threshold. Default: 0.8
+        Minimum combined primer match identity threshold. Default: 0.8
     min_length: int, optional
         Minimum amplicon length. Shorter amplicons are discarded. Default: 50
     max_length: int, optional
         Maximum amplicon length. Longer amplicons are discarded.
+    n_jobs: int, optional
+        Number of seperate processes to break the task into.
+    batch_size: int, optional
+        Number of samples to be processed in one batch.
+    read_orientation: str, optional
+        'Orientation of primers relative to the sequences: "forward" searches '
+        'for primer hits in the forward direction, "reverse" searches the '
+        'reverse-complement, and "both" searches both directions.'
     Returns
     -------
-    q2_types.DNAIterator
+    q2_types.DNAFASTAFormat
         containing the reads
     """
-    reads = _gen_reads(
-        sequences, f_primer, r_primer, trunc_len, trim_left, identity,
-        min_length, max_length)
-    try:
-        first_read = next(reads)
-    except StopIteration:
-        raise RuntimeError('No matches found') from None
-    return DNAIterator(chain([first_read], reads))
+    if min_length > trunc_len - (trim_left + trim_right) and trunc_len > 0:
+        raise ValueError('The minimum length setting is greater than the '
+                         'length of the truncated sequences. This will cause '
+                         'all sequences to be removed from the dataset. To '
+                         'proceed, set '
+                         'min_length ≤ trunc_len - (trim_left  + '
+                         'trim_right).')
+
+    n_jobs = effective_n_jobs(n_jobs)
+    if batch_size == 'auto':
+        batch_size = _autotune_reads_per_batch(
+            sequences.file.view(DNAFASTAFormat), n_jobs)
+    sequences = sequences.file.view(DNAIterator)
+    ff = DNAFASTAFormat()
+    with open(str(ff), 'a') as fh:
+        with Parallel(n_jobs) as parallel:
+            for chunk in _chunks(sequences, batch_size):
+                amplicons = parallel(delayed(_gen_reads)(sequence, f_primer,
+                                                         r_primer,
+                                                         trim_right,
+                                                         trunc_len,
+                                                         trim_left,
+                                                         identity,
+                                                         min_length,
+                                                         max_length,
+                                                         read_orientation)
+                                     for sequence in chunk)
+                for amplicon in amplicons:
+                    if amplicon is not None:
+                        skbio.write(amplicon, format='fasta', into=fh)
+    if os.stat(str(ff)).st_size == 0:
+        raise RuntimeError("No matches found")
+    return ff
 
 
 plugin.methods.register_function(
@@ -174,34 +221,58 @@ plugin.methods.register_function(
     inputs={'sequences': FeatureData[Sequence]},
     parameters={'trunc_len': Int,
                 'trim_left': Int,
+                'trim_right': Int,
                 'f_primer': Str,
                 'r_primer': Str,
                 'identity': Float,
                 'min_length': Int % Range(0, None),
-                'max_length': Int % Range(0, None)},
+                'max_length': Int % Range(0, None),
+                'n_jobs': Int % Range(1, None),
+                'batch_size': Int % Range(1, None) | Str % Choices(['auto']),
+                'read_orientation': Str % Choices(['both', 'forward',
+                                                   'reverse'])},
     outputs=[('reads', FeatureData[Sequence])],
-    name='Extract reads from reference',
-    description='Extract sequencing-like reads from a reference database.',
-    parameter_descriptions={'f_primer': 'forward primer sequence',
-                            'r_primer': 'reverse primer sequence',
-                            'trunc_len': 'read is cut to trunc_len if '
-                                         'trunc_len is positive. Applied '
-                                         'before trim_left.',
-                            'trim_left': 'trim_left nucleotides are removed '
-                                         'from the 5\' end if trim_left is '
-                                         'positive. Applied after trunc_len.',
-                            'identity': 'minimum combined primer match '
-                                        'identity threshold.',
-                            'min_length': 'Minimum amplicon length. Shorter '
-                                          'amplicons are discarded. Applied '
-                                          'after trimming and truncation, so '
-                                          'be aware that trimming may impact '
-                                          'sequence retention. Set to zero '
-                                          'to disable min length filtering.',
-                            'max_length': 'Maximum amplicon length. Longer '
-                                          'amplicons are discarded. Applied '
-                                          'before trimming and truncation, '
-                                          'so plan accordingly. Set to zero '
-                                          '(default) to disable max length '
-                                          'filtering.'}
+    name='Extract reads from reference sequences.',
+    description='Extract simulated amplicon reads from a reference database. '
+                'Performs in-silico PCR to extract simulated amplicons from '
+                'reference sequences that match the input primer sequences '
+                '(within the mismatch threshold specified by `identity`). '
+                'Both primer sequences must be in the 5\' -> 3\' orientation. '
+                'Sequences that fail to match both primers will be excluded. '
+                'Reads are extracted, trimmed, and filtered in the following '
+                'order: 1. reads are extracted in specified orientation; 2. '
+                'primers are removed; 3. reads longer than `max_length` are '
+                'removed; 4. reads are trimmed with `trim_right`; 5. reads '
+                'are truncated to `trunc_len`; 6. reads are trimmed with '
+                '`trim_left`; 7. reads shorter than `min_length` are removed.',
+    parameter_descriptions={
+        'f_primer': 'forward primer sequence (5\' -> 3\').',
+        'r_primer': 'reverse primer sequence (5\' -> 3\'). Do not use reverse-'
+                    'complemented primer sequence.',
+        'trim_right': 'trim_right nucleotides are removed from the 3\' end if '
+                      'trim_right is positive. Applied before trunc_len and '
+                      'trim_left.',
+        'trunc_len': 'read is cut to trunc_len if trunc_len is positive. '
+                     'Applied after trim_right but before trim_left.',
+        'trim_left': 'trim_left nucleotides are removed from the 5\' end if '
+                     'trim_left is positive. Applied after trim_right and '
+                     'trunc_len.',
+        'identity': 'minimum combined primer match identity threshold.',
+        'min_length': 'Minimum amplicon length. Shorter amplicons are '
+                      'discarded. Applied after trimming and truncation, so '
+                      'be aware that trimming may impact sequence retention. '
+                      'Set to zero to disable min length filtering.',
+        'max_length': 'Maximum amplicon length. Longer amplicons are '
+                      'discarded. Applied before trimming and truncation, '
+                      'so plan accordingly. Set to zero (default) to disable '
+                      'max length filtering.',
+        'n_jobs': 'Number of seperate processes to run.',
+        'batch_size': 'Number of sequences to process in a batch. The `auto` '
+                      'option is calculated from the number of sequences and '
+                      'number of jobs specified.',
+        'read_orientation': 'Orientation of primers relative to the '
+                            'sequences: "forward" searches for primer hits in '
+                            'the forward direction, "reverse" searches '
+                            'reverse-complement, and "both" searches both '
+                            'directions.'}
 )


=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -39,10 +39,12 @@ def _extract_reads(reads):
 
 def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
             pre_dispatch='2*n_jobs', confidence='disable'):
-    return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
-                                pre_dispatch=pre_dispatch)
-            (delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
-             for chunk in _chunks(reads, chunk_size)) for m in c)
+    jobs = (
+        delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
+        for chunk in _chunks(reads, chunk_size))
+    workers = Parallel(n_jobs=n_jobs, batch_size=1, pre_dispatch=pre_dispatch)
+    for calculated in workers(jobs):
+        yield from calculated
 
 
 def _predict_chunk(pipeline, separator, confidence, chunk):


=====================================
q2_feature_classifier/_taxonomic_classifier.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2019.7.0)"
-    git_full = "bc7ed3a741779ccb32114a229af65cc8c1510743"
-    git_date = "2019-07-30 18:15:57 +0000"
+    git_refnames = " (HEAD -> master, tag: 2020.11.0)"
+    git_full = "fd229681228c2ae67a6daec121fa92fc80c8f33b"
+    git_date = "2020-11-25 17:16:54 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -33,19 +33,31 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
                                _get_default_unassignable_label(),
                                search_exact: bool = False,
                                top_hits_only: bool = False,
+                               maxhits: int = 'all',
+                               maxrejects: int = 'all',
+                               output_no_hits: bool = True,
+                               weak_id: float = 0.,
                                threads: str = 1) -> pd.DataFrame:
     seqs_fp = str(query)
     ref_fp = str(reference_reads)
     if maxaccepts == 'all':
         maxaccepts = 0
+    if maxrejects == 'all':
+        maxrejects = 0
     cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
            '--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
-           str(maxaccepts), '--maxrejects', '0', '--output_no_hits', '--db',
-           ref_fp, '--threads', str(threads)]
+           str(maxaccepts), '--maxrejects', str(maxrejects), '--db', ref_fp,
+           '--threads', str(threads)]
     if search_exact:
         cmd[1] = '--search_exact'
     if top_hits_only:
         cmd.append('--top_hits_only')
+    if output_no_hits:
+        cmd.append('--output_no_hits')
+    if weak_id > 0 and weak_id < perc_identity:
+        cmd.extend(['--weak_id', str(weak_id)])
+    if maxhits != 'all':
+        cmd.extend(['--maxhits', str(maxhits)])
     cmd.append('--blast6out')
     consensus = _consensus_assignments(
         cmd, reference_taxonomy, min_consensus=min_consensus,
@@ -63,6 +75,8 @@ def classify_hybrid_vsearch_sklearn(ctx,
                                     query_cov=0.8,
                                     strand='both',
                                     min_consensus=0.51,
+                                    maxhits='all',
+                                    maxrejects='all',
                                     reads_per_batch=0,
                                     confidence=0.7,
                                     read_orientation='auto',
@@ -97,7 +111,8 @@ def classify_hybrid_vsearch_sklearn(ctx,
     taxa1, = ccv(query=query, reference_reads=reference_reads,
                  reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
                  strand=strand, min_consensus=min_consensus,
-                 search_exact=True, threads=threads)
+                 search_exact=True, threads=threads, maxhits=maxhits,
+                 maxrejects=maxrejects, output_no_hits=True)
 
     # Annotate taxonomic assignments with classification method
     taxa1 = _annotate_method(taxa1, 'VSEARCH')
@@ -134,7 +149,9 @@ parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
               'strand': Str % Choices(['both', 'plus']),
               'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
                                              inclusive_start=False),
-              'threads': Int % Range(1, None)}
+              'threads': Int % Range(1, None),
+              'maxhits': Int % Range(1, None) | Str % Choices(['all']),
+              'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}
 
 inputs = {'query': FeatureData[Sequence],
           'reference_reads': FeatureData[Sequence],
@@ -148,14 +165,33 @@ parameter_descriptions = {
     'strand': 'Align against reference sequences in forward ("plus") '
               'or both directions ("both").',
     'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
-                  '"all" to keep all hits > perc_identity similarity.',
+                  '"all" to keep all hits > perc_identity similarity. Note '
+                  'that if strand=both, maxaccepts will keep N hits for each '
+                  'direction (if searches in the opposite direction yield '
+                  'results that exceed the minimum perc_identity). In those '
+                  'cases use maxhits to control the total number of hits '
+                  'returned. This option works in pair with maxrejects. '
+                  'The search process sorts target sequences by decreasing '
+                  'number of k-mers they have in common with the query '
+                  'sequence, using that information as a proxy for sequence '
+                  'similarity. After pairwise alignments, if the first target '
+                  'sequence passes the acceptation criteria, it is accepted '
+                  'as best hit and the search process stops for that query. '
+                  'If maxaccepts is set to a higher value, more hits are '
+                  'accepted. If maxaccepts and maxrejects are both set to '
+                  '"all", the complete database is searched.',
     'perc_identity': 'Reject match if percent identity to query is '
                      'lower.',
     'query_cov': 'Reject match if query alignment coverage per high-'
                  'scoring pair is lower.',
     'min_consensus': 'Minimum fraction of assignments must match top '
                      'hit to be accepted as consensus assignment.',
-    'threads': 'Number of threads to use for job parallelization.'}
+    'threads': 'Number of threads to use for job parallelization.',
+    'maxhits': 'Maximum number of hits to show once the search is terminated.',
+    'maxrejects': 'Maximum number of non-matching target sequences to '
+                  'consider before stopping the search. This option works in '
+                  'pair with maxaccepts (see maxaccepts description for '
+                  'details).'}
 
 outputs = [('classification', FeatureData[Taxonomy])]
 
@@ -168,7 +204,9 @@ plugin.methods.register_function(
     parameters={**parameters,
                 'unassignable_label': Str,
                 'search_exact': Bool,
-                'top_hits_only': Bool},
+                'top_hits_only': Bool,
+                'output_no_hits': Bool,
+                'weak_id': Float % Range(0.0, 1.0, inclusive_end=True)},
     outputs=outputs,
     input_descriptions=input_descriptions,
     parameter_descriptions={
@@ -186,6 +224,22 @@ plugin.methods.register_function(
                          'identity. Multiple equally scored top hits will be '
                          'used for consensus taxonomic assignment if '
                          'maxaccepts is greater than 1.',
+        'output_no_hits': 'Report both matching and non-matching queries. '
+                          'WARNING: always use the default setting for this '
+                          'option unless if you know what you are doing! If '
+                          'you set this option to False, your sequences and '
+                          'feature table will need to be filtered to exclude '
+                          'unclassified sequences, otherwise you may run into '
+                          'errors downstream from missing feature IDs.',
+        'weak_id': 'Show hits with percentage of identity of at least N, '
+                   'without terminating the search. A normal search stops as '
+                   'soon as enough hits are found (as defined by maxaccepts, '
+                   'maxrejects, and perc_identity). As weak_id reports weak '
+                   'hits that are not deduced from maxaccepts, high '
+                   'perc_identity values can be used, hence preserving both '
+                   'speed and sensitivity. Logically, weak_id must be smaller '
+                   'than the value indicated by perc_identity, otherwise this '
+                   'option will be ignored.',
     },
     output_descriptions=output_descriptions,
     name='VSEARCH-based consensus taxonomy classifier',


=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -203,24 +203,33 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
                      pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
                      read_orientation: str = 'auto'
                      ) -> pd.DataFrame:
-    # autotune reads per batch
-    if reads_per_batch == 0:
-        reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)
-
-    # transform reads to DNAIterator
-    reads = DNAIterator(
-        skbio.read(str(reads), format='fasta', constructor=skbio.DNA))
-
-    reads = _autodetect_orientation(
-        reads, classifier, read_orientation=read_orientation)
-    predictions = predict(reads, classifier, chunk_size=reads_per_batch,
-                          n_jobs=n_jobs, pre_dispatch=pre_dispatch,
-                          confidence=confidence)
-    seq_ids, taxonomy, confidence = list(zip(*predictions))
-    result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence},
-                          index=seq_ids, columns=['Taxon', 'Confidence'])
-    result.index.name = 'Feature ID'
-    return result
+    try:
+        # autotune reads per batch
+        if reads_per_batch == 0:
+            reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)
+
+        # transform reads to DNAIterator
+        reads = DNAIterator(
+            skbio.read(str(reads), format='fasta', constructor=skbio.DNA))
+
+        reads = _autodetect_orientation(
+            reads, classifier, read_orientation=read_orientation)
+        predictions = predict(reads, classifier, chunk_size=reads_per_batch,
+                              n_jobs=n_jobs, pre_dispatch=pre_dispatch,
+                              confidence=confidence)
+        seq_ids, taxonomy, confidence = list(zip(*predictions))
+
+        result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence},
+                              index=seq_ids, columns=['Taxon', 'Confidence'])
+        result.index.name = 'Feature ID'
+        return result
+    except MemoryError:
+        raise MemoryError("The operation has run out of available memory. "
+                          "To correct this error:\n"
+                          "1. Reduce the reads per batch\n"
+                          "2. Reduce number of n_jobs being performed\n"
+                          "3. Use a more powerful machine or allocate "
+                          "more resources ")
 
 
 _classify_parameters = {


=====================================
q2_feature_classifier/custom.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -96,7 +96,7 @@ class _MultioutputClassifier(BaseEstimator, ClassifierMixin):
         self.separator = separator
 
     def fit(self, X, y, **fit_params):
-        y = list(zip(*[l.split(self.separator) for l in y]))
+        y = list(zip(*[label.split(self.separator) for label in y]))
         self.encoders_ = [LabelEncoder() for _ in y]
         y = [e.fit_transform(l) for e, l in zip(self.encoders_, y)]
         self.base_estimator.fit(X, list(zip(*y)), **fit_params)
@@ -106,12 +106,12 @@ class _MultioutputClassifier(BaseEstimator, ClassifierMixin):
     def classes_(self):
         classes = [e.inverse_transform(l) for e, l in
                    zip(self.encoders_, zip(*self.base_estimator.classes_))]
-        return [self.separator.join(l) for l in zip(*classes)]
+        return [self.separator.join(label) for label in zip(*classes)]
 
     def predict(self, X):
         y = self.base_estimator.predict(X).astype(int)
         y = [e.inverse_transform(l) for e, l in zip(self.encoders_, y.T)]
-        return [self.separator.join(l) for l in zip(*y)]
+        return [self.separator.join(label) for label in zip(*y)]
 
     def predict_proba(self, X):
         return self.base_estimator.predict_proba(X)


=====================================
q2_feature_classifier/plugin_setup.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/tests/data/dna-sequences-degenerate-primers.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+ATATAACGTGCCGG
+>Sequence2
+ATATAAAGTGCCGG
+>Sequence3
+ATATAACCTGCCGG
+>Sequence4
+ATATAACGGGCCGG
+>Sequence5
+ATATAACTTGCCGG


=====================================
q2_feature_classifier/tests/data/dna-sequences-mixed.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+AGAGAACGTGCAGG
+>Sequence2
+CCTGCACTTTCTCT
+>Sequence3
+AGAGAACCTGCAGG
+>Sequence4
+AGAGAACGGGCAGG
+>Sequence5
+CCTGCAAGTTCTCT


=====================================
q2_feature_classifier/tests/data/dna-sequences-reverse.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+CCTGCACGTTCTCT
+>Sequence2
+CCTGCACTTTCTCT
+>Sequence3
+CCTGCAGGTTCTCT
+>Sequence4
+CCTGCCCGTTCTCT
+>Sequence5
+CCTGCAAGTTCTCT


=====================================
q2_feature_classifier/tests/data/dna-sequences.fasta
=====================================
The diff for this file was not included because it is too large.

=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -77,6 +77,23 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
             right += tax[taxon].startswith(res[taxon])
         self.assertGreater(right/len(res), 0.5)
 
+    # make sure weak_id and other parameters do not conflict with each other.
+    # This test just makes sure the command runs okay with all options.
+    # We are not in the business of debugging VSEARCH, but want to have this
+    # test as a canary in the coal mine.
+    def test_vsearch_the_works(self):
+        result = classify_consensus_vsearch(self.reads, self.reads,
+                                            self.taxonomy, top_hits_only=True,
+                                            maxhits=1, maxrejects=10,
+                                            weak_id=0.8, perc_identity=0.99,
+                                            output_no_hits=False)
+        res = result.Taxon.to_dict()
+        tax = self.taxonomy.to_dict()
+        right = 0.
+        for taxon in res:
+            right += tax[taxon].startswith(res[taxon])
+        self.assertGreater(right/len(res), 0.5)
+
 
 class HybridClassiferTests(FeatureClassifierTestPluginBase):
     package = 'q2_feature_classifier.tests'


=====================================
q2_feature_classifier/tests/test_custom.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_feature_classifier/tests/test_cutter.py
=====================================
@@ -1,18 +1,16 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import os
-from itertools import product, islice
-
 import skbio
-from qiime2.plugins import feature_classifier
+
 from qiime2.sdk import Artifact
-from q2_types.feature_data import FeatureData, Sequence, DNAIterator
+from qiime2.plugins.feature_classifier.actions import extract_reads
+from q2_types.feature_data._format import DNAFASTAFormat
 
 from . import FeatureClassifierTestPluginBase
 
@@ -22,56 +20,134 @@ class CutterTests(FeatureClassifierTestPluginBase):
 
     def setUp(self):
         super().setUp()
-        seqs = skbio.io.read(self.get_data_path('dna-sequences.fasta'),
-                             format='fasta', constructor=skbio.DNA)
-        tmpseqs = os.path.join(self.temp_dir.name, 'temp-seqs.fasta')
-        skbio.io.write((s for s in islice(seqs, 10)), 'fasta', tmpseqs)
-        self.sequences = Artifact.import_data('FeatureData[Sequence]', tmpseqs)
-
-    def test_extract_reads(self):
-        # extract_reads should generate a FeatureData(Sequence) full of reads
-        f_primer = 'AGAGTTTGATCMTGGCTCAG'
-        r_primer = 'GCTGCCTCCCGTAGGAGT'
-        extract_reads = feature_classifier.methods.extract_reads
-        inseqs = list(self.sequences.view(DNAIterator))
-
-        trunc_lens = 0, 75, 5
-        trim_lefts = 0, 5
-        raw_lens = []
-        for trunc_len, trim_left in product(trunc_lens, trim_lefts):
-            if trunc_len == trim_left and trunc_len > 0:
-                with self.assertRaisesRegex(RuntimeError, "No matches found"):
-                    result = extract_reads(
-                        self.sequences, f_primer=f_primer, r_primer=r_primer,
-                        trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
-                        min_length=0, max_length=0)
-                continue
-            result = extract_reads(
-                self.sequences, f_primer=f_primer, r_primer=r_primer,
-                trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
-                min_length=0, max_length=0)
-            self.assertEqual(result.reads.type, FeatureData[Sequence])
-            outseqs = list(result.reads.view(DNAIterator))
-            self.assertGreater(len(outseqs), 0)
-            self.assertLessEqual(len(outseqs), len(inseqs))
-            self.assertTrue(bool(outseqs))
-            if trunc_len != 0:
-                for seq in outseqs:
-                    self.assertEqual(len(seq), trunc_len - trim_left)
-            elif trim_left == 0:
-                for seq in outseqs:
-                    raw_lens.append(len(seq))
-            else:
-                for seq, raw_len in zip(outseqs, raw_lens):
-                    self.assertEqual(len(seq), raw_len - trim_left)
-            # test that length filtering is working
-            with self.assertRaisesRegex(RuntimeError, "No matches found"):
-                result = extract_reads(
-                    self.sequences, f_primer=f_primer, r_primer=r_primer,
-                    trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
-                    min_length=500, max_length=0)
-            with self.assertRaisesRegex(RuntimeError, "No matches found"):
-                result = extract_reads(
-                    self.sequences, f_primer=f_primer, r_primer=r_primer,
-                    trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
-                    min_length=0, max_length=20)
+        self.sequences = Artifact.import_data(
+            'FeatureData[Sequence]',
+            self.get_data_path('dna-sequences.fasta'))
+
+        self.mixed_sequences = Artifact.import_data(
+            'FeatureData[Sequence]',
+            self.get_data_path('dna-sequences-mixed.fasta'))
+
+        self.f_primer = 'AGAGA'
+        self.r_primer = 'GCTGC'
+
+        self.amplicons = ['ACGT', 'AAGT', 'ACCT', 'ACGG', 'ACTT']
+
+    def _test_results(self, results):
+        for i, result in enumerate(
+                skbio.io.read(str(results.reads.view(DNAFASTAFormat)),
+                              format='fasta')):
+            self.assertEqual(str(result), self.amplicons[i])
+
+    def test_extract_reads_expected(self):
+        results = extract_reads(
+            self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=4)
+
+        self._test_results(results)
+
+    def test_extract_reads_expected_forward(self):
+        results = extract_reads(
+            self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=4, read_orientation='forward')
+
+        self._test_results(results)
+
+    def test_extract_mixed(self):
+        results = extract_reads(
+            self.mixed_sequences, f_primer=self.f_primer,
+            r_primer=self.r_primer, min_length=4)
+
+        self._test_results(results)
+
+    def test_extract_reads_expected_reverse(self):
+        reverse_sequences = Artifact.import_data(
+            'FeatureData[Sequence]',
+            self.get_data_path('dna-sequences-reverse.fasta'))
+
+        results = extract_reads(
+            reverse_sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=4, read_orientation='reverse')
+
+        self._test_results(results)
+
+    def test_extract_reads_manual_batch_size(self):
+        results = extract_reads(
+            self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=4, batch_size=10)
+
+        self._test_results(results)
+
+    def test_extract_reads_two_jobs(self):
+        results = extract_reads(
+            self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=4, n_jobs=2)
+
+        self._test_results(results)
+
+    def test_extract_reads_expected_degenerate_primers(self):
+        degenerate_f_primer = 'WWWWW'
+        degenerate_r_primer = 'SSSSS'
+
+        degenerate_sequences = Artifact.import_data(
+            'FeatureData[Sequence]',
+            self.get_data_path('dna-sequences-degenerate-primers.fasta'))
+
+        results = extract_reads(
+            degenerate_sequences, f_primer=degenerate_f_primer,
+            r_primer=degenerate_r_primer, min_length=4)
+
+        self._test_results(results)
+
+    def test_extract_reads_expected_trim_right(self):
+        """Tests expected behavior of trim_right option"""
+        results = extract_reads(
+            self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+            min_length=3, trim_right=1)
+
+        for i, result in enumerate(
+                skbio.io.read(str(results.reads.view(DNAFASTAFormat)),
+                              format='fasta')):
+            self.assertEqual(str(result), self.amplicons[i][:-1])
+
+    def test_extract_reads_fail_identity(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                min_length=4, identity=1)
+
+    def test_extract_reads_fail_min_length(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                min_length=5)
+
+    def test_extract_reads_fail_max_length(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                max_length=1)
+
+    def test_extract_reads_fail_trim_left_entire_read(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                trim_left=4)
+
+    def test_extract_reads_fail_trim_right_entire_read(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                trim_right=4)
+
+    def test_extract_reads_fail_trim_both_entire_read(self):
+        with self.assertRaisesRegex(RuntimeError, "No matches found"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                trim_left=2, trim_right=2)
+
+    def test_extract_reads_fail_min_len_greater_than_trunc_len(self):
+        with self.assertRaisesRegex(ValueError, "minimum length setting"):
+            extract_reads(
+                self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+                trunc_len=1)


=====================================
q2_feature_classifier/tests/test_taxonomic_classifier.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #



View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/afac41d751207aa3ea29a07de41817c81d6129ad

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/afac41d751207aa3ea29a07de41817c81d6129ad
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201203/6f34b066/attachment-0001.html>


More information about the debian-med-commit mailing list