[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2020.11.0
Andreas Tille
gitlab at salsa.debian.org
Thu Dec 3 13:23:29 GMT 2020
Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier
Commits:
afac41d7 by Andreas Tille at 2020-12-03T14:20:01+01:00
New upstream version 2020.11.0
- - - - -
24 changed files:
- LICENSE
- ci/recipe/meta.yaml
- q2_feature_classifier/__init__.py
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_consensus_assignment.py
- q2_feature_classifier/_cutter.py
- q2_feature_classifier/_skl.py
- q2_feature_classifier/_taxonomic_classifier.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/_vsearch.py
- q2_feature_classifier/classifier.py
- q2_feature_classifier/custom.py
- q2_feature_classifier/plugin_setup.py
- q2_feature_classifier/tests/__init__.py
- + q2_feature_classifier/tests/data/dna-sequences-degenerate-primers.fasta
- + q2_feature_classifier/tests/data/dna-sequences-mixed.fasta
- + q2_feature_classifier/tests/data/dna-sequences-reverse.fasta
- q2_feature_classifier/tests/data/dna-sequences.fasta
- q2_feature_classifier/tests/test_classifier.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- q2_feature_classifier/tests/test_custom.py
- q2_feature_classifier/tests/test_cutter.py
- q2_feature_classifier/tests/test_taxonomic_classifier.py
- setup.py
Changes:
=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
BSD 3-Clause License
-Copyright (c) 2016-2019, QIIME 2 development team.
+Copyright (c) 2016-2020, QIIME 2 development team.
All rights reserved.
Redistribution and use in source and binary forms, with or without
=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,7 @@ requirements:
run:
- python {{ python }}
- - scikit-learn 0.21.2
+ - scikit-learn 0.23.1
- joblib
- scikit-bio
- biom-format >=2.1.5,<2.2.0
=====================================
q2_feature_classifier/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -14,6 +14,16 @@ from .plugin_setup import plugin, citations
from ._consensus_assignment import (_consensus_assignments,
_get_default_unassignable_label)
+# ---------------------------------------------------------------
+# Reason for num_thread not being exposed.
+# BLAST doesn't allow threading when a subject is provided(As of 2/19/20).
+# num_thread was removed to prevent warning that stated:
+# "'Num_thread' is currently ignored when 'subject' is specified"(issue #77).
+# Seen here: https://github.com/qiime2/q2-feature-classifier/issues/77.
+# A '-subject' input is required in this function.
+# Therefore num_thread is not exposable.
+# ---------------------------------------------------------------
+
def classify_consensus_blast(
query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
=====================================
q2_feature_classifier/_consensus_assignment.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/_cutter.py
=====================================
@@ -1,16 +1,21 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-from itertools import chain
-
-from qiime2.plugin import Int, Str, Float, Range
-from q2_types.feature_data import FeatureData, Sequence, DNAIterator
import skbio
+import os
+from joblib import Parallel, delayed, effective_n_jobs
+
+from qiime2.plugin import Int, Str, Float, Range, Choices
+from q2_types.feature_data import (FeatureData, Sequence, DNAIterator,
+ DNASequencesDirectoryFormat)
+from q2_types.feature_data._format import DNAFASTAFormat
+from q2_feature_classifier._skl import _chunks
+from q2_feature_classifier.classifier import _autotune_reads_per_batch
from .plugin_setup import plugin
@@ -98,75 +103,117 @@ def _approx_match(seq, f_primer, r_primer, identity):
return None
-def _gen_reads(sequences, f_primer, r_primer, trunc_len, trim_left, identity,
- min_length, max_length):
+def _gen_reads(sequence, f_primer, r_primer, trim_right, trunc_len, trim_left,
+ identity, min_length, max_length, read_orientation):
f_primer = skbio.DNA(f_primer)
r_primer = skbio.DNA(r_primer)
- for seq in sequences:
- amp = _exact_match(seq, f_primer, r_primer)
- if not amp:
- amp = _exact_match(seq.reverse_complement(), f_primer, r_primer)
- if not amp:
- amp = _approx_match(seq, f_primer, r_primer, identity)
- if not amp:
- amp = _approx_match(
- seq.reverse_complement(), f_primer, r_primer, identity)
- if not amp:
- continue
- # we want to filter by max length before trimming
- if max_length > 0 and len(amp) > max_length:
- continue
- if trunc_len > 0:
- amp = amp[:trunc_len]
- if trim_left > 0:
- amp = amp[trim_left:]
- if min_length > 0 and len(amp) < min_length:
- continue
- if not amp:
- continue
- yield amp
-
-
-def extract_reads(sequences: DNAIterator, f_primer: str, r_primer: str,
+ amp = None
+ if read_orientation in ['forward', 'both']:
+ amp = _exact_match(sequence, f_primer, r_primer)
+ if not amp and read_orientation in ['reverse', 'both']:
+ amp = _exact_match(sequence.reverse_complement(), f_primer, r_primer)
+ if not amp and read_orientation in ['forward', 'both']:
+ amp = _approx_match(sequence, f_primer, r_primer, identity)
+ if not amp and read_orientation in ['reverse', 'both']:
+ amp = _approx_match(
+ sequence.reverse_complement(), f_primer, r_primer, identity)
+ if not amp:
+ return
+ # we want to filter by max length before trimming
+ if max_length > 0 and len(amp) > max_length:
+ return
+ if trim_right > 0:
+ amp = amp[:-trim_right]
+ if trunc_len > 0:
+ amp = amp[:trunc_len]
+ if trim_left > 0:
+ amp = amp[trim_left:]
+ if min_length > 0 and len(amp) < min_length:
+ return
+ if not amp:
+ return
+ return amp
+
+
+def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str,
+ r_primer: str, trim_right: int = 0,
trunc_len: int = 0, trim_left: int = 0,
identity: float = 0.8, min_length: int = 50,
- max_length: int = 0) -> DNAIterator:
+ max_length: int = 0, n_jobs: int = 1,
+ batch_size: int = 'auto', read_orientation: str = 'both') \
+ -> DNAFASTAFormat:
"""Extract the read selected by a primer or primer pair. Only sequences
which match the primers at greater than the specified identity are returned
Parameters
----------
- sequences : DNAIterator
- an aligned list of skbio.sequence.DNA query sequences
+ sequences : DNASequencesDirectoryFormat
+ An aligned list of skbio.sequence.DNA query sequences
f_primer : skbio.sequence.DNA
- forward primer sequence
+ Forward primer sequence
r_primer : skbio.sequence.DNA
- reverse primer sequence
+ Reverse primer sequence
+ trim_right : int, optional
+ `trim_right` nucleotides are removed from the 3' end if trim_right is
+ positive. Applied before trunc_len.
trunc_len : int, optional
- read is cut to trunc_len if trunc_len is positive. Applied before
- trim_left.
+ Read is cut to trunc_len if trunc_len is positive. Applied after
+ trim_right.
trim_left : int, optional
- trim_left nucleotides are removed from the 5' end if trim_left is
- positive. Applied after trunc_len.
+ `trim_left` nucleotides are removed from the 5' end if trim_left is
+ positive. Applied after trim_right and trunc_len.
identity : float, optional
- minimum combined primer match identity threshold. Default: 0.8
+ Minimum combined primer match identity threshold. Default: 0.8
min_length: int, optional
Minimum amplicon length. Shorter amplicons are discarded. Default: 50
max_length: int, optional
Maximum amplicon length. Longer amplicons are discarded.
+ n_jobs: int, optional
+ Number of seperate processes to break the task into.
+ batch_size: int, optional
+ Number of samples to be processed in one batch.
+ read_orientation: str, optional
+ 'Orientation of primers relative to the sequences: "forward" searches '
+ 'for primer hits in the forward direction, "reverse" searches the '
+ 'reverse-complement, and "both" searches both directions.'
Returns
-------
- q2_types.DNAIterator
+ q2_types.DNAFASTAFormat
containing the reads
"""
- reads = _gen_reads(
- sequences, f_primer, r_primer, trunc_len, trim_left, identity,
- min_length, max_length)
- try:
- first_read = next(reads)
- except StopIteration:
- raise RuntimeError('No matches found') from None
- return DNAIterator(chain([first_read], reads))
+ if min_length > trunc_len - (trim_left + trim_right) and trunc_len > 0:
+ raise ValueError('The minimum length setting is greater than the '
+ 'length of the truncated sequences. This will cause '
+ 'all sequences to be removed from the dataset. To '
+ 'proceed, set '
+ 'min_length ≤ trunc_len - (trim_left + '
+ 'trim_right).')
+
+ n_jobs = effective_n_jobs(n_jobs)
+ if batch_size == 'auto':
+ batch_size = _autotune_reads_per_batch(
+ sequences.file.view(DNAFASTAFormat), n_jobs)
+ sequences = sequences.file.view(DNAIterator)
+ ff = DNAFASTAFormat()
+ with open(str(ff), 'a') as fh:
+ with Parallel(n_jobs) as parallel:
+ for chunk in _chunks(sequences, batch_size):
+ amplicons = parallel(delayed(_gen_reads)(sequence, f_primer,
+ r_primer,
+ trim_right,
+ trunc_len,
+ trim_left,
+ identity,
+ min_length,
+ max_length,
+ read_orientation)
+ for sequence in chunk)
+ for amplicon in amplicons:
+ if amplicon is not None:
+ skbio.write(amplicon, format='fasta', into=fh)
+ if os.stat(str(ff)).st_size == 0:
+ raise RuntimeError("No matches found")
+ return ff
plugin.methods.register_function(
@@ -174,34 +221,58 @@ plugin.methods.register_function(
inputs={'sequences': FeatureData[Sequence]},
parameters={'trunc_len': Int,
'trim_left': Int,
+ 'trim_right': Int,
'f_primer': Str,
'r_primer': Str,
'identity': Float,
'min_length': Int % Range(0, None),
- 'max_length': Int % Range(0, None)},
+ 'max_length': Int % Range(0, None),
+ 'n_jobs': Int % Range(1, None),
+ 'batch_size': Int % Range(1, None) | Str % Choices(['auto']),
+ 'read_orientation': Str % Choices(['both', 'forward',
+ 'reverse'])},
outputs=[('reads', FeatureData[Sequence])],
- name='Extract reads from reference',
- description='Extract sequencing-like reads from a reference database.',
- parameter_descriptions={'f_primer': 'forward primer sequence',
- 'r_primer': 'reverse primer sequence',
- 'trunc_len': 'read is cut to trunc_len if '
- 'trunc_len is positive. Applied '
- 'before trim_left.',
- 'trim_left': 'trim_left nucleotides are removed '
- 'from the 5\' end if trim_left is '
- 'positive. Applied after trunc_len.',
- 'identity': 'minimum combined primer match '
- 'identity threshold.',
- 'min_length': 'Minimum amplicon length. Shorter '
- 'amplicons are discarded. Applied '
- 'after trimming and truncation, so '
- 'be aware that trimming may impact '
- 'sequence retention. Set to zero '
- 'to disable min length filtering.',
- 'max_length': 'Maximum amplicon length. Longer '
- 'amplicons are discarded. Applied '
- 'before trimming and truncation, '
- 'so plan accordingly. Set to zero '
- '(default) to disable max length '
- 'filtering.'}
+ name='Extract reads from reference sequences.',
+ description='Extract simulated amplicon reads from a reference database. '
+ 'Performs in-silico PCR to extract simulated amplicons from '
+ 'reference sequences that match the input primer sequences '
+ '(within the mismatch threshold specified by `identity`). '
+ 'Both primer sequences must be in the 5\' -> 3\' orientation. '
+ 'Sequences that fail to match both primers will be excluded. '
+ 'Reads are extracted, trimmed, and filtered in the following '
+ 'order: 1. reads are extracted in specified orientation; 2. '
+ 'primers are removed; 3. reads longer than `max_length` are '
+ 'removed; 4. reads are trimmed with `trim_right`; 5. reads '
+ 'are truncated to `trunc_len`; 6. reads are trimmed with '
+ '`trim_left`; 7. reads shorter than `min_length` are removed.',
+ parameter_descriptions={
+ 'f_primer': 'forward primer sequence (5\' -> 3\').',
+ 'r_primer': 'reverse primer sequence (5\' -> 3\'). Do not use reverse-'
+ 'complemented primer sequence.',
+ 'trim_right': 'trim_right nucleotides are removed from the 3\' end if '
+ 'trim_right is positive. Applied before trunc_len and '
+ 'trim_left.',
+ 'trunc_len': 'read is cut to trunc_len if trunc_len is positive. '
+ 'Applied after trim_right but before trim_left.',
+ 'trim_left': 'trim_left nucleotides are removed from the 5\' end if '
+ 'trim_left is positive. Applied after trim_right and '
+ 'trunc_len.',
+ 'identity': 'minimum combined primer match identity threshold.',
+ 'min_length': 'Minimum amplicon length. Shorter amplicons are '
+ 'discarded. Applied after trimming and truncation, so '
+ 'be aware that trimming may impact sequence retention. '
+ 'Set to zero to disable min length filtering.',
+ 'max_length': 'Maximum amplicon length. Longer amplicons are '
+ 'discarded. Applied before trimming and truncation, '
+ 'so plan accordingly. Set to zero (default) to disable '
+ 'max length filtering.',
+ 'n_jobs': 'Number of seperate processes to run.',
+ 'batch_size': 'Number of sequences to process in a batch. The `auto` '
+ 'option is calculated from the number of sequences and '
+ 'number of jobs specified.',
+ 'read_orientation': 'Orientation of primers relative to the '
+ 'sequences: "forward" searches for primer hits in '
+ 'the forward direction, "reverse" searches '
+ 'reverse-complement, and "both" searches both '
+ 'directions.'}
)
=====================================
q2_feature_classifier/_skl.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -39,10 +39,12 @@ def _extract_reads(reads):
def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
pre_dispatch='2*n_jobs', confidence='disable'):
- return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
- pre_dispatch=pre_dispatch)
- (delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
- for chunk in _chunks(reads, chunk_size)) for m in c)
+ jobs = (
+ delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
+ for chunk in _chunks(reads, chunk_size))
+ workers = Parallel(n_jobs=n_jobs, batch_size=1, pre_dispatch=pre_dispatch)
+ for calculated in workers(jobs):
+ yield from calculated
def _predict_chunk(pipeline, separator, confidence, chunk):
=====================================
q2_feature_classifier/_taxonomic_classifier.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.7.0)"
- git_full = "bc7ed3a741779ccb32114a229af65cc8c1510743"
- git_date = "2019-07-30 18:15:57 +0000"
+ git_refnames = " (HEAD -> master, tag: 2020.11.0)"
+ git_full = "fd229681228c2ae67a6daec121fa92fc80c8f33b"
+ git_date = "2020-11-25 17:16:54 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_feature_classifier/_vsearch.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -33,19 +33,31 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
_get_default_unassignable_label(),
search_exact: bool = False,
top_hits_only: bool = False,
+ maxhits: int = 'all',
+ maxrejects: int = 'all',
+ output_no_hits: bool = True,
+ weak_id: float = 0.,
threads: str = 1) -> pd.DataFrame:
seqs_fp = str(query)
ref_fp = str(reference_reads)
if maxaccepts == 'all':
maxaccepts = 0
+ if maxrejects == 'all':
+ maxrejects = 0
cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
'--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
- str(maxaccepts), '--maxrejects', '0', '--output_no_hits', '--db',
- ref_fp, '--threads', str(threads)]
+ str(maxaccepts), '--maxrejects', str(maxrejects), '--db', ref_fp,
+ '--threads', str(threads)]
if search_exact:
cmd[1] = '--search_exact'
if top_hits_only:
cmd.append('--top_hits_only')
+ if output_no_hits:
+ cmd.append('--output_no_hits')
+ if weak_id > 0 and weak_id < perc_identity:
+ cmd.extend(['--weak_id', str(weak_id)])
+ if maxhits != 'all':
+ cmd.extend(['--maxhits', str(maxhits)])
cmd.append('--blast6out')
consensus = _consensus_assignments(
cmd, reference_taxonomy, min_consensus=min_consensus,
@@ -63,6 +75,8 @@ def classify_hybrid_vsearch_sklearn(ctx,
query_cov=0.8,
strand='both',
min_consensus=0.51,
+ maxhits='all',
+ maxrejects='all',
reads_per_batch=0,
confidence=0.7,
read_orientation='auto',
@@ -97,7 +111,8 @@ def classify_hybrid_vsearch_sklearn(ctx,
taxa1, = ccv(query=query, reference_reads=reference_reads,
reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
strand=strand, min_consensus=min_consensus,
- search_exact=True, threads=threads)
+ search_exact=True, threads=threads, maxhits=maxhits,
+ maxrejects=maxrejects, output_no_hits=True)
# Annotate taxonomic assignments with classification method
taxa1 = _annotate_method(taxa1, 'VSEARCH')
@@ -134,7 +149,9 @@ parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
'strand': Str % Choices(['both', 'plus']),
'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
inclusive_start=False),
- 'threads': Int % Range(1, None)}
+ 'threads': Int % Range(1, None),
+ 'maxhits': Int % Range(1, None) | Str % Choices(['all']),
+ 'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}
inputs = {'query': FeatureData[Sequence],
'reference_reads': FeatureData[Sequence],
@@ -148,14 +165,33 @@ parameter_descriptions = {
'strand': 'Align against reference sequences in forward ("plus") '
'or both directions ("both").',
'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
- '"all" to keep all hits > perc_identity similarity.',
+ '"all" to keep all hits > perc_identity similarity. Note '
+ 'that if strand=both, maxaccepts will keep N hits for each '
+ 'direction (if searches in the opposite direction yield '
+ 'results that exceed the minimum perc_identity). In those '
+ 'cases use maxhits to control the total number of hits '
+ 'returned. This option works in pair with maxrejects. '
+ 'The search process sorts target sequences by decreasing '
+ 'number of k-mers they have in common with the query '
+ 'sequence, using that information as a proxy for sequence '
+ 'similarity. After pairwise alignments, if the first target '
+ 'sequence passes the acceptation criteria, it is accepted '
+ 'as best hit and the search process stops for that query. '
+ 'If maxaccepts is set to a higher value, more hits are '
+ 'accepted. If maxaccepts and maxrejects are both set to '
+ '"all", the complete database is searched.',
'perc_identity': 'Reject match if percent identity to query is '
'lower.',
'query_cov': 'Reject match if query alignment coverage per high-'
'scoring pair is lower.',
'min_consensus': 'Minimum fraction of assignments must match top '
'hit to be accepted as consensus assignment.',
- 'threads': 'Number of threads to use for job parallelization.'}
+ 'threads': 'Number of threads to use for job parallelization.',
+ 'maxhits': 'Maximum number of hits to show once the search is terminated.',
+ 'maxrejects': 'Maximum number of non-matching target sequences to '
+ 'consider before stopping the search. This option works in '
+ 'pair with maxaccepts (see maxaccepts description for '
+ 'details).'}
outputs = [('classification', FeatureData[Taxonomy])]
@@ -168,7 +204,9 @@ plugin.methods.register_function(
parameters={**parameters,
'unassignable_label': Str,
'search_exact': Bool,
- 'top_hits_only': Bool},
+ 'top_hits_only': Bool,
+ 'output_no_hits': Bool,
+ 'weak_id': Float % Range(0.0, 1.0, inclusive_end=True)},
outputs=outputs,
input_descriptions=input_descriptions,
parameter_descriptions={
@@ -186,6 +224,22 @@ plugin.methods.register_function(
'identity. Multiple equally scored top hits will be '
'used for consensus taxonomic assignment if '
'maxaccepts is greater than 1.',
+ 'output_no_hits': 'Report both matching and non-matching queries. '
+ 'WARNING: always use the default setting for this '
+ 'option unless if you know what you are doing! If '
+ 'you set this option to False, your sequences and '
+ 'feature table will need to be filtered to exclude '
+ 'unclassified sequences, otherwise you may run into '
+ 'errors downstream from missing feature IDs.',
+ 'weak_id': 'Show hits with percentage of identity of at least N, '
+ 'without terminating the search. A normal search stops as '
+ 'soon as enough hits are found (as defined by maxaccepts, '
+ 'maxrejects, and perc_identity). As weak_id reports weak '
+ 'hits that are not deduced from maxaccepts, high '
+ 'perc_identity values can be used, hence preserving both '
+ 'speed and sensitivity. Logically, weak_id must be smaller '
+ 'than the value indicated by perc_identity, otherwise this '
+ 'option will be ignored.',
},
output_descriptions=output_descriptions,
name='VSEARCH-based consensus taxonomy classifier',
=====================================
q2_feature_classifier/classifier.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -203,24 +203,33 @@ def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
read_orientation: str = 'auto'
) -> pd.DataFrame:
- # autotune reads per batch
- if reads_per_batch == 0:
- reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)
-
- # transform reads to DNAIterator
- reads = DNAIterator(
- skbio.read(str(reads), format='fasta', constructor=skbio.DNA))
-
- reads = _autodetect_orientation(
- reads, classifier, read_orientation=read_orientation)
- predictions = predict(reads, classifier, chunk_size=reads_per_batch,
- n_jobs=n_jobs, pre_dispatch=pre_dispatch,
- confidence=confidence)
- seq_ids, taxonomy, confidence = list(zip(*predictions))
- result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence},
- index=seq_ids, columns=['Taxon', 'Confidence'])
- result.index.name = 'Feature ID'
- return result
+ try:
+ # autotune reads per batch
+ if reads_per_batch == 0:
+ reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)
+
+ # transform reads to DNAIterator
+ reads = DNAIterator(
+ skbio.read(str(reads), format='fasta', constructor=skbio.DNA))
+
+ reads = _autodetect_orientation(
+ reads, classifier, read_orientation=read_orientation)
+ predictions = predict(reads, classifier, chunk_size=reads_per_batch,
+ n_jobs=n_jobs, pre_dispatch=pre_dispatch,
+ confidence=confidence)
+ seq_ids, taxonomy, confidence = list(zip(*predictions))
+
+ result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence},
+ index=seq_ids, columns=['Taxon', 'Confidence'])
+ result.index.name = 'Feature ID'
+ return result
+ except MemoryError:
+ raise MemoryError("The operation has run out of available memory. "
+ "To correct this error:\n"
+ "1. Reduce the reads per batch\n"
+ "2. Reduce number of n_jobs being performed\n"
+ "3. Use a more powerful machine or allocate "
+ "more resources ")
_classify_parameters = {
=====================================
q2_feature_classifier/custom.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -96,7 +96,7 @@ class _MultioutputClassifier(BaseEstimator, ClassifierMixin):
self.separator = separator
def fit(self, X, y, **fit_params):
- y = list(zip(*[l.split(self.separator) for l in y]))
+ y = list(zip(*[label.split(self.separator) for label in y]))
self.encoders_ = [LabelEncoder() for _ in y]
y = [e.fit_transform(l) for e, l in zip(self.encoders_, y)]
self.base_estimator.fit(X, list(zip(*y)), **fit_params)
@@ -106,12 +106,12 @@ class _MultioutputClassifier(BaseEstimator, ClassifierMixin):
def classes_(self):
classes = [e.inverse_transform(l) for e, l in
zip(self.encoders_, zip(*self.base_estimator.classes_))]
- return [self.separator.join(l) for l in zip(*classes)]
+ return [self.separator.join(label) for label in zip(*classes)]
def predict(self, X):
y = self.base_estimator.predict(X).astype(int)
y = [e.inverse_transform(l) for e, l in zip(self.encoders_, y.T)]
- return [self.separator.join(l) for l in zip(*y)]
+ return [self.separator.join(label) for label in zip(*y)]
def predict_proba(self, X):
return self.base_estimator.predict_proba(X)
=====================================
q2_feature_classifier/plugin_setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/tests/data/dna-sequences-degenerate-primers.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+ATATAACGTGCCGG
+>Sequence2
+ATATAAAGTGCCGG
+>Sequence3
+ATATAACCTGCCGG
+>Sequence4
+ATATAACGGGCCGG
+>Sequence5
+ATATAACTTGCCGG
=====================================
q2_feature_classifier/tests/data/dna-sequences-mixed.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+AGAGAACGTGCAGG
+>Sequence2
+CCTGCACTTTCTCT
+>Sequence3
+AGAGAACCTGCAGG
+>Sequence4
+AGAGAACGGGCAGG
+>Sequence5
+CCTGCAAGTTCTCT
=====================================
q2_feature_classifier/tests/data/dna-sequences-reverse.fasta
=====================================
@@ -0,0 +1,10 @@
+>Sequence1
+CCTGCACGTTCTCT
+>Sequence2
+CCTGCACTTTCTCT
+>Sequence3
+CCTGCAGGTTCTCT
+>Sequence4
+CCTGCCCGTTCTCT
+>Sequence5
+CCTGCAAGTTCTCT
=====================================
q2_feature_classifier/tests/data/dna-sequences.fasta
=====================================
The diff for this file was not included because it is too large.
=====================================
q2_feature_classifier/tests/test_classifier.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -77,6 +77,23 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
+ # make sure weak_id and other parameters do not conflict with each other.
+ # This test just makes sure the command runs okay with all options.
+ # We are not in the business of debugging VSEARCH, but want to have this
+ # test as a canary in the coal mine.
+ def test_vsearch_the_works(self):
+ result = classify_consensus_vsearch(self.reads, self.reads,
+ self.taxonomy, top_hits_only=True,
+ maxhits=1, maxrejects=10,
+ weak_id=0.8, perc_identity=0.99,
+ output_no_hits=False)
+ res = result.Taxon.to_dict()
+ tax = self.taxonomy.to_dict()
+ right = 0.
+ for taxon in res:
+ right += tax[taxon].startswith(res[taxon])
+ self.assertGreater(right/len(res), 0.5)
+
class HybridClassiferTests(FeatureClassifierTestPluginBase):
package = 'q2_feature_classifier.tests'
=====================================
q2_feature_classifier/tests/test_custom.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_feature_classifier/tests/test_cutter.py
=====================================
@@ -1,18 +1,16 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-import os
-from itertools import product, islice
-
import skbio
-from qiime2.plugins import feature_classifier
+
from qiime2.sdk import Artifact
-from q2_types.feature_data import FeatureData, Sequence, DNAIterator
+from qiime2.plugins.feature_classifier.actions import extract_reads
+from q2_types.feature_data._format import DNAFASTAFormat
from . import FeatureClassifierTestPluginBase
@@ -22,56 +20,134 @@ class CutterTests(FeatureClassifierTestPluginBase):
def setUp(self):
super().setUp()
- seqs = skbio.io.read(self.get_data_path('dna-sequences.fasta'),
- format='fasta', constructor=skbio.DNA)
- tmpseqs = os.path.join(self.temp_dir.name, 'temp-seqs.fasta')
- skbio.io.write((s for s in islice(seqs, 10)), 'fasta', tmpseqs)
- self.sequences = Artifact.import_data('FeatureData[Sequence]', tmpseqs)
-
- def test_extract_reads(self):
- # extract_reads should generate a FeatureData(Sequence) full of reads
- f_primer = 'AGAGTTTGATCMTGGCTCAG'
- r_primer = 'GCTGCCTCCCGTAGGAGT'
- extract_reads = feature_classifier.methods.extract_reads
- inseqs = list(self.sequences.view(DNAIterator))
-
- trunc_lens = 0, 75, 5
- trim_lefts = 0, 5
- raw_lens = []
- for trunc_len, trim_left in product(trunc_lens, trim_lefts):
- if trunc_len == trim_left and trunc_len > 0:
- with self.assertRaisesRegex(RuntimeError, "No matches found"):
- result = extract_reads(
- self.sequences, f_primer=f_primer, r_primer=r_primer,
- trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
- min_length=0, max_length=0)
- continue
- result = extract_reads(
- self.sequences, f_primer=f_primer, r_primer=r_primer,
- trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
- min_length=0, max_length=0)
- self.assertEqual(result.reads.type, FeatureData[Sequence])
- outseqs = list(result.reads.view(DNAIterator))
- self.assertGreater(len(outseqs), 0)
- self.assertLessEqual(len(outseqs), len(inseqs))
- self.assertTrue(bool(outseqs))
- if trunc_len != 0:
- for seq in outseqs:
- self.assertEqual(len(seq), trunc_len - trim_left)
- elif trim_left == 0:
- for seq in outseqs:
- raw_lens.append(len(seq))
- else:
- for seq, raw_len in zip(outseqs, raw_lens):
- self.assertEqual(len(seq), raw_len - trim_left)
- # test that length filtering is working
- with self.assertRaisesRegex(RuntimeError, "No matches found"):
- result = extract_reads(
- self.sequences, f_primer=f_primer, r_primer=r_primer,
- trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
- min_length=500, max_length=0)
- with self.assertRaisesRegex(RuntimeError, "No matches found"):
- result = extract_reads(
- self.sequences, f_primer=f_primer, r_primer=r_primer,
- trunc_len=trunc_len, trim_left=trim_left, identity=0.9,
- min_length=0, max_length=20)
+ self.sequences = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('dna-sequences.fasta'))
+
+ self.mixed_sequences = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('dna-sequences-mixed.fasta'))
+
+ self.f_primer = 'AGAGA'
+ self.r_primer = 'GCTGC'
+
+ self.amplicons = ['ACGT', 'AAGT', 'ACCT', 'ACGG', 'ACTT']
+
+ def _test_results(self, results):
+ for i, result in enumerate(
+ skbio.io.read(str(results.reads.view(DNAFASTAFormat)),
+ format='fasta')):
+ self.assertEqual(str(result), self.amplicons[i])
+
+ def test_extract_reads_expected(self):
+ results = extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4)
+
+ self._test_results(results)
+
+ def test_extract_reads_expected_forward(self):
+ results = extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4, read_orientation='forward')
+
+ self._test_results(results)
+
+ def test_extract_mixed(self):
+ results = extract_reads(
+ self.mixed_sequences, f_primer=self.f_primer,
+ r_primer=self.r_primer, min_length=4)
+
+ self._test_results(results)
+
+ def test_extract_reads_expected_reverse(self):
+ reverse_sequences = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('dna-sequences-reverse.fasta'))
+
+ results = extract_reads(
+ reverse_sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4, read_orientation='reverse')
+
+ self._test_results(results)
+
+ def test_extract_reads_manual_batch_size(self):
+ results = extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4, batch_size=10)
+
+ self._test_results(results)
+
+ def test_extract_reads_two_jobs(self):
+ results = extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4, n_jobs=2)
+
+ self._test_results(results)
+
+ def test_extract_reads_expected_degenerate_primers(self):
+ degenerate_f_primer = 'WWWWW'
+ degenerate_r_primer = 'SSSSS'
+
+ degenerate_sequences = Artifact.import_data(
+ 'FeatureData[Sequence]',
+ self.get_data_path('dna-sequences-degenerate-primers.fasta'))
+
+ results = extract_reads(
+ degenerate_sequences, f_primer=degenerate_f_primer,
+ r_primer=degenerate_r_primer, min_length=4)
+
+ self._test_results(results)
+
+ def test_extract_reads_expected_trim_right(self):
+ """Tests expected behavior of trim_right option"""
+ results = extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=3, trim_right=1)
+
+ for i, result in enumerate(
+ skbio.io.read(str(results.reads.view(DNAFASTAFormat)),
+ format='fasta')):
+ self.assertEqual(str(result), self.amplicons[i][:-1])
+
+ def test_extract_reads_fail_identity(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=4, identity=1)
+
+ def test_extract_reads_fail_min_length(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ min_length=5)
+
+ def test_extract_reads_fail_max_length(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ max_length=1)
+
+ def test_extract_reads_fail_trim_left_entire_read(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ trim_left=4)
+
+ def test_extract_reads_fail_trim_right_entire_read(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ trim_right=4)
+
+ def test_extract_reads_fail_trim_both_entire_read(self):
+ with self.assertRaisesRegex(RuntimeError, "No matches found"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ trim_left=2, trim_right=2)
+
+ def test_extract_reads_fail_min_len_greater_than_trunc_len(self):
+ with self.assertRaisesRegex(ValueError, "minimum length setting"):
+ extract_reads(
+ self.sequences, f_primer=self.f_primer, r_primer=self.r_primer,
+ trunc_len=1)
=====================================
q2_feature_classifier/tests/test_taxonomic_classifier.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/afac41d751207aa3ea29a07de41817c81d6129ad
--
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/afac41d751207aa3ea29a07de41817c81d6129ad
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201203/6f34b066/attachment-0001.html>
More information about the debian-med-commit
mailing list