[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2023.9.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Tue Jan 30 19:00:43 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier
Commits:
eb8b39a7 by Andreas Tille at 2024-01-30T19:56:55+01:00
New upstream version 2023.9.0
- - - - -
8 changed files:
- ci/recipe/meta.yaml
- q2_feature_classifier/__init__.py
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- + q2_feature_classifier/types/__init__.py
- + q2_feature_classifier/types/_format.py
- + q2_feature_classifier/types/_type.py
Changes:
=====================================
ci/recipe/meta.yaml
=====================================
@@ -22,7 +22,7 @@ requirements:
- joblib
- scikit-bio {{ scikit_bio }}
- biom-format {{ biom_format }}
- - blast >=2.6.0
+ - blast >=2.13.0
- vsearch
- qiime2 {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*
=====================================
q2_feature_classifier/__init__.py
=====================================
@@ -12,6 +12,7 @@ from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
+importlib.import_module('q2_feature_classifier.types')
importlib.import_module('q2_feature_classifier.classifier')
importlib.import_module('q2_feature_classifier._cutter')
importlib.import_module('q2_feature_classifier._blast')
=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -6,11 +6,14 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
+import os
+import warnings
import subprocess
import pandas as pd
from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
BLAST6Format)
+from .types import BLASTDBDirFmtV5, BLASTDB
from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
from .plugin_setup import plugin, citations
from ._consensus_assignment import (
@@ -36,6 +39,7 @@ DEFAULTSTRAND = 'both'
DEFAULTEVALUE = 0.001
DEFAULTMINCONSENSUS = 0.51
DEFAULTOUTPUTNOHITS = True
+DEFAULTNUMTHREADS = 1
# NOTE FOR THE FUTURE: should this be called blastn? would it be possible to
@@ -46,23 +50,40 @@ DEFAULTOUTPUTNOHITS = True
# expose different parameters etc? My feeling is let's call this `blast` for
# now and then cross that bridge when we come to it.
def blast(query: DNAFASTAFormat,
- reference_reads: DNAFASTAFormat,
+ reference_reads: DNAFASTAFormat = None,
+ blastdb: BLASTDBDirFmtV5 = None,
maxaccepts: int = DEFAULTMAXACCEPTS,
perc_identity: float = DEFAULTPERCENTID,
query_cov: float = DEFAULTQUERYCOV,
strand: str = DEFAULTSTRAND,
evalue: float = DEFAULTEVALUE,
- output_no_hits: bool = DEFAULTOUTPUTNOHITS) -> pd.DataFrame:
+ output_no_hits: bool = DEFAULTOUTPUTNOHITS,
+ num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame:
+ if reference_reads and blastdb:
+ raise ValueError('Only one reference_reads or blastdb artifact '
+ 'can be provided as input. Choose one and try '
+ 'again.')
perc_identity = perc_identity * 100
query_cov = query_cov * 100
seqs_fp = str(query)
- ref_fp = str(reference_reads)
# TODO: generalize to support other blast types?
output = BLAST6Format()
cmd = ['blastn', '-query', seqs_fp, '-evalue', str(evalue), '-strand',
- strand, '-outfmt', '6', '-subject', ref_fp, '-perc_identity',
- str(perc_identity), '-qcov_hsp_perc', str(query_cov),
+ strand, '-outfmt', '6', '-perc_identity', str(perc_identity),
+ '-qcov_hsp_perc', str(query_cov), '-num_threads', str(num_threads),
'-max_target_seqs', str(maxaccepts), '-out', str(output)]
+ if reference_reads:
+ cmd.extend(['-subject', str(reference_reads)])
+ if num_threads > 1:
+ warnings.warn('The num_threads parameters is only compatible '
+ 'when using a pre-indexed blastdb. The num_threads '
+ 'is ignored when reference_reads are provided as '
+ 'input.', UserWarning)
+ elif blastdb:
+ cmd.extend(['-db', os.path.join(blastdb.path, blastdb.get_basename())])
+ else:
+ raise ValueError('Either reference_reads or a blastdb must be '
+ 'provided as input.')
_run_command(cmd)
# load as dataframe to quickly validate (note: will fail now if empty)
result = output.view(pd.DataFrame)
@@ -76,19 +97,18 @@ def blast(query: DNAFASTAFormat,
if len(missing_ids) > 0:
# we will mirror vsearch behavior and annotate unassigneds as '*'
# and fill other columns with 0 values (np.nan makes format error).
- missed = pd.DataFrame(columns=result.columns)
- missed = missed.append(
- [{'qseqid': i, 'sseqid': '*'} for i in missing_ids]).fillna(0)
- # Do two separate appends to make sure that fillna does not alter
- # any other contents from the original search results.
- result = result.append(missed, ignore_index=True)
+ missed = pd.DataFrame(
+ [{'qseqid': i, 'sseqid': '*'} for i in missing_ids],
+ columns=result.columns).fillna(0)
+ result = pd.concat([result, missed], axis=0)
return result
def classify_consensus_blast(ctx,
query,
- reference_reads,
reference_taxonomy,
+ blastdb=None,
+ reference_reads=None,
maxaccepts=DEFAULTMAXACCEPTS,
perc_identity=DEFAULTPERCENTID,
query_cov=DEFAULTQUERYCOV,
@@ -96,13 +116,15 @@ def classify_consensus_blast(ctx,
evalue=DEFAULTEVALUE,
output_no_hits=DEFAULTOUTPUTNOHITS,
min_consensus=DEFAULTMINCONSENSUS,
- unassignable_label=DEFAULTUNASSIGNABLELABEL):
+ unassignable_label=DEFAULTUNASSIGNABLELABEL,
+ num_threads=DEFAULTNUMTHREADS):
search_db = ctx.get_action('feature_classifier', 'blast')
lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
- result, = search_db(query=query, reference_reads=reference_reads,
+ result, = search_db(query=query, blastdb=blastdb,
+ reference_reads=reference_reads,
maxaccepts=maxaccepts, perc_identity=perc_identity,
query_cov=query_cov, strand=strand, evalue=evalue,
- output_no_hits=output_no_hits)
+ output_no_hits=output_no_hits, num_threads=num_threads)
consensus, = lca(search_results=result,
reference_taxonomy=reference_taxonomy,
min_consensus=min_consensus,
@@ -113,6 +135,15 @@ def classify_consensus_blast(ctx,
return consensus, result
+def makeblastdb(sequences: DNAFASTAFormat) -> BLASTDBDirFmtV5:
+ database = BLASTDBDirFmtV5()
+ build_cmd = ['makeblastdb', '-blastdb_version', '5', '-dbtype', 'nucl',
+ '-title', 'blastdb', '-in', str(sequences),
+ '-out', os.path.join(str(database.path), 'blastdb')]
+ _run_command(build_cmd)
+ return database
+
+
# Replace this function with QIIME2 API for wrapping commands/binaries,
# pending https://github.com/qiime2/qiime2/issues/224
def _run_command(cmd, verbose=True):
@@ -128,10 +159,14 @@ def _run_command(cmd, verbose=True):
inputs = {'query': FeatureData[Sequence],
+ 'blastdb': BLASTDB,
'reference_reads': FeatureData[Sequence]}
input_descriptions = {'query': 'Query sequences.',
- 'reference_reads': 'Reference sequences.'}
+ 'blastdb': 'BLAST indexed database. Incompatible with '
+ 'reference_reads.',
+ 'reference_reads': 'Reference sequences. Incompatible '
+ 'with blastdb.'}
classification_output = ('classification', FeatureData[Taxonomy])
@@ -144,6 +179,7 @@ parameters = {'evalue': Float,
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'strand': Str % Choices(['both', 'plus', 'minus']),
'output_no_hits': Bool,
+ 'num_threads': Int % Range(1, None),
}
parameter_descriptions = {
@@ -170,6 +206,7 @@ parameter_descriptions = {
'unclassified sequences, otherwise you may run into '
'errors downstream from missing feature IDs. Set to '
'FALSE to mirror default BLAST search.',
+ 'num_threads': 'Number of threads (CPUs) to use in the BLAST search.'
}
blast6_output = ('search_results', FeatureData[BLAST6])
@@ -177,6 +214,20 @@ blast6_output = ('search_results', FeatureData[BLAST6])
blast6_output_description = {'search_results': 'Top hits for each query.'}
+plugin.methods.register_function(
+ function=makeblastdb,
+ inputs={'sequences': FeatureData[Sequence]},
+ parameters={},
+ outputs=[('database', BLASTDB)],
+ input_descriptions={'sequences': 'Input reference sequences.'},
+ parameter_descriptions={},
+ output_descriptions={'database': 'Output BLAST database.'},
+ name='Make BLAST database.',
+ description=('Make BLAST database from custom sequence collection.'),
+ citations=[citations['camacho2009blast+']]
+)
+
+
# Note: name should be changed to blastn if we do NOT generalize this function
plugin.methods.register_function(
function=blast,
=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2023.7.0, Release-2023.7)"
- git_full = "ed298ae345a893a9efc01aaa304d60cf00944edf"
- git_date = "2023-08-17 19:44:12 +0000"
+ git_refnames = " (tag: 2023.9.0, Release-2023.9)"
+ git_full = "04ef68622bc5721f234cf220caf508e7e096bc08"
+ git_date = "2023-10-03 21:57:16 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -31,6 +31,20 @@ class SequenceSearchTests(FeatureClassifierTestPluginBase):
'FeatureData[Sequence]',
self.get_data_path('se-dna-sequences.fasta'))
+ # The blastdb format is not documented in enough detail to validate
+ # so for now we just run together with blastn to validate.
+ def test_makeblastdb_and_blast(self):
+ db, = qfc.actions.makeblastdb(self.ref)
+ print(db)
+ result1, = qfc.actions.blast(self.query, blastdb=db)
+ result2, = qfc.actions.blast(self.query, self.ref)
+ pdt.assert_frame_equal(result1.view(pd.DataFrame),
+ result2.view(pd.DataFrame))
+ with self.assertRaisesRegex(ValueError, "Only one.*can be provided"):
+ qfc.actions.blast(self.query, reference_reads=self.ref, blastdb=db)
+ with self.assertRaisesRegex(ValueError, "Either.*must be provided"):
+ qfc.actions.blast(self.query)
+
def test_blast(self):
result, = qfc.actions.blast(
self.query, self.ref, maxaccepts=3, perc_identity=0.9)
@@ -164,7 +178,8 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
# search and/or taxonomy classification outputs.
def test_classify_consensus_blast(self):
result, _, = qfc.actions.classify_consensus_blast(
- self.reads, self.reads, self.taxonomy)
+ query=self.reads, reference_reads=self.reads,
+ reference_taxonomy=self.taxonomy)
self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
def test_classify_consensus_vsearch(self):
=====================================
q2_feature_classifier/types/__init__.py
=====================================
@@ -0,0 +1,13 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from ._format import BLASTDBFileFmtV5, BLASTDBDirFmtV5
+from ._type import BLASTDB
+
+
+__all__ = ['BLASTDBFileFmtV5', 'BLASTDBDirFmtV5', 'BLASTDB']
=====================================
q2_feature_classifier/types/_format.py
=====================================
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import itertools
+from qiime2.plugin import model
+from ..plugin_setup import plugin, citations
+
+
+class BLASTDBFileFmtV5(model.BinaryFileFormat):
+ # We do not have a good way to validate the individual blastdb files.
+ # TODO: could wire up `blastdbcheck` to do a deep check when level=max
+ # but this must be done on the directory, not individual files.
+ # For now validation be done at the DirFmt level on file extensions.
+ def _validate_(self, level):
+ pass
+
+
+class BLASTDBDirFmtV5(model.DirectoryFormat):
+ # TODO: is there a more robust way to do this/make some files optional?
+ # Some file extensions were introduced with more recent versions of
+ # blast, but are not actually needed for our purposes. Making these
+ # optional would allow more flexibility in blast versions, avoiding
+ # possible dependency conflicts.
+ # NOTE that the .n?? extensions are also nucleotide database specific.
+ # should we rather call the type/formats BLASTNucDB*?
+ idx1 = model.File(r'.+\.ndb', format=BLASTDBFileFmtV5)
+ idx2 = model.File(r'.+\.nhr', format=BLASTDBFileFmtV5)
+ idx3 = model.File(r'.+\.nin', format=BLASTDBFileFmtV5)
+ idx4 = model.File(r'.+\.not', format=BLASTDBFileFmtV5)
+ idx5 = model.File(r'.+\.nsq', format=BLASTDBFileFmtV5)
+ idx6 = model.File(r'.+\.ntf', format=BLASTDBFileFmtV5)
+ idx7 = model.File(r'.+\.nto', format=BLASTDBFileFmtV5)
+ # introducted in blast 2.13.0
+ # https://ncbiinsights.ncbi.nlm.nih.gov/2022/03/29/blast-2-13-0/
+ idx8 = model.File(r'.+\.njs', format=BLASTDBFileFmtV5)
+
+ # borrowed from q2-types
+ def get_basename(self):
+ paths = [str(x.relative_to(self.path)) for x in self.path.iterdir()]
+ prefix = os.path.splitext(_get_prefix(paths))[0]
+ return prefix
+
+
+# SO: https://stackoverflow.com/a/6718380/579416
+def _get_prefix(strings):
+ def all_same(x):
+ return all(x[0] == y for y in x)
+
+ char_tuples = zip(*strings)
+ prefix_tuples = itertools.takewhile(all_same, char_tuples)
+ return ''.join(x[0] for x in prefix_tuples)
+
+
+plugin.register_views(BLASTDBDirFmtV5,
+ citations=[citations['camacho2009blast+']])
=====================================
q2_feature_classifier/types/_type.py
=====================================
@@ -0,0 +1,17 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from qiime2.plugin import SemanticType
+from . import BLASTDBDirFmtV5
+from ..plugin_setup import plugin
+
+
+BLASTDB = SemanticType('BLASTDB')
+
+plugin.register_semantic_types(BLASTDB)
+plugin.register_artifact_class(BLASTDB, BLASTDBDirFmtV5)
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/eb8b39a7ce58a54cab55ae9193f7fb656a2481af
--
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/eb8b39a7ce58a54cab55ae9193f7fb656a2481af
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240130/ce323a35/attachment-0001.htm>
More information about the debian-med-commit
mailing list