[med-svn] [Git][med-team/q2-feature-classifier][upstream] New upstream version 2023.9.0

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Jan 30 19:00:43 GMT 2024



Andreas Tille pushed to branch upstream at Debian Med / q2-feature-classifier


Commits:
eb8b39a7 by Andreas Tille at 2024-01-30T19:56:55+01:00
New upstream version 2023.9.0
- - - - -


8 changed files:

- ci/recipe/meta.yaml
- q2_feature_classifier/__init__.py
- q2_feature_classifier/_blast.py
- q2_feature_classifier/_version.py
- q2_feature_classifier/tests/test_consensus_assignment.py
- + q2_feature_classifier/types/__init__.py
- + q2_feature_classifier/types/_format.py
- + q2_feature_classifier/types/_type.py


Changes:

=====================================
ci/recipe/meta.yaml
=====================================
@@ -22,7 +22,7 @@ requirements:
     - joblib
     - scikit-bio {{ scikit_bio }}
     - biom-format {{ biom_format }}
-    - blast >=2.6.0
+    - blast >=2.13.0
     - vsearch
     - qiime2 {{ qiime2_epoch }}.*
     - q2-types {{ qiime2_epoch }}.*


=====================================
q2_feature_classifier/__init__.py
=====================================
@@ -12,6 +12,7 @@ from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
 
+importlib.import_module('q2_feature_classifier.types')
 importlib.import_module('q2_feature_classifier.classifier')
 importlib.import_module('q2_feature_classifier._cutter')
 importlib.import_module('q2_feature_classifier._blast')


=====================================
q2_feature_classifier/_blast.py
=====================================
@@ -6,11 +6,14 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
+import os
+import warnings
 import subprocess
 import pandas as pd
 from q2_types.feature_data import (
     FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
     BLAST6Format)
+from .types import BLASTDBDirFmtV5, BLASTDB
 from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
 from .plugin_setup import plugin, citations
 from ._consensus_assignment import (
@@ -36,6 +39,7 @@ DEFAULTSTRAND = 'both'
 DEFAULTEVALUE = 0.001
 DEFAULTMINCONSENSUS = 0.51
 DEFAULTOUTPUTNOHITS = True
+DEFAULTNUMTHREADS = 1
 
 
 # NOTE FOR THE FUTURE: should this be called blastn? would it be possible to
@@ -46,23 +50,40 @@ DEFAULTOUTPUTNOHITS = True
 # expose different parameters etc? My feeling is let's call this `blast` for
 # now and then cross that bridge when we come to it.
 def blast(query: DNAFASTAFormat,
-          reference_reads: DNAFASTAFormat,
+          reference_reads: DNAFASTAFormat = None,
+          blastdb: BLASTDBDirFmtV5 = None,
           maxaccepts: int = DEFAULTMAXACCEPTS,
           perc_identity: float = DEFAULTPERCENTID,
           query_cov: float = DEFAULTQUERYCOV,
           strand: str = DEFAULTSTRAND,
           evalue: float = DEFAULTEVALUE,
-          output_no_hits: bool = DEFAULTOUTPUTNOHITS) -> pd.DataFrame:
+          output_no_hits: bool = DEFAULTOUTPUTNOHITS,
+          num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame:
+    if reference_reads and blastdb:
+        raise ValueError('Only one reference_reads or blastdb artifact '
+                         'can be provided as input. Choose one and try '
+                         'again.')
     perc_identity = perc_identity * 100
     query_cov = query_cov * 100
     seqs_fp = str(query)
-    ref_fp = str(reference_reads)
     # TODO: generalize to support other blast types?
     output = BLAST6Format()
     cmd = ['blastn', '-query', seqs_fp, '-evalue', str(evalue), '-strand',
-           strand, '-outfmt', '6', '-subject', ref_fp, '-perc_identity',
-           str(perc_identity), '-qcov_hsp_perc', str(query_cov),
+           strand, '-outfmt', '6', '-perc_identity', str(perc_identity),
+           '-qcov_hsp_perc', str(query_cov), '-num_threads', str(num_threads),
            '-max_target_seqs', str(maxaccepts), '-out', str(output)]
+    if reference_reads:
+        cmd.extend(['-subject', str(reference_reads)])
+        if num_threads > 1:
+            warnings.warn('The num_threads parameters is only compatible '
+                          'when using a pre-indexed blastdb. The num_threads '
+                          'is ignored when reference_reads are provided as '
+                          'input.', UserWarning)
+    elif blastdb:
+        cmd.extend(['-db', os.path.join(blastdb.path, blastdb.get_basename())])
+    else:
+        raise ValueError('Either reference_reads or a blastdb must be '
+                         'provided as input.')
     _run_command(cmd)
     # load as dataframe to quickly validate (note: will fail now if empty)
     result = output.view(pd.DataFrame)
@@ -76,19 +97,18 @@ def blast(query: DNAFASTAFormat,
         if len(missing_ids) > 0:
             # we will mirror vsearch behavior and annotate unassigneds as '*'
             # and fill other columns with 0 values (np.nan makes format error).
-            missed = pd.DataFrame(columns=result.columns)
-            missed = missed.append(
-                [{'qseqid': i, 'sseqid': '*'} for i in missing_ids]).fillna(0)
-            # Do two separate appends to make sure that fillna does not alter
-            # any other contents from the original search results.
-            result = result.append(missed, ignore_index=True)
+            missed = pd.DataFrame(
+                [{'qseqid': i, 'sseqid': '*'} for i in missing_ids],
+                columns=result.columns).fillna(0)
+            result = pd.concat([result, missed], axis=0)
     return result
 
 
 def classify_consensus_blast(ctx,
                              query,
-                             reference_reads,
                              reference_taxonomy,
+                             blastdb=None,
+                             reference_reads=None,
                              maxaccepts=DEFAULTMAXACCEPTS,
                              perc_identity=DEFAULTPERCENTID,
                              query_cov=DEFAULTQUERYCOV,
@@ -96,13 +116,15 @@ def classify_consensus_blast(ctx,
                              evalue=DEFAULTEVALUE,
                              output_no_hits=DEFAULTOUTPUTNOHITS,
                              min_consensus=DEFAULTMINCONSENSUS,
-                             unassignable_label=DEFAULTUNASSIGNABLELABEL):
+                             unassignable_label=DEFAULTUNASSIGNABLELABEL,
+                             num_threads=DEFAULTNUMTHREADS):
     search_db = ctx.get_action('feature_classifier', 'blast')
     lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
-    result, = search_db(query=query, reference_reads=reference_reads,
+    result, = search_db(query=query, blastdb=blastdb,
+                        reference_reads=reference_reads,
                         maxaccepts=maxaccepts, perc_identity=perc_identity,
                         query_cov=query_cov, strand=strand, evalue=evalue,
-                        output_no_hits=output_no_hits)
+                        output_no_hits=output_no_hits, num_threads=num_threads)
     consensus, = lca(search_results=result,
                      reference_taxonomy=reference_taxonomy,
                      min_consensus=min_consensus,
@@ -113,6 +135,15 @@ def classify_consensus_blast(ctx,
     return consensus, result
 
 
+def makeblastdb(sequences: DNAFASTAFormat) -> BLASTDBDirFmtV5:
+    database = BLASTDBDirFmtV5()
+    build_cmd = ['makeblastdb', '-blastdb_version', '5', '-dbtype', 'nucl',
+                 '-title', 'blastdb', '-in', str(sequences),
+                 '-out', os.path.join(str(database.path), 'blastdb')]
+    _run_command(build_cmd)
+    return database
+
+
 # Replace this function with QIIME2 API for wrapping commands/binaries,
 # pending https://github.com/qiime2/qiime2/issues/224
 def _run_command(cmd, verbose=True):
@@ -128,10 +159,14 @@ def _run_command(cmd, verbose=True):
 
 
 inputs = {'query': FeatureData[Sequence],
+          'blastdb': BLASTDB,
           'reference_reads': FeatureData[Sequence]}
 
 input_descriptions = {'query': 'Query sequences.',
-                      'reference_reads': 'Reference sequences.'}
+                      'blastdb': 'BLAST indexed database. Incompatible with '
+                      'reference_reads.',
+                      'reference_reads': 'Reference sequences. Incompatible '
+                      'with blastdb.'}
 
 classification_output = ('classification', FeatureData[Taxonomy])
 
@@ -144,6 +179,7 @@ parameters = {'evalue': Float,
               'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
               'strand': Str % Choices(['both', 'plus', 'minus']),
               'output_no_hits': Bool,
+              'num_threads': Int % Range(1, None),
               }
 
 parameter_descriptions = {
@@ -170,6 +206,7 @@ parameter_descriptions = {
                       'unclassified sequences, otherwise you may run into '
                       'errors downstream from missing feature IDs. Set to '
                       'FALSE to mirror default BLAST search.',
+    'num_threads': 'Number of threads (CPUs) to use in the BLAST search.'
 }
 
 blast6_output = ('search_results', FeatureData[BLAST6])
@@ -177,6 +214,20 @@ blast6_output = ('search_results', FeatureData[BLAST6])
 blast6_output_description = {'search_results': 'Top hits for each query.'}
 
 
+plugin.methods.register_function(
+    function=makeblastdb,
+    inputs={'sequences': FeatureData[Sequence]},
+    parameters={},
+    outputs=[('database', BLASTDB)],
+    input_descriptions={'sequences': 'Input reference sequences.'},
+    parameter_descriptions={},
+    output_descriptions={'database': 'Output BLAST database.'},
+    name='Make BLAST database.',
+    description=('Make BLAST database from custom sequence collection.'),
+    citations=[citations['camacho2009blast+']]
+)
+
+
 # Note: name should be changed to blastn if we do NOT generalize this function
 plugin.methods.register_function(
     function=blast,


=====================================
q2_feature_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2023.7.0, Release-2023.7)"
-    git_full = "ed298ae345a893a9efc01aaa304d60cf00944edf"
-    git_date = "2023-08-17 19:44:12 +0000"
+    git_refnames = " (tag: 2023.9.0, Release-2023.9)"
+    git_full = "04ef68622bc5721f234cf220caf508e7e096bc08"
+    git_date = "2023-10-03 21:57:16 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_feature_classifier/tests/test_consensus_assignment.py
=====================================
@@ -31,6 +31,20 @@ class SequenceSearchTests(FeatureClassifierTestPluginBase):
             'FeatureData[Sequence]',
             self.get_data_path('se-dna-sequences.fasta'))
 
+    # The blastdb format is not documented in enough detail to validate
+    # so for now we just run together with blastn to validate.
+    def test_makeblastdb_and_blast(self):
+        db, = qfc.actions.makeblastdb(self.ref)
+        print(db)
+        result1, = qfc.actions.blast(self.query, blastdb=db)
+        result2, = qfc.actions.blast(self.query, self.ref)
+        pdt.assert_frame_equal(result1.view(pd.DataFrame),
+                               result2.view(pd.DataFrame))
+        with self.assertRaisesRegex(ValueError, "Only one.*can be provided"):
+            qfc.actions.blast(self.query, reference_reads=self.ref, blastdb=db)
+        with self.assertRaisesRegex(ValueError, "Either.*must be provided"):
+            qfc.actions.blast(self.query)
+
     def test_blast(self):
         result, = qfc.actions.blast(
             self.query, self.ref, maxaccepts=3, perc_identity=0.9)
@@ -164,7 +178,8 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
     # search and/or taxonomy classification outputs.
     def test_classify_consensus_blast(self):
         result, _, = qfc.actions.classify_consensus_blast(
-            self.reads, self.reads, self.taxonomy)
+            query=self.reads, reference_reads=self.reads,
+            reference_taxonomy=self.taxonomy)
         self.assertTrue(series_is_subset(self.exp, result.view(pd.Series)))
 
     def test_classify_consensus_vsearch(self):


=====================================
q2_feature_classifier/types/__init__.py
=====================================
@@ -0,0 +1,13 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from ._format import BLASTDBFileFmtV5, BLASTDBDirFmtV5
+from ._type import BLASTDB
+
+
+__all__ = ['BLASTDBFileFmtV5', 'BLASTDBDirFmtV5', 'BLASTDB']


=====================================
q2_feature_classifier/types/_format.py
=====================================
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import itertools
+from qiime2.plugin import model
+from ..plugin_setup import plugin, citations
+
+
+class BLASTDBFileFmtV5(model.BinaryFileFormat):
+    # We do not have a good way to validate the individual blastdb files.
+    # TODO: could wire up `blastdbcheck` to do a deep check when level=max
+    # but this must be done on the directory, not individual files.
+    # For now validation be done at the DirFmt level on file extensions.
+    def _validate_(self, level):
+        pass
+
+
+class BLASTDBDirFmtV5(model.DirectoryFormat):
+    # TODO: is there a more robust way to do this/make some files optional?
+    # Some file extensions were introduced with more recent versions of
+    # blast, but are not actually needed for our purposes. Making these
+    # optional would allow more flexibility in blast versions, avoiding
+    # possible dependency conflicts.
+    # NOTE that the .n?? extensions are also nucleotide database specific.
+    # should we rather call the type/formats BLASTNucDB*?
+    idx1 = model.File(r'.+\.ndb', format=BLASTDBFileFmtV5)
+    idx2 = model.File(r'.+\.nhr', format=BLASTDBFileFmtV5)
+    idx3 = model.File(r'.+\.nin', format=BLASTDBFileFmtV5)
+    idx4 = model.File(r'.+\.not', format=BLASTDBFileFmtV5)
+    idx5 = model.File(r'.+\.nsq', format=BLASTDBFileFmtV5)
+    idx6 = model.File(r'.+\.ntf', format=BLASTDBFileFmtV5)
+    idx7 = model.File(r'.+\.nto', format=BLASTDBFileFmtV5)
+    # introducted in blast 2.13.0
+    # https://ncbiinsights.ncbi.nlm.nih.gov/2022/03/29/blast-2-13-0/
+    idx8 = model.File(r'.+\.njs', format=BLASTDBFileFmtV5)
+
+    # borrowed from q2-types
+    def get_basename(self):
+        paths = [str(x.relative_to(self.path)) for x in self.path.iterdir()]
+        prefix = os.path.splitext(_get_prefix(paths))[0]
+        return prefix
+
+
+# SO: https://stackoverflow.com/a/6718380/579416
+def _get_prefix(strings):
+    def all_same(x):
+        return all(x[0] == y for y in x)
+
+    char_tuples = zip(*strings)
+    prefix_tuples = itertools.takewhile(all_same, char_tuples)
+    return ''.join(x[0] for x in prefix_tuples)
+
+
+plugin.register_views(BLASTDBDirFmtV5,
+                      citations=[citations['camacho2009blast+']])


=====================================
q2_feature_classifier/types/_type.py
=====================================
@@ -0,0 +1,17 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from qiime2.plugin import SemanticType
+from . import BLASTDBDirFmtV5
+from ..plugin_setup import plugin
+
+
+BLASTDB = SemanticType('BLASTDB')
+
+plugin.register_semantic_types(BLASTDB)
+plugin.register_artifact_class(BLASTDB, BLASTDBDirFmtV5)



View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/eb8b39a7ce58a54cab55ae9193f7fb656a2481af

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-feature-classifier/-/commit/eb8b39a7ce58a54cab55ae9193f7fb656a2481af
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240130/ce323a35/attachment-0001.htm>


More information about the debian-med-commit mailing list