[med-svn] [Git][med-team/q2-alignment][upstream] New upstream version 2020.11.1

Sat Jan 23 08:14:44 GMT 2021


Andreas Tille pushed to branch upstream at Debian Med / q2-alignment


Commits:
c8e3e2fc by Andreas Tille at 2021-01-23T08:11:46+01:00
New upstream version 2020.11.1
- - - - -


17 changed files:

- .github/ISSUE_TEMPLATE/6-where-to-go.md
- .github/SUPPORT.md
- LICENSE
- ci/recipe/meta.yaml
- q2_alignment/__init__.py
- q2_alignment/_filter.py
- q2_alignment/_mafft.py
- q2_alignment/_version.py
- q2_alignment/plugin_setup.py
- q2_alignment/tests/__init__.py
- + q2_alignment/tests/data/aligned-dna-sequences-1.fasta
- + q2_alignment/tests/data/aligned-duplicate-ids-1.fasta
- + q2_alignment/tests/data/aligned-duplicate-ids-2.fasta
- + q2_alignment/tests/data/aligned-long-ids.fasta
- q2_alignment/tests/test_filter.py
- q2_alignment/tests/test_mafft.py
- setup.py


Changes:

=====================================
.github/ISSUE_TEMPLATE/6-where-to-go.md
=====================================
@@ -59,6 +59,9 @@ Sorted alphabetically by repo name.
 - The q2-diversity plugin
   https://github.com/qiime2/q2-diversity/issues
 
+- The q2-diversity-lib plugin
+  https://github.com/qiime2/q2-diversity-lib/issues
+
 - The q2-emperor plugin
   https://github.com/qiime2/q2-emperor/issues
 


=====================================
.github/SUPPORT.md
=====================================
@@ -52,6 +52,8 @@ Sorted alphabetically by repo name.
   | The q2-demux plugin
 - [q2-diversity](https://github.com/qiime2/q2-diversity/issues)
   | The q2-diversity plugin
+- [q2-diversity-lib](https://github.com/qiime2/q2-diversity-lib/issues)
+  | The q2-diversity-lib plugin
 - [q2-emperor](https://github.com/qiime2/q2-emperor/issues)
   | The q2-emperor plugin
 - [q2-feature-classifier](https://github.com/qiime2/q2-feature-classifier/issues)


=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2016-2019, QIIME 2 development team.
+Copyright (c) 2016-2020, QIIME 2 development team.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without


=====================================
ci/recipe/meta.yaml
=====================================
@@ -22,7 +22,7 @@ requirements:
     - scikit-bio
     - qiime2 {{ release }}.*
     - q2-types {{ release }}.*
-    - mafft >=7.310
+    - mafft >=7.394
     - sina >=1.3.5,<=2
 
 test:


=====================================
q2_alignment/__init__.py
=====================================
@@ -1,12 +1,12 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from ._mafft import mafft
+from ._mafft import mafft, mafft_add
 from ._filter import mask
 from ._version import get_versions
 
@@ -14,4 +14,4 @@ from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
 
-__all__ = ['mafft', 'mask']
+__all__ = ['mafft', 'mask', 'mafft_add']


=====================================
q2_alignment/_filter.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_alignment/_mafft.py
=====================================
@@ -1,12 +1,11 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import collections
 import subprocess
 
 import skbio
@@ -27,32 +26,43 @@ def run_command(cmd, output_fp, verbose=True):
         subprocess.run(cmd, stdout=output_f, check=True)
 
 
-def mafft(sequences: DNAFASTAFormat,
-          n_threads: int = 1,
-          parttree: bool = False) -> AlignedDNAFASTAFormat:
-    unaligned_fp = str(sequences)
-
+def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
     # Save original sequence IDs since long ids (~250 chars) can be truncated
     # by mafft. We'll replace the IDs in the aligned sequences file output by
     # mafft with the originals.
     #
     # https://github.com/qiime2/q2-alignment/issues/37
-    #
-    # Note: using OrderedDict to maintain order of IDs and have quick lookup
-    # for duplicates.
-    ids = collections.OrderedDict()
-    for seq in skbio.io.read(unaligned_fp, format='fasta',
+    aligned_seq_ids = {}
+    unaligned_seq_ids = {}
+
+    if alignment_fp is not None:
+        for seq in skbio.io.read(alignment_fp, format='fasta',
+                                 constructor=skbio.DNA):
+            id_ = seq.metadata['id']
+            if id_ in aligned_seq_ids:
+                raise ValueError(
+                    "A sequence ID is duplicated in the aligned sequences: "
+                    "%r" % id_)
+            else:
+                aligned_seq_ids[id_] = True
+
+    for seq in skbio.io.read(sequences_fp, format='fasta',
                              constructor=skbio.DNA):
-        id = seq.metadata['id']
-        if id in ids:
+        id_ = seq.metadata['id']
+        if id_ in unaligned_seq_ids:
+            raise ValueError(
+                "A sequence ID is duplicated in the unaligned sequences: "
+                "%r" % id_)
+        elif id_ in aligned_seq_ids:
             raise ValueError(
-                "Encountered duplicate sequence ID in unaligned sequences: %r"
-                % id)
+                "A sequence ID is present in both the aligned and unaligned "
+                "sequences: %r" % id_)
         else:
-            ids[id] = True
+            unaligned_seq_ids[id_] = True
 
     result = AlignedDNAFASTAFormat()
-    aligned_fp = str(result)
+    result_fp = str(result)
+    ids = {**aligned_seq_ids, **unaligned_seq_ids}
 
     # mafft will fail if the number of sequences is larger than 1 million.
     # mafft requires using parttree which is an algorithm to build an
@@ -67,9 +77,9 @@ def mafft(sequences: DNAFASTAFormat,
             "1 million, please use the parttree parameter")
 
     # mafft's signal for utilizing all cores is -1. We want to our users
-    # to enter 0 for using all cores. This is to prevent any confusion and
+    # to enter auto for using all cores. This is to prevent any confusion and
     # to keep the UX consisent.
-    if n_threads == 0:
+    if n_threads == 'auto':
         n_threads = -1
 
     # `--inputorder` must be turned on because we need the input and output in
@@ -81,12 +91,17 @@ def mafft(sequences: DNAFASTAFormat,
     if parttree:
         cmd += ['--parttree']
 
-    cmd += [unaligned_fp]
-    run_command(cmd, aligned_fp)
+    if alignment_fp is not None:
+        add_flag = '--addfragments' if addfragments else '--add'
+        cmd += [add_flag, sequences_fp, alignment_fp]
+    else:
+        cmd += [sequences_fp]
+
+    run_command(cmd, result_fp)
 
     # Read output alignment into memory, reassign original sequence IDs, and
     # write alignment back to disk.
-    msa = skbio.TabularMSA.read(aligned_fp, format='fasta',
+    msa = skbio.TabularMSA.read(result_fp, format='fasta',
                                 constructor=skbio.DNA)
     # Using `assert` because mafft would have had to add or drop sequences
     # while aligning, which would be a bug on mafft's end. This is just a
@@ -101,6 +116,24 @@ def mafft(sequences: DNAFASTAFormat,
     #
     # http://scikit-bio.org/docs/latest/generated/
     #     skbio.io.format.fasta.html#writer-specific-parameters
-    msa.write(aligned_fp, id_whitespace_replacement=None,
+    msa.write(result_fp, id_whitespace_replacement=None,
               description_newline_replacement=None)
     return result
+
+
+def mafft(sequences: DNAFASTAFormat,
+          n_threads: int = 1,
+          parttree: bool = False) -> AlignedDNAFASTAFormat:
+    sequences_fp = str(sequences)
+    return _mafft(sequences_fp, None, n_threads, parttree, False)
+
+
+def mafft_add(alignment: AlignedDNAFASTAFormat,
+              sequences: DNAFASTAFormat,
+              n_threads: int = 1,
+              parttree: bool = False,
+              addfragments: bool = False) -> AlignedDNAFASTAFormat:
+    alignment_fp = str(alignment)
+    sequences_fp = str(sequences)
+    return _mafft(
+        sequences_fp, alignment_fp, n_threads, parttree, addfragments)


=====================================
q2_alignment/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2019.1.0)"
-    git_full = "d4f27346e93d364d79d16ff4f7fb6900a57b73ad"
-    git_date = "2019-01-29 14:00:36 +0000"
+    git_refnames = " (tag: 2020.11.1)"
+    git_full = "5993d9e09b96ba63fd9bc07e063398cfd18f94d5"
+    git_date = "2020-12-05 20:44:49 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_alignment/plugin_setup.py
=====================================
@@ -1,12 +1,13 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from qiime2.plugin import Plugin, Float, Int, Bool, Range, Citations
+from qiime2.plugin import (
+    Plugin, Float, Int, Bool, Range, Citations, Str, Choices)
 from q2_types.feature_data import FeatureData, Sequence, AlignedSequence
 
 import q2_alignment
@@ -25,13 +26,13 @@ plugin = Plugin(
 plugin.methods.register_function(
     function=q2_alignment.mafft,
     inputs={'sequences': FeatureData[Sequence]},
-    parameters={'n_threads': Int % Range(0, None),
+    parameters={'n_threads': Int % Range(1, None) | Str % Choices(['auto']),
                 'parttree': Bool},
     outputs=[('alignment', FeatureData[AlignedSequence])],
     input_descriptions={'sequences': 'The sequences to be aligned.'},
     parameter_descriptions={
-        'n_threads': 'The number of threads. (Use 0 to automatically use all '
-                     'available cores)',
+        'n_threads': 'The number of threads. (Use `auto` to automatically use '
+                     'all available cores)',
         'parttree': 'This flag is required if the number of sequences being '
                     'aligned are larger than 1000000. Disabled by default'},
     output_descriptions={'alignment': 'The aligned sequences.'},
@@ -40,6 +41,34 @@ plugin.methods.register_function(
     citations=[citations['katoh2013mafft']]
 )
 
+plugin.methods.register_function(
+    function=q2_alignment.mafft_add,
+    inputs={'alignment': FeatureData[AlignedSequence],
+            'sequences': FeatureData[Sequence]},
+    parameters={'n_threads': Int % Range(1, None) | Str % Choices(['auto']),
+                'parttree': Bool,
+                'addfragments': Bool},
+    outputs=[('expanded_alignment', FeatureData[AlignedSequence])],
+    input_descriptions={'alignment': 'The alignment to which '
+                                     'sequences should be added.',
+                        'sequences': 'The sequences to be added.'},
+    parameter_descriptions={
+        'n_threads': 'The number of threads. (Use `auto` to automatically use '
+                     'all available cores)',
+        'parttree': 'This flag is required if the number of sequences being '
+                    'aligned are larger than 1000000. Disabled by default',
+        'addfragments': 'Optimize for the addition of short sequence '
+                        'fragments (for example, primer or amplicon '
+                        'sequences). If not set, default sequence addition '
+                        'is used.'},
+    output_descriptions={
+        'expanded_alignment': 'Alignment containing the provided aligned and '
+                              'unaligned sequences.'},
+    name='Add sequences to multiple sequence alignment with MAFFT.',
+    description='Add new sequences to an existing alignment with MAFFT.',
+    citations=[citations['katoh2013mafft']]
+)
+
 plugin.methods.register_function(
     function=q2_alignment.mask,
     inputs={'alignment': FeatureData[AlignedSequence]},


=====================================
q2_alignment/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_alignment/tests/data/aligned-dna-sequences-1.fasta
=====================================
@@ -0,0 +1,4 @@
+>aln-seq-1
+AGGGGG-
+>aln-seq-2
+AGGGGGG


=====================================
q2_alignment/tests/data/aligned-duplicate-ids-1.fasta
=====================================
@@ -0,0 +1,4 @@
+>id1
+AGGGGG-
+>id1
+AGGGGGG


=====================================
q2_alignment/tests/data/aligned-duplicate-ids-2.fasta
=====================================
@@ -0,0 +1,4 @@
+>aln-seq-1
+AGGGGG-
+>seq1
+AGGGGGG


=====================================
q2_alignment/tests/data/aligned-long-ids.fasta
=====================================
@@ -0,0 +1,4 @@
+>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+AGGGGG-
+>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AGGGGGG


=====================================
q2_alignment/tests/test_filter.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_alignment/tests/test_mafft.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -7,6 +7,7 @@
 # ----------------------------------------------------------------------------
 import os
 import unittest
+from unittest.mock import patch, ANY
 import subprocess
 
 import skbio
@@ -14,12 +15,11 @@ from qiime2.plugin.testing import TestPluginBase
 from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
 from qiime2.util import redirected_stdio
 
-from q2_alignment import mafft
+from q2_alignment import mafft, mafft_add
 from q2_alignment._mafft import run_command
 
 
 class MafftTests(TestPluginBase):
-
     package = 'q2_alignment.tests'
 
     def _prepare_sequence_data(self):
@@ -45,7 +45,7 @@ class MafftTests(TestPluginBase):
         input_sequences, exp = self._prepare_sequence_data()
 
         with redirected_stdio(stderr=os.devnull):
-            result = mafft(input_sequences, n_threads=0)
+            result = mafft(input_sequences, n_threads='auto')
         obs = skbio.io.read(str(result), into=skbio.TabularMSA,
                             constructor=skbio.DNA)
         self.assertEqual(obs, exp)
@@ -68,7 +68,7 @@ class MafftTests(TestPluginBase):
         input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
         input_sequences = DNAFASTAFormat(input_fp, mode='r')
 
-        with self.assertRaisesRegex(ValueError, 'duplicate.*id1'):
+        with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
             with redirected_stdio(stderr=os.devnull):
                 mafft(input_sequences)
 
@@ -83,8 +83,128 @@ class MafftTests(TestPluginBase):
                 mafft(input_sequences)
 
 
-class RunCommandTests(TestPluginBase):
+class MafftAddTests(TestPluginBase):
+    package = 'q2_alignment.tests'
+
+    def _prepare_sequence_data(self):
+        sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
+        sequences = DNAFASTAFormat(sequences_fp, mode='r')
+        alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
+        alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
+        exp = skbio.TabularMSA(
+            [skbio.DNA('AGGGGG-',
+                       metadata={'id': 'aln-seq-1', 'description': ''}),
+             skbio.DNA('AGGGGGG',
+                       metadata={'id': 'aln-seq-2', 'description': ''}),
+             skbio.DNA('AGGGGGG',
+                       metadata={'id': 'seq1', 'description': ''}),
+             skbio.DNA('-GGGGGG',
+                       metadata={'id': 'seq2', 'description': ''})]
+        )
+
+        return alignment, sequences, exp
+
+    def test_mafft_add(self):
+        alignment, sequences, exp = self._prepare_sequence_data()
+
+        with redirected_stdio(stderr=os.devnull):
+            result = mafft_add(alignment, sequences)
+        obs = skbio.io.read(str(result), into=skbio.TabularMSA,
+                            constructor=skbio.DNA)
+        self.assertEqual(obs, exp)
+
+    def test_mafft_add_fragments(self):
+        alignment, sequences, exp = self._prepare_sequence_data()
+
+        with redirected_stdio(stderr=os.devnull):
+            result = mafft_add(alignment, sequences, addfragments=True)
+        obs = skbio.io.read(str(result), into=skbio.TabularMSA,
+                            constructor=skbio.DNA)
+        self.assertEqual(obs, exp)
+
+    def test_mafft_add_flags(self):
+        alignment, sequences, exp = self._prepare_sequence_data()
+
+        with patch('q2_alignment._mafft.run_command') as patched_run_cmd:
+            with patch('q2_alignment._mafft.skbio.TabularMSA.read',
+                       return_value=exp):
+                _ = mafft_add(alignment, sequences)
+                patched_run_cmd.assert_called_with(
+                    ["mafft", "--preservecase", "--inputorder", "--thread",
+                     "1", "--add", ANY, ANY], ANY)
+
+                _ = mafft_add(alignment, sequences, addfragments=True)
+                patched_run_cmd.assert_called_with(
+                    ["mafft", "--preservecase", "--inputorder", "--thread",
+                     "1", "--addfragments", ANY, ANY], ANY)
+
+    def test_duplicate_input_ids_in_unaligned(self):
+        input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
+        sequences = DNAFASTAFormat(input_fp, mode='r')
+
+        alignment, _, _ = self._prepare_sequence_data()
+
+        with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
+            with redirected_stdio(stderr=os.devnull):
+                mafft_add(alignment, sequences)
+
+    def test_duplicate_input_ids_in_aligned(self):
+        input_fp = self.get_data_path('aligned-duplicate-ids-1.fasta')
+        alignment = DNAFASTAFormat(input_fp, mode='r')
+
+        _, sequences, _ = self._prepare_sequence_data()
 
+        with self.assertRaisesRegex(ValueError, 'the aligned.*id1'):
+            with redirected_stdio(stderr=os.devnull):
+                mafft_add(alignment, sequences)
+
+    def test_duplicate_input_ids_across_aligned_and_unaligned(self):
+        input_fp = self.get_data_path('aligned-duplicate-ids-2.fasta')
+        alignment = DNAFASTAFormat(input_fp, mode='r')
+
+        _, sequences, _ = self._prepare_sequence_data()
+
+        with self.assertRaisesRegex(ValueError, 'aligned and unaligned.*seq1'):
+            with redirected_stdio(stderr=os.devnull):
+                mafft_add(alignment, sequences)
+
+    def test_long_ids_are_not_truncated_unaligned(self):
+        input_fp = self.get_data_path('unaligned-long-ids.fasta')
+        sequences = DNAFASTAFormat(input_fp, mode='r')
+
+        alignment, _, _ = self._prepare_sequence_data()
+
+        with redirected_stdio(stderr=os.devnull):
+            result = mafft_add(alignment, sequences)
+
+        with open(str(result), 'r') as fh:
+            obs = fh.read()
+
+        self.assertIn('a'*250, obs)
+        self.assertIn('b'*250, obs)
+        self.assertIn('c'*250, obs)
+        self.assertIn('aln-seq-1', obs)
+        self.assertIn('aln-seq-2', obs)
+
+    def test_long_ids_are_not_truncated_aligned(self):
+        input_fp = self.get_data_path('aligned-long-ids.fasta')
+        alignment = DNAFASTAFormat(input_fp, mode='r')
+
+        _, sequences, _ = self._prepare_sequence_data()
+
+        with redirected_stdio(stderr=os.devnull):
+            result = mafft_add(alignment, sequences)
+
+        with open(str(result), 'r') as fh:
+            obs = fh.read()
+
+        self.assertIn('a'*250, obs)
+        self.assertIn('b'*250, obs)
+        self.assertIn('seq1', obs)
+        self.assertIn('seq2', obs)
+
+
+class RunCommandTests(TestPluginBase):
     package = 'q2_alignment.tests'
 
     def test_failed_run(self):


=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #



View it on GitLab: https://salsa.debian.org/med-team/q2-alignment/-/commit/c8e3e2fcf90e3d470d2c9c8242d86038c883cd24

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-alignment/-/commit/c8e3e2fcf90e3d470d2c9c8242d86038c883cd24
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210123/0f5c417f/attachment-0001.html>