[med-svn] [Git][med-team/q2-alignment][upstream] New upstream version 2020.11.1
Andreas Tille
gitlab at salsa.debian.org
Sat Jan 23 08:14:44 GMT 2021
Andreas Tille pushed to branch upstream at Debian Med / q2-alignment
Commits:
c8e3e2fc by Andreas Tille at 2021-01-23T08:11:46+01:00
New upstream version 2020.11.1
- - - - -
17 changed files:
- .github/ISSUE_TEMPLATE/6-where-to-go.md
- .github/SUPPORT.md
- LICENSE
- ci/recipe/meta.yaml
- q2_alignment/__init__.py
- q2_alignment/_filter.py
- q2_alignment/_mafft.py
- q2_alignment/_version.py
- q2_alignment/plugin_setup.py
- q2_alignment/tests/__init__.py
- + q2_alignment/tests/data/aligned-dna-sequences-1.fasta
- + q2_alignment/tests/data/aligned-duplicate-ids-1.fasta
- + q2_alignment/tests/data/aligned-duplicate-ids-2.fasta
- + q2_alignment/tests/data/aligned-long-ids.fasta
- q2_alignment/tests/test_filter.py
- q2_alignment/tests/test_mafft.py
- setup.py
Changes:
=====================================
.github/ISSUE_TEMPLATE/6-where-to-go.md
=====================================
@@ -59,6 +59,9 @@ Sorted alphabetically by repo name.
- The q2-diversity plugin
https://github.com/qiime2/q2-diversity/issues
+- The q2-diversity-lib plugin
+ https://github.com/qiime2/q2-diversity-lib/issues
+
- The q2-emperor plugin
https://github.com/qiime2/q2-emperor/issues
=====================================
.github/SUPPORT.md
=====================================
@@ -52,6 +52,8 @@ Sorted alphabetically by repo name.
| The q2-demux plugin
- [q2-diversity](https://github.com/qiime2/q2-diversity/issues)
| The q2-diversity plugin
+- [q2-diversity-lib](https://github.com/qiime2/q2-diversity-lib/issues)
+ | The q2-diversity-lib plugin
- [q2-emperor](https://github.com/qiime2/q2-emperor/issues)
| The q2-emperor plugin
- [q2-feature-classifier](https://github.com/qiime2/q2-feature-classifier/issues)
=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
BSD 3-Clause License
-Copyright (c) 2016-2019, QIIME 2 development team.
+Copyright (c) 2016-2020, QIIME 2 development team.
All rights reserved.
Redistribution and use in source and binary forms, with or without
=====================================
ci/recipe/meta.yaml
=====================================
@@ -22,7 +22,7 @@ requirements:
- scikit-bio
- qiime2 {{ release }}.*
- q2-types {{ release }}.*
- - mafft >=7.310
+ - mafft >=7.394
- sina >=1.3.5,<=2
test:
=====================================
q2_alignment/__init__.py
=====================================
@@ -1,12 +1,12 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-from ._mafft import mafft
+from ._mafft import mafft, mafft_add
from ._filter import mask
from ._version import get_versions
@@ -14,4 +14,4 @@ from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
-__all__ = ['mafft', 'mask']
+__all__ = ['mafft', 'mask', 'mafft_add']
=====================================
q2_alignment/_filter.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_alignment/_mafft.py
=====================================
@@ -1,12 +1,11 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-import collections
import subprocess
import skbio
@@ -27,32 +26,43 @@ def run_command(cmd, output_fp, verbose=True):
subprocess.run(cmd, stdout=output_f, check=True)
-def mafft(sequences: DNAFASTAFormat,
- n_threads: int = 1,
- parttree: bool = False) -> AlignedDNAFASTAFormat:
- unaligned_fp = str(sequences)
-
+def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
# Save original sequence IDs since long ids (~250 chars) can be truncated
# by mafft. We'll replace the IDs in the aligned sequences file output by
# mafft with the originals.
#
# https://github.com/qiime2/q2-alignment/issues/37
- #
- # Note: using OrderedDict to maintain order of IDs and have quick lookup
- # for duplicates.
- ids = collections.OrderedDict()
- for seq in skbio.io.read(unaligned_fp, format='fasta',
+ aligned_seq_ids = {}
+ unaligned_seq_ids = {}
+
+ if alignment_fp is not None:
+ for seq in skbio.io.read(alignment_fp, format='fasta',
+ constructor=skbio.DNA):
+ id_ = seq.metadata['id']
+ if id_ in aligned_seq_ids:
+ raise ValueError(
+ "A sequence ID is duplicated in the aligned sequences: "
+ "%r" % id_)
+ else:
+ aligned_seq_ids[id_] = True
+
+ for seq in skbio.io.read(sequences_fp, format='fasta',
constructor=skbio.DNA):
- id = seq.metadata['id']
- if id in ids:
+ id_ = seq.metadata['id']
+ if id_ in unaligned_seq_ids:
+ raise ValueError(
+ "A sequence ID is duplicated in the unaligned sequences: "
+ "%r" % id_)
+ elif id_ in aligned_seq_ids:
raise ValueError(
- "Encountered duplicate sequence ID in unaligned sequences: %r"
- % id)
+ "A sequence ID is present in both the aligned and unaligned "
+ "sequences: %r" % id_)
else:
- ids[id] = True
+ unaligned_seq_ids[id_] = True
result = AlignedDNAFASTAFormat()
- aligned_fp = str(result)
+ result_fp = str(result)
+ ids = {**aligned_seq_ids, **unaligned_seq_ids}
# mafft will fail if the number of sequences is larger than 1 million.
# mafft requires using parttree which is an algorithm to build an
@@ -67,9 +77,9 @@ def mafft(sequences: DNAFASTAFormat,
"1 million, please use the parttree parameter")
# mafft's signal for utilizing all cores is -1. We want to our users
- # to enter 0 for using all cores. This is to prevent any confusion and
+ # to enter auto for using all cores. This is to prevent any confusion and
# to keep the UX consisent.
- if n_threads == 0:
+ if n_threads == 'auto':
n_threads = -1
# `--inputorder` must be turned on because we need the input and output in
@@ -81,12 +91,17 @@ def mafft(sequences: DNAFASTAFormat,
if parttree:
cmd += ['--parttree']
- cmd += [unaligned_fp]
- run_command(cmd, aligned_fp)
+ if alignment_fp is not None:
+ add_flag = '--addfragments' if addfragments else '--add'
+ cmd += [add_flag, sequences_fp, alignment_fp]
+ else:
+ cmd += [sequences_fp]
+
+ run_command(cmd, result_fp)
# Read output alignment into memory, reassign original sequence IDs, and
# write alignment back to disk.
- msa = skbio.TabularMSA.read(aligned_fp, format='fasta',
+ msa = skbio.TabularMSA.read(result_fp, format='fasta',
constructor=skbio.DNA)
# Using `assert` because mafft would have had to add or drop sequences
# while aligning, which would be a bug on mafft's end. This is just a
@@ -101,6 +116,24 @@ def mafft(sequences: DNAFASTAFormat,
#
# http://scikit-bio.org/docs/latest/generated/
# skbio.io.format.fasta.html#writer-specific-parameters
- msa.write(aligned_fp, id_whitespace_replacement=None,
+ msa.write(result_fp, id_whitespace_replacement=None,
description_newline_replacement=None)
return result
+
+
+def mafft(sequences: DNAFASTAFormat,
+ n_threads: int = 1,
+ parttree: bool = False) -> AlignedDNAFASTAFormat:
+ sequences_fp = str(sequences)
+ return _mafft(sequences_fp, None, n_threads, parttree, False)
+
+
+def mafft_add(alignment: AlignedDNAFASTAFormat,
+ sequences: DNAFASTAFormat,
+ n_threads: int = 1,
+ parttree: bool = False,
+ addfragments: bool = False) -> AlignedDNAFASTAFormat:
+ alignment_fp = str(alignment)
+ sequences_fp = str(sequences)
+ return _mafft(
+ sequences_fp, alignment_fp, n_threads, parttree, addfragments)
=====================================
q2_alignment/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.1.0)"
- git_full = "d4f27346e93d364d79d16ff4f7fb6900a57b73ad"
- git_date = "2019-01-29 14:00:36 +0000"
+ git_refnames = " (tag: 2020.11.1)"
+ git_full = "5993d9e09b96ba63fd9bc07e063398cfd18f94d5"
+ git_date = "2020-12-05 20:44:49 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_alignment/plugin_setup.py
=====================================
@@ -1,12 +1,13 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-from qiime2.plugin import Plugin, Float, Int, Bool, Range, Citations
+from qiime2.plugin import (
+ Plugin, Float, Int, Bool, Range, Citations, Str, Choices)
from q2_types.feature_data import FeatureData, Sequence, AlignedSequence
import q2_alignment
@@ -25,13 +26,13 @@ plugin = Plugin(
plugin.methods.register_function(
function=q2_alignment.mafft,
inputs={'sequences': FeatureData[Sequence]},
- parameters={'n_threads': Int % Range(0, None),
+ parameters={'n_threads': Int % Range(1, None) | Str % Choices(['auto']),
'parttree': Bool},
outputs=[('alignment', FeatureData[AlignedSequence])],
input_descriptions={'sequences': 'The sequences to be aligned.'},
parameter_descriptions={
- 'n_threads': 'The number of threads. (Use 0 to automatically use all '
- 'available cores)',
+ 'n_threads': 'The number of threads. (Use `auto` to automatically use '
+ 'all available cores)',
'parttree': 'This flag is required if the number of sequences being '
'aligned are larger than 1000000. Disabled by default'},
output_descriptions={'alignment': 'The aligned sequences.'},
@@ -40,6 +41,34 @@ plugin.methods.register_function(
citations=[citations['katoh2013mafft']]
)
+plugin.methods.register_function(
+ function=q2_alignment.mafft_add,
+ inputs={'alignment': FeatureData[AlignedSequence],
+ 'sequences': FeatureData[Sequence]},
+ parameters={'n_threads': Int % Range(1, None) | Str % Choices(['auto']),
+ 'parttree': Bool,
+ 'addfragments': Bool},
+ outputs=[('expanded_alignment', FeatureData[AlignedSequence])],
+ input_descriptions={'alignment': 'The alignment to which '
+ 'sequences should be added.',
+ 'sequences': 'The sequences to be added.'},
+ parameter_descriptions={
+ 'n_threads': 'The number of threads. (Use `auto` to automatically use '
+ 'all available cores)',
+ 'parttree': 'This flag is required if the number of sequences being '
+ 'aligned are larger than 1000000. Disabled by default',
+ 'addfragments': 'Optimize for the addition of short sequence '
+ 'fragments (for example, primer or amplicon '
+ 'sequences). If not set, default sequence addition '
+ 'is used.'},
+ output_descriptions={
+ 'expanded_alignment': 'Alignment containing the provided aligned and '
+ 'unaligned sequences.'},
+ name='Add sequences to multiple sequence alignment with MAFFT.',
+ description='Add new sequences to an existing alignment with MAFFT.',
+ citations=[citations['katoh2013mafft']]
+)
+
plugin.methods.register_function(
function=q2_alignment.mask,
inputs={'alignment': FeatureData[AlignedSequence]},
=====================================
q2_alignment/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_alignment/tests/data/aligned-dna-sequences-1.fasta
=====================================
@@ -0,0 +1,4 @@
+>aln-seq-1
+AGGGGG-
+>aln-seq-2
+AGGGGGG
=====================================
q2_alignment/tests/data/aligned-duplicate-ids-1.fasta
=====================================
@@ -0,0 +1,4 @@
+>id1
+AGGGGG-
+>id1
+AGGGGGG
=====================================
q2_alignment/tests/data/aligned-duplicate-ids-2.fasta
=====================================
@@ -0,0 +1,4 @@
+>aln-seq-1
+AGGGGG-
+>seq1
+AGGGGGG
=====================================
q2_alignment/tests/data/aligned-long-ids.fasta
=====================================
@@ -0,0 +1,4 @@
+>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+AGGGGG-
+>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AGGGGGG
=====================================
q2_alignment/tests/test_filter.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_alignment/tests/test_mafft.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
import os
import unittest
+from unittest.mock import patch, ANY
import subprocess
import skbio
@@ -14,12 +15,11 @@ from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data import DNAFASTAFormat, AlignedDNAFASTAFormat
from qiime2.util import redirected_stdio
-from q2_alignment import mafft
+from q2_alignment import mafft, mafft_add
from q2_alignment._mafft import run_command
class MafftTests(TestPluginBase):
-
package = 'q2_alignment.tests'
def _prepare_sequence_data(self):
@@ -45,7 +45,7 @@ class MafftTests(TestPluginBase):
input_sequences, exp = self._prepare_sequence_data()
with redirected_stdio(stderr=os.devnull):
- result = mafft(input_sequences, n_threads=0)
+ result = mafft(input_sequences, n_threads='auto')
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs, exp)
@@ -68,7 +68,7 @@ class MafftTests(TestPluginBase):
input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
input_sequences = DNAFASTAFormat(input_fp, mode='r')
- with self.assertRaisesRegex(ValueError, 'duplicate.*id1'):
+ with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
with redirected_stdio(stderr=os.devnull):
mafft(input_sequences)
@@ -83,8 +83,128 @@ class MafftTests(TestPluginBase):
mafft(input_sequences)
-class RunCommandTests(TestPluginBase):
+class MafftAddTests(TestPluginBase):
+ package = 'q2_alignment.tests'
+
+ def _prepare_sequence_data(self):
+ sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
+ sequences = DNAFASTAFormat(sequences_fp, mode='r')
+ alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
+ alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
+ exp = skbio.TabularMSA(
+ [skbio.DNA('AGGGGG-',
+ metadata={'id': 'aln-seq-1', 'description': ''}),
+ skbio.DNA('AGGGGGG',
+ metadata={'id': 'aln-seq-2', 'description': ''}),
+ skbio.DNA('AGGGGGG',
+ metadata={'id': 'seq1', 'description': ''}),
+ skbio.DNA('-GGGGGG',
+ metadata={'id': 'seq2', 'description': ''})]
+ )
+
+ return alignment, sequences, exp
+
+ def test_mafft_add(self):
+ alignment, sequences, exp = self._prepare_sequence_data()
+
+ with redirected_stdio(stderr=os.devnull):
+ result = mafft_add(alignment, sequences)
+ obs = skbio.io.read(str(result), into=skbio.TabularMSA,
+ constructor=skbio.DNA)
+ self.assertEqual(obs, exp)
+
+ def test_mafft_add_fragments(self):
+ alignment, sequences, exp = self._prepare_sequence_data()
+
+ with redirected_stdio(stderr=os.devnull):
+ result = mafft_add(alignment, sequences, addfragments=True)
+ obs = skbio.io.read(str(result), into=skbio.TabularMSA,
+ constructor=skbio.DNA)
+ self.assertEqual(obs, exp)
+
+ def test_mafft_add_flags(self):
+ alignment, sequences, exp = self._prepare_sequence_data()
+
+ with patch('q2_alignment._mafft.run_command') as patched_run_cmd:
+ with patch('q2_alignment._mafft.skbio.TabularMSA.read',
+ return_value=exp):
+ _ = mafft_add(alignment, sequences)
+ patched_run_cmd.assert_called_with(
+ ["mafft", "--preservecase", "--inputorder", "--thread",
+ "1", "--add", ANY, ANY], ANY)
+
+ _ = mafft_add(alignment, sequences, addfragments=True)
+ patched_run_cmd.assert_called_with(
+ ["mafft", "--preservecase", "--inputorder", "--thread",
+ "1", "--addfragments", ANY, ANY], ANY)
+
+ def test_duplicate_input_ids_in_unaligned(self):
+ input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
+ sequences = DNAFASTAFormat(input_fp, mode='r')
+
+ alignment, _, _ = self._prepare_sequence_data()
+
+ with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
+ with redirected_stdio(stderr=os.devnull):
+ mafft_add(alignment, sequences)
+
+ def test_duplicate_input_ids_in_aligned(self):
+ input_fp = self.get_data_path('aligned-duplicate-ids-1.fasta')
+ alignment = DNAFASTAFormat(input_fp, mode='r')
+
+ _, sequences, _ = self._prepare_sequence_data()
+ with self.assertRaisesRegex(ValueError, 'the aligned.*id1'):
+ with redirected_stdio(stderr=os.devnull):
+ mafft_add(alignment, sequences)
+
+ def test_duplicate_input_ids_across_aligned_and_unaligned(self):
+ input_fp = self.get_data_path('aligned-duplicate-ids-2.fasta')
+ alignment = DNAFASTAFormat(input_fp, mode='r')
+
+ _, sequences, _ = self._prepare_sequence_data()
+
+ with self.assertRaisesRegex(ValueError, 'aligned and unaligned.*seq1'):
+ with redirected_stdio(stderr=os.devnull):
+ mafft_add(alignment, sequences)
+
+ def test_long_ids_are_not_truncated_unaligned(self):
+ input_fp = self.get_data_path('unaligned-long-ids.fasta')
+ sequences = DNAFASTAFormat(input_fp, mode='r')
+
+ alignment, _, _ = self._prepare_sequence_data()
+
+ with redirected_stdio(stderr=os.devnull):
+ result = mafft_add(alignment, sequences)
+
+ with open(str(result), 'r') as fh:
+ obs = fh.read()
+
+ self.assertIn('a'*250, obs)
+ self.assertIn('b'*250, obs)
+ self.assertIn('c'*250, obs)
+ self.assertIn('aln-seq-1', obs)
+ self.assertIn('aln-seq-2', obs)
+
+ def test_long_ids_are_not_truncated_aligned(self):
+ input_fp = self.get_data_path('aligned-long-ids.fasta')
+ alignment = DNAFASTAFormat(input_fp, mode='r')
+
+ _, sequences, _ = self._prepare_sequence_data()
+
+ with redirected_stdio(stderr=os.devnull):
+ result = mafft_add(alignment, sequences)
+
+ with open(str(result), 'r') as fh:
+ obs = fh.read()
+
+ self.assertIn('a'*250, obs)
+ self.assertIn('b'*250, obs)
+ self.assertIn('seq1', obs)
+ self.assertIn('seq2', obs)
+
+
+class RunCommandTests(TestPluginBase):
package = 'q2_alignment.tests'
def test_failed_run(self):
=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2016-2019, QIIME 2 development team.
+# Copyright (c) 2016-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
View it on GitLab: https://salsa.debian.org/med-team/q2-alignment/-/commit/c8e3e2fcf90e3d470d2c9c8242d86038c883cd24
--
View it on GitLab: https://salsa.debian.org/med-team/q2-alignment/-/commit/c8e3e2fcf90e3d470d2c9c8242d86038c883cd24
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210123/0f5c417f/attachment-0001.html>
More information about the debian-med-commit
mailing list