[med-svn] [python-skbio] 02/06: Imported Upstream version 0.4.2
Kevin Murray
daube-guest at moszumanska.debian.org
Thu Feb 18 00:53:55 UTC 2016
This is an automated email from the git hooks/post-receive script.
daube-guest pushed a commit to branch master
in repository python-skbio.
commit 76aae4fc461b07b59fe673eb85501cfe25a7aac3
Author: Kevin Murray <spam at kdmurray.id.au>
Date: Wed Feb 17 16:23:35 2016 -0800
Imported Upstream version 0.4.2
---
CHANGELOG.md | 33 +-
RELEASE.md | 4 +-
ci/pip_requirements.txt | 2 +-
doc/source/_static/style.css | 4 +
doc/source/conf.py | 7 +-
skbio/__init__.py | 2 +-
skbio/alignment/_pairwise.py | 26 +-
skbio/alignment/_tabular_msa.py | 45 +-
skbio/alignment/tests/test_pairwise.py | 18 +-
skbio/alignment/tests/test_tabular_msa.py | 36 +-
skbio/diversity/alpha/_lladser.py | 2 +-
skbio/io/__init__.py | 9 +-
skbio/io/_exception.py | 5 +
skbio/io/format/_base.py | 2 +-
skbio/io/format/clustal.py | 12 +-
skbio/io/format/fasta.py | 9 +-
skbio/io/format/fastq.py | 9 +-
skbio/io/format/genbank.py | 8 +-
skbio/io/format/phylip.py | 8 +-
skbio/io/format/stockholm.py | 710 +++++++++++++++++++++
.../io/format/tests/data/stockholm_all_data_types | 15 +
skbio/io/format/tests/data/stockholm_blank_lines | 6 +
skbio/io/format/tests/data/stockholm_data_only | 5 +
.../tests/data/stockholm_differing_gc_data_length | 4 +
.../tests/data/stockholm_differing_gr_data_length | 4 +
.../tests/data/stockholm_differing_seq_lengths | 4 +
skbio/io/format/tests/data/stockholm_duplicate_gc | 7 +
skbio/io/format/tests/data/stockholm_duplicate_gr | 8 +
.../tests/data/stockholm_duplicate_sequence_names | 6 +
.../format/tests/data/stockholm_duplicate_tree_ids | 6 +
skbio/io/format/tests/data/stockholm_extensive | 15 +
.../io/format/tests/data/stockholm_extensive_mixed | 15 +
.../format/tests/data/stockholm_invalid_data_type | 4 +
.../tests/data/stockholm_invalid_nonexistent_gr | 7 +
.../tests/data/stockholm_invalid_nonexistent_gs | 5 +
.../tests/data/stockholm_malformed_data_line | 3 +
.../format/tests/data/stockholm_malformed_gc_line | 4 +
.../format/tests/data/stockholm_malformed_gf_line | 3 +
.../format/tests/data/stockholm_malformed_gr_line | 4 +
.../format/tests/data/stockholm_malformed_gs_line | 4 +
skbio/io/format/tests/data/stockholm_metadata_only | 4 +
skbio/io/format/tests/data/stockholm_minimal | 3 +
.../io/format/tests/data/stockholm_missing_footer | 4 +
.../io/format/tests/data/stockholm_missing_header | 4 +
skbio/io/format/tests/data/stockholm_multiple_msa | 18 +
.../io/format/tests/data/stockholm_multiple_trees | 8 +
skbio/io/format/tests/data/stockholm_no_data | 2 +
.../format/tests/data/stockholm_nonstring_labels | 7 +
skbio/io/format/tests/data/stockholm_rna | 9 +
skbio/io/format/tests/data/stockholm_runon_gf | 5 +
skbio/io/format/tests/data/stockholm_runon_gs | 5 +
.../tests/data/stockholm_single_tree_with_id | 4 +
.../tests/data/stockholm_single_tree_without_id | 3 +
.../tests/data/stockholm_two_of_each_metadata | 11 +
.../tests/data/stockholm_whitespace_only_lines | 5 +
skbio/io/format/tests/test_base.py | 4 +-
skbio/io/format/tests/test_clustal.py | 15 +-
skbio/io/format/tests/test_fasta.py | 17 +-
skbio/io/format/tests/test_fastq.py | 17 +-
skbio/io/format/tests/test_genbank.py | 15 +-
skbio/io/format/tests/test_stockholm.py | 701 ++++++++++++++++++++
skbio/io/registry.py | 28 +-
skbio/io/tests/test_registry.py | 36 +-
skbio/io/tests/test_util.py | 36 +-
skbio/sequence/__init__.py | 13 +-
skbio/sequence/_dna.py | 33 +-
.../{_iupac_sequence.py => _grammared_sequence.py} | 189 +++++-
skbio/sequence/_nucleotide_mixin.py | 2 +-
skbio/sequence/_protein.py | 39 +-
skbio/sequence/_rna.py | 33 +-
skbio/sequence/_sequence.py | 90 ++-
skbio/sequence/distance.py | 115 ++++
skbio/sequence/tests/test_distance.py | 130 ++++
skbio/sequence/tests/test_dna.py | 6 +
skbio/sequence/tests/test_grammared_sequence.py | 613 ++++++++++++++++++
skbio/sequence/tests/test_iupac_sequence.py | 506 ---------------
skbio/sequence/tests/test_nucleotide_sequences.py | 27 +-
skbio/sequence/tests/test_protein.py | 6 +
skbio/sequence/tests/test_rna.py | 5 +
skbio/sequence/tests/test_sequence.py | 216 +++++--
skbio/stats/composition.py | 28 +-
skbio/stats/distance/_base.py | 4 +-
skbio/stats/distance/tests/test_base.py | 53 +-
.../ordination/tests/test_redundancy_analysis.py | 2 +
skbio/stats/power.py | 14 +-
skbio/stats/tests/test_composition.py | 20 +-
skbio/stats/tests/test_gradient.py | 42 +-
skbio/tree/_tree.py | 67 +-
skbio/tree/tests/test_tree.py | 13 +
skbio/util/__init__.py | 15 +-
skbio/util/_decorator.py | 3 +-
skbio/util/_misc.py | 22 +-
skbio/util/_testing.py | 27 +-
skbio/util/_warning.py | 14 +-
skbio/util/tests/test_decorator.py | 9 +-
skbio/util/tests/test_misc.py | 2 +-
96 files changed, 3529 insertions(+), 870 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c5a901..a9c6317 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,40 @@
# scikit-bio changelog
+## Version 0.4.2 (2016-02-17)
+
+Minor maintenance release. **This is the last Python 2.7 compatible release. Future scikit-bio releases will only support Python 3.**
+
+### Features
+* Added `skbio.tree.TreeNode.bifurcate` for converting multifurcating trees into bifurcating trees. ([#896](https://github.com/biocore/scikit-bio/issues/896))
+* Added `skbio.io.format.stockholm` for reading Stockholm files into a `TabularMSA` and writing from a `TabularMSA`. ([#967](https://github.com/biocore/scikit-bio/issues/967))
+* scikit-bio `Sequence` objects have better compatibility with numpy. For example, calling `np.asarray(sequence)` now converts the sequence to a numpy array of characters (the same as calling `sequence.values`).
+* Added `skbio.sequence.distance` subpackage for computing distances between scikit-bio `Sequence` objects ([#913](https://github.com/biocore/scikit-bio/issues/913))
+* Added ``skbio.sequence.GrammaredSequence``, which can be inherited from to create grammared sequences with custom alphabets (e.g., for use with TabularMSA) ([#1175](https://github.com/biocore/scikit-bio/issues/1175))
+* Added ``skbio.util.classproperty`` decorator
+
+### Backward-incompatible changes [stable]
+* When sniffing or reading a file (`skbio.io.sniff`, `skbio.io.read`, or the object-oriented `.read()` interface), passing `newline` as a keyword argument to `skbio.io.open` now raises a `TypeError`. This backward-incompatible change to a stable API is necessary because it fixes a bug (more details in bug fix section below).
+* When reading a FASTQ or QSEQ file and passing `variant='solexa'`, `ValueError` is now raised instead of `NotImplementedError`. This backward-incompatible change to a stable API is necessary to avoid creating a spin-locked process due to [a bug in Python](https://bugs.python.org/issue25786). See [#1256](https://github.com/biocore/scikit-bio/issues/1256) for details. This change is temporary and will be reverted to `NotImplementedError` when the bug is fixed in Python.
+
+### Backward-incompatible changes [experimental]
+* `skbio.io.format.genbank`: When reading GenBank files, the date field of the LOCUS line is no longer parsed into a `datetime.datetime` object and is left as a string. When writing GenBank files, the locus date metadata is expected to be a string instead of a `datetime.datetime` object ([#1153](https://github.com/biocore/scikit-bio/issues/1153))
+* `Sequence.distance` now converts the input sequence (`other`) to its type before passing both sequences to `metric`. Previous behavior was to always convert to `Sequence`.
+
+### Bug fixes
+* Fixed bug when using `Sequence.distance` or `DistanceMatrix.from_iterable` to compute distances between `Sequence` objects with differing `metadata`/`positional_metadata` and passing `metric=scipy.spatial.distance.hamming` ([#1254](https://github.com/biocore/scikit-bio/issues/1254))
+* Fixed performance bug when computing Hamming distances between `Sequence` objects in `DistanceMatrix.from_iterable` ([#1250](https://github.com/biocore/scikit-bio/issues/1250))
+* Changed `skbio.stats.composition.multiplicative_replacement` to raise an error whenever a large value of `delta` is chosen ([#1241](https://github.com/biocore/scikit-bio/issues/1241))
+* When sniffing or reading a file (`skbio.io.sniff`, `skbio.io.read`, or the object-oriented `.read()` interface), passing `newline` as a keyword argument to `skbio.io.open` now raises a `TypeError`. The file format's `newline` character will be used when opening the file. Previous behavior allowed overriding the format's `newline` character but this could cause issues with readers that assume newline characters are those defined by the file format (which is an entirely reasonable assump [...]
+* DNA, RNA, and Protein are no longer inheritable because they assume an IUPAC alphabet.
+* `DistanceMatrix` constructor provides more informative error message when data contains NaNs ([#1276](https://github.com/biocore/scikit-bio/issues/1276))
+
+### Miscellaneous
+* Warnings raised by scikit-bio now share a common subclass ``skbio.util.SkbioWarning``.
+
## Version 0.4.1 (2015-12-09)
### Features
-* The ``TabularMSA`` object was added to represent and operate on tabular multiple sequence alignments. This statisfies [RFC 1](https://github.com/biocore/scikit-bio-rfcs/blob/master/active/001-tabular-msa.md). See the ``TabularMSA`` docs for full details.
+* The ``TabularMSA`` object was added to represent and operate on tabular multiple sequence alignments. This satisfies [RFC 1](https://github.com/biocore/scikit-bio-rfcs/blob/master/active/001-tabular-msa.md). See the ``TabularMSA`` docs for full details.
* Added phylogenetic diversity metrics, including weighted UniFrac, unweighted UniFrac, and Faith's Phylogenetic Diversity. These are accessible as ``skbio.diversity.beta.unweighted_unifrac``, ``skbio.diversity.beta.weighted_unifrac``, and ``skbio.diversity.alpha.faith_pd``, respectively.
* Addition of the function ``skbio.diversity.alpha_diversity`` to support applying an alpha diversity metric to multiple samples in one call.
* Addition of the functions ``skbio.diversity.get_alpha_diversity_metrics`` and ``skbio.diversity.get_beta_diversity_metrics`` to support discovery of the alpha and beta diversity metrics implemented in scikit-bio.
diff --git a/RELEASE.md b/RELEASE.md
index e436e02..dc9a9f5 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -110,7 +110,7 @@ Assuming the GitHub release tarball correctly installs and passes its tests, you
6. Next, we'll prepare and post the release to [anaconda.org](http://www.anaconda.org).
- You'll need to have ``conda-build`` and ``anaconda-client`` installed to perform these steps. Both can be conda-installed. First, log into anaconda using the following command. You should log into the ``biocore`` anaconda account (if you don't have login info, get in touch with [@gregcaporaso](https://github.com/gregcaporaso) who is the owner of that account).
+ You'll need to have ``conda-build`` and ``anaconda-client`` installed to perform these steps. Both can be conda-installed. First, log into anaconda with your anaconda username using the following command. You should have access to push to the ``biocore`` anaconda account through your account (if you don't, get in touch with [@gregcaporaso](https://github.com/gregcaporaso) who is the owner of that account).
anaconda login
@@ -126,7 +126,7 @@ Assuming the GitHub release tarball correctly installs and passes its tests, you
If the tests pass, you're ready to upload.
- anaconda upload <package-filepath>
+ anaconda upload -u biocore <package-filepath>
``<package-filepath>`` should be replaced with the path to the package that was was created above. Repeat this for each package you created (here, the Python 2.7 and 3.5 packages).
diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt
index 33a7f0b..d14cee3 100644
--- a/ci/pip_requirements.txt
+++ b/ci/pip_requirements.txt
@@ -1,4 +1,4 @@
-HTTPretty
+HTTPretty >= 0.8.14
bz2file
contextlib2
coveralls
diff --git a/doc/source/_static/style.css b/doc/source/_static/style.css
index 06016de..712ac18 100644
--- a/doc/source/_static/style.css
+++ b/doc/source/_static/style.css
@@ -19,6 +19,10 @@ code {
font-size: 100% !important;
}
+code.descclassname, code.descname {
+ padding: 2px 0px;
+}
+
cite, code.docutils.literal:not(.xref) {
padding: 1px 4px !important;
font-size: 100% !important;
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 30a8a04..14e826d 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -7,6 +7,10 @@ import os
import types
import re
+# Force matplotlib to not use any Xwindows backend.
+import matplotlib
+matplotlib.use('Agg')
+
import sphinx
import sphinx.ext.autosummary as autosummary
@@ -128,6 +132,7 @@ extensions = [
# Determine if the matplotlib has a recent enough version of the
# plot_directive.
+
try:
from matplotlib.sphinxext import plot_directive
except ImportError:
@@ -409,7 +414,7 @@ import scipy as sp
np.random.seed(123)
"""
plot_include_source = True
-#plot_formats = [('png', 96), 'pdf']
+plot_formats = [('png', 96), ]
#plot_html_show_formats = False
import math
diff --git a/skbio/__init__.py b/skbio/__init__.py
index 68856a5..ebc5fe2 100644
--- a/skbio/__init__.py
+++ b/skbio/__init__.py
@@ -26,7 +26,7 @@ __all__ = ['Sequence', 'DNA', 'RNA', 'Protein', 'GeneticCode',
'TreeNode', 'nj', 'read', 'write', 'OrdinationResults']
__credits__ = "https://github.com/biocore/scikit-bio/graphs/contributors"
-__version__ = "0.4.1"
+__version__ = "0.4.2"
mottos = [
# 03/15/2014
diff --git a/skbio/alignment/_pairwise.py b/skbio/alignment/_pairwise.py
index 8bd6c8a..409532a 100644
--- a/skbio/alignment/_pairwise.py
+++ b/skbio/alignment/_pairwise.py
@@ -16,7 +16,7 @@ from future.builtins import range, zip
from skbio.alignment import TabularMSA
from skbio.alignment._ssw_wrapper import StripedSmithWaterman
from skbio.sequence import DNA, RNA, Protein
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence import GrammaredSequence
from skbio.util import EfficiencyWarning
from skbio.util._decorator import experimental, deprecated
@@ -273,9 +273,9 @@ def local_pairwise_align(seq1, seq2, gap_open_penalty,
Parameters
----------
- seq1 : IUPACSequence
+ seq1 : GrammaredSequence
The first unaligned sequence.
- seq2 : IUPACSequence
+ seq2 : GrammaredSequence
The second unaligned sequence.
gap_open_penalty : int or float
Penalty for opening a gap (this is substracted from previous best
@@ -323,10 +323,10 @@ def local_pairwise_align(seq1, seq2, gap_open_penalty,
EfficiencyWarning)
for seq in seq1, seq2:
- if not isinstance(seq, IUPACSequence):
+ if not isinstance(seq, GrammaredSequence):
raise TypeError(
- "`seq1` and `seq2` must be IUPACSequence subclasses, not type "
- "%r" % type(seq).__name__)
+ "`seq1` and `seq2` must be %r subclasses, not type %r" %
+ (GrammaredSequence.__name__, type(seq).__name__))
if type(seq1) is not type(seq2):
raise TypeError(
@@ -538,9 +538,9 @@ def global_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty,
Parameters
----------
- seq1 : IUPACSequence or TabularMSA
+ seq1 : GrammaredSequence or TabularMSA
The first unaligned sequence(s).
- seq2 : IUPACSequence or TabularMSA
+ seq2 : GrammaredSequence or TabularMSA
The second unaligned sequence(s).
gap_open_penalty : int or float
Penalty for opening a gap (this is substracted from previous best
@@ -602,11 +602,11 @@ def global_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty,
for seq in seq1, seq2:
# We don't need to check the case where `seq` is a `TabularMSA` with a
- # dtype that isn't a subclass of `IUPACSequence`, this is guaranteed by
- # `TabularMSA`.
- if not isinstance(seq, (IUPACSequence, TabularMSA)):
+ # dtype that isn't a subclass of `GrammaredSequence`, this is
+ # guaranteed by `TabularMSA`.
+ if not isinstance(seq, (GrammaredSequence, TabularMSA)):
raise TypeError(
- "`seq1` and `seq2` must be IUPACSequence subclasses or "
+ "`seq1` and `seq2` must be GrammaredSequence subclasses or "
"TabularMSA, not type %r" % type(seq).__name__)
seq1 = _coerce_alignment_input_type(seq1)
@@ -774,7 +774,7 @@ def make_identity_substitution_matrix(match_score, mismatch_score,
def _coerce_alignment_input_type(seq):
- if isinstance(seq, IUPACSequence):
+ if isinstance(seq, GrammaredSequence):
return TabularMSA([seq])
else:
return seq
diff --git a/skbio/alignment/_tabular_msa.py b/skbio/alignment/_tabular_msa.py
index 69003df..0de32b6 100644
--- a/skbio/alignment/_tabular_msa.py
+++ b/skbio/alignment/_tabular_msa.py
@@ -19,7 +19,7 @@ import scipy.stats
from skbio._base import SkbioObject, MetadataMixin, PositionalMetadataMixin
from skbio.sequence import Sequence
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence._grammared_sequence import GrammaredSequence
from skbio.util._decorator import experimental, classonlymethod, overrides
from skbio.util._misc import resolve_key
from skbio.alignment._indexing import TabularMSAILoc, TabularMSALoc
@@ -35,7 +35,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Parameters
----------
- sequences : iterable of IUPACSequence, TabularMSA
+ sequences : iterable of GrammaredSequence, TabularMSA
Aligned sequences in the MSA. Sequences must all be the same type and
length. For example, `sequences` could be an iterable of ``DNA``,
``RNA``, or ``Protein`` sequences. If `sequences` is a ``TabularMSA``,
@@ -288,12 +288,12 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Returns
-------
- TabularMSA, IUPACSequence, Sequence
+ TabularMSA, GrammaredSequence, Sequence
A ``TabularMSA`` is returned when `seq_idx` and `pos_idx` are
- non-scalars. A ``IUPACSequence`` of type ``msa.dtype`` is returned
- when `seq_idx` is a scalar (this object will match the dtype of the
- MSA). A ``Sequence`` is returned when `seq_idx` is non-scalar and
- `pos_idx` is scalar.
+ non-scalars. A ``GrammaredSequence`` of type ``msa.dtype`` is
+ returned when `seq_idx` is a scalar (this object will match the
+ dtype of the MSA). A ``Sequence`` is returned when `seq_idx` is
+ non-scalar and `pos_idx` is scalar.
See Also
--------
@@ -554,12 +554,12 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Returns
-------
- TabularMSA, IUPACSequence, Sequence
+ TabularMSA, GrammaredSequence, Sequence
A ``TabularMSA`` is returned when `seq_idx` and `pos_idx` are
- non-scalars. A ``IUPACSequence`` of type ``msa.dtype`` is returned
- when `seq_idx` is a scalar (this object will match the dtype of the
- MSA). A ``Sequence`` is returned when `seq_idx` is non-scalar and
- `pos_idx` is scalar.
+ non-scalars. A ``GrammaredSequence`` of type ``msa.dtype`` is
+ returned when `seq_idx` is a scalar (this object will match the
+ dtype of the MSA). A ``Sequence`` is returned when `seq_idx` is
+ non-scalar and `pos_idx` is scalar.
See Also
--------
@@ -713,8 +713,8 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Parameters
----------
dictionary : dict
- Dictionary mapping keys to ``IUPACSequence`` sequence objects. The
- ``TabularMSA`` object will have its index labels set
+ Dictionary mapping keys to ``GrammaredSequence`` sequence objects.
+ The ``TabularMSA`` object will have its index labels set
to the keys in the dictionary.
Returns
@@ -924,7 +924,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Yields
------
- IUPACSequence
+ GrammaredSequence
Each sequence in the order they are stored in the MSA.
Examples
@@ -945,7 +945,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Yields
------
- IUPACSequence
+ GrammaredSequence
Each sequence in reverse order from how they are stored in the MSA.
Examples
@@ -1710,7 +1710,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Parameters
----------
- sequence : IUPACSequence
+ sequence : GrammaredSequence
Sequence to be appended. Must match the dtype of the MSA and the
number of positions in the MSA.
minter : callable or metadata key, optional
@@ -1730,7 +1730,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
If neither `minter` nor `index` are provided and the MSA has a
non-default index.
TypeError
- If the sequence object isn't an ``IUPACSequence``.
+ If the sequence object isn't a ``GrammaredSequence``.
TypeError
If the type of the sequence does not match the dtype of the MSA.
ValueError
@@ -1792,7 +1792,7 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
Parameters
----------
- sequences : iterable of IUPACSequence
+ sequences : iterable of GrammaredSequence
Sequences to be appended. Must match the dtype of the MSA and the
number of positions in the MSA.
minter : callable or metadata key, optional
@@ -1816,7 +1816,8 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
ValueError
If `index` is not the same length as `sequences`.
TypeError
- If `sequences` contains an object that isn't an ``IUPACSequence``.
+ If `sequences` contains an object that isn't a
+ ``GrammaredSequence``.
TypeError
If `sequence` contains a type that does not match the dtype of the
MSA.
@@ -1910,10 +1911,10 @@ class TabularMSA(MetadataMixin, PositionalMetadataMixin, SkbioObject):
else:
sequence = sequences[0]
expected_dtype = type(sequence)
- if not issubclass(expected_dtype, IUPACSequence):
+ if not issubclass(expected_dtype, GrammaredSequence):
raise TypeError(
"Each sequence must be of type %r, not type %r"
- % (IUPACSequence.__name__, expected_dtype.__name__))
+ % (GrammaredSequence.__name__, expected_dtype.__name__))
expected_length = len(sequence)
for sequence in sequences:
diff --git a/skbio/alignment/tests/test_pairwise.py b/skbio/alignment/tests/test_pairwise.py
index b3d9fef..d3ceb73 100644
--- a/skbio/alignment/tests/test_pairwise.py
+++ b/skbio/alignment/tests/test_pairwise.py
@@ -24,28 +24,28 @@ from skbio.alignment._pairwise import (
_init_matrices_sw, _init_matrices_nw,
_compute_score_and_traceback_matrices, _traceback, _first_largest,
_compute_substitution_score)
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence import GrammaredSequence
from skbio.util._decorator import classproperty, overrides
-class CustomSequence(IUPACSequence):
+class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('^$')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def default_gap_char(cls):
return '^'
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set('WXYZ')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {}
@@ -134,7 +134,8 @@ class PairwiseAlignmentTests(TestCase):
def test_global_pairwise_align_invalid_type(self):
with six.assertRaisesRegex(self, TypeError,
- "IUPACSequence.*TabularMSA.*'Sequence'"):
+ "GrammaredSequence.*"
+ "TabularMSA.*'Sequence'"):
global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def test_global_pairwise_align_dtype_mismatch(self):
@@ -460,7 +461,8 @@ class PairwiseAlignmentTests(TestCase):
self.assertEqual(start_end_no_sub, start_end_alt_sub)
def test_local_pairwise_align_invalid_type(self):
- with six.assertRaisesRegex(self, TypeError, 'IUPACSequence.*Sequence'):
+ with six.assertRaisesRegex(self, TypeError,
+ 'GrammaredSequence.*Sequence'):
local_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def test_local_pairwise_align_type_mismatch(self):
diff --git a/skbio/alignment/tests/test_tabular_msa.py b/skbio/alignment/tests/test_tabular_msa.py
index 595994a..7f61e76 100644
--- a/skbio/alignment/tests/test_tabular_msa.py
+++ b/skbio/alignment/tests/test_tabular_msa.py
@@ -21,7 +21,7 @@ import pandas as pd
import scipy.stats
from skbio import Sequence, DNA, RNA, Protein, TabularMSA
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence import GrammaredSequence
from skbio.util._decorator import classproperty, overrides
from skbio.util._testing import (ReallyEqualMixin, MetadataMixinTests,
PositionalMetadataMixinTests,
@@ -75,10 +75,10 @@ class TestTabularMSA(unittest.TestCase, ReallyEqualMixin):
def test_constructor_invalid_dtype(self):
with six.assertRaisesRegex(self, TypeError,
- 'IUPACSequence.*Sequence'):
+ 'GrammaredSequence.*Sequence'):
TabularMSA([Sequence('')])
- with six.assertRaisesRegex(self, TypeError, 'IUPACSequence.*int'):
+ with six.assertRaisesRegex(self, TypeError, 'GrammaredSequence.*int'):
TabularMSA([42, DNA('')])
def test_constructor_not_monomorphic(self):
@@ -1829,7 +1829,7 @@ class TestAppend(unittest.TestCase):
msa = TabularMSA([])
with six.assertRaisesRegex(self, TypeError,
- 'IUPACSequence.*Sequence'):
+ 'GrammaredSequence.*Sequence'):
msa.append(Sequence(''))
self.assertEqual(msa, TabularMSA([]))
@@ -2087,7 +2087,7 @@ class TestExtend(unittest.TestCase):
msa = TabularMSA([])
with six.assertRaisesRegex(self, TypeError,
- 'IUPACSequence.*Sequence'):
+ 'GrammaredSequence.*Sequence'):
msa.extend([Sequence('')])
self.assertEqual(msa, TabularMSA([]))
@@ -3291,19 +3291,24 @@ class TestGapFrequencies(unittest.TestCase):
npt.assert_array_equal(np.array([0.0, 2/3, 1/3, 1.0, 1.0]), freqs)
def test_relative_frequencies_precise(self):
- class CustomSequence(IUPACSequence):
+ class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('0123456789')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '0'
+
+ @classproperty
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set('')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {}
@@ -3314,19 +3319,24 @@ class TestGapFrequencies(unittest.TestCase):
npt.assert_array_equal(np.array([1.0]), freqs)
def test_custom_gap_characters(self):
- class CustomSequence(IUPACSequence):
+ class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('#$*')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '#'
+
+ @classproperty
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set('ABC-.')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {'D': 'ABC-.'}
diff --git a/skbio/diversity/alpha/_lladser.py b/skbio/diversity/alpha/_lladser.py
index d59d7a3..7c35c9c 100644
--- a/skbio/diversity/alpha/_lladser.py
+++ b/skbio/diversity/alpha/_lladser.py
@@ -213,7 +213,7 @@ def _get_interval_for_r_new_otus(seq, r):
# bail out if not enough unseen
if not count or (unseen < r):
- raise StopIteration
+ return
# make a copy of seen before yielding, as we'll continue to add to the
# set in subsequent iterations
diff --git a/skbio/io/__init__.py b/skbio/io/__init__.py
index 3719814..495a7c5 100644
--- a/skbio/io/__init__.py
+++ b/skbio/io/__init__.py
@@ -25,6 +25,7 @@ see the associated documentation.
ordination
phylip
qseq
+ stockholm
.. currentmodule:: skbio.io.registry
@@ -62,6 +63,7 @@ User exceptions and warnings
PhylipFormatError
QSeqFormatError
QUALFormatError
+ StockholmFormatError
Subpackages
-----------
@@ -201,7 +203,8 @@ from ._exception import (UnrecognizedFormatError, FileFormatError,
FASTAFormatError, GenBankFormatError, IOSourceError,
FASTQFormatError, LSMatFormatError, NewickFormatError,
OrdinationFormatError, PhylipFormatError,
- QSeqFormatError, QUALFormatError)
+ QSeqFormatError, QUALFormatError,
+ StockholmFormatError)
from .registry import write, read, sniff, create_format, io_registry
from .util import open
@@ -221,7 +224,8 @@ __all__ = ['write', 'read', 'sniff', 'open', 'io_registry', 'create_format',
'OrdinationFormatError',
'PhylipFormatError',
'QSeqFormatError',
- 'QUALFormatError']
+ 'QUALFormatError',
+ 'StockholmFormatError']
# Necessary to import each file format module to have them added to the I/O
@@ -238,6 +242,7 @@ import_module('skbio.io.format.ordination')
import_module('skbio.io.format.phylip')
import_module('skbio.io.format.qseq')
import_module('skbio.io.format.genbank')
+import_module('skbio.io.format.stockholm')
# This is meant to be a handy indicator to the user that they have done
# something wrong.
diff --git a/skbio/io/_exception.py b/skbio/io/_exception.py
index 2db661d..4cad028 100644
--- a/skbio/io/_exception.py
+++ b/skbio/io/_exception.py
@@ -84,6 +84,11 @@ class QSeqFormatError(FileFormatError):
pass
+class StockholmFormatError(FileFormatError):
+ """Raised when a ``stockholm`` formatted file cannot be parsed."""
+ pass
+
+
class InvalidRegistrationError(Exception):
"""Raised if function doesn't meet the expected API of its registration."""
pass
diff --git a/skbio/io/format/_base.py b/skbio/io/format/_base.py
index 9f8be9c..9087be8 100644
--- a/skbio/io/format/_base.py
+++ b/skbio/io/format/_base.py
@@ -85,7 +85,7 @@ def _get_phred_offset_and_range(variant, phred_offset, errors):
elif variant == 'solexa':
phred_offset = 64
phred_range = (-5, 62)
- raise NotImplementedError(errors[1])
+ raise ValueError(errors[1])
else:
raise ValueError("Unrecognized variant %r." % variant)
else:
diff --git a/skbio/io/format/clustal.py b/skbio/io/format/clustal.py
index 3a12ddb..aab3e88 100644
--- a/skbio/io/format/clustal.py
+++ b/skbio/io/format/clustal.py
@@ -49,14 +49,16 @@ Format Parameters
-----------------
The only supported format parameter is ``constructor``, which specifies the
type of in-memory sequence object to read each aligned sequence into. This must
-be a subclass of ``IUPACSequence`` (e.g., ``DNA``, ``RNA``, ``Protein``) and is
-a required format parameter. For example, if you know that the clustal file
-you're reading contains DNA sequences, you would pass ``constructor=DNA`` to
-the reader call.
+be a subclass of ``GrammaredSequence`` (e.g., ``DNA``, ``RNA``, ``Protein``)
+and is a required format parameter. For example, if you know that the clustal
+file you're reading contains DNA sequences, you would pass ``constructor=DNA``
+to the reader call.
Examples
--------
-Assume we have a clustal-formatted file of RNA sequences::
+Assume we have a clustal-formatted file of RNA sequences:
+
+.. code-block:: none
CLUSTAL W (1.82) multiple sequence alignment
diff --git a/skbio/io/format/fasta.py b/skbio/io/format/fasta.py
index e35ed02..8a54bcf 100644
--- a/skbio/io/format/fasta.py
+++ b/skbio/io/format/fasta.py
@@ -203,8 +203,8 @@ When reading into a ``Sequence`` generator, ``constructor`` defaults to
``Sequence`` and must be a subclass of ``Sequence`` if supplied.
When reading into a ``TabularMSA``, ``constructor`` is a required format
-parameter and must be a subclass of ``IUPACSequence`` (e.g., ``DNA``, ``RNA``,
-``Protein``).
+parameter and must be a subclass of ``GrammaredSequence`` (e.g., ``DNA``,
+``RNA``, ``Protein``).
.. note:: The FASTA sniffer will not attempt to guess the ``constructor``
parameter.
@@ -839,7 +839,10 @@ def _parse_fasta_raw(fh, data_parser, error_type):
"""
# Skip any blank or whitespace-only lines at beginning of file
- seq_header = next(_line_generator(fh, skip_blanks=True))
+ try:
+ seq_header = next(_line_generator(fh, skip_blanks=True))
+ except StopIteration:
+ return
# header check inlined here and below for performance
if seq_header.startswith('>'):
diff --git a/skbio/io/format/fastq.py b/skbio/io/format/fastq.py
index c2b58ea..79ce877 100644
--- a/skbio/io/format/fastq.py
+++ b/skbio/io/format/fastq.py
@@ -18,7 +18,9 @@ FASTA/QUAL because the quality scores are stored in the same file as the
biological sequence data.
An example FASTQ-formatted file containing two DNA sequences and their quality
-scores::
+scores:
+
+.. code-block:: none
@seq1 description 1
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
@@ -326,7 +328,10 @@ def _fastq_sniffer(fh):
def _fastq_to_generator(fh, variant=None, phred_offset=None,
constructor=Sequence, **kwargs):
# Skip any blank or whitespace-only lines at beginning of file
- seq_header = next(_line_generator(fh, skip_blanks=True))
+ try:
+ seq_header = next(_line_generator(fh, skip_blanks=True))
+ except StopIteration:
+ return
if not seq_header.startswith('@'):
raise FASTQFormatError(
diff --git a/skbio/io/format/genbank.py b/skbio/io/format/genbank.py
index 7d3ec5d..a6d931c 100644
--- a/skbio/io/format/genbank.py
+++ b/skbio/io/format/genbank.py
@@ -265,7 +265,6 @@ from future.builtins import range, zip
import re
import numpy as np
import pandas as pd
-from datetime import datetime
from functools import partial
from skbio.io import create_format, GenBankFormatError
@@ -277,8 +276,6 @@ from skbio.sequence import Sequence, DNA, RNA, Protein
genbank = create_format('genbank')
-# date format in locus line of genbank record
-_TIME_FORMAT = '%d-%b-%Y'
# This list is ordered
# used to read and write genbank file.
_HEADERS = ['LOCUS',
@@ -510,12 +507,11 @@ def _parse_locus(lines):
"Could not parse the LOCUS line:\n%s" % line)
res['size'] = int(res['size'])
- res['date'] = datetime.strptime(res['date'], _TIME_FORMAT)
return res
def _serialize_locus(header, obj, indent=12):
- '''Serilize LOCUS line.
+ '''Serialize LOCUS line.
Parameters
----------
@@ -523,8 +519,6 @@ def _serialize_locus(header, obj, indent=12):
'''
# use 'or' to convert None to ''
kwargs = {k: v or '' for k, v in obj.items()}
- # convert datetime to str
- kwargs['date'] = kwargs['date'].strftime(_TIME_FORMAT).upper()
return ('{header:<{indent}}{locus_name} {size} {unit}'
' {mol_type} {shape} {division} {date}\n').format(
diff --git a/skbio/io/format/phylip.py b/skbio/io/format/phylip.py
index 3e056c9..6ee7b15 100644
--- a/skbio/io/format/phylip.py
+++ b/skbio/io/format/phylip.py
@@ -126,10 +126,10 @@ Format Parameters
-----------------
The only supported format parameter is ``constructor``, which specifies the
type of in-memory sequence object to read each aligned sequence into. This must
-be a subclass of ``IUPACSequence`` (e.g., ``DNA``, ``RNA``, ``Protein``) and is
-a required format parameter. For example, if you know that the PHYLIP file
-you're reading contains DNA sequences, you would pass ``constructor=DNA`` to
-the reader call.
+be a subclass of ``GrammaredSequence`` (e.g., ``DNA``, ``RNA``, ``Protein``)
+and is a required format parameter. For example, if you know that the PHYLIP
+file you're reading contains DNA sequences, you would pass ``constructor=DNA``
+to the reader call.
Examples
--------
diff --git a/skbio/io/format/stockholm.py b/skbio/io/format/stockholm.py
new file mode 100644
index 0000000..b4f2cb3
--- /dev/null
+++ b/skbio/io/format/stockholm.py
@@ -0,0 +1,710 @@
+"""
+Stockholm format (:mod:`skbio.io.format.stockholm`)
+===================================================
+
+.. currentmodule:: skbio.io.format.stockholm
+
+The Stockholm format is a multiple sequence alignment format (MSA) that
+optionally supports storing arbitrary alignment features (metadata). Features
+can be placed into four different categories: GF, GS, GR, and GC (described in
+more detail below).
+
+An example Stockholm file, taken from [1]_:
+
+.. code-block:: none
+
+ # STOCKHOLM 1.0
+ #=GF ID UPSK
+ #=GF SE Predicted; Infernal
+ #=GF SS Published; PMID 9223489
+ #=GF RN [1]
+ #=GF RM 9223489
+ #=GF RT The role of the pseudoknot at the 3' end of turnip yellow mosaic
+ #=GF RT virus RNA in minus-strand synthesis by the viral RNA-dependent \
+RNA
+ #=GF RT polymerase.
+ #=GF RA Deiman BA, Kortlever RM, Pleij CW;
+ #=GF RL J Virol 1997;71:5990-5996.
+ AF035635.1/619-641 UGAGUUCUCGAUCUCUAAAAUCG
+ M24804.1/82-104 UGAGUUCUCUAUCUCUAAAAUCG
+ J04373.1/6212-6234 UAAGUUCUCGAUCUUUAAAAUCG
+ M24803.1/1-23 UAAGUUCUCGAUCUCUAAAAUCG
+ #=GC SS_cons .AAA....<<<<aaa....>>>>
+ //
+
+Format Support
+--------------
+**Has Sniffer: Yes**
+
+**State: Experimental as of 0.4.2.**
+
++------+------+---------------------------------------------------------------+
+|Reader|Writer| Object Class |
++======+======+===============================================================+
+|Yes |Yes |:mod:`skbio.alignment.TabularMSA` |
++------+------+---------------------------------------------------------------+
+
+Format Specification
+--------------------
+The Stockholm format consists of a header, a multiple sequence alignment,
+associated metadata (features), and a footer.
+
+Header
+^^^^^^
+The first line of a Stockholm file must be the following header:
+
+.. code-block:: none
+
+ # STOCKHOLM 1.0
+
+Multiple Sequence Alignment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Sequence lines consist of a sequence name, followed by whitespace, followed by
+the aligned sequence. For example::
+
+ seq1 ACG-T-GGT
+ seq2 ACCGTTCG-
+
+Sequence names (``seq1``, ``seq2``) are stored in the ``TabularMSA``
+``index``.
+
+.. note:: scikit-bio currently supports reading Stockholm files where each
+ sequence is contained on a single line. Interleaved/wrap-around Stockholm
+ files are not supported. When writing, each sequence will be placed on its
+ own line.
+
+.. warning:: Sequence names must be unique in the Stockholm file. Likewise,
+ when writing from a ``TabularMSA``, ``index`` must be unique.
+
+Metadata
+^^^^^^^^
+Stockholm files support storing arbitrary metadata (features) about the MSA.
+All metadata described in the following sections are optional and may appear in
+any order. Metadata "mark-up" lines begin with either ``#=GF``, ``#=GS``,
+``#=GR``, or ``#=GC``, and each line describes a single feature of the
+alignment.
+
+.. note:: Stockholm format supports generic features. [1]_ and [2]_ provide a
+ list of common features output by Pfam/Rfam. scikit-bio does not
+ require that these features are present. These features are processed in the
+ same way as any arbitrary feature would be, as a simple key-value pair of
+ strings. When writing, feature names, feature data, and sequence names are
+ converted to type ``str``.
+
+.. note:: When writing a Stockholm file, scikit-bio will place the metadata in
+ the format's recommended order:
+
+ - GF: Above the alignment
+ - GS: Above the alignment (after GF)
+ - GR: Below corresponding sequence
+ - GC: Below the alignment
+
+GF metadata
++++++++++++
+Data relating to the multiple sequence alignment as a whole, such as authors or
+number of sequences in the alignment. Starts with ``#=GF`` followed by a
+feature name and data relating to the feature. Typically comes first in a
+Stockholm file.
+
+For example (taken from [2]_):
+
+.. code-block:: none
+
+ #=GF DE CBS domain
+
+Where ``DE`` is the feature name and ``CBS Domain`` is the feature data.
+
+GF metadata is stored in the ``TabularMSA`` ``metadata`` dictionary.
+
+.. note:: When reading, duplicate GF feature names will have their values
+ concatenated in the order they appear in the file. When writing, each GF
+ feature will be placed on its own line, regardless of length.
+
+.. note:: Trees labelled with ``NH``/``TN`` are handled differently than other
+ GF features. When reading a Stockholm file with these features, the reader
+ follows the rules described in [2]_.
+
+ A single tree without an identifier will be stored as::
+
+ metadata = {
+ 'NH': 'tree in NHX format'
+ }
+
+ A single tree with an identifier will be stored as::
+
+ metadata = {
+ 'NH': {
+ 'tree-id': 'tree in NHX format'
+ }
+ }
+
+ Multiple trees (which must have identifiers) will be stored as::
+
+ metadata = {
+ 'NH': {
+ 'tree-id-1': 'tree in NHX format',
+ 'tree-id-2': 'tree in NHX format'
+ }
+ }
+
+GS metadata
++++++++++++
+Data relating to a specific sequence in the multiple sequence alignment.
+Starts with ``#=GS`` followed by the sequence name followed by a feature name
+and data relating to the feature. Typically comes after GF metadata in a
+Stockholm file.
+
+For example (taken from [2]_):
+
+.. code-block:: none
+
+ #=GS O83071/259-312 AC O83071
+
+Where ``O83071/259-312`` is the sequence name, ``AC`` is the feature name, and
+``083071`` is the feature data.
+
+GS metadata is stored in the sequence-specific ``metadata`` dictionary.
+
+.. note:: When reading, duplicate GS feature names will have their values
+ concatenated in the order they appear in the file. When writing, each GS
+ feature will be placed on its own line, regardless of length.
+
+GR metadata
++++++++++++
+Data relating to the columns of a specific sequence in a multiple sequence
+alignment. Starts with ``#=GR`` followed by the sequence name followed by a
+feature name and data relating to the feature, one character per column.
+Typically comes after the sequence line it relates to.
+
+For example (taken from [2]_):
+
+.. code-block:: none
+
+ #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
+
+Where ``O31698/18-71`` is the sequence name, ``SS`` is the feature name, and
+``CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH`` is the feature data.
+
+GR metadata is stored in sequence-specific ``positional_metadata``.
+
+.. note:: Duplicate GR feature names attributed to a single sequence are
+ disallowed.
+
+GC metadata
++++++++++++
+Data relating to the columns of the multiple sequence alignment as a whole.
+Starts with ``#=GC`` followed by a feature name and data relating to the
+feature, one character per column. Typically comes at the end of the multiple
+sequence alignment.
+
+For example (taken from [2]_):
+
+.. code-block:: none
+
+ #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
+
+Where ``SS_cons`` is the feature name and
+``CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH`` is the feature data.
+
+GC metadata is stored in ``TabularMSA`` ``positional_metadata``.
+
+.. note:: Duplicate GC feature names are disallowed.
+
+Footer
+^^^^^^
+The final line of a Stockholm file must be the following footer::
+
+ //
+
+.. note:: scikit-bio currently supports reading a Stockholm file containing a
+ single MSA. If the file contains more than one MSA, only the first MSA will
+ be read into a ``TabularMSA``.
+
+Format Parameters
+-----------------
+The only supported format parameter is ``constructor``, which specifies the
+type of in-memory sequence object to read each aligned sequence into. This must
+be a subclass of ``GrammaredSequence`` (e.g., ``DNA``, ``RNA``, ``Protein``)
+and is a required format parameter. For example, if you know that the Stockholm
+file you're reading contains DNA sequences, you would pass ``constructor=DNA``
+to the reader call.
+
+Examples
+--------
+Suppose we have a Stockholm file containing an MSA of protein sequences
+(modified from [2]_):
+
+>>> import skbio.io
+>>> from io import StringIO
+>>> from skbio import Protein, TabularMSA
+>>> fs = '\\n'.join([
+... '# STOCKHOLM 1.0',
+... '#=GF CC CBS domains are small intracellular modules mostly'
+... ' found ',
+... '#=GF CC in 2 or four copies within a protein.',
+... '#=GS O83071/192-246 AC O83071',
+... '#=GS O31698/88-139 OS Bacillus subtilis',
+... 'O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRV',
+... '#=GR O83071/192-246 SA 999887756453524252..55152525....36463',
+... 'O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVA',
+... 'O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAI',
+... 'O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFV',
+... 'O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFV',
+... '#=GR O31699/88-139 AS ________________*____________________',
+... '#=GR O31699/88-139 IN ____________1______________2_________',
+... '#=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEE',
+... '//'
+... ])
+>>> fh = StringIO(fs)
+>>> msa = TabularMSA.read(fh, constructor=Protein)
+>>> msa # doctest: +NORMALIZE_WHITESPACE
+TabularMSA[Protein]
+----------------------------------------------------------------------
+Metadata:
+ 'CC': 'CBS domains are small intracellular modules mostly found in
+ 2 or four copies within a protein.'
+Positional metadata:
+ 'SS_cons': <dtype: object>
+Stats:
+ sequence count: 5
+ position count: 37
+----------------------------------------------------------------------
+MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRV
+MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVA
+MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAI
+EVMLTDIPRLHINDPIMK..GFGMVINN......GFV
+EVMLTDIPRLHINDPIMK..GFGMVINN......GFV
+
+The sequence names are stored in the ``index``:
+
+>>> msa.index
+Index(['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139',
+ 'O31699/88-139'],
+ dtype='object')
+
+The ``TabularMSA`` has GF metadata stored in its ``metadata`` dictionary:
+
+>>> msa.metadata
+OrderedDict([('CC', 'CBS domains are small intracellular modules mostly found \
+in 2 or four copies within a protein.')])
+
+GC metadata is stored in the ``TabularMSA`` ``positional_metadata``:
+
+>>> msa.positional_metadata # doctest: +ELLIPSIS
+ SS_cons
+0 C
+1 C
+2 C
+3 C
+4 C
+5 H
+6 H
+7 H
+8 H
+9 H
+...
+
+GS metadata is stored in the sequence-specific ``metadata`` dictionary:
+
+>>> msa[0].metadata
+OrderedDict([('AC', 'O83071')])
+
+GR metadata is stored in sequence-specific ``positional_metadata``:
+
+>>> msa[4].positional_metadata # doctest: +ELLIPSIS
+ AS IN
+0 _ _
+1 _ _
+2 _ _
+3 _ _
+4 _ _
+5 _ _
+6 _ _
+7 _ _
+8 _ _
+9 _ _
+...
+
+Let's write this ``TabularMSA`` in Stockholm format:
+
+>>> fh = StringIO()
+>>> _ = msa.write(fh, format='stockholm')
+>>> print(fh.getvalue())
+# STOCKHOLM 1.0
+#=GF CC CBS domains are small intracellular modules mostly found in 2 or four \
+copies within a protein.
+#=GS O83071/192-246 AC O83071
+#=GS O31698/88-139 OS Bacillus subtilis
+O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRV
+#=GR O83071/192-246 SA 999887756453524252..55152525....36463
+O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVA
+O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAI
+O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFV
+O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFV
+#=GR O31699/88-139 AS ________________*____________________
+#=GR O31699/88-139 IN ____________1______________2_________
+#=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEE
+//
+<BLANKLINE>
+>>> fh.close()
+
+References
+==========
+.. [1] https://en.wikipedia.org/wiki/Stockholm_format
+.. [2] http://sonnhammer.sbc.su.se/Stockholm.html
+
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import (absolute_import, division, print_function,
+ unicode_literals)
+from future.utils import viewitems
+from future.builtins import zip
+
+from collections import OrderedDict
+
+from skbio.alignment import TabularMSA
+from skbio.sequence._grammared_sequence import GrammaredSequence
+from skbio.io import create_format, StockholmFormatError
+
+stockholm = create_format('stockholm')
+
+
+ at stockholm.sniffer()
+def _stockholm_sniffer(fh):
+ # Smells a Stockholm file if the following conditions are met:
+ # - File isn't empty
+ # - File contains correct header
+ try:
+ line = next(fh)
+ except StopIteration:
+ return False, {}
+
+ if _is_header(line):
+ return True, {}
+
+ return False, {}
+
+
+ at stockholm.reader(TabularMSA)
+def _stockholm_to_tabular_msa(fh, constructor=None):
+ # Checks that user has passed required constructor parameter
+ if constructor is None:
+ raise ValueError("Must provide `constructor` parameter.")
+ # Checks that contructor parameter is supported
+ elif not issubclass(constructor, GrammaredSequence):
+ raise TypeError("`constructor` must be a subclass of "
+ "`GrammaredSequence`.")
+
+ # Checks that the file isn't empty
+ try:
+ line = next(fh)
+ except StopIteration:
+ raise StockholmFormatError("File is empty.")
+ # Checks that the file follows basic format (includes the required header)
+ if not _is_header(line):
+ raise StockholmFormatError("File missing required Stockholm header "
+ "line.")
+ msa_data = _MSAData()
+ for line in fh:
+ if line.isspace():
+ continue
+
+ line = line.rstrip('\n')
+
+ if _is_sequence_line(line):
+ seq_name, seq_data = _parse_sequence_line(line)
+ msa_data.add_sequence(seq_name, seq_data)
+ elif line.startswith("#=GF"):
+ feature_name, feature_data = _parse_gf_line(line)
+ msa_data.add_gf_metadata(feature_name, feature_data)
+ elif line.startswith("#=GS"):
+ seq_name, feature_name, feature_data = _parse_gs_line(line)
+ msa_data.add_gs_metadata(seq_name, feature_name, feature_data)
+ elif line.startswith("#=GR"):
+ seq_name, feature_name, feature_data = _parse_gr_line(line)
+ msa_data.add_gr_metadata(seq_name, feature_name, feature_data)
+ elif line.startswith('#=GC'):
+ feature_name, feature_data = _parse_gc_line(line)
+ msa_data.add_gc_metadata(feature_name, feature_data)
+ elif _is_footer(line):
+ break
+ else:
+ raise StockholmFormatError("Unrecognized line: %r" % line)
+
+ if not _is_footer(line):
+ raise StockholmFormatError('Final line does not conform to Stockholm '
+ 'format. Must contain only "//".')
+
+ return msa_data.build_tabular_msa(constructor)
+
+
+# For storing intermediate data used to construct a Sequence object.
+class _MSAData(object):
+ def __init__(self):
+ self._seqs = {}
+ self._seq_order = []
+ self._metadata = OrderedDict()
+ self._positional_metadata = OrderedDict()
+
+ def add_sequence(self, seq_name, seq_data):
+ if seq_name not in self._seqs:
+ self._seqs[seq_name] = _SeqData(seq_name)
+ self._seqs[seq_name].seq = seq_data
+ self._seq_order.append(seq_name)
+
+ def add_gf_metadata(self, feature_name, feature_data):
+ # Handles first instance of labelled tree
+ if feature_name == 'TN' and 'NH' not in self._metadata:
+ self._metadata['NH'] = OrderedDict()
+ self._metadata['NH'][feature_data] = ''
+ # Handles second instance of labelled tree
+ elif feature_name == 'TN' and 'NH' in self._metadata:
+ if feature_data in self._metadata['NH']:
+ raise StockholmFormatError("Tree name %r used multiple times "
+ "in file." % feature_data)
+ self._metadata['NH'][feature_data] = ''
+ # Handles extra line(s) of an already created tree
+ elif feature_name == 'NH' and feature_name in self._metadata:
+ trees = self._metadata[feature_name]
+ tree_id = list(trees.keys())[-1]
+ self._metadata[feature_name][tree_id] = (trees[tree_id] +
+ feature_data)
+ elif feature_name in self._metadata:
+ self._metadata[feature_name] = (self._metadata[feature_name] +
+ feature_data)
+ else:
+ self._metadata[feature_name] = feature_data
+
+ def add_gc_metadata(self, feature_name, feature_data):
+ if feature_name in self._positional_metadata:
+ _raise_duplicate_error("Found duplicate GC label %r."
+ % feature_name)
+ self._positional_metadata[feature_name] = feature_data
+
+ def add_gs_metadata(self, seq_name, feature_name, feature_data):
+ if seq_name not in self._seqs:
+ self._seqs[seq_name] = _SeqData(seq_name)
+ self._seqs[seq_name].add_metadata_feature(feature_name, feature_data)
+
+ def add_gr_metadata(self, seq_name, feature_name, feature_data):
+ if seq_name not in self._seqs:
+ self._seqs[seq_name] = _SeqData(seq_name)
+ self._seqs[seq_name].add_positional_metadata_feature(feature_name,
+ feature_data)
+
+ def build_tabular_msa(self, constructor):
+ if len(self._seqs) != len(self._seq_order):
+ invalid_seq_names = set(self._seqs) - set(self._seq_order)
+ raise StockholmFormatError('Found GS or GR metadata for '
+ 'nonexistent sequence(s): %r'
+ % invalid_seq_names)
+
+ seqs = []
+ for seq_name in self._seq_order:
+ seqs.append(self._seqs[seq_name].build_sequence(constructor))
+
+ positional_metadata = self._positional_metadata
+ if not positional_metadata:
+ positional_metadata = None
+
+ metadata = self._metadata
+ if not metadata:
+ metadata = None
+
+ # Constructs TabularMSA
+ return TabularMSA(seqs, metadata=metadata,
+ positional_metadata=positional_metadata,
+ index=self._seq_order)
+
+
+class _SeqData(object):
+ def __init__(self, name):
+ self.name = name
+ self._seq = None
+ self.metadata = None
+ self.positional_metadata = None
+
+ @property
+ def seq(self):
+ return self._seq
+
+ @seq.setter
+ def seq(self, seq):
+ if self._seq is None:
+ self._seq = seq
+ else:
+ _raise_duplicate_error("Found duplicate sequence name: %r"
+ % self.name)
+
+ def add_metadata_feature(self, feature_name, feature_data):
+ if self.metadata is None:
+ self.metadata = OrderedDict()
+ if feature_name in self.metadata:
+ self.metadata[feature_name] += feature_data
+ else:
+ self.metadata[feature_name] = feature_data
+
+ def add_positional_metadata_feature(self, feature_name, feature_data):
+ if self.positional_metadata is None:
+ self.positional_metadata = OrderedDict()
+ if feature_name in self.positional_metadata:
+ _raise_duplicate_error("Found duplicate GR label %r associated "
+ "with sequence name %r"
+ % (feature_name, self.name))
+ else:
+ self.positional_metadata[feature_name] = feature_data
+
+ def build_sequence(self, constructor):
+ return constructor(self.seq, metadata=self.metadata,
+ positional_metadata=(self.positional_metadata))
+
+
+def _parse_gf_line(line):
+ line = line.split(None, 2)
+ _check_for_malformed_line(line, 3)
+ return line[1:]
+
+
+def _parse_gs_line(line):
+ line = line.split(None, 3)
+ _check_for_malformed_line(line, 4)
+ return line[1:]
+
+
+def _parse_gr_line(line):
+ line = line.split(None, 3)
+ _check_for_malformed_line(line, 4)
+ seq_name = line[1]
+ feature_name = line[2]
+ feature_data = list(line[3])
+ return seq_name, feature_name, feature_data
+
+
+def _parse_gc_line(line):
+ line = line.split(None, 2)
+ _check_for_malformed_line(line, 3)
+ feature_name = line[1]
+ feature_data = list(line[2])
+ return feature_name, feature_data
+
+
+def _parse_sequence_line(line):
+ line = line.split(None, 1)
+ _check_for_malformed_line(line, 2)
+ return line
+
+
+def _is_header(line):
+ return line == '# STOCKHOLM 1.0\n'
+
+
+def _is_footer(line):
+ return line.rstrip() == '//'
+
+
+def _is_sequence_line(line):
+ return not (line.startswith("#") or _is_footer(line))
+
+
+def _raise_duplicate_error(message):
+ raise StockholmFormatError(message+' Note: If the file being used is in '
+ 'Stockholm interleaved format, this '
+ 'is not supported by the reader.')
+
+
+def _check_for_malformed_line(line, expected_len):
+ if len(line) != expected_len:
+ raise StockholmFormatError('Line contains %d item(s). It must '
+ 'contain exactly %d item(s).'
+ % (len(line), expected_len))
+
+
+ at stockholm.writer(TabularMSA)
+def _tabular_msa_to_stockholm(obj, fh):
+ if not obj.index.is_unique:
+ raise StockholmFormatError("The TabularMSA's index labels must be"
+ " unique.")
+ # Writes header
+ fh.write("# STOCKHOLM 1.0\n")
+
+ # Writes GF data to file
+ if obj.has_metadata():
+ for gf_feature, gf_feature_data in viewitems(obj.metadata):
+ if gf_feature == 'NH' and isinstance(gf_feature_data, dict):
+ for tree_id, tree in viewitems(obj.metadata[gf_feature]):
+ fh.write("#=GF TN %s\n" % tree_id)
+ fh.write("#=GF NH %s\n" % tree)
+ else:
+ fh.write("#=GF %s %s\n" % (gf_feature, gf_feature_data))
+
+ unpadded_data = []
+ # Writes GS data to file, retrieves GR data, and retrieves sequence data
+ for seq, seq_name in zip(obj, obj.index):
+ seq_name = str(seq_name)
+ if seq.has_metadata():
+ for gs_feature, gs_feature_data in viewitems(seq.metadata):
+ fh.write("#=GS %s %s %s\n" % (seq_name, gs_feature,
+ gs_feature_data))
+ unpadded_data.append((seq_name, str(seq)))
+ if seq.has_positional_metadata():
+ df = _format_positional_metadata(seq.positional_metadata,
+ 'Sequence-specific positional '
+ 'metadata (GR)')
+ for gr_feature in df.columns:
+ gr_feature_data = ''.join(df[gr_feature])
+ gr_string = "#=GR %s %s" % (seq_name, gr_feature)
+ unpadded_data.append((gr_string, gr_feature_data))
+
+ # Retrieves GC data
+ if obj.has_positional_metadata():
+ df = _format_positional_metadata(obj.positional_metadata,
+ 'Multiple sequence alignment '
+ 'positional metadata (GC)')
+ for gc_feature in df.columns:
+ gc_feature_data = ''.join(df[gc_feature])
+ gc_string = "#=GC %s" % gc_feature
+ unpadded_data.append((gc_string, gc_feature_data))
+
+ # Writes GR, GC, and raw data to file with padding
+ _write_padded_data(unpadded_data, fh)
+
+ # Writes footer
+ fh.write("//\n")
+
+
+def _write_padded_data(data, fh):
+ max_data_len = 0
+ for label, _ in data:
+ if len(label) > max_data_len:
+ max_data_len = len(label)
+ fmt = '{0:%d} {1}\n' % max_data_len
+ for label, value in data:
+ fh.write(fmt.format(label, value))
+
+
+def _format_positional_metadata(df, data_type):
+ # Asserts positional metadata feature names are unique
+ if not df.columns.is_unique:
+ num_repeated_columns = len(df.columns) - len(set(df.columns))
+ raise StockholmFormatError('%s feature names must be unique. '
+ 'Found %d duplicate names.'
+ % (data_type, num_repeated_columns))
+
+ str_df = df.astype(str)
+
+ # Asserts positional metadata dataframe items are one character long
+ for column in str_df.columns:
+ if (str_df[column].str.len() != 1).any():
+ raise StockholmFormatError("%s must contain a single character for"
+ " each position's value. Found value(s)"
+ " in column %s of incorrect length."
+ % (data_type, column))
+ return str_df
diff --git a/skbio/io/format/tests/data/stockholm_all_data_types b/skbio/io/format/tests/data/stockholm_all_data_types
new file mode 100644
index 0000000..8a38d78
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_all_data_types
@@ -0,0 +1,15 @@
+# STOCKHOLM 1.0
+#=GF NM Kestrel Gorlick
+#=GF DT February 11, 2016
+#=GF FN Writer test file
+#=GS seq1 DT February 1, 2016
+#=GS seq1 NM Unknown
+#=GS seq3 DT Unknown
+seq1 GAGGCCATGCCCAGGTGAAG
+seq2 ACCTGAGCCACAGTAGAAGT
+seq3 CCCTTCGCTGGAAATGTATG
+#=GR seq3 AS CCGAAAGTCGTTCGAAAATG
+#=GR seq3 SS GGCGAGTCGTTCGAGCTGGC
+#=GC AS_cons CGTTCGTTCTAACAATTCCA
+#=GC SS_cons GGCGCTACGACCTACGACCG
+//
diff --git a/skbio/io/format/tests/data/stockholm_blank_lines b/skbio/io/format/tests/data/stockholm_blank_lines
new file mode 100644
index 0000000..fd3fbb0
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_blank_lines
@@ -0,0 +1,6 @@
+# STOCKHOLM 1.0
+#=GF AL ABCD
+
+#=GF NM 1234
+
+//
diff --git a/skbio/io/format/tests/data/stockholm_data_only b/skbio/io/format/tests/data/stockholm_data_only
new file mode 100644
index 0000000..a9df91f
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_data_only
@@ -0,0 +1,5 @@
+# STOCKHOLM 1.0
+seq1 ACUCCGACAUGCUCC
+seq2 UAGUGCCGAACGCUG
+seq3 GUGUGGGCGUGAUUC
+//
diff --git a/skbio/io/format/tests/data/stockholm_differing_gc_data_length b/skbio/io/format/tests/data/stockholm_differing_gc_data_length
new file mode 100644
index 0000000..f6b603d
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_differing_gc_data_length
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+seq1 ACGCUUGCAA
+#=GC AT_cons UGCUUGGCGCAU
+//
diff --git a/skbio/io/format/tests/data/stockholm_differing_gr_data_length b/skbio/io/format/tests/data/stockholm_differing_gr_data_length
new file mode 100644
index 0000000..018bd7f
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_differing_gr_data_length
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+seq1 AUCGCCUG
+#=GR seq1 AL UCGCUUG
+//
diff --git a/skbio/io/format/tests/data/stockholm_differing_seq_lengths b/skbio/io/format/tests/data/stockholm_differing_seq_lengths
new file mode 100644
index 0000000..8df950e
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_differing_seq_lengths
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+seq1 AUCCGCUUAC
+seq2 UGCCUGGAGCG
+//
diff --git a/skbio/io/format/tests/data/stockholm_duplicate_gc b/skbio/io/format/tests/data/stockholm_duplicate_gc
new file mode 100644
index 0000000..3a5d969
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_duplicate_gc
@@ -0,0 +1,7 @@
+# STOCKHOLM 1.0
+#=GF ID RCR2
+OSF22 TCCTCCCAGTGTCGCCCGGT
+OSF33 TTTTTTGGTCAAATTAAAGG
+#=GC SS_cons CAAGGGAAATACTACGGGAC
+#=GC SS_cons CGGGACTCGTGCGTTGTAGG
+//
diff --git a/skbio/io/format/tests/data/stockholm_duplicate_gr b/skbio/io/format/tests/data/stockholm_duplicate_gr
new file mode 100644
index 0000000..a8238e5
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_duplicate_gr
@@ -0,0 +1,8 @@
+# STOCKHOLM 1.0
+#=GF ID CBS
+#=GF AC PF00571
+LFDR2 GGCCTCAGGACGAAGCACGG
+LFDR3 AATTGTGATCATCTTACAGG
+#=GR LFDR3 OS TTTTACCAATTGCTGACAGA
+#=GR LFDR3 OS CCTGGACATCCCCCGCACGG
+//
diff --git a/skbio/io/format/tests/data/stockholm_duplicate_sequence_names b/skbio/io/format/tests/data/stockholm_duplicate_sequence_names
new file mode 100644
index 0000000..bf9f7bb
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_duplicate_sequence_names
@@ -0,0 +1,6 @@
+# STOCKHOLM 1.0
+ASR132 GGCGUUCCGUUCAGUGCUGG
+RTF112 GGGCGGUGCUAUGAAAAUAC
+ASR132 GGCGUUCCGUUCAGUGCUGG
+TTL879 AGCUAUCUCCGGGCCCUUGG
+//
diff --git a/skbio/io/format/tests/data/stockholm_duplicate_tree_ids b/skbio/io/format/tests/data/stockholm_duplicate_tree_ids
new file mode 100644
index 0000000..f9f596c
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_duplicate_tree_ids
@@ -0,0 +1,6 @@
+# STOCKHOLM 1.0
+#=GF TN tree1
+#=GF NH ABCD
+#=GF TN tree1
+#=GF NH EFGH
+//
diff --git a/skbio/io/format/tests/data/stockholm_extensive b/skbio/io/format/tests/data/stockholm_extensive
new file mode 100644
index 0000000..2129c73
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_extensive
@@ -0,0 +1,15 @@
+# STOCKHOLM 1.0
+#=GF ID CBS
+#=GF AC PF00571
+#=GF AU Bateman A
+#=GF SQ 67
+#=GS O31698/88-139 OS Bacillus subtilis
+O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
+#=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
+O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
+O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GR O31699/88-139 AS ________________*__________________________
+#=GR O31699/88-139 IN ____________1______________2__________0____
+#=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
+//
diff --git a/skbio/io/format/tests/data/stockholm_extensive_mixed b/skbio/io/format/tests/data/stockholm_extensive_mixed
new file mode 100644
index 0000000..7fb60d3
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_extensive_mixed
@@ -0,0 +1,15 @@
+# STOCKHOLM 1.0
+#=GR O31699/88-139 AS ________________*__________________________
+O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
+#=GF ID CBS
+#=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
+#=GF AC PF00571
+O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
+#=GR O31699/88-139 IN ____________1______________2__________0____
+#=GS O31698/88-139 OS Bacillus subtilis
+#=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
+O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GF AU Bateman A
+#=GF SQ 67
+//
diff --git a/skbio/io/format/tests/data/stockholm_invalid_data_type b/skbio/io/format/tests/data/stockholm_invalid_data_type
new file mode 100644
index 0000000..119e7c3
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_invalid_data_type
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+#=GF AL ABCD
+#=GZ NM 1234
+//
diff --git a/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gr b/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gr
new file mode 100644
index 0000000..fa964b4
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gr
@@ -0,0 +1,7 @@
+# STOCKHOLM 1.0
+#=GF ID ARD
+RL1255 AAGGGUUAUUUAUAUACUUU
+RL1332 UGCUAAGAGUGGGGAUGAUU
+RL1232 GCCACAACCGAUUAGAUAGA
+#=GR RL1355 AS ACUAUAUACAUAGCUAUAUU
+//
diff --git a/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gs b/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gs
new file mode 100644
index 0000000..39bb07d
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_invalid_nonexistent_gs
@@ -0,0 +1,5 @@
+# STOCKHOLM 1.0
+#=GS AC14 ID ARD2
+AC12 AAGGGUUAUUUAUAUACUUU
+AC13 UGCUAAGAGUGGGGAUGAUU
+//
diff --git a/skbio/io/format/tests/data/stockholm_malformed_data_line b/skbio/io/format/tests/data/stockholm_malformed_data_line
new file mode 100644
index 0000000..6a067ae
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_malformed_data_line
@@ -0,0 +1,3 @@
+# STOCKHOLM 1.0
+seq1
+//
diff --git a/skbio/io/format/tests/data/stockholm_malformed_gc_line b/skbio/io/format/tests/data/stockholm_malformed_gc_line
new file mode 100644
index 0000000..0763020
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_malformed_gc_line
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+seq1 ATCCGCT
+#=GC AT_cons
+//
diff --git a/skbio/io/format/tests/data/stockholm_malformed_gf_line b/skbio/io/format/tests/data/stockholm_malformed_gf_line
new file mode 100644
index 0000000..a28f41e
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_malformed_gf_line
@@ -0,0 +1,3 @@
+# STOCKHOLM 1.0
+#=GF AL
+//
diff --git a/skbio/io/format/tests/data/stockholm_malformed_gr_line b/skbio/io/format/tests/data/stockholm_malformed_gr_line
new file mode 100644
index 0000000..f30873f
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_malformed_gr_line
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+seq1 ACGGTCG
+#=GR seq1
+//
diff --git a/skbio/io/format/tests/data/stockholm_malformed_gs_line b/skbio/io/format/tests/data/stockholm_malformed_gs_line
new file mode 100644
index 0000000..4017bd7
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_malformed_gs_line
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+#=GS seq1 AL
+seq1 ACCTG
+//
diff --git a/skbio/io/format/tests/data/stockholm_metadata_only b/skbio/io/format/tests/data/stockholm_metadata_only
new file mode 100644
index 0000000..6025bc9
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_metadata_only
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+#=GF NM Kestrel Gorlick
+#=GF DT February 5th, 2016
+//
diff --git a/skbio/io/format/tests/data/stockholm_minimal b/skbio/io/format/tests/data/stockholm_minimal
new file mode 100644
index 0000000..ffcb4e4
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_minimal
@@ -0,0 +1,3 @@
+# STOCKHOLM 1.0
+0235244 TGTGTCGCAGTTGTCGTTTG
+//
diff --git a/skbio/io/format/tests/data/stockholm_missing_footer b/skbio/io/format/tests/data/stockholm_missing_footer
new file mode 100644
index 0000000..88ffcff
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_missing_footer
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+0232455 TTATCTTAGCCTCTCTAAGT
+0234323 ATCCCACGGAAACAGATGGC
+0235244 TGTGTCGCAGTTGTCGTTTG
diff --git a/skbio/io/format/tests/data/stockholm_missing_header b/skbio/io/format/tests/data/stockholm_missing_header
new file mode 100644
index 0000000..758c5a4
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_missing_header
@@ -0,0 +1,4 @@
+0232455 TTATCTTAGCCTCTCTAAGT
+0234323 ATCCCACGGAAACAGATGGC
+0235244 TGTGTCGCAGTTGTCGTTTG
+//
diff --git a/skbio/io/format/tests/data/stockholm_multiple_msa b/skbio/io/format/tests/data/stockholm_multiple_msa
new file mode 100644
index 0000000..9695c09
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_multiple_msa
@@ -0,0 +1,18 @@
+# STOCKHOLM 1.0
+#=GF AC G2134T23
+#=GF ID ARD
+RTC2231 AAGGGUUAUUUAUAUACUUU
+RTF2124 UGCUAAGAGUGGGGAUGAUU
+RTH3322 GCCACAACCGAUUAGAUAGA
+RTB1512 UUAGAAACCGAUGGACCGAA
+#=GC AC_cons GGGACUGGACAUCUAUUCAG
+//
+# STOCKHOLM 1.0
+#=GF AC G2134T23
+#=GF ID ARD
+RTC2231 AAGGGUUAUUUAUAUACUUU
+RTF2124 UGCUAAGAGUGGGGAUGAUU
+RTH3322 GCCACAACCGAUUAGAUAGA
+RTB1512 UUAGAAACCGAUGGACCGAA
+#=GC AC_cons GGGACUGGACAUCUAUUCAG
+//
diff --git a/skbio/io/format/tests/data/stockholm_multiple_trees b/skbio/io/format/tests/data/stockholm_multiple_trees
new file mode 100644
index 0000000..232c9ac
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_multiple_trees
@@ -0,0 +1,8 @@
+# STOCKHOLM 1.0
+#=GF TN tree1
+#=GF NH ABCD
+#=GF TN tree2
+#=GF NH EFGH
+#=GF TN tree3
+#=GF NH IJKL
+//
diff --git a/skbio/io/format/tests/data/stockholm_no_data b/skbio/io/format/tests/data/stockholm_no_data
new file mode 100644
index 0000000..8d557fc
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_no_data
@@ -0,0 +1,2 @@
+# STOCKHOLM 1.0
+//
diff --git a/skbio/io/format/tests/data/stockholm_nonstring_labels b/skbio/io/format/tests/data/stockholm_nonstring_labels
new file mode 100644
index 0000000..82134dc
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_nonstring_labels
@@ -0,0 +1,7 @@
+# STOCKHOLM 1.0
+#=GF 1.3 2857
+#=GS 11214 8 123
+11214 ACTG
+#=GR 11214 1.0 1234
+#=GC 25 4321
+//
diff --git a/skbio/io/format/tests/data/stockholm_rna b/skbio/io/format/tests/data/stockholm_rna
new file mode 100644
index 0000000..7b684fd
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_rna
@@ -0,0 +1,9 @@
+# STOCKHOLM 1.0
+#=GF AC G2134T23
+#=GF ID ARD
+RTC2231 AAGGGUUAUUUAUAUACUUU
+RTF2124 UGCUAAGAGUGGGGAUGAUU
+RTH3322 GCCACAACCGAUUAGAUAGA
+RTB1512 UUAGAAACCGAUGGACCGAA
+#=GC AC_cons GGGACUGGACAUCUAUUCAG
+//
diff --git a/skbio/io/format/tests/data/stockholm_runon_gf b/skbio/io/format/tests/data/stockholm_runon_gf
new file mode 100644
index 0000000..a65830f
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_runon_gf
@@ -0,0 +1,5 @@
+# STOCKHOLM 1.0
+#=GF CC CBS domains are small intracellular modules mostly found
+#=GF CC in 2 or four copies within a protein.
+GG1344 ACTGGTTCAATG
+//
diff --git a/skbio/io/format/tests/data/stockholm_runon_gs b/skbio/io/format/tests/data/stockholm_runon_gs
new file mode 100644
index 0000000..755eb5f
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_runon_gs
@@ -0,0 +1,5 @@
+# STOCKHOLM 1.0
+#=GS seq1 AL ABCDEFGHIJKLMNOP
+#=GS seq1 AL QRSTUVWXYZ
+seq1 ATCGTTCAGTG
+//
diff --git a/skbio/io/format/tests/data/stockholm_single_tree_with_id b/skbio/io/format/tests/data/stockholm_single_tree_with_id
new file mode 100644
index 0000000..b05d33e
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_single_tree_with_id
@@ -0,0 +1,4 @@
+# STOCKHOLM 1.0
+#=GF TN tree1
+#=GF NH ABCD
+//
diff --git a/skbio/io/format/tests/data/stockholm_single_tree_without_id b/skbio/io/format/tests/data/stockholm_single_tree_without_id
new file mode 100644
index 0000000..c04df41
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_single_tree_without_id
@@ -0,0 +1,3 @@
+# STOCKHOLM 1.0
+#=GF NH ABCD
+//
diff --git a/skbio/io/format/tests/data/stockholm_two_of_each_metadata b/skbio/io/format/tests/data/stockholm_two_of_each_metadata
new file mode 100644
index 0000000..c73f2a5
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_two_of_each_metadata
@@ -0,0 +1,11 @@
+# STOCKHOLM 1.0
+#=GF NM Kestrel Gorlick
+#=GF DT February 5th, 2016
+#=GS seq1 AL ABCD
+#=GS seq1 NS 1234
+seq1 ACTGACCATGTTCA
+#=GR seq1 SS CACTACTTGTGACG
+#=GR seq1 AS TCACATCGGCGATG
+#=GC SS_cons ______1____2__
+#=GC AS_cons __1___1___3___
+//
diff --git a/skbio/io/format/tests/data/stockholm_whitespace_only_lines b/skbio/io/format/tests/data/stockholm_whitespace_only_lines
new file mode 100644
index 0000000..1bbb205
--- /dev/null
+++ b/skbio/io/format/tests/data/stockholm_whitespace_only_lines
@@ -0,0 +1,5 @@
+# STOCKHOLM 1.0
+#=GF AL ABCD
+
+#=GF NM 1234
+//
diff --git a/skbio/io/format/tests/test_base.py b/skbio/io/format/tests/test_base.py
index 441c039..a19fb25 100644
--- a/skbio/io/format/tests/test_base.py
+++ b/skbio/io/format/tests/test_base.py
@@ -39,7 +39,7 @@ class PhredDecoderTests(unittest.TestCase):
self.assertIn('`phred_offset`', str(cm.exception))
def test_solexa_variant(self):
- with self.assertRaises(NotImplementedError) as cm:
+ with self.assertRaises(ValueError) as cm:
_decode_qual_to_phred('abcd', variant='solexa')
self.assertIn('719', str(cm.exception))
@@ -124,7 +124,7 @@ class PhredEncoderTests(unittest.TestCase):
self.assertIn('`phred_offset`', str(cm.exception))
def test_solexa_variant(self):
- with self.assertRaises(NotImplementedError) as cm:
+ with self.assertRaises(ValueError) as cm:
_encode_phred_to_qual([1, 2, 3], variant='solexa')
self.assertIn('719', str(cm.exception))
diff --git a/skbio/io/format/tests/test_clustal.py b/skbio/io/format/tests/test_clustal.py
index ac91338..733d65b 100644
--- a/skbio/io/format/tests/test_clustal.py
+++ b/skbio/io/format/tests/test_clustal.py
@@ -15,7 +15,7 @@ from unittest import TestCase, main
import six
from skbio import TabularMSA
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence._grammared_sequence import GrammaredSequence
from skbio.util._decorator import classproperty, overrides
from skbio.io.format.clustal import (
_clustal_to_tabular_msa, _tabular_msa_to_clustal, _clustal_sniffer,
@@ -25,19 +25,24 @@ from skbio.io.format.clustal import (
from skbio.io import ClustalFormatError
-class CustomSequence(IUPACSequence):
+class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('-.')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set(string.ascii_letters)
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {}
diff --git a/skbio/io/format/tests/test_fasta.py b/skbio/io/format/tests/test_fasta.py
index 413053f..7052942 100644
--- a/skbio/io/format/tests/test_fasta.py
+++ b/skbio/io/format/tests/test_fasta.py
@@ -25,24 +25,29 @@ from skbio.io.format.fasta import (
_fasta_to_tabular_msa, _generator_to_fasta,
_sequence_to_fasta, _dna_to_fasta, _rna_to_fasta, _protein_to_fasta,
_tabular_msa_to_fasta)
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence._grammared_sequence import GrammaredSequence
from skbio.util import get_data_path
from skbio.util._decorator import classproperty, overrides
-class CustomSequence(IUPACSequence):
+class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('-.')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set(string.ascii_letters)
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {}
@@ -709,7 +714,7 @@ class WriterTests(TestCase):
self.msa = TabularMSA(seqs)
def empty_gen():
- raise StopIteration()
+ return
yield
def single_seq_gen():
diff --git a/skbio/io/format/tests/test_fastq.py b/skbio/io/format/tests/test_fastq.py
index cd7bab2..337429e 100644
--- a/skbio/io/format/tests/test_fastq.py
+++ b/skbio/io/format/tests/test_fastq.py
@@ -21,7 +21,7 @@ from skbio.io import FASTQFormatError
from skbio.io.format.fastq import (
_fastq_sniffer, _fastq_to_generator, _fastq_to_tabular_msa,
_generator_to_fastq, _tabular_msa_to_fastq)
-from skbio.sequence._iupac_sequence import IUPACSequence
+from skbio.sequence._grammared_sequence import GrammaredSequence
from skbio.util import get_data_path
from skbio.util._decorator import classproperty, overrides
@@ -346,7 +346,7 @@ class TestReaders(unittest.TestCase):
def test_fastq_to_generator_solexa(self):
# solexa support isn't implemented yet. should raise error even with
# valid solexa file
- with self.assertRaises(NotImplementedError):
+ with six.assertRaisesRegex(self, ValueError, 'Solexa'):
list(_fastq_to_generator(
get_data_path('solexa_full_range_original_solexa.fastq'),
variant='solexa'))
@@ -392,19 +392,24 @@ class TestReaders(unittest.TestCase):
self.assertEqual(observed, expected)
def test_fastq_to_tabular_msa(self):
- class CustomSequence(IUPACSequence):
+ class CustomSequence(GrammaredSequence):
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def gap_chars(cls):
return set('-.')
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set(string.ascii_letters)
@classproperty
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {}
diff --git a/skbio/io/format/tests/test_genbank.py b/skbio/io/format/tests/test_genbank.py
index 81d89bb..75003f3 100644
--- a/skbio/io/format/tests/test_genbank.py
+++ b/skbio/io/format/tests/test_genbank.py
@@ -15,7 +15,6 @@ import pandas as pd
import numpy.testing as npt
import six
from unittest import TestCase, main
-from datetime import datetime
from skbio import Protein, DNA, RNA, Sequence
from skbio.util import get_data_path
@@ -63,17 +62,17 @@ class GenBankIOTests(TestCase):
(['LOCUS NC_005816 9609 bp '
'DNA circular CON 07-FEB-2015'],
{'division': 'CON', 'mol_type': 'DNA', 'shape': 'circular',
- 'locus_name': 'NC_005816', 'date': datetime(2015, 2, 7, 0, 0),
+ 'locus_name': 'NC_005816', 'date': '07-FEB-2015',
'unit': 'bp', 'size': 9609}),
(['LOCUS SCU49845 5028 bp '
'DNA PLN 21-JUN-1999'],
{'division': 'PLN', 'mol_type': 'DNA', 'shape': None,
- 'locus_name': 'SCU49845', 'date': datetime(1999, 6, 21, 0, 0),
+ 'locus_name': 'SCU49845', 'date': '21-JUN-1999',
'unit': 'bp', 'size': 5028}),
(['LOCUS NP_001832 360 aa '
'linear PRI 18-DEC-2001'],
{'division': 'PRI', 'mol_type': None, 'shape': 'linear',
- 'locus_name': 'NP_001832', 'date': datetime(2001, 12, 18, 0, 0),
+ 'locus_name': 'NP_001832', 'date': '18-DEC-2001',
'unit': 'aa', 'size': 360}))
# test single record and read uppercase sequence
@@ -81,7 +80,7 @@ class GenBankIOTests(TestCase):
self.single_lower_fp = get_data_path('genbank_single_record_lower')
self.single = (
'GSREILDFK',
- {'LOCUS': {'date': datetime(1994, 9, 23, 0, 0),
+ {'LOCUS': {'date': '23-SEP-1994',
'division': 'BCT',
'locus_name': 'AAB29917',
'mol_type': None,
@@ -120,7 +119,7 @@ class GenBankIOTests(TestCase):
'translation': '"MKQSTIALAVLPLLFTPVTKA"',
'type_': 'CDS'}],
'KEYWORDS': 'alkaline phosphatase; signal peptide.',
- 'LOCUS': {'date': datetime(1993, 4, 26, 0, 0),
+ 'LOCUS': {'date': '26-APR-1993',
'division': 'BCT',
'locus_name': 'ECOALKP',
'mol_type': 'mRNA',
@@ -163,7 +162,7 @@ class GenBankIOTests(TestCase):
'right_partial_': True,
'type_': 'Protein'}],
'KEYWORDS': '.',
- 'LOCUS': {'date': datetime(1994, 9, 23, 0, 0),
+ 'LOCUS': {'date': '23-SEP-1994',
'division': 'BCT',
'locus_name': 'AAB29917',
'mol_type': None,
@@ -207,7 +206,7 @@ class GenBankIOTests(TestCase):
'right_partial_': True,
'type_': 'rRNA'}],
'KEYWORDS': 'ENV.',
- 'LOCUS': {'date': datetime(2010, 8, 29, 0, 0),
+ 'LOCUS': {'date': '29-AUG-2010',
'division': 'ENV',
'locus_name': 'HQ018078',
'mol_type': 'DNA',
diff --git a/skbio/io/format/tests/test_stockholm.py b/skbio/io/format/tests/test_stockholm.py
new file mode 100644
index 0000000..d3a72d0
--- /dev/null
+++ b/skbio/io/format/tests/test_stockholm.py
@@ -0,0 +1,701 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import (absolute_import, division, print_function,
+ unicode_literals)
+import six
+
+import pandas as pd
+
+import io
+import unittest
+from collections import OrderedDict
+
+from skbio import TabularMSA, Protein, DNA, RNA
+from skbio.io import StockholmFormatError
+from skbio.io.format.stockholm import (_stockholm_to_tabular_msa,
+ _tabular_msa_to_stockholm,
+ _stockholm_sniffer)
+from skbio.util import get_data_path
+
+
+class TestStockholmSniffer(unittest.TestCase):
+ def setUp(self):
+ self.positives = [get_data_path(e) for e in [
+ 'stockholm_extensive',
+ 'stockholm_minimal',
+ 'stockholm_rna',
+ 'stockholm_runon_gf',
+ 'stockholm_duplicate_sequence_names',
+ 'stockholm_duplicate_gr',
+ 'stockholm_duplicate_gc',
+ 'stockholm_invalid_nonexistent_gr',
+ 'stockholm_invalid_nonexistent_gs',
+ 'stockholm_no_data',
+ 'stockholm_blank_lines',
+ 'stockholm_differing_gc_data_length',
+ 'stockholm_differing_gr_data_length',
+ 'stockholm_differing_seq_lengths',
+ 'stockholm_duplicate_sequence_names',
+ 'stockholm_duplicate_tree_ids',
+ 'stockholm_extensive_mixed',
+ 'stockholm_invalid_data_type',
+ 'stockholm_malformed_gf_line',
+ 'stockholm_malformed_gs_line',
+ 'stockholm_malformed_gr_line',
+ 'stockholm_malformed_gc_line',
+ 'stockholm_malformed_data_line',
+ 'stockholm_metadata_only',
+ 'stockholm_multiple_msa',
+ 'stockholm_multiple_trees',
+ 'stockholm_runon_gs',
+ 'stockholm_single_tree_with_id',
+ 'stockholm_single_tree_without_id',
+ 'stockholm_whitespace_only_lines',
+ 'stockholm_all_data_types',
+ 'stockholm_two_of_each_metadata',
+ 'stockholm_data_only',
+ 'stockholm_nonstring_labels'
+ ]]
+
+ self.negatives = [get_data_path(e) for e in [
+ 'stockholm_missing_header',
+ 'empty',
+ 'whitespace_only'
+ ]]
+
+ def test_positives(self):
+ for fp in self.positives:
+ self.assertEqual(_stockholm_sniffer(fp), (True, {}))
+
+ def test_negatives(self):
+ for fp in self.negatives:
+ self.assertEqual(_stockholm_sniffer(fp), (False, {}))
+
+
+class TestStockholmReader(unittest.TestCase):
+ def test_stockholm_extensive(self):
+ fp = get_data_path('stockholm_extensive')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ exp = TabularMSA([Protein('MTCRAQLIAVPRASSLAE..AIACAQKM....'
+ 'RVSRVPVYERS',
+ positional_metadata={'SA': list('9998877564'
+ '53524252..'
+ '55152525..'
+ '..36463774'
+ '777')}),
+ Protein('EVMLTDIPRLHINDPIMK..GFGMVINN....'
+ '..GFVCVENDE',
+ metadata={'OS': 'Bacillus subtilis'},
+ positional_metadata={'SS': list('CCCCCCCHHHH'
+ 'HHHHHHH..HE'
+ 'EEEEEE....E'
+ 'EEEEEE'
+ 'EEEH')}),
+ Protein('EVMLTDIPRLHINDPIMK..GFGMVINN...'
+ '...GFVCVENDE',
+ positional_metadata={'AS': list('___________'
+ '_____*_____'
+ '___________'
+ '________'
+ '__'),
+ 'IN': list('___________'
+ '_1_________'
+ '_____2_____'
+ '_____0_'
+ '___')})],
+ metadata={'ID': 'CBS', 'AC': 'PF00571',
+ 'AU': 'Bateman A', 'SQ': '67'},
+ positional_metadata={'SS_cons': list('CCCCCHHHHHHHH'
+ 'HHHHH..EEEEEE'
+ 'EE....EEEEEEE'
+ 'EEEH')},
+ index=['O83071/192-246', 'O31698/88-139',
+ 'O31699/88-139'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_extensive_mixed(self):
+ fp = get_data_path('stockholm_extensive_mixed')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ exp = TabularMSA([Protein('MTCRAQLIAVPRASSLAE..AIACAQKM....'
+ 'RVSRVPVYERS',
+ positional_metadata={'SA': list('9998877564'
+ '53524252..'
+ '55152525..'
+ '..36463774'
+ '777')}),
+ Protein('EVMLTDIPRLHINDPIMK..GFGMVINN....'
+ '..GFVCVENDE',
+ metadata={'OS': 'Bacillus subtilis'},
+ positional_metadata={'SS': list('CCCCCCCHHHH'
+ 'HHHHHHH..HE'
+ 'EEEEEE....E'
+ 'EEEEEE'
+ 'EEEH')}),
+ Protein('EVMLTDIPRLHINDPIMK..GFGMVINN...'
+ '...GFVCVENDE',
+ positional_metadata={'AS': list('___________'
+ '_____*_____'
+ '___________'
+ '________'
+ '__'),
+ 'IN': list('___________'
+ '_1_________'
+ '_____2_____'
+ '_____0_'
+ '___')})],
+ metadata={'ID': 'CBS', 'AC': 'PF00571',
+ 'AU': 'Bateman A', 'SQ': '67'},
+ positional_metadata={'SS_cons': list('CCCCCHHHHHHHH'
+ 'HHHHH..EEEEEE'
+ 'EE....EEEEEEE'
+ 'EEEH')},
+ index=['O83071/192-246', 'O31698/88-139',
+ 'O31699/88-139'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_minimal(self):
+ fp = get_data_path('stockholm_minimal')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([DNA('TGTGTCGCAGTTGTCGTTTG')], index=['0235244'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_rna(self):
+ fp = get_data_path('stockholm_rna')
+ msa = _stockholm_to_tabular_msa(fp, constructor=RNA)
+ exp = TabularMSA([RNA('AAGGGUUAUUUAUAUACUUU'),
+ RNA('UGCUAAGAGUGGGGAUGAUU'),
+ RNA('GCCACAACCGAUUAGAUAGA'),
+ RNA('UUAGAAACCGAUGGACCGAA')],
+ metadata={'AC': 'G2134T23', 'ID': 'ARD'},
+ positional_metadata=(
+ {'AC_cons': list('GGGACUGGACAUCUAUUCAG')}),
+ index=['RTC2231', 'RTF2124', 'RTH3322', 'RTB1512'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_runon_gf(self):
+ fp = get_data_path('stockholm_runon_gf')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([DNA('ACTGGTTCAATG')],
+ metadata={'CC': 'CBS domains are small intracellular'
+ ' modules mostly found in 2 or four '
+ 'copies within a protein.'},
+ index=['GG1344'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_runon_gs(self):
+ fp = get_data_path('stockholm_runon_gs')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([DNA('ATCGTTCAGTG',
+ metadata={'AL': 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})],
+ index=['seq1'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_metadata_only(self):
+ fp = get_data_path('stockholm_metadata_only')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'NM': 'Kestrel Gorlick',
+ 'DT': 'February 5th, 2016'})
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_no_data(self):
+ fp = get_data_path('stockholm_no_data')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_with_blank_lines(self):
+ fp = get_data_path('stockholm_blank_lines')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'AL': 'ABCD', 'NM': '1234'})
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_with_whitespace_only_lines(self):
+ fp = get_data_path('stockholm_whitespace_only_lines')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'AL': 'ABCD', 'NM': '1234'})
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_single_tree_without_id(self):
+ fp = get_data_path('stockholm_single_tree_without_id')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'NH': 'ABCD'})
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_single_tree_with_id(self):
+ fp = get_data_path('stockholm_single_tree_with_id')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'NH': {'tree1': 'ABCD'}})
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_multiple_trees(self):
+ fp = get_data_path('stockholm_multiple_trees')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ exp = TabularMSA([], metadata={'NH': {'tree1': 'ABCD',
+ 'tree2': 'EFGH',
+ 'tree3': 'IJKL'}})
+ self.assertEqual(msa, exp)
+
+ def test_multiple_msa_file(self):
+ fp = get_data_path('stockholm_multiple_msa')
+ msa = _stockholm_to_tabular_msa(fp, constructor=RNA)
+ exp = TabularMSA([RNA('AAGGGUUAUUUAUAUACUUU'),
+ RNA('UGCUAAGAGUGGGGAUGAUU'),
+ RNA('GCCACAACCGAUUAGAUAGA'),
+ RNA('UUAGAAACCGAUGGACCGAA')],
+ metadata={'AC': 'G2134T23', 'ID': 'ARD'},
+ positional_metadata=(
+ {'AC_cons': list('GGGACUGGACAUCUAUUCAG')}),
+ index=['RTC2231', 'RTF2124', 'RTH3322', 'RTB1512'])
+ self.assertEqual(msa, exp)
+
+ def test_stockholm_maintains_order(self):
+ fp = get_data_path('stockholm_two_of_each_metadata')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ msa_order = list(msa.metadata.items())
+ exp_order = [('NM', 'Kestrel Gorlick'), ('DT', 'February 5th, 2016')]
+ self.assertEqual(msa_order, exp_order)
+ msa_order = list(msa[0].metadata.items())
+ exp_order = [('AL', 'ABCD'), ('NS', '1234')]
+ self.assertEqual(msa_order, exp_order)
+ msa_order = list(msa.positional_metadata.columns)
+ exp_order = ['SS_cons', 'AS_cons']
+ self.assertEqual(msa_order, exp_order)
+ msa_order = list(msa[0].positional_metadata.columns)
+ exp_order = ['SS', 'AS']
+ self.assertEqual(msa_order, exp_order)
+
+ def test_stockholm_duplicate_tree_id_error(self):
+ fp = get_data_path('stockholm_duplicate_tree_ids')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Tree.*tree1.*in file.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_nonexistent_gr_error(self):
+ fp = get_data_path('stockholm_invalid_nonexistent_gr')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'GS or GR.*nonexistent sequence.*RL1355.'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_nonexistent_gs_error(self):
+ fp = get_data_path('stockholm_invalid_nonexistent_gs')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'GS or GR.*nonexistent sequence.*AC14.'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_duplicate_sequence_names_error(self):
+ fp = get_data_path('stockholm_duplicate_sequence_names')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'duplicate sequence name.*ASR132.*supported'
+ ' by the reader.'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_duplicate_gr_error(self):
+ fp = get_data_path('stockholm_duplicate_gr')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Found duplicate GR.*OS.*LFDR3.*supported '
+ 'by the reader.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_duplicate_gc_error(self):
+ fp = get_data_path('stockholm_duplicate_gc')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Found duplicate GC.*SS_cons.*supported '
+ 'by the reader.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_empty_file_error(self):
+ fp = get_data_path('empty')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'File is empty.'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_missing_header_error(self):
+ fp = get_data_path('stockholm_missing_header')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'File missing.*header'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_missing_footer_error(self):
+ fp = get_data_path('stockholm_missing_footer')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Final line.*only "//".'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_data_type_error(self):
+ fp = get_data_path('stockholm_invalid_data_type')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ "Unrecognized.*'#=GZ"):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_malformed_gf_line_error(self):
+ fp = get_data_path('stockholm_malformed_gf_line')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Line contains 2.*must contain.*3.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_malformed_gs_line_error(self):
+ fp = get_data_path('stockholm_malformed_gs_line')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Line contains 3.*must contain.*4.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_malformed_gr_line_error(self):
+ fp = get_data_path('stockholm_malformed_gr_line')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Line contains 2.*must contain.*4.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_malformed_gc_line_error(self):
+ fp = get_data_path('stockholm_malformed_gc_line')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Line contains 2.*must contain.*3.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_malformed_data_line_error(self):
+ fp = get_data_path('stockholm_malformed_data_line')
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Line contains 1.*must contain.*2.'):
+ _stockholm_to_tabular_msa(fp, constructor=DNA)
+
+ def test_differing_sequence_lengths_error(self):
+ fp = get_data_path('stockholm_differing_seq_lengths')
+ with six.assertRaisesRegex(self, ValueError,
+ 'Each sequence.*11 != 10'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_differing_data_lengths_gr_error(self):
+ fp = get_data_path('stockholm_differing_gr_data_length')
+ with six.assertRaisesRegex(self, ValueError,
+ 'Number.*7.*(8).'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_differing_data_lengths_gc_error(self):
+ fp = get_data_path('stockholm_differing_gc_data_length')
+ with six.assertRaisesRegex(self, ValueError,
+ 'Number.*12.*(10).'):
+ _stockholm_to_tabular_msa(fp, constructor=RNA)
+
+ def test_no_constructor_error(self):
+ fp = get_data_path('empty')
+ with six.assertRaisesRegex(self, ValueError, 'Must.*parameter.'):
+ _stockholm_to_tabular_msa(fp)
+
+ def test_unsupported_constructor_error(self):
+ fp = get_data_path('empty')
+ with six.assertRaisesRegex(self, TypeError,
+ '`constructor`.*`GrammaredSequence`'):
+ _stockholm_to_tabular_msa(fp, constructor=TabularMSA)
+
+ def test_handles_missing_metadata_efficiently(self):
+ msa = _stockholm_to_tabular_msa(get_data_path('stockholm_minimal'),
+ constructor=DNA)
+ self.assertIsNone(msa._metadata)
+ self.assertIsNone(msa._positional_metadata)
+ self.assertIsNone(msa[0]._metadata)
+ self.assertIsNone(msa[0]._positional_metadata)
+
+
+class TestStockholmWriter(unittest.TestCase):
+ def test_msa_to_stockholm_extensive(self):
+ fp = get_data_path('stockholm_all_data_types')
+ msa = TabularMSA([DNA('GAGGCCATGCCCAGGTGAAG',
+ metadata=OrderedDict([('DT', 'February 1, 2016'),
+ ('NM', 'Unknown')])),
+ DNA('ACCTGAGCCACAGTAGAAGT'),
+ DNA('CCCTTCGCTGGAAATGTATG',
+ metadata={'DT': 'Unknown'},
+ positional_metadata=OrderedDict([('AS',
+ list('CCGAAAGT'
+ 'CGTTCGA'
+ 'AAATG')),
+ ('SS',
+ list('GGCGAGTC'
+ 'GTTCGAGC'
+ 'TGG'
+ 'C'))]))],
+ metadata=OrderedDict([('NM', 'Kestrel Gorlick'),
+ ('DT', 'February 11, 2016'),
+ ('FN', 'Writer test file')]),
+ positional_metadata=OrderedDict([('AS_cons',
+ list('CGTTCGTTCTAAC'
+ 'AATTCCA')),
+ ('SS_cons',
+ list('GGCGCTACGACCT'
+ 'ACGACCG'))]),
+ index=['seq1', 'seq2', 'seq3'])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_minimal(self):
+ fp = get_data_path('stockholm_minimal')
+ msa = TabularMSA([DNA('TGTGTCGCAGTTGTCGTTTG')], index=['0235244'])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_single_tree(self):
+ fp = get_data_path('stockholm_single_tree_without_id')
+ msa = TabularMSA([], metadata=OrderedDict([('NH', 'ABCD')]))
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_single_tree_as_dict(self):
+ fp = get_data_path('stockholm_single_tree_with_id')
+ msa = TabularMSA([], metadata={'NH': {'tree1': 'ABCD'}})
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_multiple_trees(self):
+ fp = get_data_path('stockholm_multiple_trees')
+ msa = TabularMSA([], metadata=OrderedDict([('NH',
+ OrderedDict([('tree1',
+ 'ABCD'),
+ ('tree2',
+ 'EFGH'),
+ ('tree3',
+ 'IJKL')]))]))
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_data_only(self):
+ fp = get_data_path('stockholm_data_only')
+ msa = TabularMSA([RNA('ACUCCGACAUGCUCC'),
+ RNA('UAGUGCCGAACGCUG'),
+ RNA('GUGUGGGCGUGAUUC')],
+ index=['seq1', 'seq2', 'seq3'])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_nonstring_values(self):
+ fp = get_data_path('stockholm_nonstring_labels')
+ msa = TabularMSA([DNA('ACTG', metadata=OrderedDict([(8, 123)]),
+ positional_metadata=OrderedDict([(1.0,
+ [1, 2, 3, 4])])
+ )],
+ metadata=OrderedDict([(1.3, 2857)]),
+ positional_metadata=OrderedDict([(25, [4, 3, 2, 1])]),
+ index=[11214])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_msa_to_stockholm_empty(self):
+ fp = get_data_path('stockholm_no_data')
+ msa = TabularMSA([])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_extensive(self):
+ fp = get_data_path('stockholm_extensive')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_minimal(self):
+ fp = get_data_path('stockholm_minimal')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_single_tree(self):
+ fp = get_data_path('stockholm_single_tree_without_id')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_multiple_trees(self):
+ fp = get_data_path('stockholm_multiple_trees')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_data_only(self):
+ fp = get_data_path('stockholm_data_only')
+ msa = _stockholm_to_tabular_msa(fp, constructor=RNA)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_nonstring_index_values(self):
+ fp = get_data_path('stockholm_nonstring_labels')
+ msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_round_trip_empty(self):
+ fp = get_data_path('stockholm_no_data')
+ msa = _stockholm_to_tabular_msa(fp, constructor=Protein)
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ obs = fh.getvalue()
+ fh.close()
+ with io.open(fp) as fh:
+ exp = fh.read()
+ self.assertEqual(obs, exp)
+
+ def test_handles_missing_metadata_efficiently(self):
+ msa = TabularMSA([DNA('ACTG'), DNA('GTCA')], index=['seq1', 'seq2'])
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+ self.assertIsNone(msa._metadata)
+ self.assertIsNone(msa._positional_metadata)
+ self.assertIsNone(msa[0]._metadata)
+ self.assertIsNone(msa[0]._positional_metadata)
+
+ def test_unoriginal_index_error(self):
+ msa = TabularMSA([DNA('ATCGCCAGCT'), DNA('TTGTGCTGGC')],
+ index=['seq1', 'seq1'])
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'index labels must be unique.'):
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+
+ def test_unoriginal_gr_feature_names_error(self):
+ pos_metadata_dataframe = pd.DataFrame.from_items([('AC',
+ list('GAGCAAGCCACTA'
+ 'GA')),
+ ('SS',
+ list('TCCTTGAACTACC'
+ 'CG')),
+ ('AS',
+ list('TCAGCTCTGCAGC'
+ 'GT')),
+ ('SS',
+ list('GTCAGGCGCTCGG'
+ 'TG'))])
+ msa = TabularMSA([DNA('CGTCAATCTCGAACT',
+ positional_metadata=pos_metadata_dataframe)],
+ index=['seq1'])
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Sequence-specific positional metadata.*'
+ 'must be unique. Found 1 duplicate'):
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+
+ def test_unoriginal_gc_feature_names_error(self):
+ pos_metadata_dataframe = pd.DataFrame.from_items([('AC',
+ list('GAGCAAGCCACTA'
+ 'GA')),
+ ('SS',
+ list('TCCTTGAACTACC'
+ 'CG')),
+ ('SS',
+ list('TCAGCTCTGCAGC'
+ 'GT')),
+ ('AC',
+ list('GTCAGGCGCTCGG'
+ 'TG'))])
+ msa = TabularMSA([DNA('CCCCTGCTTTCGTAG')],
+ positional_metadata=pos_metadata_dataframe)
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Multiple sequence alignment positional '
+ 'metadata.*must be unique. Found 2 '
+ 'duplicate'):
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+
+ def test_gr_wrong_dataframe_item_length_error(self):
+ seq1 = list('GAGCAAGCCACTAGA')
+ seq1.append('GG')
+ pos_metadata_dataframe = pd.DataFrame({'AC': seq1,
+ 'SS': list('TCCTTGAACTACCCGA'),
+ 'AS': list('TCAGCTCTGCAGCGTT')})
+ msa = TabularMSA([DNA('TCCTTGAACTACCCGA',
+ positional_metadata=pos_metadata_dataframe)])
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Sequence-specific positional metadata.*'
+ 'must contain a single character.*Found '
+ 'value\(s\) in column AC'):
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+
+ def test_gc_wrong_dataframe_item_length_error(self):
+ seq1 = list('GAGCAAGCCACTAGA')
+ seq1.append('GG')
+ pos_metadata_dataframe = pd.DataFrame({'AC': seq1,
+ 'SS': list('TCCTTGAACTACCCGA'),
+ 'AS': list('TCAGCTCTGCAGCGTT')})
+ msa = TabularMSA([DNA('TCCTTGAACTACCCGA')],
+ positional_metadata=pos_metadata_dataframe)
+ with six.assertRaisesRegex(self, StockholmFormatError,
+ 'Multiple sequence alignment positional '
+ 'metadata.*must contain a single character'
+ '.*Found value\(s\) in column AC'):
+ fh = io.StringIO()
+ _tabular_msa_to_stockholm(msa, fh)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/skbio/io/registry.py b/skbio/io/registry.py
index 1bf7737..4578c38 100644
--- a/skbio/io/registry.py
+++ b/skbio/io/registry.py
@@ -371,7 +371,8 @@ class IORegistry(object):
file : openable (filepath, URL, filehandle, etc.)
The file to sniff. Something that is understood by `skbio.io.open`.
kwargs : dict, optional
- Keyword arguments will be passed to `skbio.io.open`.
+ Keyword arguments will be passed to `skbio.io.open`. `newline`
+ cannot be provided.
Returns
-------
@@ -385,8 +386,14 @@ class IORegistry(object):
This occurs when the format is not 'claimed' by any registered
sniffer or when the format is ambiguous and has been 'claimed' by
more than one sniffer.
+ TypeError
+ If `newline` is provided in `kwargs`.
"""
+ if 'newline' in kwargs:
+ raise TypeError(
+ "Cannot provide `newline` keyword argument when sniffing.")
+
# By resolving the input here, we have the oppurtunity to reuse the
# file (which is potentially ephemeral). Each sniffer will also resolve
# the file, but that call will short-circuit and won't claim
@@ -457,7 +464,8 @@ class IORegistry(object):
When True, will double check the `format` if provided.
kwargs : dict, optional
Keyword arguments will be passed to their respective handlers
- (`skbio.io.open` and the reader for `format`)
+ (`skbio.io.open` and the reader for `format`). `newline` cannot be
+ provided.
Returns
-------
@@ -468,6 +476,8 @@ class IORegistry(object):
------
ValueError
Raised when `format` and `into` are both None.
+ TypeError
+ If `newline` is provided in `kwargs`.
UnrecognizedFormatError
Raised when a reader could not be found for a given `format` or the
format could not be guessed.
@@ -479,6 +489,10 @@ class IORegistry(object):
overriding the suggestion provided by the sniffer of `format`.
"""
+ if 'newline' in kwargs:
+ raise TypeError(
+ "Cannot provide `newline` keyword argument when reading.")
+
# Context managers do not compose well with generators. We have to
# duplicate the logic so that the file will stay open while yielding.
# Otherwise the context exits as soon as the generator is returned
@@ -512,9 +526,8 @@ class IORegistry(object):
with _resolve_file(file, **io_kwargs) as (file, _, _):
reader, kwargs = self._init_reader(file, fmt, into, verify, kwargs,
io_kwargs)
- generator = reader(file, **kwargs)
- while True:
- yield next(generator)
+ for item in reader(file, **kwargs):
+ yield item
def _find_io_kwargs(self, kwargs):
return {k: kwargs[k] for k in _open_kwargs if k in kwargs}
@@ -977,9 +990,8 @@ class Format(object):
file_params, file, encoding, newline, kwargs)
with open_files(files, mode='r', **io_kwargs) as fhs:
kwargs.update(zip(file_keys, fhs[:-1]))
- generator = reader_function(fhs[-1], **kwargs)
- while True:
- yield next(generator)
+ for item in reader_function(fhs[-1], **kwargs):
+ yield item
self._add_reader(cls, wrapped_reader, monkey_patch, override)
return wrapped_reader
diff --git a/skbio/io/tests/test_registry.py b/skbio/io/tests/test_registry.py
index da0f9b3..20291ea 100644
--- a/skbio/io/tests/test_registry.py
+++ b/skbio/io/tests/test_registry.py
@@ -7,6 +7,8 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+
+import six
from six.moves import zip_longest
from io import StringIO
@@ -590,18 +592,17 @@ class TestSniff(RegistryTest):
fmt, _ = self.registry.sniff(fp, encoding='big5')
self.assertEqual(fmt, 'formatx')
- def test_that_newline_is_used(self):
+ def test_passing_newline_raises_error(self):
formatx = self.registry.create_format('formatx')
fp = get_data_path('real_file')
@formatx.sniffer()
def sniffer(fh):
- self.assertEqual(fh.readlines(), ['a\nb\nc\nd\ne\n'])
return True, {}
- fmt, _ = self.registry.sniff(fp, newline='\r')
- self.assertEqual(fmt, 'formatx')
+ with six.assertRaisesRegex(self, TypeError, '`newline`'):
+ self.registry.sniff(fp, newline='\r')
def test_non_default_encoding(self):
big5_format = self.registry.create_format('big5_format',
@@ -627,17 +628,12 @@ class TestSniff(RegistryTest):
@formatx.sniffer()
def sniffer(fh):
- self.assertEqual(fh.readlines(), self._expected_lines)
+ self.assertEqual(fh.readlines(), ['a\nb\nc\nd\ne\n'])
return True, {}
- self._expected_lines = ['a\nb\nc\nd\ne\n']
fmt, _ = self.registry.sniff(fp)
self.assertEqual(fmt, 'formatx')
- self._expected_lines = ['a\n', 'b\n', 'c\n', 'd\n', 'e\n']
- fmt, _ = self.registry.sniff(fp, newline=None)
- self.assertEqual(fmt, 'formatx')
-
def test_position_not_mutated_real_file(self):
formatx = self.registry.create_format('formatx')
@@ -1142,7 +1138,7 @@ class TestRead(RegistryTest):
with self.assertRaises(UnicodeDecodeError):
self.registry.read(fp, format='format1', encoding='utf8')
- def test_that_newline_is_used(self):
+ def test_passing_newline_raises_error(self):
formatx = self.registry.create_format('formatx')
fp = get_data_path('real_file')
@@ -1159,12 +1155,11 @@ class TestRead(RegistryTest):
def reader_gen(fh):
yield TestClass(fh.readlines())
- instance = self.registry.read(fp, into=TestClass, newline='\r')
- self.assertEqual(instance, TestClass(['a\nb\nc\nd\ne\n']))
+ with six.assertRaisesRegex(self, TypeError, '`newline`'):
+ self.registry.read(fp, into=TestClass, newline='\r')
- gen = self.registry.read(fp, format='formatx', newline='\r')
- self.assertEqual(next(gen), TestClass(['a\nb\nc\nd\ne\n']))
- gen.close()
+ with six.assertRaisesRegex(self, TypeError, '`newline`'):
+ self.registry.read(fp, format='formatx', newline='\r')
def test_non_default_newline(self):
formatx = self.registry.create_format('formatx', newline='\r')
@@ -1190,15 +1185,6 @@ class TestRead(RegistryTest):
self.assertEqual(next(gen), TestClass(['a\nb\nc\nd\ne\n']))
gen.close()
- instance = self.registry.read(fp, into=TestClass, newline=None)
- self.assertEqual(instance, TestClass(['a\n', 'b\n', 'c\n', 'd\n',
- 'e\n']))
-
- gen = self.registry.read(fp, format='formatx', newline=None)
- self.assertEqual(next(gen), TestClass(['a\n', 'b\n', 'c\n', 'd\n',
- 'e\n']))
- gen.close()
-
def test_file_sentinel_many(self):
format1 = self.registry.create_format('format1')
diff --git a/skbio/io/tests/test_util.py b/skbio/io/tests/test_util.py
index 88b4480..70c2d06 100644
--- a/skbio/io/tests/test_util.py
+++ b/skbio/io/tests/test_util.py
@@ -15,7 +15,11 @@ import shutil
import io
import os.path
-import httpretty
+try:
+ import httpretty
+ has_httpretty = True
+except ImportError:
+ has_httpretty = False
import skbio.io
from skbio.io.registry import open_file
@@ -320,6 +324,13 @@ class WritableBinarySourceTests(object):
self.assertTrue(result.closed)
self.check_closed(file, True)
+ def compare_gzip_file_contents(self, a, b):
+ # The first 10 bytes of a gzip header include a timestamp. The header
+ # can be followed by other "volatile" metadata, so only compare gzip
+ # footers (last 8 bytes) which contain a CRC-32 checksum and the length
+ # of the uncompressed data.
+ self.assertEqual(a[-8:], b[-8:])
+
def test_open_binary(self):
self.check_open_state_contents(self.binary_file, self.binary_contents,
True, encoding='binary',
@@ -332,9 +343,8 @@ class WritableBinarySourceTests(object):
self.check_open_state_contents(self.gzip_file, self.text_contents,
False, compression='gzip')
- # The first 10 bytes of a gzip header include a timestamp, so skip.
- self.assertEqual(self.get_contents(self.gzip_file)[10:],
- self.gzip_contents[10:])
+ self.compare_gzip_file_contents(self.get_contents(self.gzip_file),
+ self.gzip_contents)
def test_open_bz2(self):
self.check_open_state_contents(self.bz2_file, self.text_contents,
@@ -355,9 +365,9 @@ class WritableBinarySourceTests(object):
self.decoded_contents, False,
compression='gzip', encoding='big5')
- # The first 10 bytes of a gzip header include a timestamp, so skip.
- self.assertEqual(self.get_contents(self.gzip_encoded_file)[10:],
- self.gzip_encoded_contents[10:])
+ self.compare_gzip_file_contents(
+ self.get_contents(self.gzip_encoded_file),
+ self.gzip_encoded_contents)
def test_open_bz2_encoding(self):
self.check_open_state_contents(self.bz2_encoded_file,
@@ -434,6 +444,7 @@ class TestWriteFilepath(WritableBinarySourceTests, WritableSourceTest):
return f.read()
+ at unittest.skipIf(not has_httpretty, "HTTPretty not available to mock tests.")
class TestReadURL(ReadableBinarySourceTests, ReadableSourceTest):
expected_close = True
@@ -482,18 +493,17 @@ class TestWriteBytesIO(WritableBinarySourceTests, WritableSourceTest):
self.check_open_state_contents(self.gzip_file, self.text_contents,
False, compression='gzip')
- # The first 10 bytes of a gzip header include a timestamp, so skip.
- self.assertEqual(self.get_contents(self.gzip_file)[10:],
- self.gzip_contents[23:])
+ self.compare_gzip_file_contents(self.get_contents(self.gzip_file),
+ self.gzip_contents)
def test_open_gzip_encoding(self):
self.check_open_state_contents(self.gzip_encoded_file,
self.decoded_contents, False,
compression='gzip', encoding='big5')
- # The first 10 bytes of a gzip header include a timestamp, so skip.
- self.assertEqual(self.get_contents(self.gzip_encoded_file)[10:],
- self.gzip_encoded_contents[20:])
+ self.compare_gzip_file_contents(
+ self.get_contents(self.gzip_encoded_file),
+ self.gzip_encoded_contents)
class TestReadBufferedReader(ReadableBinarySourceTests, ReadableSourceTest):
diff --git a/skbio/sequence/__init__.py b/skbio/sequence/__init__.py
index 08ee93a..85296fa 100644
--- a/skbio/sequence/__init__.py
+++ b/skbio/sequence/__init__.py
@@ -28,11 +28,20 @@ Classes
:toctree: generated/
Sequence
+ GrammaredSequence
DNA
RNA
Protein
GeneticCode
+Subpackages
+-----------
+
+.. autosummary::
+ :toctree: generated/
+
+ distance
+
Examples
--------
New sequences are created with optional metadata and positional metadata.
@@ -304,7 +313,9 @@ from ._protein import Protein
from ._dna import DNA
from ._rna import RNA
from ._genetic_code import GeneticCode
+from ._grammared_sequence import GrammaredSequence
-__all__ = ['Sequence', 'Protein', 'DNA', 'RNA', 'GeneticCode']
+__all__ = ['Sequence', 'Protein', 'DNA', 'RNA', 'GeneticCode',
+ 'GrammaredSequence']
test = TestRunner(__file__).test
diff --git a/skbio/sequence/_dna.py b/skbio/sequence/_dna.py
index cf788f4..686d5d1 100644
--- a/skbio/sequence/_dna.py
+++ b/skbio/sequence/_dna.py
@@ -7,15 +7,17 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+from six import add_metaclass
import skbio
from skbio.util._decorator import classproperty, overrides
from skbio.util._decorator import stable
from ._nucleotide_mixin import NucleotideMixin, _motifs as _parent_motifs
-from ._iupac_sequence import IUPACSequence
+from ._grammared_sequence import GrammaredSequence, DisableSubclassingMeta
-class DNA(IUPACSequence, NucleotideMixin):
+ at add_metaclass(DisableSubclassingMeta)
+class DNA(GrammaredSequence, NucleotideMixin):
"""Store DNA sequence data and optional associated metadata.
Only characters in the IUPAC DNA character set [1]_ are supported.
@@ -64,6 +66,14 @@ class DNA(IUPACSequence, NucleotideMixin):
See Also
--------
RNA
+ GrammaredSequence
+
+ Notes
+ -----
+ Subclassing is disabled for DNA, because subclassing makes
+ it possible to change the alphabet, and certain methods rely on the
+ IUPAC alphabet. If a custom sequence alphabet is needed, inherit directly
+ from ``GrammaredSequence``.
References
----------
@@ -104,7 +114,6 @@ class DNA(IUPACSequence, NucleotideMixin):
"""
@classproperty
- @stable(as_of="0.4.0")
@overrides(NucleotideMixin)
def complement_map(cls):
comp_map = {
@@ -117,14 +126,12 @@ class DNA(IUPACSequence, NucleotideMixin):
return comp_map
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set("ACGT")
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {
"R": set("AG"), "Y": set("CT"), "M": set("AC"), "K": set("TG"),
@@ -132,6 +139,16 @@ class DNA(IUPACSequence, NucleotideMixin):
"H": set("ACT"), "V": set("ACG"), "N": set("ACGT")
}
+ @classproperty
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ @overrides(GrammaredSequence)
+ def gap_chars(cls):
+ return set('-.')
+
@property
def _motifs(self):
return _motifs
@@ -399,7 +416,7 @@ class DNA(IUPACSequence, NucleotideMixin):
"""
return self.transcribe().translate_six_frames(*args, **kwargs)
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def _repr_stats(self):
"""Define custom statistics to display in the sequence's repr."""
stats = super(DNA, self)._repr_stats()
diff --git a/skbio/sequence/_iupac_sequence.py b/skbio/sequence/_grammared_sequence.py
similarity index 71%
rename from skbio/sequence/_iupac_sequence.py
rename to skbio/sequence/_grammared_sequence.py
index 3de92db..8a658f1 100644
--- a/skbio/sequence/_iupac_sequence.py
+++ b/skbio/sequence/_grammared_sequence.py
@@ -7,14 +7,14 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
-from future.utils import with_metaclass
from abc import ABCMeta, abstractproperty
from itertools import product
+import re
import numpy as np
+from six import add_metaclass
-import re
from skbio.util._decorator import (classproperty, overrides, stable,
experimental)
@@ -22,11 +22,90 @@ from skbio.util._misc import MiniRegistry
from ._sequence import Sequence
-class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
- """Store biological sequence data conforming to the IUPAC character set.
+class GrammaredSequenceMeta(ABCMeta, type):
+ def __new__(mcs, name, bases, dct):
+ cls = super(GrammaredSequenceMeta, mcs).__new__(mcs, name, bases, dct)
+
+ concrete_gap_chars = \
+ type(cls.gap_chars) is not abstractproperty
+ concrete_degenerate_map = \
+ type(cls.degenerate_map) is not abstractproperty
+ concrete_nondegenerate_chars = \
+ type(cls.nondegenerate_chars) is not abstractproperty
+ concrete_default_gap_char = \
+ type(cls.default_gap_char) is not abstractproperty
+ # degenerate_chars is not abstract but it depends on degenerate_map
+ # which is abstract.
+ concrete_degenerate_chars = concrete_degenerate_map
+
+ # Only perform metaclass checks if none of the attributes on the class
+ # are abstract.
+ # TODO: Rather than hard-coding a list of attributes to check, we can
+ # probably check all the attributes on the class and make sure none of
+ # them are abstract.
+ if (concrete_gap_chars and concrete_degenerate_map and
+ concrete_nondegenerate_chars and concrete_default_gap_char and
+ concrete_degenerate_chars):
+
+ if cls.default_gap_char not in cls.gap_chars:
+ raise TypeError(
+ "default_gap_char must be in gap_chars for class %s" %
+ name)
+
+ if len(cls.gap_chars & cls.degenerate_chars) > 0:
+ raise TypeError(
+ "gap_chars and degenerate_chars must not share any "
+ "characters for class %s" % name)
+
+ for key in cls.degenerate_map.keys():
+ for nondegenerate in cls.degenerate_map[key]:
+ if nondegenerate not in cls.nondegenerate_chars:
+ raise TypeError(
+ "degenerate_map must expand only to "
+ "characters included in nondegenerate_chars "
+ "for class %s" % name)
+
+ if len(cls.degenerate_chars & cls.nondegenerate_chars) > 0:
+ raise TypeError(
+ "degenerate_chars and nondegenerate_chars must not "
+ "share any characters for class %s" % name)
+
+ if len(cls.gap_chars & cls.nondegenerate_chars) > 0:
+ raise TypeError(
+ "gap_chars and nondegenerate_chars must not share any "
+ "characters for class %s" % name)
+
+ return cls
+
+
+# Adapted from http://stackoverflow.com/a/16056691/943814
+# Note that inheriting from GrammaredSequenceMeta, rather than something
+# more general, is intentional. Multiple inheritance with metaclasses can be
+# tricky and is not handled automatically in Python. Since this class needs to
+# inherit both from ABCMeta and GrammaredSequenceMeta, the only way we could
+# find to make this work was to have GrammaredSequenceMeta inherit from ABCMeta
+# and then inherit from GrammaredSequenceMeta here.
+class DisableSubclassingMeta(GrammaredSequenceMeta):
+ def __new__(mcs, name, bases, dct):
+ for b in bases:
+ if isinstance(b, DisableSubclassingMeta):
+ raise TypeError("Subclassing disabled for class %s. To create"
+ " a custom sequence class, inherit directly"
+ " from skbio.sequence.%s" %
+ (b.__name__, GrammaredSequence.__name__))
+ return super(DisableSubclassingMeta, mcs).__new__(mcs, name, bases,
+ dict(dct))
+
+
+ at add_metaclass(GrammaredSequenceMeta)
+class GrammaredSequence(Sequence):
+ """Store sequence data conforming to a character set.
This is an abstract base class (ABC) that cannot be instantiated.
+ This class is intended to be inherited from to create grammared sequences
+ with custom alphabets.
+
Attributes
----------
values
@@ -42,7 +121,7 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Raises
------
ValueError
- If sequence characters are not in the IUPAC character set [1]_.
+ If sequence characters are not in the character set [1]_.
See Also
--------
@@ -57,6 +136,55 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Nucleic Acids Res. May 10, 1985; 13(9): 3021-3030.
A Cornish-Bowden
+ Examples
+ --------
+
+ Note in the example below that properties either need to be static or
+ use skbio's `classproperty` decorator.
+
+ >>> from skbio.sequence import GrammaredSequence
+ >>> from skbio.util import classproperty
+ >>> class CustomSequence(GrammaredSequence):
+ ... @classproperty
+ ... def degenerate_map(cls):
+ ... return {"X": set("AB")}
+ ...
+ ... @classproperty
+ ... def nondegenerate_chars(cls):
+ ... return set("ABC")
+ ...
+ ... @classproperty
+ ... def default_gap_char(cls):
+ ... return '-'
+ ...
+ ... @classproperty
+ ... def gap_chars(cls):
+ ... return set('-.')
+
+ >>> seq = CustomSequence('ABABACAC')
+ >>> seq
+ CustomSequence
+ -----------------------------
+ Stats:
+ length: 8
+ has gaps: False
+ has degenerates: False
+ has non-degenerates: True
+ -----------------------------
+ 0 ABABACAC
+
+ >>> seq = CustomSequence('XXXXXX')
+ >>> seq
+ CustomSequence
+ ------------------------------
+ Stats:
+ length: 6
+ has gaps: False
+ has degenerates: True
+ has non-degenerates: False
+ ------------------------------
+ 0 XXXXXX
+
"""
__validation_mask = None
__degenerate_codes = None
@@ -97,18 +225,19 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
@classproperty
@stable(as_of='0.4.0')
def alphabet(cls):
- """Return valid IUPAC characters.
+ """Return valid characters.
This includes gap, non-degenerate, and degenerate characters.
Returns
-------
set
- Valid IUPAC characters.
+ Valid characters.
"""
return cls.degenerate_chars | cls.nondegenerate_chars | cls.gap_chars
+ @abstractproperty
@classproperty
@stable(as_of='0.4.0')
def gap_chars(cls):
@@ -120,8 +249,9 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Characters defined as gaps.
"""
- return set('-.')
+ pass # pragma: no cover
+ @abstractproperty
@classproperty
@experimental(as_of='0.4.1')
def default_gap_char(cls):
@@ -137,17 +267,17 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Default gap character.
"""
- return '-'
+ pass # pragma: no cover
@classproperty
@stable(as_of='0.4.0')
def degenerate_chars(cls):
- """Return degenerate IUPAC characters.
+ """Return degenerate characters.
Returns
-------
set
- Degenerate IUPAC characters.
+ Degenerate characters.
"""
return set(cls.degenerate_map)
@@ -156,15 +286,15 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
@classproperty
@stable(as_of='0.4.0')
def nondegenerate_chars(cls):
- """Return non-degenerate IUPAC characters.
+ """Return non-degenerate characters.
Returns
-------
set
- Non-degenerate IUPAC characters.
+ Non-degenerate characters.
"""
- return set() # pragma: no cover
+ pass # pragma: no cover
@abstractproperty
@classproperty
@@ -175,11 +305,11 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Returns
-------
dict (set)
- Mapping of each degenerate IUPAC character to the set of
- non-degenerate IUPAC characters it represents.
+ Mapping of each degenerate character to the set of
+ non-degenerate characters it represents.
"""
- return set() # pragma: no cover
+ pass # pragma: no cover
@property
def _motifs(self):
@@ -188,7 +318,7 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
@overrides(Sequence)
def __init__(self, sequence, metadata=None, positional_metadata=None,
lowercase=False, validate=True):
- super(IUPACSequence, self).__init__(
+ super(GrammaredSequence, self).__init__(
sequence, metadata, positional_metadata, lowercase)
if validate:
@@ -210,14 +340,13 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
invalid_characters > 0)[0].astype(np.uint8).view('|S1'))
raise ValueError(
"Invalid character%s in sequence: %r. \n"
- "Lowercase letters are not used in IUPAC notation. You can "
- "pass `lowercase=True` if your sequence contains lowercase "
- "letters.\n"
- "Valid IUPAC characters: "
- "%r" % ('s' if len(bad) > 1 else '',
- [str(b.tostring().decode("ascii")) for b in bad] if
- len(bad) > 1 else bad[0],
- list(self.alphabet)))
+ "Valid characters: %r\n"
+ "Note: Use `lowercase` if your sequence contains lowercase "
+ "characters not in the sequence's alphabet."
+ % ('s' if len(bad) > 1 else '',
+ [str(b.tostring().decode("ascii")) for b in bad] if
+ len(bad) > 1 else bad[0],
+ list(self.alphabet)))
@stable(as_of='0.4.0')
def gaps(self):
@@ -387,7 +516,7 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Returns
-------
- IUPACSequence
+ GrammaredSequence
A new sequence with all gap characters removed.
See Also
@@ -429,7 +558,7 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
Yields
------
- IUPACSequence
+ GrammaredSequence
Non-degenerate version of the sequence.
See Also
@@ -590,7 +719,7 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
@overrides(Sequence)
def _repr_stats(self):
"""Define custom statistics to display in the sequence's repr."""
- stats = super(IUPACSequence, self)._repr_stats()
+ stats = super(GrammaredSequence, self)._repr_stats()
stats.append(('has gaps', '%r' % self.has_gaps()))
stats.append(('has degenerates', '%r' % self.has_degenerates()))
stats.append(('has non-degenerates', '%r' % self.has_nondegenerates()))
@@ -600,4 +729,4 @@ class IUPACSequence(with_metaclass(ABCMeta, Sequence)):
_motifs = MiniRegistry()
# Leave this at the bottom
-_motifs.interpolate(IUPACSequence, "find_motifs")
+_motifs.interpolate(GrammaredSequence, "find_motifs")
diff --git a/skbio/sequence/_nucleotide_mixin.py b/skbio/sequence/_nucleotide_mixin.py
index 9b5c75c..771cf2d 100644
--- a/skbio/sequence/_nucleotide_mixin.py
+++ b/skbio/sequence/_nucleotide_mixin.py
@@ -14,7 +14,7 @@ from abc import ABCMeta, abstractproperty
import numpy as np
from skbio.util._decorator import classproperty, stable
-from ._iupac_sequence import _motifs as parent_motifs
+from ._grammared_sequence import _motifs as parent_motifs
class NucleotideMixin(with_metaclass(ABCMeta, object)):
diff --git a/skbio/sequence/_protein.py b/skbio/sequence/_protein.py
index 14b3e4e..4639a60 100644
--- a/skbio/sequence/_protein.py
+++ b/skbio/sequence/_protein.py
@@ -9,13 +9,16 @@
from __future__ import absolute_import, division, print_function
import numpy as np
+from six import add_metaclass
from skbio.util._decorator import classproperty, overrides
from skbio.util._decorator import stable
-from ._iupac_sequence import IUPACSequence, _motifs as parent_motifs
+from ._grammared_sequence import (GrammaredSequence, DisableSubclassingMeta,
+ _motifs as parent_motifs)
-class Protein(IUPACSequence):
+ at add_metaclass(DisableSubclassingMeta)
+class Protein(GrammaredSequence):
"""Store protein sequence data and optional associated metadata.
Only characters in the IUPAC protein character set [1]_ are supported.
@@ -61,6 +64,17 @@ class Protein(IUPACSequence):
degenerate_chars
degenerate_map
+ See Also
+ --------
+ GrammaredSequence
+
+ Notes
+ -----
+ Subclassing is disabled for Protein, because subclassing makes
+ it possible to change the alphabet, and certain methods rely on the
+ IUPAC alphabet. If a custom sequence alphabet is needed, inherit directly
+ from ``GrammaredSequence``.
+
References
----------
.. [1] Nomenclature for incompletely specified bases in nucleic acid
@@ -108,20 +122,17 @@ class Protein(IUPACSequence):
return cls.__stop_codes
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def alphabet(cls):
return super(Protein, cls).alphabet | cls.stop_chars
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set("ACDEFGHIKLMNPQRSTVWY")
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {
"B": set("DN"), "Z": set("EQ"),
@@ -141,6 +152,16 @@ class Protein(IUPACSequence):
"""
return set('*')
+ @classproperty
+ @overrides(GrammaredSequence)
+ def gap_chars(cls):
+ return set('-.')
+
+ @classproperty
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
@property
def _motifs(self):
return _motifs
@@ -195,7 +216,7 @@ class Protein(IUPACSequence):
"""
return bool(self.stops().any())
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def _repr_stats(self):
"""Define custom statistics to display in the sequence's repr."""
stats = super(Protein, self)._repr_stats()
diff --git a/skbio/sequence/_rna.py b/skbio/sequence/_rna.py
index 7e81da5..3548a93 100644
--- a/skbio/sequence/_rna.py
+++ b/skbio/sequence/_rna.py
@@ -7,15 +7,17 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+from six import add_metaclass
import skbio
from skbio.util._decorator import classproperty, overrides
from skbio.util._decorator import stable
from ._nucleotide_mixin import NucleotideMixin, _motifs as _parent_motifs
-from ._iupac_sequence import IUPACSequence
+from ._grammared_sequence import GrammaredSequence, DisableSubclassingMeta
-class RNA(IUPACSequence, NucleotideMixin):
+ at add_metaclass(DisableSubclassingMeta)
+class RNA(GrammaredSequence, NucleotideMixin):
"""Store RNA sequence data and optional associated metadata.
Only characters in the IUPAC RNA character set [1]_ are supported.
@@ -64,6 +66,14 @@ class RNA(IUPACSequence, NucleotideMixin):
See Also
--------
DNA
+ GrammaredSequence
+
+ Notes
+ -----
+ Subclassing is disabled for RNA, because subclassing makes
+ it possible to change the alphabet, and certain methods rely on the
+ IUPAC alphabet. If a custom sequence alphabet is needed, inherit directly
+ from ``GrammaredSequence``.
References
----------
@@ -104,7 +114,6 @@ class RNA(IUPACSequence, NucleotideMixin):
"""
@classproperty
- @stable(as_of="0.4.0")
@overrides(NucleotideMixin)
def complement_map(cls):
comp_map = {
@@ -117,14 +126,12 @@ class RNA(IUPACSequence, NucleotideMixin):
return comp_map
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def nondegenerate_chars(cls):
return set("ACGU")
@classproperty
- @stable(as_of="0.4.0")
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def degenerate_map(cls):
return {
"R": set("AG"), "Y": set("CU"), "M": set("AC"), "K": set("UG"),
@@ -132,6 +139,16 @@ class RNA(IUPACSequence, NucleotideMixin):
"H": set("ACU"), "V": set("ACG"), "N": set("ACGU")
}
+ @classproperty
+ @overrides(GrammaredSequence)
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ @overrides(GrammaredSequence)
+ def gap_chars(cls):
+ return set('-.')
+
@property
def _motifs(self):
return _motifs
@@ -404,7 +421,7 @@ class RNA(IUPACSequence, NucleotideMixin):
genetic_code = skbio.GeneticCode.from_ncbi(genetic_code)
return genetic_code.translate_six_frames(self, *args, **kwargs)
- @overrides(IUPACSequence)
+ @overrides(GrammaredSequence)
def _repr_stats(self):
"""Define custom statistics to display in the sequence's repr."""
stats = super(RNA, self)._repr_stats()
diff --git a/skbio/sequence/_sequence.py b/skbio/sequence/_sequence.py
index 34150b6..ed15cbc 100644
--- a/skbio/sequence/_sequence.py
+++ b/skbio/sequence/_sequence.py
@@ -17,10 +17,9 @@ import numbers
from contextlib import contextmanager
import numpy as np
-from scipy.spatial.distance import hamming
-
import pandas as pd
+import skbio.sequence.distance
from skbio._base import SkbioObject, MetadataMixin, PositionalMetadataMixin
from skbio.sequence._repr import _SequenceReprBuilder
from skbio.util._decorator import (stable, experimental, deprecated,
@@ -349,6 +348,29 @@ class Sequence(MetadataMixin, PositionalMetadataMixin, collections.Sequence,
return self._bytes.view('|S1')
@property
+ def __array_interface__(self):
+ """Array interface for compatibility with numpy.
+
+ This property allows a ``Sequence`` object to share its underlying data
+ buffer (``Sequence.values``) with numpy. See [1]_ for more details.
+
+ References
+ ----------
+ .. [1] http://docs.scipy.org/doc/numpy/reference/arrays.interface.html
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from skbio import Sequence
+ >>> seq = Sequence('ABC123')
+ >>> np.asarray(seq) # doctest: +NORMALIZE_WHITESPACE
+ array([b'A', b'B', b'C', b'1', b'2', b'3'],
+ dtype='|S1')
+
+ """
+ return self.values.__array_interface__
+
+ @property
@experimental(as_of="0.4.1")
def observed_chars(self):
"""Set of observed characters in the sequence.
@@ -1434,70 +1456,61 @@ class Sequence(MetadataMixin, PositionalMetadataMixin, collections.Sequence,
Parameters
----------
other : str, Sequence, or 1D np.ndarray (np.uint8 or '\|S1')
- Sequence to compute the distance to.
+ Sequence to compute the distance to. If `other` is a ``Sequence``
+ object, it must be the same type as this sequence. Other input
+ types will be converted into a ``Sequence`` object of the same type
+ as this sequence.
metric : function, optional
Function used to compute the distance between this sequence and
- `other`. If ``None`` (the default),
- ``scipy.spatial.distance.hamming`` will be used. This function
- should take two ``skbio.Sequence`` objects and return a ``float``.
+ `other`. If ``None`` (the default), Hamming distance will be used
+ (:func:`skbio.sequence.distance.hamming`). `metric` should take two
+ ``skbio.Sequence`` objects and return a ``float``. The sequence
+ objects passed to `metric` will be the same type as this sequence.
+ See :mod:`skbio.sequence.distance` for other predefined metrics
+ that can be supplied via `metric`.
Returns
-------
float
- Distance between this sequence and `other`.
+ Distance between this sequence and `other` as defined by `metric`.
Raises
------
- ValueError
- If the sequences are not the same length when `metric` is ``None``
- (i.e., `metric` is ``scipy.spatial.distance.hamming``). This is
- only checked when using this metric, as equal length is not a
- requirement of all sequence distance metrics. In general, the
- metric itself should test and give an informative error message,
- but the message from ``scipy.spatial.distance.hamming`` is somewhat
- cryptic (as of this writing), and it's the default metric, so we
- explicitly do this check here. This metric-specific check will be
- removed from this method when the ``skbio.sequence.stats`` module
- is created (track progress on issue #913).
TypeError
If `other` is a ``Sequence`` object with a different type than this
sequence.
See Also
--------
+ skbio.sequence.distance
fraction_diff
fraction_same
- scipy.spatial.distance.hamming
Examples
--------
>>> from skbio import Sequence
>>> s = Sequence('GGUC')
>>> t = Sequence('AGUC')
+
+ Compute Hamming distance (the default metric):
+
>>> s.distance(t)
0.25
- >>> def custom_dist(s1, s2): return 0.42
- >>> s.distance(t, custom_dist)
+
+ Use a custom metric:
+
+ >>> def custom_metric(s1, s2): return 0.42
+ >>> s.distance(t, custom_metric)
0.42
"""
# TODO refactor this method to accept a name (string) of the distance
# metric to apply and accept **kwargs
- other = self._munge_to_sequence(other, 'distance')
+ other = self._munge_to_self_type(other, 'distance')
if metric is None:
- return self._hamming(other)
+ metric = skbio.sequence.distance.hamming
return float(metric(self, other))
- def _hamming(self, other):
- # Hamming requires equal length sequences. We are checking this
- # here because the error you would get otherwise is cryptic.
- if len(self) != len(other):
- raise ValueError(
- "Sequences do not have equal length. "
- "Hamming distances can only be computed between "
- "sequences of equal length.")
- return float(hamming(self.values, other.values))
-
@stable(as_of="0.4.0")
def matches(self, other):
"""Find positions that match with another sequence.
@@ -2145,6 +2158,17 @@ class Sequence(MetadataMixin, PositionalMetadataMixin, collections.Sequence,
return normalized
+ def _munge_to_self_type(self, other, method):
+ if isinstance(other, Sequence):
+ if type(other) != type(self):
+ raise TypeError("Cannot use %s and %s together with `%s`" %
+ (self.__class__.__name__,
+ other.__class__.__name__, method))
+ else:
+ return other
+
+ return self.__class__(other)
+
def _munge_to_sequence(self, other, method):
if isinstance(other, Sequence):
if type(other) != type(self):
diff --git a/skbio/sequence/distance.py b/skbio/sequence/distance.py
new file mode 100644
index 0000000..568db42
--- /dev/null
+++ b/skbio/sequence/distance.py
@@ -0,0 +1,115 @@
+"""
+Sequence distance metrics (:mod:`skbio.sequence.distance`)
+==========================================================
+
+.. currentmodule:: skbio.sequence.distance
+
+This module contains functions for computing distances between scikit-bio
+``Sequence`` objects. These functions can be used directly or supplied to other
+parts of the scikit-bio API that accept a sequence distance metric as input,
+such as :meth:`skbio.sequence.Sequence.distance` and
+:meth:`skbio.stats.distance.DistanceMatrix.from_iterable`.
+
+Functions
+---------
+
+.. autosummary::
+ :toctree: generated/
+
+ hamming
+
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import scipy.spatial.distance
+
+import skbio
+from skbio.util._decorator import experimental
+
+
+ at experimental(as_of='0.4.2')
+def hamming(seq1, seq2):
+ """Compute Hamming distance between two sequences.
+
+ The Hamming distance between two equal-length sequences is the proportion
+ of differing characters.
+
+ Parameters
+ ----------
+ seq1, seq2 : Sequence
+ Sequences to compute Hamming distance between.
+
+ Returns
+ -------
+ float
+ Hamming distance between `seq1` and `seq2`.
+
+ Raises
+ ------
+ TypeError
+ If `seq1` and `seq2` are not ``Sequence`` instances.
+ TypeError
+ If `seq1` and `seq2` are not the same type.
+ ValueError
+ If `seq1` and `seq2` are not the same length.
+
+ See Also
+ --------
+ scipy.spatial.distance.hamming
+
+ Notes
+ -----
+ ``np.nan`` will be returned if the sequences do not contain any characters.
+
+ This function does not make assumptions about the sequence alphabet in use.
+ Each sequence object's underlying sequence of characters are used to
+ compute Hamming distance. Characters that may be considered equivalent in
+ certain contexts (e.g., `-` and `.` as gap characters) are treated as
+ distinct characters when computing Hamming distance.
+
+ Examples
+ --------
+ >>> from skbio import Sequence
+ >>> from skbio.sequence.distance import hamming
+ >>> seq1 = Sequence('AGGGTA')
+ >>> seq2 = Sequence('CGTTTA')
+ >>> hamming(seq1, seq2)
+ 0.5
+
+ """
+ for seq in seq1, seq2:
+ if not isinstance(seq, skbio.Sequence):
+ raise TypeError(
+ "`seq1` and `seq2` must be Sequence instances, not %r"
+ % type(seq).__name__)
+
+ if type(seq1) is not type(seq2):
+ raise TypeError(
+ "Sequences must have matching type. Type %r does not match type %r"
+ % (type(seq1).__name__, type(seq2).__name__))
+
+ # Hamming requires equal length sequences. We are checking this here
+ # because the error you would get otherwise is cryptic.
+ if len(seq1) != len(seq2):
+ raise ValueError(
+ "Hamming distance can only be computed between sequences of equal "
+ "length (%d != %d)" % (len(seq1), len(seq2)))
+
+ # scipy throws a RuntimeWarning when computing Hamming distance on length 0
+ # input.
+ if not seq1:
+ distance = np.nan
+ else:
+ distance = scipy.spatial.distance.hamming(seq1.values, seq2.values)
+
+ return float(distance)
diff --git a/skbio/sequence/tests/test_distance.py b/skbio/sequence/tests/test_distance.py
new file mode 100644
index 0000000..905c359
--- /dev/null
+++ b/skbio/sequence/tests/test_distance.py
@@ -0,0 +1,130 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import absolute_import, division, print_function
+
+import itertools
+import unittest
+
+import six
+import numpy as np
+import numpy.testing as npt
+
+from skbio import Sequence, DNA
+from skbio.sequence.distance import hamming
+
+
+class TestHamming(unittest.TestCase):
+ def test_non_sequence(self):
+ seq1 = Sequence('abc')
+ seq2 = 'abc'
+
+ with six.assertRaisesRegex(self, TypeError,
+ 'seq1.*seq2.*Sequence.*str'):
+ hamming(seq1, seq2)
+
+ with six.assertRaisesRegex(self, TypeError,
+ 'seq1.*seq2.*Sequence.*str'):
+ hamming(seq2, seq1)
+
+ def test_type_mismatch(self):
+ seq1 = Sequence('ABC')
+ seq2 = DNA('ACG')
+
+ with six.assertRaisesRegex(self, TypeError,
+ 'Sequence.*does not match.*DNA'):
+ hamming(seq1, seq2)
+
+ def test_length_mismatch(self):
+ seq1 = Sequence('ABC')
+ seq2 = Sequence('ABCD')
+
+ with six.assertRaisesRegex(self, ValueError, 'equal length.*3 != 4'):
+ hamming(seq1, seq2)
+
+ def test_return_type(self):
+ seq1 = Sequence('ABC')
+ seq2 = Sequence('ABC')
+
+ distance = hamming(seq1, seq2)
+
+ self.assertIsInstance(distance, float)
+ self.assertEqual(distance, 0.0)
+
+ def test_minimum_distance(self):
+ seq1 = Sequence('ABC')
+ seq2 = Sequence('ABC')
+
+ distance = hamming(seq1, seq2)
+
+ self.assertEqual(distance, 0.0)
+
+ def test_mid_range_distance(self):
+ seq1 = Sequence("abcdefgh")
+ seq2 = Sequence("1b23ef45")
+
+ distance = hamming(seq1, seq2)
+
+ self.assertEqual(distance, 5.0/8.0)
+
+ def test_maximum_distance(self):
+ seq1 = Sequence('ABC')
+ seq2 = Sequence('CAB')
+
+ distance = hamming(seq1, seq2)
+
+ self.assertEqual(distance, 1.0)
+
+ def test_empty_sequences(self):
+ seq1 = Sequence('')
+ seq2 = Sequence('')
+
+ distance = hamming(seq1, seq2)
+
+ npt.assert_equal(distance, np.nan)
+
+ def test_single_character_sequences(self):
+ seq1 = Sequence('a')
+ seq2 = Sequence('b')
+
+ self.assertEqual(hamming(seq1, seq1), 0.0)
+ self.assertEqual(hamming(seq1, seq2), 1.0)
+
+ def test_sequence_subclass(self):
+ seq1 = DNA('ACG-T')
+ seq2 = DNA('ACCTT')
+
+ distance = hamming(seq1, seq2)
+
+ self.assertEqual(distance, 2.0/5.0)
+
+ def test_sequences_with_metadata(self):
+ # test for #1254
+ seqs1 = [
+ Sequence("ACGT"),
+ Sequence("ACGT", metadata={'id': 'abc'}),
+ Sequence("ACGT", positional_metadata={'qual': range(4)})
+ ]
+ seqs2 = [
+ Sequence("AAAA"),
+ Sequence("AAAA", metadata={'id': 'def'}),
+ Sequence("AAAA", positional_metadata={'qual': range(4, 8)})
+ ]
+
+ for seqs in seqs1, seqs2:
+ for seq1, seq2 in itertools.product(seqs, repeat=2):
+ distance = hamming(seq1, seq2)
+ self.assertEqual(distance, 0.0)
+
+ for seq1, seq2 in itertools.product(seqs1, seqs2):
+ distance = hamming(seq1, seq2)
+ self.assertEqual(distance, 0.75)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/skbio/sequence/tests/test_dna.py b/skbio/sequence/tests/test_dna.py
index 4c3c3d3..a076059 100644
--- a/skbio/sequence/tests/test_dna.py
+++ b/skbio/sequence/tests/test_dna.py
@@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+import six
import unittest
@@ -40,6 +41,11 @@ class TestDNA(unittest.TestCase):
self.assertEqual(seq.transcribe(), RNA('AUAU'))
self.assertEqual(seq, DNA('ATAT'))
+ def test_cannot_subclass(self):
+ with six.assertRaisesRegex(self, TypeError, "Subclassing disabled"):
+ class CustomSequence(DNA):
+ pass
+
if __name__ == '__main__':
unittest.main()
diff --git a/skbio/sequence/tests/test_grammared_sequence.py b/skbio/sequence/tests/test_grammared_sequence.py
new file mode 100644
index 0000000..7edcc47
--- /dev/null
+++ b/skbio/sequence/tests/test_grammared_sequence.py
@@ -0,0 +1,613 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from __future__ import absolute_import, division, print_function
+import six
+
+from unittest import TestCase, main
+
+import numpy as np
+import numpy.testing as npt
+
+from skbio.sequence import GrammaredSequence
+from skbio.util import classproperty
+
+
+class ExampleGrammaredSequence(GrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("AB"), "Y": set("BC"), "Z": set("AC")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("ABC")
+
+ @classproperty
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ def gap_chars(cls):
+ return set('-.')
+
+
+class ExampleMotifsTester(ExampleGrammaredSequence):
+ @property
+ def _motifs(self):
+ # These aren't really motifs, just a way to excercise the code paths
+ return {
+ "name1": lambda x, _, __: str(x),
+ "name2": lambda x, _, __: len(x)
+ }
+
+
+class TestGrammaredSequence(TestCase):
+ def test_default_gap_must_be_in_gap_chars(self):
+ with six.assertRaisesRegex(
+ self, TypeError,
+ "default_gap_char must be in gap_chars for class "
+ "GrammaredSequenceInvalidDefaultGap"):
+
+ class GrammaredSequenceInvalidDefaultGap(ExampleGrammaredSequence):
+ @classproperty
+ def default_gap_char(cls):
+ return '*'
+
+ def test_degenerates_must_expand_to_valid_nondegenerates(self):
+ with six.assertRaisesRegex(
+ self, TypeError,
+ "degenerate_map must expand only to characters included in "
+ "nondegenerate_chars for class "
+ "GrammaredSequenceInvalidDefaultGap"):
+
+ class GrammaredSequenceInvalidDefaultGap(ExampleGrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("B")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("A")
+
+ def test_gap_chars_and_degenerates_share(self):
+ with six.assertRaisesRegex(
+ self, TypeError,
+ "gap_chars and degenerate_chars must not share any characters "
+ "for class GrammaredSequenceGapInDegenerateMap"):
+
+ class GrammaredSequenceGapInDegenerateMap(
+ ExampleGrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("AB")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("ABC")
+
+ @classproperty
+ def gap_chars(cls):
+ return set(".-X")
+
+ def test_gap_chars_and_nondegenerates_share(self):
+ with six.assertRaisesRegex(
+ self, TypeError,
+ ("gap_chars and nondegenerate_chars must not share any characters "
+ "for class GrammaredSequenceGapInNondegenerateMap")):
+
+ class GrammaredSequenceGapInNondegenerateMap(
+ ExampleGrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("AB")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("ABC")
+
+ @classproperty
+ def gap_chars(cls):
+ return set(".-A")
+
+ def test_degenerates_and_nondegenerates_share(self):
+ with six.assertRaisesRegex(
+ self, TypeError,
+ ("degenerate_chars and nondegenerate_chars must not share any "
+ "characters for class GrammaredSequenceInvalid")):
+
+ class GrammaredSequenceInvalid(ExampleGrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("AB")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("ABCX")
+
+ def test_instantiation_with_no_implementation(self):
+ class GrammaredSequenceSubclassNoImplementation(GrammaredSequence):
+ pass
+
+ with self.assertRaises(TypeError) as cm:
+ GrammaredSequenceSubclassNoImplementation()
+
+ self.assertIn("abstract class", str(cm.exception))
+ self.assertIn("nondegenerate_chars", str(cm.exception))
+ self.assertIn("degenerate_map", str(cm.exception))
+
+ def test_init_default_parameters(self):
+ seq = ExampleGrammaredSequence('.-ABCXYZ')
+
+ npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
+ self.assertFalse(seq.has_metadata())
+ self.assertFalse(seq.has_positional_metadata())
+
+ def test_init_nondefault_parameters(self):
+ seq = ExampleGrammaredSequence(
+ '.-ABCXYZ',
+ metadata={'id': 'foo'},
+ positional_metadata={'quality': range(8)})
+
+ npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
+ self.assertTrue(seq.has_metadata())
+ self.assertEqual(seq.metadata['id'], 'foo')
+ self.assertTrue(seq.has_positional_metadata())
+ npt.assert_equal(seq.positional_metadata['quality'], np.array(range(8),
+ dtype='int'))
+
+ def test_init_valid_empty_sequence(self):
+ # just make sure we can instantiate an empty sequence regardless of
+ # `validate` and `lowercase` parameters. more extensive tests
+ # are performed in Sequence base class unit tests
+ for validate in (True, False):
+ for lowercase in (True, False):
+ seq = ExampleGrammaredSequence(
+ '', validate=validate, lowercase=lowercase)
+ self.assertEqual(seq, ExampleGrammaredSequence(''))
+
+ def test_init_valid_single_character_sequence(self):
+ for validate in (True, False):
+ for lowercase in (True, False):
+ seq = ExampleGrammaredSequence(
+ 'C', validate=validate, lowercase=lowercase)
+ self.assertEqual(seq, ExampleGrammaredSequence('C'))
+
+ def test_init_valid_multiple_character_sequence(self):
+ for validate in (True, False):
+ for lowercase in (True, False):
+ seq = ExampleGrammaredSequence(
+ 'BAACB.XYY-AZ', validate=validate, lowercase=lowercase)
+ self.assertEqual(seq, ExampleGrammaredSequence('BAACB.XYY-AZ'))
+
+ def test_init_validate_parameter_single_character(self):
+ seq = 'w'
+
+ with six.assertRaisesRegex(self, ValueError, "character.*'w'"):
+ ExampleGrammaredSequence(seq)
+
+ # test that we can instantiate an invalid sequence. we don't guarantee
+ # anything working beyond instantiation
+ ExampleGrammaredSequence(seq, validate=False)
+
+ def test_init_validate_parameter_multiple_characters(self):
+ # mix of valid and invalid characters with repeats and lowercased
+ # alphabet characters
+ seq = 'CBCBBbawCbbwBXYZ-.x'
+
+ with six.assertRaisesRegex(self, ValueError, "\['a', 'b', 'w', 'x'\]"):
+ ExampleGrammaredSequence(seq)
+
+ ExampleGrammaredSequence(seq, validate=False)
+
+ def test_init_lowercase_all_lowercase(self):
+ s = 'cbcbbbazcbbzbxyz-.x'
+
+ with six.assertRaisesRegex(self, ValueError,
+ "\['a', 'b', 'c', 'x', 'y', 'z'\]"):
+ ExampleGrammaredSequence(s)
+
+ seq = ExampleGrammaredSequence(s, lowercase=True)
+ self.assertEqual(seq, ExampleGrammaredSequence('CBCBBBAZCBBZBXYZ-.X'))
+
+ def test_init_lowercase_mixed_case(self):
+ s = 'CBCBBbazCbbzBXYZ-.x'
+
+ with six.assertRaisesRegex(self, ValueError, "\['a', 'b', 'x', 'z'\]"):
+ ExampleGrammaredSequence(s)
+
+ seq = ExampleGrammaredSequence(s, lowercase=True)
+ self.assertEqual(seq, ExampleGrammaredSequence('CBCBBBAZCBBZBXYZ-.X'))
+
+ def test_init_lowercase_no_validation(self):
+ s = 'car'
+
+ with six.assertRaisesRegex(self, ValueError, "\['a', 'c', 'r'\]"):
+ ExampleGrammaredSequence(s)
+
+ with six.assertRaisesRegex(self, ValueError, "character.*'R'"):
+ ExampleGrammaredSequence(s, lowercase=True)
+
+ ExampleGrammaredSequence(s, lowercase=True, validate=False)
+
+ def test_init_lowercase_byte_ownership(self):
+ bytes = np.array([97, 98, 97], dtype=np.uint8)
+
+ with six.assertRaisesRegex(self, ValueError, "\['a', 'b'\]"):
+ ExampleGrammaredSequence(bytes)
+
+ seq = ExampleGrammaredSequence(bytes, lowercase=True)
+ self.assertEqual(seq, ExampleGrammaredSequence('ABA'))
+
+ # should not share the same memory
+ self.assertIsNot(seq._bytes, bytes)
+
+ # we should have copied `bytes` before modifying in place to convert to
+ # upper. make sure `bytes` hasn't been mutated
+ npt.assert_equal(bytes, np.array([97, 98, 97], dtype=np.uint8))
+
+ def test_init_lowercase_invalid_keys(self):
+ for invalid_key in ((), [], 2):
+ invalid_type = type(invalid_key)
+ with six.assertRaisesRegex(self, TypeError,
+ "lowercase keyword argument expected "
+ "a bool or string, but got %s" %
+ invalid_type):
+ ExampleGrammaredSequence('ACGTacgt', lowercase=invalid_key)
+
+ def test_degenerate_chars(self):
+ expected = set("XYZ")
+ self.assertIs(type(ExampleGrammaredSequence.degenerate_chars), set)
+ self.assertEqual(ExampleGrammaredSequence.degenerate_chars, expected)
+
+ ExampleGrammaredSequence.degenerate_chars.add("W")
+ self.assertEqual(ExampleGrammaredSequence.degenerate_chars, expected)
+
+ self.assertEqual(ExampleGrammaredSequence('').degenerate_chars,
+ expected)
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').degenerate_chars = set("BAR")
+
+ def test_nondegenerate_chars(self):
+ expected = set("ABC")
+ self.assertEqual(ExampleGrammaredSequence.nondegenerate_chars,
+ expected)
+
+ ExampleGrammaredSequence.degenerate_chars.add("D")
+ self.assertEqual(ExampleGrammaredSequence.nondegenerate_chars,
+ expected)
+
+ self.assertEqual(ExampleGrammaredSequence('').nondegenerate_chars,
+ expected)
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').nondegenerate_chars = set("BAR")
+
+ def test_gap_chars(self):
+ expected = set(".-")
+ self.assertIs(type(ExampleGrammaredSequence.gap_chars), set)
+ self.assertEqual(ExampleGrammaredSequence.gap_chars, expected)
+
+ ExampleGrammaredSequence.gap_chars.add("_")
+ self.assertEqual(ExampleGrammaredSequence.gap_chars, expected)
+
+ self.assertEqual(ExampleGrammaredSequence('').gap_chars, expected)
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').gap_chars = set("_ =")
+
+ def test_default_gap_char(self):
+ self.assertIs(type(ExampleGrammaredSequence.default_gap_char), str)
+ self.assertEqual(ExampleGrammaredSequence.default_gap_char, '-')
+ self.assertEqual(ExampleGrammaredSequence('').default_gap_char, '-')
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').default_gap_char = '.'
+
+ def test_alphabet(self):
+ expected = set("ABC.-XYZ")
+ self.assertIs(type(ExampleGrammaredSequence.alphabet), set)
+ self.assertEqual(ExampleGrammaredSequence.alphabet, expected)
+
+ ExampleGrammaredSequence.alphabet.add("DEF")
+ self.assertEqual(ExampleGrammaredSequence.alphabet, expected)
+
+ self.assertEqual(ExampleGrammaredSequence('').alphabet, expected)
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').alphabet = set("ABCDEFG.-WXYZ")
+
+ def test_degenerate_map(self):
+ expected = {"X": set("AB"), "Y": set("BC"), "Z": set("AC")}
+ self.assertEqual(ExampleGrammaredSequence.degenerate_map, expected)
+
+ ExampleGrammaredSequence.degenerate_map['W'] = set("ABC")
+ ExampleGrammaredSequence.degenerate_map['X'] = set("CA")
+ self.assertEqual(ExampleGrammaredSequence.degenerate_map, expected)
+
+ self.assertEqual(ExampleGrammaredSequence('').degenerate_map, expected)
+
+ with self.assertRaises(AttributeError):
+ ExampleGrammaredSequence('').degenerate_map = {'W': "ABC"}
+
+ def test_gaps(self):
+ self.assertIs(type(ExampleGrammaredSequence("").gaps()), np.ndarray)
+ self.assertIs(ExampleGrammaredSequence("").gaps().dtype,
+ np.dtype('bool'))
+ npt.assert_equal(ExampleGrammaredSequence("ABCXBZYABC").gaps(),
+ np.zeros(10).astype(bool))
+
+ npt.assert_equal(ExampleGrammaredSequence(".-.-.").gaps(),
+ np.ones(5).astype(bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("A.B-C.X-Y.").gaps(),
+ np.array([0, 1] * 5, dtype=bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("AB.AC.XY-").gaps(),
+ np.array([0, 0, 1] * 3, dtype=bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("A.BC.-").gaps(),
+ np.array([0, 1, 0, 0, 1, 1], dtype=bool))
+
+ def test_has_gaps(self):
+ self.assertIs(type(ExampleGrammaredSequence("").has_gaps()), bool)
+ self.assertIs(type(ExampleGrammaredSequence("-").has_gaps()), bool)
+
+ self.assertFalse(ExampleGrammaredSequence("").has_gaps())
+ self.assertFalse(ExampleGrammaredSequence("ABCXYZ").has_gaps())
+
+ self.assertTrue(ExampleGrammaredSequence("-").has_gaps())
+ self.assertTrue(ExampleGrammaredSequence("ABCXYZ-").has_gaps())
+
+ def test_degenerates(self):
+ self.assertIs(type(ExampleGrammaredSequence("").degenerates()),
+ np.ndarray)
+ self.assertIs(ExampleGrammaredSequence("").degenerates().dtype,
+ np.dtype('bool'))
+
+ npt.assert_equal(ExampleGrammaredSequence("ABCBC-.AB.").degenerates(),
+ np.zeros(10).astype(bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("ZYZYZ").degenerates(),
+ np.ones(5).astype(bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("AX.Y-ZBXCZ").degenerates(),
+ np.array([0, 1] * 5, dtype=bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("ABXACY.-Z").degenerates(),
+ np.array([0, 0, 1] * 3, dtype=bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("AZBCXY").degenerates(),
+ np.array([0, 1, 0, 0, 1, 1], dtype=bool))
+
+ def test_has_degenerates(self):
+ self.assertIs(type(ExampleGrammaredSequence("").has_degenerates()),
+ bool)
+ self.assertIs(type(ExampleGrammaredSequence("X").has_degenerates()),
+ bool)
+
+ self.assertFalse(ExampleGrammaredSequence("").has_degenerates())
+ self.assertFalse(ExampleGrammaredSequence("A-.BC").has_degenerates())
+
+ self.assertTrue(ExampleGrammaredSequence("Z").has_degenerates())
+ self.assertTrue(ExampleGrammaredSequence("ABC.XYZ-").has_degenerates())
+
+ def test_nondegenerates(self):
+ self.assertIs(type(ExampleGrammaredSequence("").nondegenerates()),
+ np.ndarray)
+ self.assertIs(ExampleGrammaredSequence("").nondegenerates().dtype,
+ np.dtype('bool'))
+
+ npt.assert_equal(
+ ExampleGrammaredSequence("XYZYZ-.XY.").nondegenerates(),
+ np.zeros(10).astype(bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("ABABA").nondegenerates(),
+ np.ones(5).astype(bool))
+
+ npt.assert_equal(
+ ExampleGrammaredSequence("XA.B-AZCXA").nondegenerates(),
+ np.array([0, 1] * 5, dtype=bool))
+
+ npt.assert_equal(
+ ExampleGrammaredSequence("XXAZZB.-C").nondegenerates(),
+ np.array([0, 0, 1] * 3, dtype=bool))
+
+ npt.assert_equal(ExampleGrammaredSequence("YB.-AC").nondegenerates(),
+ np.array([0, 1, 0, 0, 1, 1], dtype=bool))
+
+ def test_has_nondegenerates(self):
+ self.assertIs(type(ExampleGrammaredSequence("").has_nondegenerates()),
+ bool)
+ self.assertIs(type(ExampleGrammaredSequence("A").has_nondegenerates()),
+ bool)
+
+ self.assertFalse(ExampleGrammaredSequence("").has_nondegenerates())
+ self.assertFalse(
+ ExampleGrammaredSequence("X-.YZ").has_nondegenerates())
+
+ self.assertTrue(ExampleGrammaredSequence("C").has_nondegenerates())
+ self.assertTrue(
+ ExampleGrammaredSequence(".XYZ-ABC").has_nondegenerates())
+
+ def test_degap(self):
+ kw = {
+ 'metadata': {
+ 'id': 'some_id',
+ 'description': 'some description',
+ },
+ }
+
+ self.assertEqual(
+ ExampleGrammaredSequence(
+ "", positional_metadata={'qual': []}, **kw).degap(),
+ ExampleGrammaredSequence(
+ "", positional_metadata={'qual': []}, **kw))
+
+ self.assertEqual(
+ ExampleGrammaredSequence(
+ "ABCXYZ",
+ positional_metadata={'qual': np.arange(6)},
+ **kw).degap(),
+ ExampleGrammaredSequence(
+ "ABCXYZ",
+ positional_metadata={'qual': np.arange(6)},
+ **kw))
+
+ self.assertEqual(
+ ExampleGrammaredSequence(
+ "ABC-XYZ",
+ positional_metadata={'qual': np.arange(7)},
+ **kw).degap(),
+ ExampleGrammaredSequence(
+ "ABCXYZ",
+ positional_metadata={'qual': [0, 1, 2, 4, 5, 6]},
+ **kw))
+
+ self.assertEqual(
+ ExampleGrammaredSequence(
+ ".-ABC-XYZ.",
+ positional_metadata={'qual': np.arange(10)},
+ **kw).degap(),
+ ExampleGrammaredSequence(
+ "ABCXYZ",
+ positional_metadata={'qual': [2, 3, 4, 6, 7, 8]},
+ **kw))
+
+ self.assertEqual(
+ ExampleGrammaredSequence(
+ "---.-.-.-.-.",
+ positional_metadata={'quality': np.arange(12)},
+ **kw).degap(),
+ ExampleGrammaredSequence(
+ "",
+ positional_metadata={'quality': np.array([], dtype=np.int64)},
+ **kw))
+
+ def test_expand_degenerates_no_degens(self):
+ seq = ExampleGrammaredSequence("ABCABCABC")
+ self.assertEqual(list(seq.expand_degenerates()), [seq])
+
+ def test_expand_degenerates_all_degens(self):
+ exp = [
+ ExampleGrammaredSequence('ABA'), ExampleGrammaredSequence('ABC'),
+ ExampleGrammaredSequence('ACA'), ExampleGrammaredSequence('ACC'),
+ ExampleGrammaredSequence('BBA'), ExampleGrammaredSequence('BBC'),
+ ExampleGrammaredSequence('BCA'), ExampleGrammaredSequence('BCC')
+ ]
+ # Sort based on sequence string, as order is not guaranteed.
+ obs = sorted(ExampleGrammaredSequence('XYZ').expand_degenerates(),
+ key=str)
+ self.assertEqual(obs, exp)
+
+ def test_expand_degenerates_with_metadata(self):
+ kw = {
+ "metadata": {
+ "id": "some_id",
+ "description": "some description"
+ },
+ "positional_metadata": {
+ "quality": np.arange(3),
+ },
+ }
+ exp = [ExampleGrammaredSequence('ABA', **kw),
+ ExampleGrammaredSequence('ABC', **kw),
+ ExampleGrammaredSequence('BBA', **kw),
+ ExampleGrammaredSequence('BBC', **kw)]
+ obs = sorted(
+ ExampleGrammaredSequence('XBZ', **kw).expand_degenerates(),
+ key=str)
+ self.assertEqual(obs, exp)
+
+ def test_to_regex_no_degens(self):
+ seq = ExampleGrammaredSequence('ABC')
+ regex = seq.to_regex()
+ self.assertEqual(regex.pattern, str(seq))
+
+ def test_to_regex_with_degens(self):
+ seq = ExampleGrammaredSequence('AYZ')
+ regex = seq.to_regex()
+ self.assertFalse(any(regex.match(s) is None
+ for s in 'ABA ABC ACA ACC'.split()))
+ self.assertTrue(all(regex.match(s) is None
+ for s in 'CBA BBA ABB AAA'.split()))
+
+ def test_find_motifs_no_motif(self):
+ seq = ExampleMotifsTester("ABCABCABC")
+ with self.assertRaises(ValueError) as cm:
+ seq.find_motifs("doesn't-exist")
+ self.assertIn("doesn't-exist", str(cm.exception))
+
+ seq = ExampleGrammaredSequence("ABCABCABC")
+ with self.assertRaises(ValueError) as cm:
+ seq.find_motifs("doesn't-exist")
+ self.assertIn("doesn't-exist", str(cm.exception))
+
+ def test_find_motifs(self):
+ seq = ExampleMotifsTester("ABC")
+ self.assertEqual(seq.find_motifs("name1"), "ABC")
+ self.assertEqual(seq.find_motifs("name2"), 3)
+
+ def test_repr(self):
+ # basic sanity checks for custom repr stats. more extensive testing is
+ # performed on Sequence.__repr__
+
+ # minimal
+ obs = repr(ExampleGrammaredSequence(''))
+ self.assertEqual(obs.count('\n'), 7)
+ self.assertTrue(obs.startswith('ExampleGrammaredSequence'))
+ self.assertIn('length: 0', obs)
+ self.assertIn('has gaps: False', obs)
+ self.assertIn('has degenerates: False', obs)
+ self.assertIn('has non-degenerates: False', obs)
+ self.assertTrue(obs.endswith('-'))
+
+ # no metadata, mix of gaps, degenerates, and non-degenerates
+ obs = repr(ExampleGrammaredSequence('AY-B'))
+ self.assertEqual(obs.count('\n'), 8)
+ self.assertTrue(obs.startswith('ExampleGrammaredSequence'))
+ self.assertIn('length: 4', obs)
+ self.assertIn('has gaps: True', obs)
+ self.assertIn('has degenerates: True', obs)
+ self.assertIn('has non-degenerates: True', obs)
+ self.assertTrue(obs.endswith('0 AY-B'))
+
+ # metadata and positional metadata of mixed types
+ obs = repr(
+ ExampleGrammaredSequence(
+ 'ABCA',
+ metadata={'foo': 42, u'bar': 33.33, None: True, False: {},
+ (1, 2): 3, 'acb' * 100: "'"},
+ positional_metadata={'foo': range(4),
+ 42: ['a', 'b', [], 'c']}))
+ self.assertEqual(obs.count('\n'), 18)
+ self.assertTrue(obs.startswith('ExampleGrammaredSequence'))
+ self.assertIn('None: True', obs)
+ self.assertIn('\'foo\': 42', obs)
+ self.assertIn('42: <dtype: object>', obs)
+ self.assertIn('\'foo\': <dtype: int64>', obs)
+ self.assertIn('length: 4', obs)
+ self.assertIn('has gaps: False', obs)
+ self.assertIn('has degenerates: False', obs)
+ self.assertIn('has non-degenerates: True', obs)
+ self.assertTrue(obs.endswith('0 ABCA'))
+
+ # sequence spanning > 5 lines
+ obs = repr(ExampleGrammaredSequence('A' * 301))
+ self.assertEqual(obs.count('\n'), 12)
+ self.assertTrue(obs.startswith('ExampleGrammaredSequence'))
+ self.assertIn('length: 301', obs)
+ self.assertIn('has gaps: False', obs)
+ self.assertIn('has degenerates: False', obs)
+ self.assertIn('has non-degenerates: True', obs)
+ self.assertIn('...', obs)
+ self.assertTrue(obs.endswith('300 A'))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/skbio/sequence/tests/test_iupac_sequence.py b/skbio/sequence/tests/test_iupac_sequence.py
deleted file mode 100644
index d878a6d..0000000
--- a/skbio/sequence/tests/test_iupac_sequence.py
+++ /dev/null
@@ -1,506 +0,0 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2013--, scikit-bio development team.
-#
-# Distributed under the terms of the Modified BSD License.
-#
-# The full license is in the file COPYING.txt, distributed with this software.
-# ----------------------------------------------------------------------------
-
-from __future__ import absolute_import, division, print_function
-import six
-
-from unittest import TestCase, main
-
-import numpy as np
-import numpy.testing as npt
-
-from skbio.sequence._iupac_sequence import IUPACSequence
-from skbio.util._decorator import classproperty
-
-
-class ExampleIUPACSequence(IUPACSequence):
- @classproperty
- def degenerate_map(cls):
- return {"X": set("AB"), "Y": set("BC"), "Z": set("AC")}
-
- @classproperty
- def nondegenerate_chars(cls):
- return set("ABC")
-
-
-class ExampleMotifsTester(ExampleIUPACSequence):
- @property
- def _motifs(self):
- # These aren't really motifs, just a way to excercise the code paths
- return {
- "name1": lambda x, _, __: str(x),
- "name2": lambda x, _, __: len(x)
- }
-
-
-class TestIUPACSequence(TestCase):
- def test_instantiation_with_no_implementation(self):
- class IUPACSequenceSubclassNoImplementation(IUPACSequence):
- pass
-
- with self.assertRaises(TypeError) as cm:
- IUPACSequenceSubclassNoImplementation()
-
- self.assertIn("abstract class", str(cm.exception))
- self.assertIn("nondegenerate_chars", str(cm.exception))
- self.assertIn("degenerate_map", str(cm.exception))
-
- def test_init_default_parameters(self):
- seq = ExampleIUPACSequence('.-ABCXYZ')
-
- npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
- self.assertFalse(seq.has_metadata())
- self.assertFalse(seq.has_positional_metadata())
-
- def test_init_nondefault_parameters(self):
- seq = ExampleIUPACSequence('.-ABCXYZ',
- metadata={'id': 'foo'},
- positional_metadata={'quality': range(8)})
-
- npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
- self.assertTrue(seq.has_metadata())
- self.assertEqual(seq.metadata['id'], 'foo')
- self.assertTrue(seq.has_positional_metadata())
- npt.assert_equal(seq.positional_metadata['quality'], np.array(range(8),
- dtype='int'))
-
- def test_init_valid_empty_sequence(self):
- # just make sure we can instantiate an empty sequence regardless of
- # `validate` and `lowercase` parameters. more extensive tests
- # are performed in Sequence base class unit tests
- for validate in (True, False):
- for lowercase in (True, False):
- seq = ExampleIUPACSequence('', validate=validate,
- lowercase=lowercase)
- self.assertEqual(seq, ExampleIUPACSequence(''))
-
- def test_init_valid_single_character_sequence(self):
- for validate in (True, False):
- for lowercase in (True, False):
- seq = ExampleIUPACSequence('C', validate=validate,
- lowercase=lowercase)
- self.assertEqual(seq, ExampleIUPACSequence('C'))
-
- def test_init_valid_multiple_character_sequence(self):
- for validate in (True, False):
- for lowercase in (True, False):
- seq = ExampleIUPACSequence('BAACB.XYY-AZ', validate=validate,
- lowercase=lowercase)
- self.assertEqual(seq, ExampleIUPACSequence('BAACB.XYY-AZ'))
-
- def test_init_validate_parameter_single_character(self):
- seq = 'w'
-
- with six.assertRaisesRegex(self, ValueError, "character.*'w'"):
- ExampleIUPACSequence(seq)
-
- # test that we can instantiate an invalid sequence. we don't guarantee
- # anything working beyond instantiation
- ExampleIUPACSequence(seq, validate=False)
-
- def test_init_validate_parameter_multiple_characters(self):
- # mix of valid and invalid characters with repeats and lowercased
- # alphabet characters
- seq = 'CBCBBbawCbbwBXYZ-.x'
-
- with six.assertRaisesRegex(self, ValueError, "\['a', 'b', 'w', 'x'\]"):
- ExampleIUPACSequence(seq)
-
- ExampleIUPACSequence(seq, validate=False)
-
- def test_init_lowercase_all_lowercase(self):
- s = 'cbcbbbazcbbzbxyz-.x'
-
- with six.assertRaisesRegex(self, ValueError,
- "\['a', 'b', 'c', 'x', 'y', 'z'\]"):
- ExampleIUPACSequence(s)
-
- seq = ExampleIUPACSequence(s, lowercase=True)
- self.assertEqual(seq, ExampleIUPACSequence('CBCBBBAZCBBZBXYZ-.X'))
-
- def test_init_lowercase_mixed_case(self):
- s = 'CBCBBbazCbbzBXYZ-.x'
-
- with six.assertRaisesRegex(self, ValueError, "\['a', 'b', 'x', 'z'\]"):
- ExampleIUPACSequence(s)
-
- seq = ExampleIUPACSequence(s, lowercase=True)
- self.assertEqual(seq, ExampleIUPACSequence('CBCBBBAZCBBZBXYZ-.X'))
-
- def test_init_lowercase_no_validation(self):
- s = 'car'
-
- with six.assertRaisesRegex(self, ValueError, "\['a', 'c', 'r'\]"):
- ExampleIUPACSequence(s)
-
- with six.assertRaisesRegex(self, ValueError, "character.*'R'"):
- ExampleIUPACSequence(s, lowercase=True)
-
- ExampleIUPACSequence(s, lowercase=True, validate=False)
-
- def test_init_lowercase_byte_ownership(self):
- bytes = np.array([97, 98, 97], dtype=np.uint8)
-
- with six.assertRaisesRegex(self, ValueError, "\['a', 'b'\]"):
- ExampleIUPACSequence(bytes)
-
- seq = ExampleIUPACSequence(bytes, lowercase=True)
- self.assertEqual(seq, ExampleIUPACSequence('ABA'))
-
- # should not share the same memory
- self.assertIsNot(seq._bytes, bytes)
-
- # we should have copied `bytes` before modifying in place to convert to
- # upper. make sure `bytes` hasn't been mutated
- npt.assert_equal(bytes, np.array([97, 98, 97], dtype=np.uint8))
-
- def test_init_lowercase_invalid_keys(self):
- for invalid_key in ((), [], 2):
- invalid_type = type(invalid_key)
- with six.assertRaisesRegex(self, TypeError,
- "lowercase keyword argument expected "
- "a bool or string, but got %s" %
- invalid_type):
- ExampleIUPACSequence('ACGTacgt', lowercase=invalid_key)
-
- def test_degenerate_chars(self):
- expected = set("XYZ")
- self.assertIs(type(ExampleIUPACSequence.degenerate_chars), set)
- self.assertEqual(ExampleIUPACSequence.degenerate_chars, expected)
-
- ExampleIUPACSequence.degenerate_chars.add("W")
- self.assertEqual(ExampleIUPACSequence.degenerate_chars, expected)
-
- self.assertEqual(ExampleIUPACSequence('').degenerate_chars, expected)
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').degenerate_chars = set("BAR")
-
- def test_nondegenerate_chars(self):
- expected = set("ABC")
- self.assertEqual(ExampleIUPACSequence.nondegenerate_chars, expected)
-
- ExampleIUPACSequence.degenerate_chars.add("D")
- self.assertEqual(ExampleIUPACSequence.nondegenerate_chars, expected)
-
- self.assertEqual(ExampleIUPACSequence('').nondegenerate_chars,
- expected)
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').nondegenerate_chars = set("BAR")
-
- def test_gap_chars(self):
- expected = set(".-")
- self.assertIs(type(ExampleIUPACSequence.gap_chars), set)
- self.assertEqual(ExampleIUPACSequence.gap_chars, expected)
-
- ExampleIUPACSequence.gap_chars.add("_")
- self.assertEqual(ExampleIUPACSequence.gap_chars, expected)
-
- self.assertEqual(ExampleIUPACSequence('').gap_chars, expected)
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').gap_chars = set("_ =")
-
- def test_default_gap_char(self):
- self.assertIs(type(ExampleIUPACSequence.default_gap_char), str)
- self.assertEqual(ExampleIUPACSequence.default_gap_char, '-')
- self.assertEqual(ExampleIUPACSequence('').default_gap_char, '-')
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').default_gap_char = '.'
-
- def test_alphabet(self):
- expected = set("ABC.-XYZ")
- self.assertIs(type(ExampleIUPACSequence.alphabet), set)
- self.assertEqual(ExampleIUPACSequence.alphabet, expected)
-
- ExampleIUPACSequence.alphabet.add("DEF")
- self.assertEqual(ExampleIUPACSequence.alphabet, expected)
-
- self.assertEqual(ExampleIUPACSequence('').alphabet, expected)
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').alphabet = set("ABCDEFG.-WXYZ")
-
- def test_degenerate_map(self):
- expected = {"X": set("AB"), "Y": set("BC"), "Z": set("AC")}
- self.assertEqual(ExampleIUPACSequence.degenerate_map, expected)
-
- ExampleIUPACSequence.degenerate_map['W'] = set("ABC")
- ExampleIUPACSequence.degenerate_map['X'] = set("CA")
- self.assertEqual(ExampleIUPACSequence.degenerate_map, expected)
-
- self.assertEqual(ExampleIUPACSequence('').degenerate_map, expected)
-
- with self.assertRaises(AttributeError):
- ExampleIUPACSequence('').degenerate_map = {'W': "ABC"}
-
- def test_gaps(self):
- self.assertIs(type(ExampleIUPACSequence("").gaps()), np.ndarray)
- self.assertIs(ExampleIUPACSequence("").gaps().dtype, np.dtype('bool'))
- npt.assert_equal(ExampleIUPACSequence("ABCXBZYABC").gaps(),
- np.zeros(10).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence(".-.-.").gaps(),
- np.ones(5).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence("A.B-C.X-Y.").gaps(),
- np.array([0, 1] * 5, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("AB.AC.XY-").gaps(),
- np.array([0, 0, 1] * 3, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("A.BC.-").gaps(),
- np.array([0, 1, 0, 0, 1, 1], dtype=bool))
-
- def test_has_gaps(self):
- self.assertIs(type(ExampleIUPACSequence("").has_gaps()), bool)
- self.assertIs(type(ExampleIUPACSequence("-").has_gaps()), bool)
-
- self.assertFalse(ExampleIUPACSequence("").has_gaps())
- self.assertFalse(ExampleIUPACSequence("ABCXYZ").has_gaps())
-
- self.assertTrue(ExampleIUPACSequence("-").has_gaps())
- self.assertTrue(ExampleIUPACSequence("ABCXYZ-").has_gaps())
-
- def test_degenerates(self):
- self.assertIs(type(ExampleIUPACSequence("").degenerates()), np.ndarray)
- self.assertIs(ExampleIUPACSequence("").degenerates().dtype,
- np.dtype('bool'))
-
- npt.assert_equal(ExampleIUPACSequence("ABCBC-.AB.").degenerates(),
- np.zeros(10).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence("ZYZYZ").degenerates(),
- np.ones(5).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence("AX.Y-ZBXCZ").degenerates(),
- np.array([0, 1] * 5, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("ABXACY.-Z").degenerates(),
- np.array([0, 0, 1] * 3, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("AZBCXY").degenerates(),
- np.array([0, 1, 0, 0, 1, 1], dtype=bool))
-
- def test_has_degenerates(self):
- self.assertIs(type(ExampleIUPACSequence("").has_degenerates()), bool)
- self.assertIs(type(ExampleIUPACSequence("X").has_degenerates()), bool)
-
- self.assertFalse(ExampleIUPACSequence("").has_degenerates())
- self.assertFalse(ExampleIUPACSequence("A-.BC").has_degenerates())
-
- self.assertTrue(ExampleIUPACSequence("Z").has_degenerates())
- self.assertTrue(ExampleIUPACSequence("ABC.XYZ-").has_degenerates())
-
- def test_nondegenerates(self):
- self.assertIs(type(ExampleIUPACSequence("").nondegenerates()),
- np.ndarray)
- self.assertIs(ExampleIUPACSequence("").nondegenerates().dtype,
- np.dtype('bool'))
-
- npt.assert_equal(ExampleIUPACSequence("XYZYZ-.XY.").nondegenerates(),
- np.zeros(10).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence("ABABA").nondegenerates(),
- np.ones(5).astype(bool))
-
- npt.assert_equal(ExampleIUPACSequence("XA.B-AZCXA").nondegenerates(),
- np.array([0, 1] * 5, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("XXAZZB.-C").nondegenerates(),
- np.array([0, 0, 1] * 3, dtype=bool))
-
- npt.assert_equal(ExampleIUPACSequence("YB.-AC").nondegenerates(),
- np.array([0, 1, 0, 0, 1, 1], dtype=bool))
-
- def test_has_nondegenerates(self):
- self.assertIs(type(ExampleIUPACSequence("").has_nondegenerates()),
- bool)
- self.assertIs(type(ExampleIUPACSequence("A").has_nondegenerates()),
- bool)
-
- self.assertFalse(ExampleIUPACSequence("").has_nondegenerates())
- self.assertFalse(ExampleIUPACSequence("X-.YZ").has_nondegenerates())
-
- self.assertTrue(ExampleIUPACSequence("C").has_nondegenerates())
- self.assertTrue(ExampleIUPACSequence(".XYZ-ABC").has_nondegenerates())
-
- def test_degap(self):
- kw = {
- 'metadata': {
- 'id': 'some_id',
- 'description': 'some description',
- },
- }
-
- self.assertEqual(
- ExampleIUPACSequence("", positional_metadata={'qual': []},
- **kw).degap(),
- ExampleIUPACSequence("", positional_metadata={'qual': []},
- **kw))
-
- self.assertEqual(
- ExampleIUPACSequence(
- "ABCXYZ",
- positional_metadata={'qual': np.arange(6)},
- **kw).degap(),
- ExampleIUPACSequence(
- "ABCXYZ",
- positional_metadata={'qual': np.arange(6)},
- **kw))
-
- self.assertEqual(
- ExampleIUPACSequence(
- "ABC-XYZ",
- positional_metadata={'qual': np.arange(7)},
- **kw).degap(),
- ExampleIUPACSequence(
- "ABCXYZ",
- positional_metadata={'qual': [0, 1, 2, 4, 5, 6]},
- **kw))
-
- self.assertEqual(
- ExampleIUPACSequence(
- ".-ABC-XYZ.",
- positional_metadata={'qual': np.arange(10)},
- **kw).degap(),
- ExampleIUPACSequence(
- "ABCXYZ",
- positional_metadata={'qual': [2, 3, 4, 6, 7, 8]},
- **kw))
-
- self.assertEqual(
- ExampleIUPACSequence(
- "---.-.-.-.-.",
- positional_metadata={'quality': np.arange(12)},
- **kw).degap(),
- ExampleIUPACSequence(
- "",
- positional_metadata={'quality': np.array([], dtype=np.int64)},
- **kw))
-
- def test_expand_degenerates_no_degens(self):
- seq = ExampleIUPACSequence("ABCABCABC")
- self.assertEqual(list(seq.expand_degenerates()), [seq])
-
- def test_expand_degenerates_all_degens(self):
- exp = [ExampleIUPACSequence('ABA'), ExampleIUPACSequence('ABC'),
- ExampleIUPACSequence('ACA'), ExampleIUPACSequence('ACC'),
- ExampleIUPACSequence('BBA'), ExampleIUPACSequence('BBC'),
- ExampleIUPACSequence('BCA'), ExampleIUPACSequence('BCC')]
- # Sort based on sequence string, as order is not guaranteed.
- obs = sorted(ExampleIUPACSequence('XYZ').expand_degenerates(), key=str)
- self.assertEqual(obs, exp)
-
- def test_expand_degenerates_with_metadata(self):
- kw = {
- "metadata": {
- "id": "some_id",
- "description": "some description"
- },
- "positional_metadata": {
- "quality": np.arange(3),
- },
- }
- exp = [ExampleIUPACSequence('ABA', **kw),
- ExampleIUPACSequence('ABC', **kw),
- ExampleIUPACSequence('BBA', **kw),
- ExampleIUPACSequence('BBC', **kw)]
- obs = sorted(ExampleIUPACSequence('XBZ', **kw).expand_degenerates(),
- key=str)
- self.assertEqual(obs, exp)
-
- def test_to_regex_no_degens(self):
- seq = ExampleIUPACSequence('ABC')
- regex = seq.to_regex()
- self.assertEqual(regex.pattern, str(seq))
-
- def test_to_regex_with_degens(self):
- seq = ExampleIUPACSequence('AYZ')
- regex = seq.to_regex()
- self.assertFalse(any(regex.match(s) is None
- for s in 'ABA ABC ACA ACC'.split()))
- self.assertTrue(all(regex.match(s) is None
- for s in 'CBA BBA ABB AAA'.split()))
-
- def test_find_motifs_no_motif(self):
- seq = ExampleMotifsTester("ABCABCABC")
- with self.assertRaises(ValueError) as cm:
- seq.find_motifs("doesn't-exist")
- self.assertIn("doesn't-exist", str(cm.exception))
-
- seq = ExampleIUPACSequence("ABCABCABC")
- with self.assertRaises(ValueError) as cm:
- seq.find_motifs("doesn't-exist")
- self.assertIn("doesn't-exist", str(cm.exception))
-
- def test_find_motifs(self):
- seq = ExampleMotifsTester("ABC")
- self.assertEqual(seq.find_motifs("name1"), "ABC")
- self.assertEqual(seq.find_motifs("name2"), 3)
-
- def test_repr(self):
- # basic sanity checks for custom repr stats. more extensive testing is
- # performed on Sequence.__repr__
-
- # minimal
- obs = repr(ExampleIUPACSequence(''))
- self.assertEqual(obs.count('\n'), 7)
- self.assertTrue(obs.startswith('ExampleIUPACSequence'))
- self.assertIn('length: 0', obs)
- self.assertIn('has gaps: False', obs)
- self.assertIn('has degenerates: False', obs)
- self.assertIn('has non-degenerates: False', obs)
- self.assertTrue(obs.endswith('-'))
-
- # no metadata, mix of gaps, degenerates, and non-degenerates
- obs = repr(ExampleIUPACSequence('AY-B'))
- self.assertEqual(obs.count('\n'), 8)
- self.assertTrue(obs.startswith('ExampleIUPACSequence'))
- self.assertIn('length: 4', obs)
- self.assertIn('has gaps: True', obs)
- self.assertIn('has degenerates: True', obs)
- self.assertIn('has non-degenerates: True', obs)
- self.assertTrue(obs.endswith('0 AY-B'))
-
- # metadata and positional metadata of mixed types
- obs = repr(
- ExampleIUPACSequence(
- 'ABCA',
- metadata={'foo': 42, u'bar': 33.33, None: True, False: {},
- (1, 2): 3, 'acb' * 100: "'"},
- positional_metadata={'foo': range(4),
- 42: ['a', 'b', [], 'c']}))
- self.assertEqual(obs.count('\n'), 18)
- self.assertTrue(obs.startswith('ExampleIUPACSequence'))
- self.assertIn('None: True', obs)
- self.assertIn('\'foo\': 42', obs)
- self.assertIn('42: <dtype: object>', obs)
- self.assertIn('\'foo\': <dtype: int64>', obs)
- self.assertIn('length: 4', obs)
- self.assertIn('has gaps: False', obs)
- self.assertIn('has degenerates: False', obs)
- self.assertIn('has non-degenerates: True', obs)
- self.assertTrue(obs.endswith('0 ABCA'))
-
- # sequence spanning > 5 lines
- obs = repr(ExampleIUPACSequence('A' * 301))
- self.assertEqual(obs.count('\n'), 12)
- self.assertTrue(obs.startswith('ExampleIUPACSequence'))
- self.assertIn('length: 301', obs)
- self.assertIn('has gaps: False', obs)
- self.assertIn('has degenerates: False', obs)
- self.assertIn('has non-degenerates: True', obs)
- self.assertIn('...', obs)
- self.assertTrue(obs.endswith('300 A'))
-
-
-if __name__ == "__main__":
- main()
diff --git a/skbio/sequence/tests/test_nucleotide_sequences.py b/skbio/sequence/tests/test_nucleotide_sequences.py
index ce03b19..367c794 100644
--- a/skbio/sequence/tests/test_nucleotide_sequences.py
+++ b/skbio/sequence/tests/test_nucleotide_sequences.py
@@ -15,6 +15,8 @@ import numpy as np
from skbio import DNA, RNA, Protein, GeneticCode
from skbio.sequence._nucleotide_mixin import NucleotideMixin
+from skbio.sequence import GrammaredSequence
+from skbio.util import classproperty
# This file contains tests for functionality of sequence types which implement
@@ -408,12 +410,29 @@ class TestNucelotideSequence(unittest.TestCase):
def test_is_reverse_complement_type_mismatch(self):
for Class in (DNA, RNA):
- class Subclass(Class):
- pass
+ class DifferentSequenceClass(GrammaredSequence):
+ @classproperty
+ def degenerate_map(cls):
+ return {"X": set("AB")}
+
+ @classproperty
+ def nondegenerate_chars(cls):
+ return set("ABC")
+
+ @classproperty
+ def default_gap_char(cls):
+ return '-'
+
+ @classproperty
+ def gap_chars(cls):
+ return set('-.')
+
seq1 = Class('ABC')
- seq2 = Subclass('ABC')
+ seq2 = DifferentSequenceClass('ABC')
- with self.assertRaises(TypeError):
+ with six.assertRaisesRegex(self, TypeError,
+ "Cannot use.*and "
+ "DifferentSequenceClass together"):
seq1.is_reverse_complement(seq2)
def test_motif_purine_run(self):
diff --git a/skbio/sequence/tests/test_protein.py b/skbio/sequence/tests/test_protein.py
index 87c3159..7af26ef 100644
--- a/skbio/sequence/tests/test_protein.py
+++ b/skbio/sequence/tests/test_protein.py
@@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+import six
import unittest
@@ -119,6 +120,11 @@ class TestProtein(unittest.TestCase):
self.assertIn('has gaps: False', obs)
self.assertIn('has stops: True', obs)
+ def test_cannot_subclass(self):
+ with six.assertRaisesRegex(self, TypeError, "Subclassing disabled"):
+ class CustomSequence(Protein):
+ pass
+
if __name__ == "__main__":
unittest.main()
diff --git a/skbio/sequence/tests/test_rna.py b/skbio/sequence/tests/test_rna.py
index b9e8240..ebefb55 100644
--- a/skbio/sequence/tests/test_rna.py
+++ b/skbio/sequence/tests/test_rna.py
@@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
+import six
import unittest
@@ -40,6 +41,10 @@ class TestRNA(unittest.TestCase):
self.assertEqual(seq.reverse_transcribe(), DNA('ATAT'))
self.assertEqual(seq, RNA('AUAU'))
+ def test_cannot_subclass(self):
+ with six.assertRaisesRegex(self, TypeError, "Subclassing disabled"):
+ class CustomSequence(RNA):
+ pass
if __name__ == '__main__':
unittest.main()
diff --git a/skbio/sequence/tests/test_sequence.py b/skbio/sequence/tests/test_sequence.py
index f23a991..477e657 100644
--- a/skbio/sequence/tests/test_sequence.py
+++ b/skbio/sequence/tests/test_sequence.py
@@ -12,6 +12,7 @@ from six.moves import zip_longest
import copy
import functools
+import itertools
import re
from types import GeneratorType
from collections import Hashable
@@ -20,8 +21,10 @@ from unittest import TestCase, main
import numpy as np
import numpy.testing as npt
import pandas as pd
+import scipy.spatial.distance
-from skbio import Sequence
+import skbio.sequence.distance
+from skbio import Sequence, DNA
from skbio.util import assert_data_frame_almost_equal
from skbio.sequence._sequence import (_single_index_to_slice, _is_single_index,
_as_slice_if_single_index)
@@ -53,15 +56,21 @@ class TestSequencePositionalMetadata(TestCase, ReallyEqualMixin,
self._positional_metadata_constructor_ = factory
-class TestSequence(TestCase, ReallyEqualMixin):
+class TestSequenceBase(TestCase):
def setUp(self):
- self.lowercase_seq = Sequence('AAAAaaaa', lowercase='key')
self.sequence_kinds = frozenset([
str, Sequence, lambda s: np.fromstring(s, dtype='|S1'),
lambda s: np.fromstring(s, dtype=np.uint8)])
+
+class TestSequence(TestSequenceBase, ReallyEqualMixin):
+ def setUp(self):
+ super(TestSequence, self).setUp()
+
+ self.lowercase_seq = Sequence('AAAAaaaa', lowercase='key')
+
def empty_generator():
- raise StopIteration()
+ return
yield
self.getitem_empty_indices = [
@@ -454,6 +463,19 @@ class TestSequence(TestCase, ReallyEqualMixin):
with self.assertRaises(AttributeError):
seq.values = np.array("GGGG", dtype='c')
+ def test_sequence_numpy_compatibility(self):
+ seq = Sequence('abc123')
+
+ array = np.asarray(seq)
+
+ self.assertIsInstance(array, np.ndarray)
+ self.assertEqual(array.dtype, '|S1')
+ npt.assert_equal(array, np.array('abc123', dtype='c'))
+ npt.assert_equal(array, seq.values)
+
+ with self.assertRaises(ValueError):
+ array[1] = 'B'
+
def test_observed_chars_property(self):
self.assertEqual(Sequence('').observed_chars, set())
self.assertEqual(Sequence('x').observed_chars, {'x'})
@@ -1155,50 +1177,6 @@ class TestSequence(TestCase, ReallyEqualMixin):
self.assertEqual('AaAAaAAA',
self.lowercase_seq.lowercase([1, 4]))
- def test_distance(self):
- tested = 0
- for constructor in self.sequence_kinds:
- tested += 1
- seq1 = Sequence("abcdef")
- seq2 = constructor("12bcef")
-
- self.assertIsInstance(seq1.distance(seq1), float)
- self.assertEqual(seq1.distance(seq2), 2.0/3.0)
-
- self.assertEqual(tested, 4)
-
- def test_distance_arbitrary_function(self):
- def metric(x, y):
- return len(x) ** 2 + len(y) ** 2
-
- seq1 = Sequence("12345678")
- seq2 = Sequence("1234")
- result = seq1.distance(seq2, metric=metric)
- self.assertIsInstance(result, float)
- self.assertEqual(result, 80.0)
-
- def test_distance_default_metric(self):
- seq1 = Sequence("abcdef")
- seq2 = Sequence("12bcef")
- seq_wrong = Sequence("abcdefghijklmnop")
-
- self.assertIsInstance(seq1.distance(seq1), float)
- self.assertEqual(seq1.distance(seq1), 0.0)
- self.assertEqual(seq1.distance(seq2), 2.0/3.0)
-
- with self.assertRaises(ValueError):
- seq1.distance(seq_wrong)
-
- with self.assertRaises(ValueError):
- seq_wrong.distance(seq1)
-
- def test_distance_on_subclass(self):
- seq1 = Sequence("abcdef")
- seq2 = SequenceSubclass("12bcef")
-
- with self.assertRaises(TypeError):
- seq1.distance(seq2)
-
def test_matches(self):
tested = 0
for constructor in self.sequence_kinds:
@@ -2223,6 +2201,148 @@ class TestSequence(TestCase, ReallyEqualMixin):
seq._munge_to_bytestring(input_, 'dummy_method')
+class TestDistance(TestSequenceBase):
+ def test_mungeable_inputs_to_sequence(self):
+ def metric(a, b):
+ self.assertEqual(a, Sequence("abcdef"))
+ self.assertEqual(b, Sequence("12bcef"))
+ return 42.0
+
+ for constructor in self.sequence_kinds:
+ seq1 = Sequence("abcdef")
+ seq2 = constructor("12bcef")
+
+ distance = seq1.distance(seq2, metric=metric)
+
+ self.assertEqual(distance, 42.0)
+
+ def test_mungeable_inputs_to_sequence_subclass(self):
+ def metric(a, b):
+ self.assertEqual(a, SequenceSubclass("abcdef"))
+ self.assertEqual(b, SequenceSubclass("12bcef"))
+ return -42.0
+
+ sequence_kinds = frozenset([
+ str, SequenceSubclass, lambda s: np.fromstring(s, dtype='|S1'),
+ lambda s: np.fromstring(s, dtype=np.uint8)])
+
+ for constructor in sequence_kinds:
+ seq1 = SequenceSubclass("abcdef")
+ seq2 = constructor("12bcef")
+
+ distance = seq1.distance(seq2, metric=metric)
+
+ self.assertEqual(distance, -42.0)
+
+ def test_sequence_type_mismatch(self):
+ seq1 = SequenceSubclass("abcdef")
+ seq2 = Sequence("12bcef")
+
+ with six.assertRaisesRegex(self, TypeError,
+ 'SequenceSubclass.*Sequence.*`distance`'):
+ seq1.distance(seq2)
+
+ with six.assertRaisesRegex(self, TypeError,
+ 'Sequence.*SequenceSubclass.*`distance`'):
+ seq2.distance(seq1)
+
+ def test_munging_invalid_characters_to_self_type(self):
+ with six.assertRaisesRegex(self, ValueError, 'Invalid characters.*X'):
+ DNA("ACGT").distance("WXYZ")
+
+ def test_munging_invalid_type_to_self_type(self):
+ with self.assertRaises(TypeError):
+ Sequence("ACGT").distance(42)
+
+ def test_return_type_coercion(self):
+ def metric(a, b):
+ return 42
+
+ distance = Sequence('abc').distance('cba', metric=metric)
+
+ self.assertIsInstance(distance, float)
+
+ def test_invalid_return_type(self):
+ def metric(a, b):
+ return 'too far'
+
+ with six.assertRaisesRegex(self, ValueError, 'string.*float'):
+ Sequence('abc').distance('cba', metric=metric)
+
+ def test_arbitrary_metric(self):
+ def metric(x, y):
+ return len(x) ** 2 + len(y) ** 2
+
+ seq1 = Sequence("12345678")
+ seq2 = Sequence("1234")
+
+ distance = seq1.distance(seq2, metric=metric)
+
+ self.assertEqual(distance, 80.0)
+
+ def test_scipy_hamming_metric_with_metadata(self):
+ # test for #1254
+ seqs1 = [
+ Sequence("ACGT"),
+ Sequence("ACGT", metadata={'id': 'abc'}),
+ Sequence("ACGT", positional_metadata={'qual': range(4)})
+ ]
+ seqs2 = [
+ Sequence("AAAA"),
+ Sequence("AAAA", metadata={'id': 'def'}),
+ Sequence("AAAA", positional_metadata={'qual': range(4, 8)})
+ ]
+
+ for seqs in seqs1, seqs2:
+ for seq1, seq2 in itertools.product(seqs, repeat=2):
+ distance = seq1.distance(seq2,
+ metric=scipy.spatial.distance.hamming)
+ self.assertEqual(distance, 0.0)
+
+ for seq1, seq2 in itertools.product(seqs1, seqs2):
+ distance = seq1.distance(seq2,
+ metric=scipy.spatial.distance.hamming)
+ self.assertEqual(distance, 0.75)
+
+ def test_default_metric_with_metadata(self):
+ # test for #1254
+ seqs1 = [
+ Sequence("ACGT"),
+ Sequence("ACGT", metadata={'id': 'abc'}),
+ Sequence("ACGT", positional_metadata={'qual': range(4)})
+ ]
+ seqs2 = [
+ Sequence("AAAA"),
+ Sequence("AAAA", metadata={'id': 'def'}),
+ Sequence("AAAA", positional_metadata={'qual': range(4, 8)})
+ ]
+
+ for seqs in seqs1, seqs2:
+ for seq1, seq2 in itertools.product(seqs, repeat=2):
+ distance = seq1.distance(seq2)
+ self.assertEqual(distance, 0.0)
+
+ for seq1, seq2 in itertools.product(seqs1, seqs2):
+ distance = seq1.distance(seq2)
+ self.assertEqual(distance, 0.75)
+
+ def test_default_metric_matches_hamming(self):
+ seq1 = Sequence("abcdef")
+ seq2 = Sequence("12bcef")
+ seq_wrong = Sequence("abcdefghijklmnop")
+
+ distance1 = seq1.distance(seq2)
+ distance2 = skbio.sequence.distance.hamming(seq1, seq2)
+
+ self.assertEqual(distance1, distance2)
+
+ with self.assertRaises(ValueError):
+ seq1.distance(seq_wrong)
+
+ with self.assertRaises(ValueError):
+ seq_wrong.distance(seq1)
+
+
# NOTE: this must be a *separate* class for doctests only (no unit tests). nose
# will not run the unit tests otherwise
#
diff --git a/skbio/stats/composition.py b/skbio/stats/composition.py
index dd35507..ba5bde4 100644
--- a/skbio/stats/composition.py
+++ b/skbio/stats/composition.py
@@ -124,6 +124,15 @@ def closure(mat):
A matrix of proportions where all of the values
are nonzero and each composition (row) adds up to 1
+ Raises
+ ------
+ ValueError
+ Raises an error if any values are negative.
+ ValueError
+ Raises an error if the matrix has more than 2 dimension.
+ ValueError
+ Raises an error if there is a row that has all zeros.
+
Examples
--------
>>> import numpy as np
@@ -139,6 +148,8 @@ def closure(mat):
raise ValueError("Cannot have negative proportions")
if mat.ndim > 2:
raise ValueError("Input matrix can only have two dimensions or less")
+ if np.all(mat == 0, axis=1).sum() > 0:
+ raise ValueError("Input matrix cannot have rows with all zeros")
mat = mat / mat.sum(axis=1, keepdims=True)
return mat.squeeze()
@@ -170,6 +181,16 @@ def multiplicative_replacement(mat, delta=None):
A matrix of proportions where all of the values
are nonzero and each composition (row) adds up to 1
+ Raises
+ ------
+ ValueError
+ Raises an error if negative proportions are created due to a large
+ `delta`.
+
+ Notes
+ -----
+ This method will result in negative proportions if a large delta is chosen.
+
References
----------
.. [1] J. A. Martin-Fernandez. "Dealing With Zeros and Missing Values in
@@ -196,6 +217,9 @@ def multiplicative_replacement(mat, delta=None):
delta = (1. / num_feats)**2
zcnts = 1 - tot * delta
+ if np.any(zcnts) < 0:
+ raise ValueError('The multiplicative replacment created negative '
+ 'proportions. Consider using a smaller `delta`.')
mat = np.where(z_mat, delta, zcnts * mat)
return mat.squeeze()
@@ -381,8 +405,8 @@ def inner(x, y):
>>> from skbio.stats.composition import inner
>>> x = np.array([.1, .3, .4, .2])
>>> y = np.array([.2, .4, .2, .2])
- >>> inner(x, y)
- 0.21078524737545556
+ >>> inner(x, y) # doctest: +ELLIPSIS
+ 0.2107852473...
"""
x = closure(x)
y = closure(y)
diff --git a/skbio/stats/distance/_base.py b/skbio/stats/distance/_base.py
index b1a72c7..b1ecf22 100644
--- a/skbio/stats/distance/_base.py
+++ b/skbio/stats/distance/_base.py
@@ -38,7 +38,6 @@ class DistanceMatrixError(DissimilarityMatrixError):
class MissingIDError(DissimilarityMatrixError):
"""Error for ID lookup that doesn't exist in the dissimilarity matrix."""
- @experimental(as_of="0.4.0")
def __init__(self, missing_id):
super(MissingIDError, self).__init__()
self.args = ("The ID '%s' is not in the dissimilarity matrix." %
@@ -816,7 +815,8 @@ class DistanceMatrix(DissimilarityMatrix):
super(DistanceMatrix, self)._validate(data, ids)
if (data.T != data).any():
- raise DistanceMatrixError("Data must be symmetric.")
+ raise DistanceMatrixError(
+ "Data must be symmetric and cannot contain NaNs.")
@experimental(as_of="0.4.0")
diff --git a/skbio/stats/distance/tests/test_base.py b/skbio/stats/distance/tests/test_base.py
index fabb70e..71e7bde 100644
--- a/skbio/stats/distance/tests/test_base.py
+++ b/skbio/stats/distance/tests/test_base.py
@@ -12,13 +12,16 @@ from six import StringIO, binary_type, text_type
from unittest import TestCase, main
+import six
import matplotlib as mpl
import numpy as np
import numpy.testing as npt
import pandas as pd
+import scipy.spatial.distance
from IPython.core.display import Image, SVG
-from skbio import DistanceMatrix
+import skbio.sequence.distance
+from skbio import DistanceMatrix, Sequence
from skbio.stats.distance import (
DissimilarityMatrixError, DistanceMatrixError, MissingIDError,
DissimilarityMatrix, randdm)
@@ -489,6 +492,10 @@ class DistanceMatrixTests(DissimilarityMatrixTestData):
with self.assertRaises(DissimilarityMatrixError):
DistanceMatrix([[1, 2, 3]], ['a'])
+ def test_init_nans(self):
+ with six.assertRaisesRegex(self, DistanceMatrixError, 'NaNs'):
+ DistanceMatrix([[0.0, np.nan], [np.nan, 0.0]], ['a', 'b'])
+
def test_from_iterable_no_key(self):
iterable = (x for x in range(4))
@@ -536,6 +543,50 @@ class DistanceMatrixTests(DissimilarityMatrixTestData):
DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a),
key=str, keys=['1', '2', '3', '4'])
+ def test_from_iterable_scipy_hamming_metric_with_metadata(self):
+ # test for #1254
+ seqs = [
+ Sequence('ACGT'),
+ Sequence('ACGA', metadata={'id': 'seq1'}),
+ Sequence('AAAA', metadata={'id': 'seq2'}),
+ Sequence('AAAA', positional_metadata={'qual': range(4)})
+ ]
+
+ exp = DistanceMatrix([
+ [0, 0.25, 0.75, 0.75],
+ [0.25, 0.0, 0.5, 0.5],
+ [0.75, 0.5, 0.0, 0.0],
+ [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd'])
+
+ dm = DistanceMatrix.from_iterable(
+ seqs,
+ metric=scipy.spatial.distance.hamming,
+ keys=['a', 'b', 'c', 'd'])
+
+ self.assertEqual(dm, exp)
+
+ def test_from_iterable_skbio_hamming_metric_with_metadata(self):
+ # test for #1254
+ seqs = [
+ Sequence('ACGT'),
+ Sequence('ACGA', metadata={'id': 'seq1'}),
+ Sequence('AAAA', metadata={'id': 'seq2'}),
+ Sequence('AAAA', positional_metadata={'qual': range(4)})
+ ]
+
+ exp = DistanceMatrix([
+ [0, 0.25, 0.75, 0.75],
+ [0.25, 0.0, 0.5, 0.5],
+ [0.75, 0.5, 0.0, 0.0],
+ [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd'])
+
+ dm = DistanceMatrix.from_iterable(
+ seqs,
+ metric=skbio.sequence.distance.hamming,
+ keys=['a', 'b', 'c', 'd'])
+
+ self.assertEqual(dm, exp)
+
def test_condensed_form(self):
for dm, condensed in zip(self.dms, self.dm_condensed_forms):
obs = dm.condensed_form()
diff --git a/skbio/stats/ordination/tests/test_redundancy_analysis.py b/skbio/stats/ordination/tests/test_redundancy_analysis.py
index 36a767a..10c0d88 100644
--- a/skbio/stats/ordination/tests/test_redundancy_analysis.py
+++ b/skbio/stats/ordination/tests/test_redundancy_analysis.py
@@ -106,6 +106,7 @@ class TestRDAResults(TestCase):
eigvals=eigvals)
assert_ordination_results_equal(scores, exp,
+ ignore_directionality=True,
ignore_biplot_scores_labels=True,
decimal=6)
@@ -163,6 +164,7 @@ class TestRDAResults(TestCase):
eigvals=eigvals)
assert_ordination_results_equal(scores, exp,
+ ignore_directionality=True,
ignore_biplot_scores_labels=True,
decimal=6)
diff --git a/skbio/stats/power.py b/skbio/stats/power.py
index 3567d66..3fbe84b 100644
--- a/skbio/stats/power.py
+++ b/skbio/stats/power.py
@@ -261,7 +261,7 @@ def subsample_power(test, samples, draw_mode='ind', alpha_pwr=0.05, ratio=None,
`scipy.stats.chisquare` to look for the difference in frequency between
groups.
- >>> from scipy.stats import chisquare, nanmean
+ >>> from scipy.stats import chisquare
>>> test = lambda x: chisquare(np.array([x[i].sum() for i in
... range(len(x))]))[1]
@@ -287,10 +287,10 @@ def subsample_power(test, samples, draw_mode='ind', alpha_pwr=0.05, ratio=None,
... counts_interval=5)
>>> counts
array([ 5, 10, 15, 20, 25, 30, 35, 40, 45])
- >>> nanmean(pwr_est, 0) # doctest: +NORMALIZE_WHITESPACE
+ >>> np.nanmean(pwr_est, axis=0) # doctest: +NORMALIZE_WHITESPACE
array([ 0.056, 0.074, 0.226, 0.46 , 0.61 , 0.806, 0.952, 1. ,
1. ])
- >>> counts[nanmean(pwr_est, 0) > 0.8].min()
+ >>> counts[np.nanmean(pwr_est, axis=0) > 0.8].min()
30
So, we can estimate that we will see a significant difference in the
@@ -338,9 +338,9 @@ def subsample_power(test, samples, draw_mode='ind', alpha_pwr=0.05, ratio=None,
... ratio=[2, 1])
>>> counts2
array([ 5., 10., 15., 20., 25., 30.])
- >>> nanmean(pwr_est2, 0)
+ >>> np.nanmean(pwr_est2, axis=0)
array([ 0.14 , 0.272, 0.426, 0.646, 0.824, 0.996])
- >>> counts2[nanmean(pwr_est2, 0) > 0.8].min()
+ >>> counts2[np.nanmean(pwr_est2, axis=0) > 0.8].min()
25.0
When we consider the number of samples per group needed in the power
@@ -610,7 +610,9 @@ def confidence_bound(vec, alpha=0.05, df=None, axis=None):
df = num_counts - 1
# Calculates the bound
- bound = scipy.stats.nanstd(vec, axis=axis) / np.sqrt(num_counts - 1) * \
+ # In the conversion from scipy.stats.nanstd -> np.nanstd `ddof=1` had to be
+ # added to match the scipy default of `bias=False`.
+ bound = np.nanstd(vec, axis=axis, ddof=1) / np.sqrt(num_counts - 1) * \
scipy.stats.t.ppf(1 - alpha / 2, df)
return bound
diff --git a/skbio/stats/tests/test_composition.py b/skbio/stats/tests/test_composition.py
index 00e5ac1..f8e9f3c 100644
--- a/skbio/stats/tests/test_composition.py
+++ b/skbio/stats/tests/test_composition.py
@@ -12,7 +12,6 @@ from unittest import TestCase, main
import numpy as np
import numpy.testing as npt
import pandas.util.testing as pdt
-
from numpy.random import normal
import pandas as pd
import scipy
@@ -79,6 +78,14 @@ class CompositionTests(TestCase):
closure(self.cdata2)
npt.assert_allclose(self.cdata2, np.array([2, 2, 6]))
+ def test_closure_warning(self):
+ with self.assertRaises(ValueError):
+ closure([0., 0., 0.])
+
+ with self.assertRaises(ValueError):
+ closure([[0., 0., 0.],
+ [0., 5., 5.]])
+
def test_perturb(self):
pmat = perturb(closure(self.cdata1),
closure(np.array([1, 1, 1])))
@@ -214,6 +221,10 @@ class CompositionTests(TestCase):
multiplicative_replacement(self.cdata4)
npt.assert_allclose(self.cdata4, np.array([1, 2, 3, 0, 5]))
+ def multiplicative_replacement_warning(self):
+ with self.assertRaises(ValueError):
+ multiplicative_replacement([0, 1, 2], delta=1)
+
def test_clr(self):
cmat = clr(closure(self.cdata1))
A = np.array([.2, .2, .6])
@@ -324,7 +335,6 @@ class AncomTests(TestCase):
[10, 11, 10, 10, 10, 11],
[10, 13, 10, 10, 10, 12]]).T
self.cats1 = pd.Series([0, 0, 0, 1, 1, 1])
-
# Real valued data with 2 groupings
D, L = 40, 80
np.random.seed(0)
@@ -622,8 +632,8 @@ class AncomTests(TestCase):
result = ancom(self.table1,
self.cats1,
multiple_comparisons_correction=None,
- significance_test=scipy.stats.mannwhitneyu)
- exp = pd.DataFrame({'W': np.array([6, 6, 2, 2, 2, 2, 2]),
+ significance_test=scipy.stats.ttest_ind)
+ exp = pd.DataFrame({'W': np.array([5, 5, 2, 2, 2, 2, 2]),
'reject': np.array([True, True, False, False,
False, False, False],
dtype=bool)})
@@ -633,7 +643,7 @@ class AncomTests(TestCase):
result = ancom(self.table2,
self.cats2,
multiple_comparisons_correction=None,
- significance_test=scipy.stats.mannwhitneyu)
+ significance_test=scipy.stats.ttest_ind)
exp = pd.DataFrame({'W': np.array([8, 8, 3, 3,
8, 3, 3, 3, 3]),
'reject': np.array([True, True, False, False,
diff --git a/skbio/stats/tests/test_gradient.py b/skbio/stats/tests/test_gradient.py
index 96fb233..872d73b 100644
--- a/skbio/stats/tests/test_gradient.py
+++ b/skbio/stats/tests/test_gradient.py
@@ -228,7 +228,7 @@ class GradientTests(BaseTests):
's7': np.array([7]),
's8': np.array([8])},
orient='index')
- trajectory.sort(columns=0, inplace=True)
+ trajectory.sort_values(by=0, inplace=True)
w_vector = pd.Series(np.array([1, 5, 8, 12, 45, 80, 85, 90]),
['s1', 's2', 's3', 's4',
's5', 's6', 's7', 's8']).astype(np.float64)
@@ -242,7 +242,7 @@ class GradientTests(BaseTests):
's8': np.array([20.3428571428])},
orient='index')
obs = _weight_by_vector(trajectory, w_vector)
- assert_data_frame_almost_equal(obs.sort(axis=0), exp.sort(axis=0))
+ assert_data_frame_almost_equal(obs.sort_index(), exp.sort_index())
trajectory = pd.DataFrame.from_dict({'s1': np.array([1]),
's2': np.array([2]),
@@ -253,7 +253,7 @@ class GradientTests(BaseTests):
's7': np.array([7]),
's8': np.array([8])},
orient='index')
- trajectory.sort(columns=0, inplace=True)
+ trajectory.sort_values(by=0, inplace=True)
w_vector = pd.Series(np.array([1, 2, 3, 4, 5, 6, 7, 8]),
['s1', 's2', 's3', 's4',
's5', 's6', 's7', 's8']).astype(np.float64)
@@ -264,7 +264,7 @@ class GradientTests(BaseTests):
},
orient='index')
obs = _weight_by_vector(trajectory, w_vector)
- assert_data_frame_almost_equal(obs.sort(axis=0), exp.sort(axis=0))
+ assert_data_frame_almost_equal(obs.sort_index(), exp.sort_index())
trajectory = pd.DataFrame.from_dict({'s2': np.array([2]),
's3': np.array([3]),
@@ -272,21 +272,21 @@ class GradientTests(BaseTests):
's5': np.array([5]),
's6': np.array([6])},
orient='index')
- trajectory.sort(columns=0, inplace=True)
+ trajectory.sort_values(by=0, inplace=True)
w_vector = pd.Series(np.array([25, 30, 35, 40, 45]),
['s2', 's3', 's4', 's5', 's6']).astype(np.float64)
exp = pd.DataFrame.from_dict({'s2': np.array([2]), 's3': np.array([3]),
's4': np.array([4]), 's5': np.array([5]),
's6': np.array([6])}, orient='index')
obs = _weight_by_vector(trajectory, w_vector)
- assert_data_frame_almost_equal(obs.sort(axis=0), exp.sort(axis=0))
+ assert_data_frame_almost_equal(obs.sort_index(), exp.sort_index())
trajectory = pd.DataFrame.from_dict({'s1': np.array([1, 2, 3]),
's2': np.array([2, 3, 4]),
's3': np.array([5, 6, 7]),
's4': np.array([8, 9, 10])},
orient='index')
- trajectory.sort(columns=0, inplace=True)
+ trajectory.sort_values(by=0, inplace=True)
w_vector = pd.Series(np.array([1, 2, 3, 4]),
['s1', 's2', 's3', 's4']).astype(np.float64)
exp = pd.DataFrame.from_dict({'s1': np.array([1, 2, 3]),
@@ -295,7 +295,7 @@ class GradientTests(BaseTests):
's4': np.array([8, 9, 10])},
orient='index').astype(np.float64)
obs = _weight_by_vector(trajectory, w_vector)
- assert_data_frame_almost_equal(obs.sort(axis=0), exp.sort(axis=0))
+ assert_data_frame_almost_equal(obs.sort_index(), exp.sort_index())
sample_ids = ['PC.356', 'PC.481', 'PC.355', 'PC.593', 'PC.354']
trajectory = pd.DataFrame.from_dict({'PC.356': np.array([5.65948525,
@@ -334,7 +334,7 @@ class GradientTests(BaseTests):
}, orient='index')
obs = _weight_by_vector(trajectory.ix[sample_ids],
w_vector[sample_ids])
- assert_data_frame_almost_equal(obs.sort(axis=0), exp.sort(axis=0))
+ assert_data_frame_almost_equal(obs.sort_index(), exp.sort_index())
def test_weight_by_vector_single_element(self):
trajectory = pd.DataFrame.from_dict({'s1': np.array([42])},
@@ -578,20 +578,20 @@ class GradientANOVATests(BaseTests):
# Takes a subset from metadata_map
bv = GradientANOVA(subset_coords, self.prop_expl, self.metadata_map)
assert_data_frame_almost_equal(
- bv._coords.sort(axis=0),
- subset_coords.sort(axis=0))
+ bv._coords.sort_index(),
+ subset_coords.sort_index())
assert_data_frame_almost_equal(
- bv._metadata_map.sort(axis=0),
- subset_metadata_map.sort(axis=0))
+ bv._metadata_map.sort_index(),
+ subset_metadata_map.sort_index())
# Takes a subset from coords
bv = GradientANOVA(self.coords, self.prop_expl, subset_metadata_map)
assert_data_frame_almost_equal(
- bv._coords.sort(axis=0),
- subset_coords.sort(axis=0))
+ bv._coords.sort_index(),
+ subset_coords.sort_index())
assert_data_frame_almost_equal(
- bv._metadata_map.sort(axis=0),
- subset_metadata_map.sort(axis=0))
+ bv._metadata_map.sort_index(),
+ subset_metadata_map.sort_index())
# Takes a subset from metadata_map and coords at the same time
coord_data = {
@@ -625,8 +625,8 @@ class GradientANOVATests(BaseTests):
-0.0301637746424])},
orient='index')
assert_data_frame_almost_equal(
- bv._coords.sort(axis=0),
- exp_coords.sort(axis=0))
+ bv._coords.sort_index(),
+ exp_coords.sort_index())
exp_metadata_map = pd.DataFrame.from_dict(
{'PC.355': {'Treatment': 'Control',
'DOB': '20061218',
@@ -634,8 +634,8 @@ class GradientANOVATests(BaseTests):
'Description': 'Control_mouse_I.D._355'}},
orient='index')
assert_data_frame_almost_equal(
- bv._metadata_map.sort(axis=0),
- exp_metadata_map.sort(axis=0))
+ bv._metadata_map.sort_index(),
+ exp_metadata_map.sort_index())
def test_normalize_samples_error(self):
"""Raises an error if coords and metadata_map does not have samples in
diff --git a/skbio/tree/_tree.py b/skbio/tree/_tree.py
index 7256c5f..aa58959 100644
--- a/skbio/tree/_tree.py
+++ b/skbio/tree/_tree.py
@@ -1176,7 +1176,7 @@ class TreeNode(SkbioObject):
if not self.children:
if include_self:
yield self
- raise StopIteration
+ return
child_index_stack = [0]
curr = self
curr_children = self.children
@@ -2716,6 +2716,71 @@ class TreeNode(SkbioObject):
return dist_f(self_matrix, other_matrix)
+ @experimental(as_of="0.4.2")
+ def bifurcate(self, insert_length=None):
+ r"""Reorders the tree into a bifurcating tree.
+
+ All nodes that have more than 2 children will
+ have additional intermediate nodes inserted to ensure that
+ every node has only 2 children.
+
+ Parameters
+ ----------
+ insert_length : int, optional
+ The branch length assigned to all inserted nodes.
+
+ See Also
+ --------
+ prune
+
+ Notes
+ -----
+ Any nodes that have a single child can be collapsed using the
+ prune method to create strictly bifurcating trees.
+
+ Examples
+ --------
+ >>> from skbio import TreeNode
+ >>> tree = TreeNode.read(["((a,b,g,h)c,(d,e)f)root;"])
+ >>> print(tree.ascii_art())
+ /-a
+ |
+ |--b
+ /c-------|
+ | |--g
+ | |
+ -root----| \-h
+ |
+ | /-d
+ \f-------|
+ \-e
+ >>> tree.bifurcate()
+ >>> print(tree.ascii_art())
+ /-h
+ /c-------|
+ | | /-g
+ | \--------|
+ | | /-a
+ -root----| \--------|
+ | \-b
+ |
+ | /-d
+ \f-------|
+ \-e
+ """
+ for n in self.traverse(include_self=True):
+ if len(n.children) > 2:
+ stack = n.children
+ while len(stack) > 2:
+ ind = stack.pop()
+ intermediate = TreeNode()
+ intermediate.length = insert_length
+ intermediate.extend(stack)
+ n.append(intermediate)
+ for k in stack:
+ n.remove(k)
+ n.extend([ind, intermediate])
+
@experimental(as_of="0.4.0")
def index_tree(self):
"""Index a tree for rapid lookups within a tree array
diff --git a/skbio/tree/tests/test_tree.py b/skbio/tree/tests/test_tree.py
index 7891ec2..4424ddb 100644
--- a/skbio/tree/tests/test_tree.py
+++ b/skbio/tree/tests/test_tree.py
@@ -817,6 +817,19 @@ class TreeTests(TestCase):
obs = [n.name for n in self.simple_t.levelorder()]
self.assertEqual(obs, exp)
+ def test_bifurcate(self):
+ t1 = TreeNode.read(StringIO(u'(((a,b),c),(d,e));'))
+ t2 = TreeNode.read(StringIO(u'((a,b,c));'))
+ t3 = t2.copy()
+
+ t1.bifurcate()
+ t2.bifurcate()
+ t3.bifurcate(insert_length=0)
+
+ self.assertEqual(str(t1), '(((a,b),c),(d,e));\n')
+ self.assertEqual(str(t2), '((c,(a,b)));\n')
+ self.assertEqual(str(t3), '((c,(a,b):0));\n')
+
def test_index_tree_single_node(self):
"""index_tree handles single node tree"""
t1 = TreeNode.read(StringIO(u'root;'))
diff --git a/skbio/util/__init__.py b/skbio/util/__init__.py
index b36f55d..92b5bdd 100644
--- a/skbio/util/__init__.py
+++ b/skbio/util/__init__.py
@@ -24,7 +24,7 @@ Common functionality to support testing in skbio.
Miscellaneous functionality
---------------------------
-Generally useful functions that don't fit in more specific locations.
+Generally useful functionality that doesn't fit in more specific locations.
.. autosummary::
:toctree: generated/
@@ -35,6 +35,7 @@ Generally useful functions that don't fit in more specific locations.
is_casava_v180_or_later
remove_files
safe_md5
+ classproperty
Warnings
--------
@@ -57,16 +58,18 @@ Warnings
from __future__ import absolute_import, division, print_function
-from ._warning import EfficiencyWarning, RepresentationWarning
+from ._warning import EfficiencyWarning, RepresentationWarning, SkbioWarning
from ._misc import (cardinal_to_ordinal, create_dir, find_duplicates,
is_casava_v180_or_later, remove_files, safe_md5)
from ._testing import (get_data_path, TestRunner,
assert_ordination_results_equal,
assert_data_frame_almost_equal)
+from ._decorator import classproperty
-__all__ = ['EfficiencyWarning', 'RepresentationWarning', 'cardinal_to_ordinal',
- 'create_dir', 'find_duplicates', 'is_casava_v180_or_later',
- 'remove_files', 'safe_md5', 'get_data_path', 'TestRunner',
- 'assert_ordination_results_equal', 'assert_data_frame_almost_equal']
+__all__ = ['SkbioWarning', 'EfficiencyWarning', 'RepresentationWarning',
+ 'cardinal_to_ordinal', 'create_dir', 'find_duplicates',
+ 'is_casava_v180_or_later', 'remove_files', 'safe_md5',
+ 'get_data_path', 'TestRunner', 'assert_ordination_results_equal',
+ 'assert_data_frame_almost_equal', 'classproperty']
test = TestRunner(__file__).test
diff --git a/skbio/util/_decorator.py b/skbio/util/_decorator.py
index 69a3bf2..7208159 100644
--- a/skbio/util/_decorator.py
+++ b/skbio/util/_decorator.py
@@ -13,6 +13,7 @@ import textwrap
import decorator
from ._exception import OverrideError
+from ._warning import DeprecationWarning as SkbioDeprecationWarning
class _state_decorator(object):
@@ -254,7 +255,7 @@ class deprecated(_state_decorator):
warnings.warn('%s is deprecated as of scikit-bio version %s, and '
'will be removed in version %s. %s' %
(func.__name__, self.as_of, self.until, self.reason),
- DeprecationWarning)
+ SkbioDeprecationWarning)
# args[0] is the function being wrapped when this is called
# after wrapping with decorator.decorator, but why???
return func(*args[1:], **kwargs)
diff --git a/skbio/util/_misc.py b/skbio/util/_misc.py
index e4db019..48a87b0 100644
--- a/skbio/util/_misc.py
+++ b/skbio/util/_misc.py
@@ -38,14 +38,20 @@ def make_sentinel(name):
def find_sentinels(function, sentinel):
keys = []
- function_spec = inspect.getargspec(function)
- if function_spec.defaults is not None:
- # Concept from http://stackoverflow.com/a/12627202/579416
- keywords_start = -len(function_spec.defaults)
- for key, default in zip(function_spec.args[keywords_start:],
- function_spec.defaults):
- if default is sentinel:
- keys.append(key)
+ if hasattr(inspect, 'signature'):
+ params = inspect.signature(function).parameters
+ for name, param in params.items():
+ if param.default is sentinel:
+ keys.append(name)
+ else: # Py2
+ function_spec = inspect.getargspec(function)
+ if function_spec.defaults is not None:
+ # Concept from http://stackoverflow.com/a/12627202/579416
+ keywords_start = -len(function_spec.defaults)
+ for key, default in zip(function_spec.args[keywords_start:],
+ function_spec.defaults):
+ if default is sentinel:
+ keys.append(key)
return keys
diff --git a/skbio/util/_testing.py b/skbio/util/_testing.py
index b357595..92c111c 100644
--- a/skbio/util/_testing.py
+++ b/skbio/util/_testing.py
@@ -12,16 +12,17 @@ from future.utils import PY3
import copy
import os
import inspect
+import warnings
import six
import pandas as pd
-from nose import core
-from nose.tools import nottest
+import nose
import numpy as np
import numpy.testing as npt
import pandas.util.testing as pdt
+from skbio.util import SkbioWarning
from ._decorator import experimental
@@ -891,7 +892,20 @@ class PositionalMetadataMixinTests(object):
self.assertTrue(obj.has_positional_metadata())
- at nottest
+ at nose.tools.nottest
+class SuppressSkbioWarnings(nose.plugins.Plugin):
+ def configure(self, options, conf):
+ super(SuppressSkbioWarnings, self).configure(options, conf)
+ self.enabled = True
+
+ def beforeTest(self, test):
+ warnings.simplefilter("ignore", category=SkbioWarning)
+
+ def afterTest(self, test):
+ warnings.resetwarnings()
+
+
+ at nose.tools.nottest
class TestRunner(object):
"""Simple wrapper class around nosetests functionality.
@@ -936,7 +950,8 @@ class TestRunner(object):
argv.extend(['--with-doctest', '--doctest-tests'])
if verbose:
argv.append('-v')
- return core.run(argv=argv, defaultTest=self._test_dir)
+ return nose.core.run(argv=argv, defaultTest=self._test_dir,
+ addplugins=[SuppressSkbioWarnings()])
@experimental(as_of="0.4.0")
@@ -1000,7 +1015,7 @@ def assert_ordination_results_equal(left, right, ignore_method_names=False,
Ignore differences in `biplot_scores` row and column labels.
ignore_directionality : bool, optional
Ignore differences in directionality (i.e., differences in signs) for
- attributes `samples` and `features`.
+ attributes `samples`, `features` and `biplot_scores`.
Raises
------
@@ -1027,10 +1042,12 @@ def assert_ordination_results_equal(left, right, ignore_method_names=False,
_assert_frame_equal(left.biplot_scores, right.biplot_scores,
ignore_biplot_scores_labels,
ignore_biplot_scores_labels,
+ ignore_directionality=ignore_directionality,
decimal=decimal)
_assert_frame_equal(left.sample_constraints, right.sample_constraints,
ignore_columns=ignore_axis_labels,
+ ignore_directionality=ignore_directionality,
decimal=decimal)
_assert_series_equal(left.eigvals, right.eigvals, ignore_axis_labels,
diff --git a/skbio/util/_warning.py b/skbio/util/_warning.py
index 8128415..2de0295 100644
--- a/skbio/util/_warning.py
+++ b/skbio/util/_warning.py
@@ -9,7 +9,12 @@
from __future__ import absolute_import, division, print_function
-class EfficiencyWarning(Warning):
+class SkbioWarning(Warning):
+ """Used to filter our warnings from warnings given by 3rd parties"""
+ pass
+
+
+class EfficiencyWarning(SkbioWarning):
"""Warn about potentially accidental use of inefficient code.
For example, if a user doesn't have an optimized version of a
@@ -22,7 +27,7 @@ class EfficiencyWarning(Warning):
pass
-class RepresentationWarning(Warning):
+class RepresentationWarning(SkbioWarning):
"""Warn about assumptions made for the successful completion of a process.
Warn about substitutions, assumptions, or particular alterations that were
@@ -32,3 +37,8 @@ class RepresentationWarning(Warning):
"""
pass
+
+
+class DeprecationWarning(DeprecationWarning, SkbioWarning):
+ """Used to indicate deprecated functionality in scikit-bio."""
+ pass
diff --git a/skbio/util/tests/test_decorator.py b/skbio/util/tests/test_decorator.py
index 625a947..b968263 100644
--- a/skbio/util/tests/test_decorator.py
+++ b/skbio/util/tests/test_decorator.py
@@ -11,7 +11,8 @@ import unittest
import inspect
import warnings
-from skbio.util._decorator import classproperty, overrides, classonlymethod
+from skbio.util import classproperty
+from skbio.util._decorator import overrides, classonlymethod
from skbio.util._decorator import (stable, experimental, deprecated,
_state_decorator)
from skbio.util._exception import OverrideError
@@ -224,6 +225,8 @@ class TestStable(TestStabilityState):
def test_function_signature(self):
f = self._get_f('0.1.0')
+ # Py2: update this to use inspect.signature when we drop Python 2
+ # inspect.getargspec is deprecated and won't exist in 3.6
expected = inspect.ArgSpec(
args=['x', 'y'], varargs=None, keywords=None, defaults=(42,))
self.assertEqual(inspect.getargspec(f), expected)
@@ -261,6 +264,8 @@ class TestExperimental(TestStabilityState):
def test_function_signature(self):
f = self._get_f('0.1.0')
+ # Py2: update this to use inspect.signature when we drop Python 2
+ # inspect.getargspec is deprecated and won't exist in 3.6
expected = inspect.ArgSpec(
args=['x', 'y'], varargs=None, keywords=None, defaults=(42,))
self.assertEqual(inspect.getargspec(f), expected)
@@ -318,6 +323,8 @@ class TestDeprecated(TestStabilityState):
def test_function_signature(self):
f = self._get_f('0.1.0', until='0.1.4',
reason='You should now use skbio.g().')
+ # Py2: update this to use inspect.signature when we drop Python 2
+ # inspect.getargspec is deprecated and won't exist in 3.6
expected = inspect.ArgSpec(
args=['x', 'y'], varargs=None, keywords=None, defaults=(42,))
self.assertEqual(inspect.getargspec(f), expected)
diff --git a/skbio/util/tests/test_misc.py b/skbio/util/tests/test_misc.py
index 0edd952..cb03bbb 100644
--- a/skbio/util/tests/test_misc.py
+++ b/skbio/util/tests/test_misc.py
@@ -286,7 +286,7 @@ class CardinalToOrdinalTests(unittest.TestCase):
class TestFindDuplicates(unittest.TestCase):
def test_empty_input(self):
def empty_gen():
- raise StopIteration()
+ return
yield
for empty in [], (), '', set(), {}, empty_gen():
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-skbio.git
More information about the debian-med-commit
mailing list