[med-svn] [Git][med-team/q2-types][upstream] New upstream version 2019.10.0
Liubov Chuprikova
gitlab at salsa.debian.org
Sun Dec 15 12:46:04 GMT 2019
Liubov Chuprikova pushed to branch upstream at Debian Med / q2-types
Commits:
043a6cd7 by Liubov Chuprikova at 2019-12-15T12:43:02Z
New upstream version 2019.10.0
- - - - -
19 changed files:
- ci/recipe/meta.yaml
- q2_types/_version.py
- q2_types/feature_data/_format.py
- q2_types/feature_data/_transformer.py
- + q2_types/feature_data/tests/data/dna-sequences-duplicate-id.fasta
- + q2_types/feature_data/tests/data/dna-sequences-id-starts-with-space.fasta
- + q2_types/feature_data/tests/data/dna-sequences-no-id.fasta
- q2_types/feature_data/tests/data/taxonomy/blanks-and-comments → q2_types/feature_data/tests/data/taxonomy/blanks
- + q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv
- + q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv
- q2_types/feature_data/tests/data/taxonomy/header-only.tsv
- + q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv
- + q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv
- + q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv
- q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv
- q2_types/feature_data/tests/test_format.py
- q2_types/feature_data/tests/test_transformer.py
- q2_types/feature_table/_format.py
- q2_types/sample_data/tests/test_transformer.py
Changes:
=====================================
ci/recipe/meta.yaml
=====================================
@@ -25,6 +25,8 @@ requirements:
- biom-format >=2.1.5,<2.2.0
- ijson
- h5py
+ - matplotlib 3.1.0
+ - matplotlib-base 3.1.0
- qiime2 {{ release }}.*
test:
=====================================
q2_types/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.7.0)"
- git_full = "9f31de0c81510fbe6be8b16f95e23b4c974ca002"
- git_date = "2019-07-30 18:15:54 +0000"
+ git_refnames = " (tag: 2019.10.0)"
+ git_full = "b382dd345500fc2172858ff00638e6bca35760ed"
+ git_date = "2019-11-01 01:04:25 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_types/feature_data/_format.py
=====================================
@@ -54,9 +54,6 @@ class TaxonomyFormat(model.TextFileFormat):
elif line.lstrip(' ') == '\n':
# Blank line
continue
- elif line.startswith('#'):
- # Comment line
- continue
else:
cells = line.split('\t')
if len(cells) < 2:
@@ -93,41 +90,53 @@ class TSVTaxonomyFormat(model.TextFileFormat):
Optionally followed by other arbitrary columns.
- This format supports comment lines starting with #, and blank lines. The
- expected header must be the first non-comment, non-blank line. In addition
- to the header, there must be at least one line of data.
+ This format supports blank lines. The expected header must be the first
+ non-blank line. In addition to the header, there must be at least one line
+ of data.
"""
HEADER = ['Feature ID', 'Taxon']
- def sniff(self):
+ def _check_n_records(self, n=None):
with self.open() as fh:
- data_lines = 0
+ data_line_count = 0
header = None
- while data_lines < 10:
- line = fh.readline()
- if line == '':
- # EOF
- break
- elif line.lstrip(' ') == '\n':
+ file_ = enumerate(fh) if n is None else zip(range(n), fh)
+
+ for i, line in file_:
+ # Tracks line number for error reporting
+ i = i + 1
+
+ if line.lstrip(' ') == '\n':
# Blank line
continue
- elif line.startswith('#'):
- # Comment line
- continue
- cells = line.rstrip('\n').split('\t')
+ cells = line.strip('\n').split('\t')
+
if header is None:
if cells[:2] != self.HEADER:
- return False
+ raise ValidationError(
+ '%s must be the first two header values. The '
+ 'first two header values provided are: %s (on '
+ 'line %s).' % (self.HEADER, cells[:2], i))
header = cells
else:
if len(cells) != len(header):
- return False
- data_lines += 1
+ raise ValidationError(
+ 'Number of values on line %s are not the same as '
+ 'number of header values. Found %s values '
+ '(%s), expected %s.' % (i, len(cells), cells,
+ len(self.HEADER)))
+
+ data_line_count += 1
+
+ if data_line_count == 0:
+ raise ValidationError('No taxonomy records found, only blank '
+ 'lines and/or a header row.')
- return header is not None and data_lines > 0
+ def _validate_(self, level):
+ self._check_n_records(n={'min': 10, 'max': None}[level])
TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(
@@ -138,6 +147,7 @@ class DNAFASTAFormat(model.TextFileFormat):
def _validate_lines(self, max_lines):
FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
last_line_was_ID = False
+ ids = {}
with open(str(self), 'rb') as fh:
try:
@@ -149,8 +159,8 @@ class DNAFASTAFormat(model.TextFileFormat):
return
if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
- "FASTA ID. FASTA IDs must start "
- "with '>'")
+ "description. Descriptions must "
+ "start with '>'")
fh.seek(0)
for line_number, line in enumerate(fh, 1):
if line_number >= max_lines:
@@ -158,9 +168,24 @@ class DNAFASTAFormat(model.TextFileFormat):
line = line.decode('utf-8-sig')
if line.startswith('>'):
if last_line_was_ID:
- raise ValidationError('Multiple consecutive IDs '
- 'starting on line '
- f'{line_number-1!r}')
+ raise ValidationError('Multiple consecutive '
+ 'descriptions starting on '
+ f'line {line_number-1!r}')
+ line = line.split()
+ if line[0] == '>':
+ if len(line) == 1:
+ raise ValidationError(
+ f'Description on line {line_number} is '
+ 'missing an ID.')
+ else:
+ raise ValidationError(
+ f'ID on line {line_number} starts with a '
+ 'space. IDs may not start with spaces')
+ if line[0] in ids:
+ raise ValidationError(
+ f'ID on line {line_number} is a duplicate of '
+ f'another ID on line {ids[line[0]]}.')
+ ids[line[0]] = line_number
last_line_was_ID = True
elif re.fullmatch(FASTADNAValidator, line):
last_line_was_ID = False
=====================================
q2_types/feature_data/_transformer.py
=====================================
@@ -47,7 +47,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
"""
# Using `dtype=object` and `set_index()` to avoid type casting/inference of
# any columns or the index.
- df = pd.read_csv(filepath, sep='\t', comment='#', skip_blank_lines=True,
+ df = pd.read_csv(filepath, sep='\t', skip_blank_lines=True,
header=None, dtype=object)
if len(df.columns) < 2:
@@ -88,6 +88,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
"column names are duplicated: %s" %
', '.join(df.columns.get_duplicates()))
+ df['Taxon'] = df['Taxon'].str.strip()
return df
=====================================
q2_types/feature_data/tests/data/dna-sequences-duplicate-id.fasta
=====================================
@@ -0,0 +1,5 @@
+>SEQUENCE1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+>SEQUENCE1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGT
=====================================
q2_types/feature_data/tests/data/dna-sequences-id-starts-with-space.fasta
=====================================
@@ -0,0 +1 @@
+> this_id_starts_with_a_space
\ No newline at end of file
=====================================
q2_types/feature_data/tests/data/dna-sequences-no-id.fasta
=====================================
@@ -0,0 +1 @@
+>
\ No newline at end of file
=====================================
q2_types/feature_data/tests/data/taxonomy/blanks-and-comments → q2_types/feature_data/tests/data/taxonomy/blanks
=====================================
@@ -1,17 +1,17 @@
-# hello
-# world #
+
+
-# hello, peanut
-#
-#
+
+
+
=====================================
q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv
=====================================
@@ -0,0 +1,3 @@
+Feature ID Taxon
+seq1 k__Bacteria; p__Proteobacteria -1.0
+seq2 k__Bacteria 1.0
=====================================
q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv
=====================================
@@ -0,0 +1,3 @@
+Feature ID Taxon Confidence Random
+seq1 k__Foo; p__Bar -1.0
+seq2 k__Foo; p__Baz -42.0
=====================================
q2_types/feature_data/tests/data/taxonomy/header-only.tsv
=====================================
@@ -1,7 +1 @@
-# This file
-
-# only has a
-# header!
-
-
Feature ID Taxon
=====================================
q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID Taxon Confidence
+seq1 k__Foo; p__Bar -1.0
=====================================
q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID Taxon Confidence
+seq1 k__Foo; p__Bar -1.0
=====================================
q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID Taxon Confidence
+seq1 k__Foo; p__Bar -1.0
=====================================
q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv
=====================================
@@ -1,11 +1,11 @@
-# There's some important whitespace in this file for testing, take care not to
-# remove :)
-# hello
-# world #
+
+
+
+
@@ -15,20 +15,20 @@
-# comment
+
Feature ID Taxon Extra Column
-# hello, peanut
-#
-#
+
+
+
SEQUENCE1 k__Bar; p__Baz foo
-# GWAR
+
seq2 some; taxonomy; for; ya bar baz
-# FOOTER
+
=====================================
q2_types/feature_data/tests/test_format.py
=====================================
@@ -37,7 +37,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_taxonomy_format_validate_negative(self):
- filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
+ filenames = ['empty', 'blanks', '1-column.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
@@ -78,7 +78,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_headerless_tsv_taxonomy_format_validate_negative(self):
- filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
+ filenames = ['empty', 'blanks', '1-column.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
@@ -113,7 +113,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_tsv_taxonomy_format_validate_negative(self):
- filenames = ['empty', 'blanks-and-comments', '1-column.tsv',
+ filenames = ['empty', 'blanks', '1-column.tsv',
'headerless.tsv', 'header-only.tsv', 'jagged.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
@@ -134,6 +134,19 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
+ def test_tsv_taxonomy_format_column_header_lengths(self):
+ filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']
+
+ filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
+ for filename in filenames]
+
+ for filepath in filepaths:
+ format = TSVTaxonomyFormat(filepath, mode='r')
+
+ with self.assertRaisesRegex(ValidationError,
+ 'line 2.*3 values.*expected 2'):
+ format.validate()
+
class TestDNAFASTAFormats(TestPluginBase):
package = 'q2_types.feature_data.tests'
@@ -169,7 +182,8 @@ class TestDNAFASTAFormats(TestPluginBase):
filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
format = DNAFASTAFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
+ with self.assertRaisesRegex(
+ ValidationError, 'consecutive descriptions.*1'):
format.validate()
def test_dna_fasta_format_missing_initial_ID(self):
@@ -201,6 +215,28 @@ class TestDNAFASTAFormats(TestPluginBase):
format.validate()
+ def test_dna_fasta_format_duplicate_ids(self):
+ filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
+ format = DNAFASTAFormat(filepath, mode='r')
+
+ with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'):
+ format.validate()
+
+ def test_dna_fasta_format_no_id(self):
+ filepath = self.get_data_path('dna-sequences-no-id.fasta')
+ format = DNAFASTAFormat(filepath, mode='r')
+
+ with self.assertRaisesRegex(ValidationError, '1.*missing an ID'):
+ format.validate()
+
+ def test_dna_fasta_format_id_starts_with_space(self):
+ filepath = self.get_data_path(
+ 'dna-sequences-id-starts-with-space.fasta')
+ format = DNAFASTAFormat(filepath, mode='r')
+
+ with self.assertRaisesRegex(ValidationError, '1 starts with a space'):
+ format.validate()
+
def test_paired_dna_sequences_directory_format(self):
filepath = self.get_data_path('dna-sequences.fasta')
temp_dir = self.temp_dir.name
=====================================
q2_types/feature_data/tests/test_transformer.py
=====================================
@@ -264,6 +264,45 @@ class TestTaxonomyFormatTransformers(TestPluginBase):
self.assertEqual(exp, obs)
+ def test_tsv_taxonomy_to_metadata_trailing_whitespace_taxon(self):
+ _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+ os.path.join(
+ 'taxonomy',
+ 'trailing_space_taxon.tsv'))
+
+ index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+ exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+ columns=['Taxon', 'Confidence'], dtype=object)
+ exp = qiime2.Metadata(exp_df)
+
+ self.assertEqual(exp, obs)
+
+ def test_tsv_taxonomy_to_metadata_leading_whitespace_taxon(self):
+ _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+ os.path.join(
+ 'taxonomy',
+ 'leading_space_taxon.tsv'))
+
+ index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+ exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+ columns=['Taxon', 'Confidence'], dtype=object)
+ exp = qiime2.Metadata(exp_df)
+
+ self.assertEqual(exp, obs)
+
+ def test_tsv_taxonomy_to_metadata_trailing_leading_whitespace_taxon(self):
+ _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+ os.path.join(
+ 'taxonomy',
+ 'start_end_space_taxon.tsv'))
+
+ index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+ exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+ columns=['Taxon', 'Confidence'], dtype=object)
+ exp = qiime2.Metadata(exp_df)
+
+ self.assertEqual(exp, obs)
+
# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.
@@ -275,11 +314,11 @@ class TestTaxonomyFormatsToDataFrame(TestPluginBase):
_taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '1-column.tsv')))
- def test_blanks_and_comments(self):
+ def test_blanks(self):
with self.assertRaises(pandas.io.common.EmptyDataError):
_taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
- 'blanks-and-comments')))
+ 'blanks')))
def test_empty(self):
with self.assertRaises(pandas.io.common.EmptyDataError):
=====================================
q2_types/feature_table/_format.py
=====================================
@@ -22,7 +22,8 @@ class BIOMV100Format(model.TextFileFormat):
}
def sniff(self):
- with self.open() as fh:
+ # Can't self.open(mode='rb'), so we defer to the backing pathlib object
+ with self.path.open(mode='rb') as fh:
try:
parser = ijson.parse(fh)
for prefix, event, value in parser:
=====================================
q2_types/sample_data/tests/test_transformer.py
=====================================
@@ -26,7 +26,10 @@ class TestTransformers(TestPluginBase):
name='shannon', index=exp_index)
obs = transformer(exp)
- obs = pd.Series.from_csv(str(obs), sep='\t', header=0)
+
+ # Squeeze equals true to return series instead of dataframe
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+ squeeze=True)
assert_series_equal(exp, obs)
View it on GitLab: https://salsa.debian.org/med-team/q2-types/commit/043a6cd7bc6818c3424d077b00a6949e847c6e17
--
View it on GitLab: https://salsa.debian.org/med-team/q2-types/commit/043a6cd7bc6818c3424d077b00a6949e847c6e17
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191215/1095df05/attachment-0001.html>
More information about the debian-med-commit
mailing list