[med-svn] [Git][med-team/q2-types][upstream] New upstream version 2019.10.0

Sun Dec 15 12:46:04 GMT 2019


Liubov Chuprikova pushed to branch upstream at Debian Med / q2-types


Commits:
043a6cd7 by Liubov Chuprikova at 2019-12-15T12:43:02Z
New upstream version 2019.10.0
- - - - -


19 changed files:

- ci/recipe/meta.yaml
- q2_types/_version.py
- q2_types/feature_data/_format.py
- q2_types/feature_data/_transformer.py
- + q2_types/feature_data/tests/data/dna-sequences-duplicate-id.fasta
- + q2_types/feature_data/tests/data/dna-sequences-id-starts-with-space.fasta
- + q2_types/feature_data/tests/data/dna-sequences-no-id.fasta
- q2_types/feature_data/tests/data/taxonomy/blanks-and-comments → q2_types/feature_data/tests/data/taxonomy/blanks
- + q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv
- + q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv
- q2_types/feature_data/tests/data/taxonomy/header-only.tsv
- + q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv
- + q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv
- + q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv
- q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv
- q2_types/feature_data/tests/test_format.py
- q2_types/feature_data/tests/test_transformer.py
- q2_types/feature_table/_format.py
- q2_types/sample_data/tests/test_transformer.py


Changes:

=====================================
ci/recipe/meta.yaml
=====================================
@@ -25,6 +25,8 @@ requirements:
     - biom-format >=2.1.5,<2.2.0
     - ijson
     - h5py
+    - matplotlib 3.1.0
+    - matplotlib-base 3.1.0
     - qiime2 {{ release }}.*
 
 test:


=====================================
q2_types/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2019.7.0)"
-    git_full = "9f31de0c81510fbe6be8b16f95e23b4c974ca002"
-    git_date = "2019-07-30 18:15:54 +0000"
+    git_refnames = " (tag: 2019.10.0)"
+    git_full = "b382dd345500fc2172858ff00638e6bca35760ed"
+    git_date = "2019-11-01 01:04:25 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_types/feature_data/_format.py
=====================================
@@ -54,9 +54,6 @@ class TaxonomyFormat(model.TextFileFormat):
                 elif line.lstrip(' ') == '\n':
                     # Blank line
                     continue
-                elif line.startswith('#'):
-                    # Comment line
-                    continue
                 else:
                     cells = line.split('\t')
                     if len(cells) < 2:
@@ -93,41 +90,53 @@ class TSVTaxonomyFormat(model.TextFileFormat):
 
     Optionally followed by other arbitrary columns.
 
-    This format supports comment lines starting with #, and blank lines. The
-    expected header must be the first non-comment, non-blank line. In addition
-    to the header, there must be at least one line of data.
+    This format supports blank lines. The expected header must be the first
+    non-blank line. In addition to the header, there must be at least one line
+    of data.
 
     """
     HEADER = ['Feature ID', 'Taxon']
 
-    def sniff(self):
+    def _check_n_records(self, n=None):
         with self.open() as fh:
-            data_lines = 0
+            data_line_count = 0
             header = None
-            while data_lines < 10:
-                line = fh.readline()
 
-                if line == '':
-                    # EOF
-                    break
-                elif line.lstrip(' ') == '\n':
+            file_ = enumerate(fh) if n is None else zip(range(n), fh)
+
+            for i, line in file_:
+                # Tracks line number for error reporting
+                i = i + 1
+
+                if line.lstrip(' ') == '\n':
                     # Blank line
                     continue
-                elif line.startswith('#'):
-                    # Comment line
-                    continue
 
-                cells = line.rstrip('\n').split('\t')
+                cells = line.strip('\n').split('\t')
+
                 if header is None:
                     if cells[:2] != self.HEADER:
-                        return False
+                        raise ValidationError(
+                            '%s must be the first two header values. The '
+                            'first two header values provided are: %s (on '
+                            'line %s).' % (self.HEADER, cells[:2], i))
                     header = cells
                 else:
                     if len(cells) != len(header):
-                        return False
-                    data_lines += 1
+                        raise ValidationError(
+                            'Number of values on line %s are not the same as '
+                            'number of header values. Found %s values '
+                            '(%s), expected %s.' % (i, len(cells), cells,
+                                                    len(self.HEADER)))
+
+                    data_line_count += 1
+
+            if data_line_count == 0:
+                raise ValidationError('No taxonomy records found, only blank '
+                                      'lines and/or a header row.')
 
-            return header is not None and data_lines > 0
+    def _validate_(self, level):
+        self._check_n_records(n={'min': 10, 'max': None}[level])
 
 
 TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(
@@ -138,6 +147,7 @@ class DNAFASTAFormat(model.TextFileFormat):
     def _validate_lines(self, max_lines):
         FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
         last_line_was_ID = False
+        ids = {}
 
         with open(str(self), 'rb') as fh:
             try:
@@ -149,8 +159,8 @@ class DNAFASTAFormat(model.TextFileFormat):
                     return
                 if first[0] != ord(b'>'):
                     raise ValidationError("First line of file is not a valid "
-                                          "FASTA ID. FASTA IDs must start "
-                                          "with '>'")
+                                          "description. Descriptions must "
+                                          "start with '>'")
                 fh.seek(0)
                 for line_number, line in enumerate(fh, 1):
                     if line_number >= max_lines:
@@ -158,9 +168,24 @@ class DNAFASTAFormat(model.TextFileFormat):
                     line = line.decode('utf-8-sig')
                     if line.startswith('>'):
                         if last_line_was_ID:
-                            raise ValidationError('Multiple consecutive IDs '
-                                                  'starting on line '
-                                                  f'{line_number-1!r}')
+                            raise ValidationError('Multiple consecutive '
+                                                  'descriptions starting on '
+                                                  f'line {line_number-1!r}')
+                        line = line.split()
+                        if line[0] == '>':
+                            if len(line) == 1:
+                                raise ValidationError(
+                                    f'Description on line {line_number} is '
+                                    'missing an ID.')
+                            else:
+                                raise ValidationError(
+                                    f'ID on line {line_number} starts with a '
+                                    'space. IDs may not start with spaces')
+                        if line[0] in ids:
+                            raise ValidationError(
+                                f'ID on line {line_number} is a duplicate of '
+                                f'another ID on line {ids[line[0]]}.')
+                        ids[line[0]] = line_number
                         last_line_was_ID = True
                     elif re.fullmatch(FASTADNAValidator, line):
                         last_line_was_ID = False


=====================================
q2_types/feature_data/_transformer.py
=====================================
@@ -47,7 +47,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
     """
     # Using `dtype=object` and `set_index()` to avoid type casting/inference of
     # any columns or the index.
-    df = pd.read_csv(filepath, sep='\t', comment='#', skip_blank_lines=True,
+    df = pd.read_csv(filepath, sep='\t', skip_blank_lines=True,
                      header=None, dtype=object)
 
     if len(df.columns) < 2:
@@ -88,6 +88,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
             "column names are duplicated: %s" %
             ', '.join(df.columns.get_duplicates()))
 
+    df['Taxon'] = df['Taxon'].str.strip()
     return df
 
 


=====================================
q2_types/feature_data/tests/data/dna-sequences-duplicate-id.fasta
=====================================
@@ -0,0 +1,5 @@
+>SEQUENCE1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+>SEQUENCE1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+ACGTACGTACGTACGTACGTACGT


=====================================
q2_types/feature_data/tests/data/dna-sequences-id-starts-with-space.fasta
=====================================
@@ -0,0 +1 @@
+> this_id_starts_with_a_space
\ No newline at end of file


=====================================
q2_types/feature_data/tests/data/dna-sequences-no-id.fasta
=====================================
@@ -0,0 +1 @@
+>
\ No newline at end of file


=====================================
q2_types/feature_data/tests/data/taxonomy/blanks-and-comments → q2_types/feature_data/tests/data/taxonomy/blanks
=====================================
@@ -1,17 +1,17 @@
 
 
-# hello
 
-# world #
+
+
 
 
 
      
 
  
-# hello, peanut
-#
 
-#   
+
+
+   
   
 


=====================================
q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv
=====================================
@@ -0,0 +1,3 @@
+Feature ID	Taxon
+seq1	k__Bacteria; p__Proteobacteria	-1.0
+seq2	k__Bacteria	1.0


=====================================
q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv
=====================================
@@ -0,0 +1,3 @@
+Feature ID	Taxon	Confidence	Random
+seq1	k__Foo; p__Bar	-1.0
+seq2	k__Foo; p__Baz	-42.0


=====================================
q2_types/feature_data/tests/data/taxonomy/header-only.tsv
=====================================
@@ -1,7 +1 @@
-# This file
-
-# only has a
-# header!
-
-
 Feature ID	Taxon


=====================================
q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID	Taxon	Confidence
+seq1	 k__Foo; p__Bar	-1.0


=====================================
q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID	Taxon	Confidence
+seq1	 k__Foo; p__Bar 	-1.0


=====================================
q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv
=====================================
@@ -0,0 +1,2 @@
+Feature ID	Taxon	Confidence
+seq1	 k__Foo; p__Bar 	-1.0


=====================================
q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv
=====================================
@@ -1,11 +1,11 @@
 
 
-# There's some important whitespace in this file for testing, take care not to
-# remove :)
 
-# hello
 
-# world #
+
+
+
+
 
 
 
@@ -15,20 +15,20 @@
               
 
     
-# comment
+
 
 Feature ID	Taxon	Extra Column
 
-# hello, peanut
-#
 
-#  
+
+
+
 
 
 SEQUENCE1	k__Bar; p__Baz	foo
 
-# GWAR
+
 seq2	some; taxonomy; for; ya	bar baz
 
 
-# FOOTER
+


=====================================
q2_types/feature_data/tests/test_format.py
=====================================
@@ -37,7 +37,7 @@ class TestTaxonomyFormats(TestPluginBase):
             format.validate()
 
     def test_taxonomy_format_validate_negative(self):
-        filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
+        filenames = ['empty', 'blanks', '1-column.tsv']
         filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                      for filename in filenames]
 
@@ -78,7 +78,7 @@ class TestTaxonomyFormats(TestPluginBase):
             format.validate()
 
     def test_headerless_tsv_taxonomy_format_validate_negative(self):
-        filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
+        filenames = ['empty', 'blanks', '1-column.tsv']
         filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                      for filename in filenames]
 
@@ -113,7 +113,7 @@ class TestTaxonomyFormats(TestPluginBase):
             format.validate()
 
     def test_tsv_taxonomy_format_validate_negative(self):
-        filenames = ['empty', 'blanks-and-comments', '1-column.tsv',
+        filenames = ['empty', 'blanks', '1-column.tsv',
                      'headerless.tsv', 'header-only.tsv', 'jagged.tsv']
         filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
                      for filename in filenames]
@@ -134,6 +134,19 @@ class TestTaxonomyFormats(TestPluginBase):
 
         format.validate()
 
+    def test_tsv_taxonomy_format_column_header_lengths(self):
+        filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']
+
+        filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
+                     for filename in filenames]
+
+        for filepath in filepaths:
+            format = TSVTaxonomyFormat(filepath, mode='r')
+
+            with self.assertRaisesRegex(ValidationError,
+                                        'line 2.*3 values.*expected 2'):
+                format.validate()
+
 
 class TestDNAFASTAFormats(TestPluginBase):
     package = 'q2_types.feature_data.tests'
@@ -169,7 +182,8 @@ class TestDNAFASTAFormats(TestPluginBase):
         filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
         format = DNAFASTAFormat(filepath, mode='r')
 
-        with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
+        with self.assertRaisesRegex(
+                ValidationError, 'consecutive descriptions.*1'):
             format.validate()
 
     def test_dna_fasta_format_missing_initial_ID(self):
@@ -201,6 +215,28 @@ class TestDNAFASTAFormats(TestPluginBase):
 
         format.validate()
 
+    def test_dna_fasta_format_duplicate_ids(self):
+        filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'):
+            format.validate()
+
+    def test_dna_fasta_format_no_id(self):
+        filepath = self.get_data_path('dna-sequences-no-id.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, '1.*missing an ID'):
+            format.validate()
+
+    def test_dna_fasta_format_id_starts_with_space(self):
+        filepath = self.get_data_path(
+            'dna-sequences-id-starts-with-space.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, '1 starts with a space'):
+            format.validate()
+
     def test_paired_dna_sequences_directory_format(self):
         filepath = self.get_data_path('dna-sequences.fasta')
         temp_dir = self.temp_dir.name


=====================================
q2_types/feature_data/tests/test_transformer.py
=====================================
@@ -264,6 +264,45 @@ class TestTaxonomyFormatTransformers(TestPluginBase):
 
         self.assertEqual(exp, obs)
 
+    def test_tsv_taxonomy_to_metadata_trailing_whitespace_taxon(self):
+        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+                                       os.path.join(
+                                           'taxonomy',
+                                           'trailing_space_taxon.tsv'))
+
+        index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+                              columns=['Taxon', 'Confidence'], dtype=object)
+        exp = qiime2.Metadata(exp_df)
+
+        self.assertEqual(exp, obs)
+
+    def test_tsv_taxonomy_to_metadata_leading_whitespace_taxon(self):
+        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+                                       os.path.join(
+                                           'taxonomy',
+                                           'leading_space_taxon.tsv'))
+
+        index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+                              columns=['Taxon', 'Confidence'], dtype=object)
+        exp = qiime2.Metadata(exp_df)
+
+        self.assertEqual(exp, obs)
+
+    def test_tsv_taxonomy_to_metadata_trailing_leading_whitespace_taxon(self):
+        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
+                                       os.path.join(
+                                           'taxonomy',
+                                           'start_end_space_taxon.tsv'))
+
+        index = pd.Index(['seq1'], name='Feature ID', dtype=object)
+        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
+                              columns=['Taxon', 'Confidence'], dtype=object)
+        exp = qiime2.Metadata(exp_df)
+
+        self.assertEqual(exp, obs)
+
 
 # In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
 # which does the heavy lifting for the transformers.
@@ -275,11 +314,11 @@ class TestTaxonomyFormatsToDataFrame(TestPluginBase):
             _taxonomy_formats_to_dataframe(
                 self.get_data_path(os.path.join('taxonomy', '1-column.tsv')))
 
-    def test_blanks_and_comments(self):
+    def test_blanks(self):
         with self.assertRaises(pandas.io.common.EmptyDataError):
             _taxonomy_formats_to_dataframe(
                 self.get_data_path(os.path.join('taxonomy',
-                                                'blanks-and-comments')))
+                                                'blanks')))
 
     def test_empty(self):
         with self.assertRaises(pandas.io.common.EmptyDataError):


=====================================
q2_types/feature_table/_format.py
=====================================
@@ -22,7 +22,8 @@ class BIOMV100Format(model.TextFileFormat):
     }
 
     def sniff(self):
-        with self.open() as fh:
+        # Can't self.open(mode='rb'), so we defer to the backing pathlib object
+        with self.path.open(mode='rb') as fh:
             try:
                 parser = ijson.parse(fh)
                 for prefix, event, value in parser:


=====================================
q2_types/sample_data/tests/test_transformer.py
=====================================
@@ -26,7 +26,10 @@ class TestTransformers(TestPluginBase):
                         name='shannon', index=exp_index)
 
         obs = transformer(exp)
-        obs = pd.Series.from_csv(str(obs), sep='\t', header=0)
+
+        # Squeeze equals true to return series instead of dataframe
+        obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+                          squeeze=True)
 
         assert_series_equal(exp, obs)
 



View it on GitLab: https://salsa.debian.org/med-team/q2-types/commit/043a6cd7bc6818c3424d077b00a6949e847c6e17

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-types/commit/043a6cd7bc6818c3424d077b00a6949e847c6e17
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191215/1095df05/attachment-0001.html>