[med-svn] [Git][med-team/q2-cutadapt][upstream] 2 commits: New upstream version 2020.2.0

Steffen Möller gitlab at salsa.debian.org
Tue Dec 1 23:38:31 GMT 2020



Steffen Möller pushed to branch upstream at Debian Med / q2-cutadapt


Commits:
e90e9700 by Steffen Moeller at 2020-06-10T16:20:31+02:00
New upstream version 2020.2.0
- - - - -
34f17f14 by Steffen Moeller at 2020-12-02T00:09:58+01:00
New upstream version 2020.11.0
- - - - -


13 changed files:

- LICENSE
- ci/recipe/meta.yaml
- q2_cutadapt/__init__.py
- q2_cutadapt/_demux.py
- q2_cutadapt/_trim.py
- q2_cutadapt/_version.py
- q2_cutadapt/plugin_setup.py
- q2_cutadapt/tests/__init__.py
- + q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz
- + q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz
- q2_cutadapt/tests/test_demux.py
- q2_cutadapt/tests/test_trim.py
- setup.py


Changes:

=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2017-2019, QIIME 2 development team.
+Copyright (c) 2017-2020, QIIME 2 development team.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without


=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,7 @@ requirements:
 
   run:
     - python {{ python }}
-    - cutadapt
+    - cutadapt >=3
     - pigz
     - pandas
     - numpy


=====================================
q2_cutadapt/__init__.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_cutadapt/_demux.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -8,6 +8,7 @@
 
 import gzip
 import os
+import shutil
 import subprocess
 import tempfile
 
@@ -83,8 +84,20 @@ def _rename_files(seqs_dir_fmt, per_sample_dir_fmt, barcode_series):
             src = os.path.join(str(per_sample_dir_fmt),
                                '%s.%d.fastq.gz' % (sample_id,
                                                    read_direction))
+
+            # TODO: remove this outer guard when we upgrade to cutadapt 3
             if os.path.isfile(src):
-                os.rename(src, str(out_fp))
+                if out_fp.exists():
+                    _merge_files(src, str(out_fp))
+                    os.remove(src)
+                else:
+                    os.rename(src, str(out_fp))
+
+
+def _merge_files(src, dst):
+    with gzip.open(src, mode='rt', encoding='ascii') as src_fh, \
+            gzip.open(dst, mode='at', encoding='ascii') as dst_fh:
+        shutil.copyfileobj(src_fh, dst_fh)
 
 
 def _write_barcode_fasta(barcode_series, barcode_fasta):
@@ -106,8 +119,8 @@ def _write_empty_fastq_to_mux_barcode_in_seq_fmt(seqs_dir_fmt):
         seqs_dir_fmt.file.write_data(fastq, FastqGzFormat)
 
 
-def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
-           mux_fmt, batch_size, minimum_length):
+def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
+           error_tolerance, mux_fmt, batch_size, minimum_length):
     fwd_barcode_name = forward_barcodes.name
     forward_barcodes = forward_barcodes.drop_missing_values()
     barcodes = forward_barcodes.to_series().to_frame()
@@ -130,14 +143,16 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
 
         if samples_w_missing_barcodes:
             raise ValueError('The following samples do not have both '
-                             'forward and reverse barcodes: %s'
+                             'forward and reverse barcodes (note: if your '
+                             'reads are in single index mixed orientation, '
+                             'try again with all of your barcodes in a single '
+                             'metadata column): %s'
                              % ', '.join(sorted(samples_w_missing_barcodes)))
         if samples_w_dup_barcode_pairs:
             raise ValueError('The following samples have duplicate barcode'
                              ' pairs: %s' %
                              ', '.join(sorted(samples_w_dup_barcode_pairs)))
 
-    per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
     n_samples = len(barcodes)
     if batch_size > n_samples:
         raise ValueError('The batch_size (%d) cannot be greater than the '
@@ -164,13 +179,13 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
         if reverse_barcodes is not None:
             open_fhs['rev'].close()
         previous_untrimmed = current_untrimmed
-        # Only use the forward barcode in the renamed files
-    _rename_files(seqs, per_sample_sequences, barcodes[fwd_barcode_name])
 
+    # Only use the forward barcode in the renamed files
+    _rename_files(seqs, per_sample_sequences, barcodes[fwd_barcode_name])
     muxed = len(list(per_sample_sequences.sequences.iter_views(FastqGzFormat)))
     if muxed == 0:
         raise ValueError('No samples were demultiplexed.')
-    return per_sample_sequences, previous_untrimmed
+    return previous_untrimmed
 
 
 def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
@@ -180,9 +195,14 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
                  minimum_length: int = 1) -> \
                     (CasavaOneEightSingleLanePerSampleDirFmt,
                      MultiplexedSingleEndBarcodeInSequenceDirFmt):
+    per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
     mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
-    return _demux(seqs, barcodes, None, error_rate, mux_fmt, batch_size,
-                  minimum_length)
+
+    untrimmed = _demux(
+        seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
+        batch_size, minimum_length)
+
+    return per_sample_sequences, untrimmed
 
 
 def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
@@ -190,9 +210,33 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
                  reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
                  error_rate: float = 0.1,
                  batch_size: int = 0,
-                 minimum_length: int = 1) -> \
+                 minimum_length: int = 1,
+                 mixed_orientation: bool = False) -> \
                     (CasavaOneEightSingleLanePerSampleDirFmt,
                      MultiplexedPairedEndBarcodeInSequenceDirFmt):
+    if mixed_orientation and reverse_barcodes is not None:
+        raise ValueError('Dual-indexed barcodes for mixed orientation '
+                         'reads are not supported.')
+
+    per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
     mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
-    return _demux(seqs, forward_barcodes, reverse_barcodes, error_rate,
-                  mux_fmt, batch_size, minimum_length)
+
+    untrimmed = _demux(
+        seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
+        error_rate, mux_fmt, batch_size, minimum_length)
+
+    if mixed_orientation:
+        fwd = untrimmed.forward_sequences.view(FastqGzFormat)
+        rev = untrimmed.reverse_sequences.view(FastqGzFormat)
+
+        remaining_seqs = MultiplexedPairedEndBarcodeInSequenceDirFmt()
+        # fwd -> rev && rev -> fwd
+        remaining_seqs.forward_sequences.write_data(rev, FastqGzFormat)
+        remaining_seqs.reverse_sequences.write_data(fwd, FastqGzFormat)
+
+        untrimmed = _demux(
+            remaining_seqs, per_sample_sequences, forward_barcodes,
+            reverse_barcodes, error_rate, mux_fmt, batch_size,
+            minimum_length)
+
+    return per_sample_sequences, untrimmed


=====================================
q2_cutadapt/_trim.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_cutadapt/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2019.10.0)"
-    git_full = "b7f4a9c3769dbea95c437997289ffdf6858e40f9"
-    git_date = "2019-11-01 01:04:25 +0000"
+    git_refnames = " (HEAD -> master, tag: 2020.11.0)"
+    git_full = "8a18174db6b284af087f8bc82b0b717a446aab4e"
+    git_date = "2020-11-25 17:13:09 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_cutadapt/plugin_setup.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -282,6 +282,7 @@ plugin.methods.register_function(
                                     inclusive_end=True),
         'batch_size': Int % Range(0, None),
         'minimum_length': Int % Range(1, None),
+        'mixed_orientation': Bool,
     },
     outputs=[
         ('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
@@ -306,6 +307,9 @@ plugin.methods.register_function(
                           'the cutadapt default of 0 has been overridden, '
                           'because that value produces empty sequence '
                           'records.',
+        'mixed_orientation': 'Handle demultiplexing of mixed orientation '
+                             'reads (i.e. when forward and reverse reads '
+                             'coexist in the same file).'
     },
     output_descriptions={
         'per_sample_sequences': 'The resulting demultiplexed sequences.',


=====================================
q2_cutadapt/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz
=====================================
Binary files /dev/null and b/q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz differ


=====================================
q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz
=====================================
Binary files /dev/null and b/q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz differ


=====================================
q2_cutadapt/tests/test_demux.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -7,6 +7,7 @@
 # ----------------------------------------------------------------------------
 
 import gzip
+import itertools
 import os
 import pathlib
 import shutil
@@ -33,22 +34,28 @@ from qiime2.plugin.testing import TestPluginBase
 class TestDemuxSingle(TestPluginBase):
     package = 'q2_cutadapt.tests'
 
-    def assert_demux_results(self, exp_samples_and_barcodes, obs_demuxed_art):
+    def assert_demux_results(self, exp_samples_and_barcodes, exp_results,
+                             obs_demuxed_art):
         obs_demuxed = obs_demuxed_art.view(
             SingleLanePerSampleSingleEndFastqDirFmt)
         obs_demuxed_seqs = obs_demuxed.sequences.iter_views(FastqGzFormat)
-        zipped = zip(exp_samples_and_barcodes.iteritems(), obs_demuxed_seqs)
-        for (sample_id, barcode), (filename, _) in zipped:
+        zipped = itertools.zip_longest(exp_samples_and_barcodes.iteritems(),
+                                       exp_results, obs_demuxed_seqs)
+        for (sample_id, barcode), exp, (filename, fmt) in zipped:
             filename = str(filename)
             self.assertTrue(sample_id in filename)
             self.assertTrue(barcode in filename)
+            with gzip.open(str(fmt), 'rt') as fh:
+                obs = ''.join(fh.readlines())
+            self.assertEqual(exp, obs)
 
     def assert_untrimmed_results(self, exp, obs_untrimmed_art):
         obs_untrimmed = obs_untrimmed_art.view(
             MultiplexedSingleEndBarcodeInSequenceDirFmt)
         obs_untrimmed = obs_untrimmed.file.view(FastqGzFormat)
-        obs_untrimmed = gzip.decompress(obs_untrimmed.path.read_bytes())
-        self.assertEqual(exp, obs_untrimmed)
+        with gzip.open(str(obs_untrimmed), 'rt') as fh:
+            obs = ''.join(fh.readlines())
+        self.assertEqual(exp, obs)
 
     def setUp(self):
         super().setUp()
@@ -62,13 +69,21 @@ class TestDemuxSingle(TestPluginBase):
         metadata = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                       obs_untrimmed_art)
 
     def test_all_matched(self):
@@ -76,45 +91,83 @@ class TestDemuxSingle(TestPluginBase):
             pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                                      name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample c
+            '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        # obs_untrimmed should be empty, since everything matched
-        self.assert_untrimmed_results(b'', obs_untrimmed_art)
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('', obs_untrimmed_art)
 
+    # NOTE: this test used to check for an exception because it was
+    # possible to generate a completely empty output dir with no fastq.gz
+    # files in it. As of cutadapt 3 this is no longer possible, because output
+    # files are generated for every sample (and we must specify at least one
+    # sample in order for the barcodes to be valid QIIME 2 Metadata). Rather
+    # than remove the test, we will retool it here.
     def test_none_matched(self):
         metadata = CategoricalMetadataColumn(
             pd.Series(['TTTT'], name='Barcode',
                       index=pd.Index(['sample_d'], name='id')))
 
         with redirected_stdio(stderr=os.devnull):
-            with self.assertRaisesRegex(ValueError, 'demultiplexed'):
+            obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
+        self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art)
+        self.assert_untrimmed_results('@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+                                      obs_untrimmed_art)
+
     def test_error_tolerance_filtering(self):
         metadata = CategoricalMetadataColumn(
             pd.Series(['AAAG', 'CCCC'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a has no reads (bc we misspelled the barcode)
+            '',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
-        # sample_a is dropped because of a substitution error (AAAA vs AAAG)
-        exp_samples_and_barcodes = pd.Series(['CCCC'], index=['sample_b'])
-        self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
-        self.assert_untrimmed_results(b'@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
-                                      b'@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
-                                      b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+        self.assert_demux_results(metadata.to_series(), exp,
+                                  obs_demuxed_art)
+        self.assert_untrimmed_results('@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                       obs_untrimmed_art)
 
     def test_error_tolerance_high_enough_to_prevent_filtering(self):
         metadata = CategoricalMetadataColumn(
             pd.Series(['AAAG', 'CCCC'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
@@ -122,8 +175,8 @@ class TestDemuxSingle(TestPluginBase):
                                      error_rate=0.25)
 
         # This test should yield the same results as test_typical, above
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                       obs_untrimmed_art)
 
     def test_extra_barcode_in_metadata(self):
@@ -131,19 +184,29 @@ class TestDemuxSingle(TestPluginBase):
             pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b', 'sample_c',
                                       'sample_d'], name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample c
+            '@id6\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample d is empty bc no reads matched the barcode TTTT
+            '', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
-        # TTTT/sample_d shouldn't be in the demuxed results, because there
-        # were no reads with that barcode present
-        exp_samples_and_barcodes = pd.Series(['AAAA', 'CCCC', 'GGGG'],
+        exp_samples_and_barcodes = pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
                                              index=['sample_a', 'sample_b',
-                                                    'sample_c'])
-        self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
-        # obs_untrimmed should be empty, since everything matched
-        self.assert_untrimmed_results(b'', obs_untrimmed_art)
+                                                    'sample_c', 'sample_d'])
+        self.assert_demux_results(exp_samples_and_barcodes, exp,
+                                  obs_demuxed_art)
+        self.assert_untrimmed_results('', obs_untrimmed_art)
 
     def test_variable_length_barcodes(self):
         metadata = CategoricalMetadataColumn(
@@ -153,28 +216,46 @@ class TestDemuxSingle(TestPluginBase):
         muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
         muxed_sequences = Artifact.import_data(
             'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample c
+            '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(muxed_sequences, metadata)
 
-        # This test should yield the same results as test_typical, above, just
-        # with variable length barcodes
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        self.assert_untrimmed_results(b'', obs_untrimmed_art)
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('', obs_untrimmed_art)
 
     def test_batch_size(self):
         metadata = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata,
                                      batch_size=1)
 
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+        # This test should yield the same results as test_typical, above,
+        # the fact that we are batching shouldn't impact the final results
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
                                       obs_untrimmed_art)
 
     def test_invalid_batch_size(self):
@@ -190,15 +271,24 @@ class TestDemuxSingle(TestPluginBase):
             pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                       name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample c
+            '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata,
                                      batch_size=2)
 
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        # obs_untrimmed should be empty, since everything matched
-        self.assert_untrimmed_results(b'', obs_untrimmed_art)
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('', obs_untrimmed_art)
 
     def test_min_length(self):
         metadata = CategoricalMetadataColumn(
@@ -207,42 +297,59 @@ class TestDemuxSingle(TestPluginBase):
             pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
                       name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample c is empty because the barcode matched the entire
+            # read, which removed everything.
+            '', ]
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_single_fn(self.muxed_sequences, metadata)
 
-        obs = obs_demuxed_art.view(SingleLanePerSampleSingleEndFastqDirFmt)
-
-        (obs_f1, _), (obs_f2, _) = obs.sequences.iter_views(FastqGzFormat)
-
-        self.assertEqual('sample_a_AAAA_L001_R1_001.fastq.gz', str(obs_f1))
-        self.assertEqual('sample_b_CCCC_L001_R1_001.fastq.gz', str(obs_f2))
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('', obs_untrimmed_art)
 
 
 class TestDemuxPaired(TestPluginBase):
     package = 'q2_cutadapt.tests'
 
-    def assert_demux_results(self, exp_samples_and_barcodes, obs_demuxed_art):
+    def assert_demux_results(self, exp_samples_and_barcodes, exp_results,
+                             obs_demuxed_art):
         obs_demuxed = obs_demuxed_art.view(
             SingleLanePerSamplePairedEndFastqDirFmt)
         obs_demuxed_seqs = obs_demuxed.sequences.iter_views(FastqGzFormat)
         # Since we are working with fwd/rev reads, duplicate each list elem
         exp = [x for x in exp_samples_and_barcodes.iteritems() for _ in (0, 1)]
-        zipped = zip(exp, obs_demuxed_seqs)
-        for (sample_id, barcode), (filename, _) in zipped:
+        zipped = itertools.zip_longest(exp, exp_results, obs_demuxed_seqs)
+        for (sample_id, barcode), exp, (filename, fmt) in zipped:
             filename = str(filename)
             self.assertTrue(sample_id in filename)
             self.assertTrue(barcode in filename)
+            with gzip.open(str(fmt), 'rt') as fh:
+                obs = ''.join(fh.readlines())
+            self.assertEqual(exp, obs)
 
     def assert_untrimmed_results(self, exp, obs_untrimmed_art):
         obs_untrimmed = obs_untrimmed_art.view(
             MultiplexedPairedEndBarcodeInSequenceDirFmt)
+
+        # first check the fwd reads
         obs_untrimmed_f = obs_untrimmed.forward_sequences.view(FastqGzFormat)
-        obs_untrimmed_f = gzip.decompress(obs_untrimmed_f.path.read_bytes())
+        with gzip.open(str(obs_untrimmed_f), 'rt') as fh:
+            obs_untrimmed_f = ''.join(fh.readlines())
         self.assertEqual(exp[0], obs_untrimmed_f)
+
+        # next check the rev reads
         obs_untrimmed_r = obs_untrimmed.reverse_sequences.view(FastqGzFormat)
-        obs_untrimmed_r = gzip.decompress(obs_untrimmed_r.path.read_bytes())
+        with gzip.open(str(obs_untrimmed_r), 'rt') as fh:
+            obs_untrimmed_r = ''.join(fh.readlines())
         self.assertEqual(exp[1], obs_untrimmed_r)
 
     def setUp(self):
@@ -264,14 +371,29 @@ class TestDemuxPaired(TestPluginBase):
         metadata = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC'], name='Barcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a, fwd
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample a, rev
+            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample b, fwd
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b, fwd
+            '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n', ]
+        exp_untrimmed = ['@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+                         '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
                 self.demux_paired_fn(self.muxed_sequences, metadata)
 
-        self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
-        exp_untrimmed = [b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
-                         b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
         self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
 
     def test_di_typical(self):
@@ -281,6 +403,23 @@ class TestDemuxPaired(TestPluginBase):
         reverse_barcodes = CategoricalMetadataColumn(
             pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode',
                       index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a, fwd
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample a, rev
+            '@id1\nTGCATGCA\n+\nzzzzzzzz\n'
+            '@id3\nTGCATGCA\n+\nzzzzzzzz\n',
+            # sample b, fwd
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample a, fwd
+            '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
+            '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
+            '@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
+        exp_untrimmed = ['@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+                         '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
 
         with redirected_stdio(stderr=os.devnull):
             obs_demuxed_art, obs_untrimmed_art = \
@@ -288,12 +427,66 @@ class TestDemuxPaired(TestPluginBase):
                                      forward_barcodes=forward_barcodes,
                                      reverse_barcodes=reverse_barcodes)
 
-        self.assert_demux_results(forward_barcodes.to_series(),
+        self.assert_demux_results(forward_barcodes.to_series(), exp,
                                   obs_demuxed_art)
-        exp_untrimmed = [b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
-                         b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
         self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
 
+    def test_mixed_orientation_success(self):
+        # sample_a and sample_b have reads in both fwd and rev directions.
+        # sample_c only has reads in the fwd direction.
+        # sample_d only has reads in the rev direction.
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+                                      'sample_d'], name='id')))
+        mixed_orientation_sequences_f_fp = self.get_data_path(
+            'mixed-orientation/forward.fastq.gz')
+        mixed_orientation_sequences_r_fp = self.get_data_path(
+            'mixed-orientation/reverse.fastq.gz')
+        with tempfile.TemporaryDirectory() as temp:
+            shutil.copy(mixed_orientation_sequences_f_fp, temp)
+            shutil.copy(mixed_orientation_sequences_r_fp, temp)
+            mixed_orientation_sequences = Artifact.import_data(
+                'MultiplexedPairedEndBarcodeInSequence', temp)
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(mixed_orientation_sequences,
+                                     forward_barcodes=forward_barcodes,
+                                     mixed_orientation=True)
+        exp = [
+            # sample_a fwd
+            '@id1\nACGTACGT\n+\nyyyyyyyy\n' \
+            '@id3\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_a rev
+            '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
+            '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample_b fwd
+            '@id4\nACGTACGT\n+\nyyyyyyyy\n' \
+            '@id2\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_b rev
+            '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
+            '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample_c fwd
+            '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_c rev
+            '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample_d fwd
+            '@id6\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_d rev
+            '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ]
+
+        # We want to be sure that the validation is 100%, not just `min`,
+        obs_demuxed_art.validate(level='max')
+        # checkpoint assertion for the above `validate` - nothing should fail
+        self.assertTrue(True)
+
+        self.assert_demux_results(forward_barcodes.to_series(), exp,
+                                  obs_demuxed_art)
+
+        # Everything should match, so untrimmed should be empty
+        self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
+
     def test_di_mismatched_barcodes(self):
         forward_barcodes = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC', 'ACGT'], name='ForwardBarcode',
@@ -326,6 +519,36 @@ class TestDemuxPaired(TestPluginBase):
                                  forward_barcodes=forward_barcodes,
                                  reverse_barcodes=reverse_barcodes)
 
+    def test_multiple_orientations_dual_indices(self):
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        reverse_barcodes = CategoricalMetadataColumn(
+            pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+        mixed_orientation_sequences_f_fp = self.get_data_path(
+            'mixed-orientation/forward.fastq.gz')
+        mixed_orientation_sequences_r_fp = self.get_data_path(
+            'mixed-orientation/reverse.fastq.gz')
+
+        # These files have forward and reverse reads mixed together in the same
+        # file
+        with tempfile.TemporaryDirectory() as temp:
+            shutil.copy(mixed_orientation_sequences_f_fp, temp)
+            shutil.copy(mixed_orientation_sequences_r_fp, temp)
+            mixed_orientation_sequences = Artifact.import_data(
+                'MultiplexedPairedEndBarcodeInSequence', temp)
+
+        with self.assertRaisesRegex(ValueError,
+                                    'Dual-indexed barcodes for mixed '
+                                    'orientation reads are not supported.'):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(mixed_orientation_sequences,
+                                     forward_barcodes=forward_barcodes,
+                                     reverse_barcodes=reverse_barcodes,
+                                     mixed_orientation=True)
+
 
 class TestDemuxUtilsSingleEnd(TestPluginBase):
     package = 'q2_cutadapt.tests'
@@ -360,7 +583,7 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
                          obs[11])
 
     def test_rename_files_single(self):
-        for fn in ['sample_a.fastq.gz', 'sample_b.fastq.gz']:
+        for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
             shutil.copy(self.fastq_fp,
                         str(self.per_sample_dir_fmt.path / pathlib.Path(fn)))
 
@@ -368,16 +591,19 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
                       self.barcode_series)
 
         seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
+        counter = 0
         for fn, (sample_id, barcode) in zip(seqs,
                                             self.barcode_series.iteritems()):
             self.assertTrue(sample_id in str(fn))
             self.assertTrue(barcode in str(fn))
+            counter += 1
+        self.assertEqual(counter, 2)
 
     def test_rename_files_extra_samples_in_barcode_map(self):
         barcode_series = pd.Series(['A', 'G', 'C'],
                                    index=['sample_a', 'sample_b', 'sample_c'])
 
-        for fn in ['sample_a.fastq.gz', 'sample_b.fastq.gz']:
+        for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
             shutil.copy(self.fastq_fp,
                         str(self.per_sample_dir_fmt.path / pathlib.Path(fn)))
 
@@ -385,9 +611,12 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
                       barcode_series)
 
         seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
+        counter = 0
         for fn, (sample_id, barcode) in zip(seqs, barcode_series.iteritems()):
             self.assertTrue(sample_id in str(fn))
             self.assertTrue(barcode in str(fn))
+            counter += 1
+        self.assertEqual(counter, 2)
 
     def test_write_empty_fastq_to_mux_barcode_in_seq_fmt(self):
         _write_empty_fastq_to_mux_barcode_in_seq_fmt(self.untrimmed_dir_fmt)
@@ -471,9 +700,12 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
         seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
         exp = [('sample_a', 'A'), ('sample_a', 'A'),
                ('sample_b', 'G'), ('sample_b', 'G')]
+        counter = 0
         for fn, (sample_id, barcode) in zip(seqs, exp):
             self.assertTrue(sample_id in str(fn))
             self.assertTrue(barcode in str(fn))
+            counter += 1
+        self.assertEqual(counter, 4)
 
     def test_rename_files_extra_samples_in_barcode_map(self):
         barcode_series = pd.Series(['A', 'G', 'C'],
@@ -490,9 +722,12 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
         seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
         exp = [('sample_a', 'A'), ('sample_a', 'A'),
                ('sample_b', 'G'), ('sample_b', 'G')]
+        counter = 0
         for fn, (sample_id, barcode) in zip(seqs, exp):
             self.assertTrue(sample_id in str(fn))
             self.assertTrue(barcode in str(fn))
+            counter += 1
+        self.assertEqual(counter, 4)
 
     def test_write_empty_fastq_to_mux_barcode_in_seq_fmt(self):
         _write_empty_fastq_to_mux_barcode_in_seq_fmt(self.untrimmed_dir_fmt)


=====================================
q2_cutadapt/tests/test_trim.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #


=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
 # ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
 #
 # Distributed under the terms of the Modified BSD License.
 #
@@ -31,6 +31,7 @@ setup(
             'data/single-end/*',
             'data/paired-end/*',
             'data/paired-end-unordered/*',
+            'data/mixed-orientation/*',
         ],
     },
     zip_safe=False,



View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/compare/4a7d2a06004d319eabd45ef2ac451a2f48fa6802...34f17f14522275fdab535c7539c4754017903da4

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/compare/4a7d2a06004d319eabd45ef2ac451a2f48fa6802...34f17f14522275fdab535c7539c4754017903da4
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201201/04ae7699/attachment-0001.html>


More information about the debian-med-commit mailing list