[med-svn] [Git][med-team/q2-cutadapt][upstream] New upstream version 2024.2.0

Andreas Tille (@tille) gitlab at salsa.debian.org
Sun Feb 18 13:42:37 GMT 2024



Andreas Tille pushed to branch upstream at Debian Med / q2-cutadapt


Commits:
5605a03f by Andreas Tille at 2024-02-18T14:38:19+01:00
New upstream version 2024.2.0
- - - - -


6 changed files:

- .github/workflows/ci-dev.yaml
- README.md
- q2_cutadapt/_demux.py
- q2_cutadapt/_version.py
- q2_cutadapt/plugin_setup.py
- q2_cutadapt/tests/test_demux.py


Changes:

=====================================
.github/workflows/ci-dev.yaml
=====================================
@@ -9,4 +9,4 @@ jobs:
   ci:
     uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml at dev
     with:
-      distro: core
\ No newline at end of file
+      distro: amplicon


=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
 # q2-cutadapt
 
-![](https://github.com/qiime2/q2-cutadapt/workflows/ci/badge.svg)
+![](https://github.com/qiime2/q2-cutadapt/workflows/ci-dev/badge.svg)
 
 This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org.
\ No newline at end of file


=====================================
q2_cutadapt/_demux.py
=====================================
@@ -40,9 +40,12 @@ def run_command(cmd, verbose=True):
 
 def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
                          untrimmed_dir_fmt, error_rate, minimum_length,
+                         forward_cut=0, reverse_cut=0,
+                         anchor_forward=False, anchor_reverse=False,
                          cores=1):
     cmd = ['cutadapt',
-           '--front', 'file:%s' % barcode_fhs['fwd'].name,
+           '-g',
+           f'{"^" if anchor_forward else ""}file:{barcode_fhs["fwd"].name}',
            '--error-rate', str(error_rate),
            '--minimum-length', str(minimum_length),
            # {name} is a cutadapt convention for interpolating the sample id
@@ -57,7 +60,8 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
             # Dual indices
             cmd += [
                 '--pair-adapters',
-                '-G', 'file:%s' % barcode_fhs['rev'].name,
+                '-G',
+                f'{"^" if anchor_reverse else ""}file:{barcode_fhs["rev"].name}',  # noqa: E501
             ]
         cmd += [
             '-p', os.path.join(str(per_sample_dir_fmt), '{name}.2.fastq.gz'),
@@ -65,12 +69,16 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
             os.path.join(str(untrimmed_dir_fmt), 'reverse.fastq.gz'),
             str(seqs_dir_fmt.forward_sequences.view(FastqGzFormat)),
             str(seqs_dir_fmt.reverse_sequences.view(FastqGzFormat)),
-            ]
+            '-U', str(reverse_cut),
+        ]
     else:
         # SINGLE-END
         cmd += [str(seqs_dir_fmt.file.view(FastqGzFormat))]
 
-    cmd += ['-j', str(cores)]
+    cmd += [
+        '-u', str(forward_cut),
+        '-j', str(cores)
+    ]
     return cmd
 
 
@@ -191,7 +199,8 @@ def _check_barcodes_uniqueness(
 
 
 def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
-           error_tolerance, mux_fmt, batch_size, minimum_length, cores):
+           error_tolerance, mux_fmt, batch_size, minimum_length, forward_cut,
+           reverse_cut, anchor_forward, anchor_reverse, cores):
     fwd_barcode_name = forward_barcodes.name
     forward_barcodes = forward_barcodes.drop_missing_values()
     barcodes = forward_barcodes.to_series().to_frame()
@@ -217,10 +226,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
             open_fhs['rev'] = tempfile.NamedTemporaryFile()
             _write_barcode_fasta(barcode_batch[rev_barcode_name],
                                  open_fhs['rev'])
-        cmd = _build_demux_command(previous_untrimmed, open_fhs,
-                                   per_sample_sequences,
-                                   current_untrimmed, error_tolerance,
-                                   minimum_length, cores)
+        cmd = _build_demux_command(
+            previous_untrimmed, open_fhs, per_sample_sequences,
+            current_untrimmed, error_tolerance, minimum_length, forward_cut,
+            reverse_cut, anchor_forward, anchor_reverse, cores
+        )
         run_command(cmd)
         open_fhs['fwd'].close()
         if reverse_barcodes is not None:
@@ -237,6 +247,8 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
 
 def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
                  barcodes: qiime2.CategoricalMetadataColumn,
+                 cut: int = 0,
+                 anchor_barcode: bool = False,
                  error_rate: float = 0.1,
                  batch_size: int = 0,
                  minimum_length: int = 1,
@@ -248,15 +260,53 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
     mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
 
     untrimmed = _demux(
-        seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
-        batch_size, minimum_length, cores)
+        seqs, per_sample_sequences, barcodes, None, error_rate,
+        mux_fmt, batch_size, minimum_length, cut, 0, anchor_barcode, False,
+        cores)
 
     return per_sample_sequences, untrimmed
 
 
+def _check_paired_requirements(loc):
+    mixed_orientation = loc.get("mixed_orientation", None)
+    forward_cut = loc.get("forward_cut", 0)
+    reverse_cut = loc.get("reverse_cut", 0)
+    reverse_barcodes = loc.get("reverse_barcodes", None)
+    anchor_forward_barcode = loc.get("anchor_forward_barcode", False)
+    anchor_reverse_barcode = loc.get("anchor_reverse_barcode", False)
+
+    if (
+        not mixed_orientation
+        and anchor_reverse_barcode and (reverse_barcodes is None)
+    ):
+        raise ValueError("A reverse barcode needs to be provided in order to "
+                         "anchor the reverse barcode.")
+
+    if (
+        mixed_orientation
+        and forward_cut != reverse_cut
+    ):
+        raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
+                         "the same number when using the 'mixed_orientation' "
+                         "mode.")
+
+    if (
+        mixed_orientation
+        and anchor_forward_barcode != anchor_reverse_barcode
+    ):
+        raise ValueError(
+            "'anchor_forward_barcode' and 'anchor_reverse_barcode' need to be "
+            "set to the same value when using the 'mixed_orientation' mode."
+        )
+
+
 def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
                  forward_barcodes: qiime2.CategoricalMetadataColumn,
                  reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
+                 forward_cut: int = 0,
+                 reverse_cut: int = 0,
+                 anchor_forward_barcode: bool = False,
+                 anchor_reverse_barcode: bool = False,
                  error_rate: float = 0.1,
                  batch_size: int = 0,
                  minimum_length: int = 1,
@@ -266,13 +316,15 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
                      MultiplexedPairedEndBarcodeInSequenceDirFmt):
     _check_barcodes_uniqueness(
         forward_barcodes, reverse_barcodes, mixed_orientation)
+    _check_paired_requirements(locals())
 
     per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
     mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
 
     untrimmed = _demux(
         seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
-        error_rate, mux_fmt, batch_size, minimum_length, cores)
+        error_rate, mux_fmt, batch_size, minimum_length, forward_cut,
+        reverse_cut, anchor_forward_barcode, anchor_reverse_barcode, cores)
 
     if mixed_orientation:
         fwd = untrimmed.forward_sequences.view(FastqGzFormat)
@@ -282,10 +334,12 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
         # fwd -> rev && rev -> fwd
         remaining_seqs.forward_sequences.write_data(rev, FastqGzFormat)
         remaining_seqs.reverse_sequences.write_data(fwd, FastqGzFormat)
-
+        # Cuts have already been performed during the first demux pass, set
+        #  forward and reverse cut to 0
         untrimmed = _demux(
             remaining_seqs, per_sample_sequences, forward_barcodes,
             reverse_barcodes, error_rate, mux_fmt, batch_size,
-            minimum_length, cores)
+            minimum_length, 0, 0, anchor_reverse_barcode,
+            anchor_forward_barcode, cores)
 
     return per_sample_sequences, untrimmed


=====================================
q2_cutadapt/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2023.9.0, Release-2023.9)"
-    git_full = "4098881c977e2a54475e580e65b40c15c92c490d"
-    git_date = "2023-10-03 21:51:11 +0000"
+    git_refnames = " (tag: 2024.2.0, Release-2024.2)"
+    git_full = "aee8d31a9f1517824d9bda653dc525a8c8721cfb"
+    git_date = "2024-02-16 21:55:34 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_cutadapt/plugin_setup.py
=====================================
@@ -17,6 +17,7 @@ from qiime2.plugin import (
     List,
     Str,
     Bool,
+    Threads,
 )
 from q2_types.multiplexed_sequences import (
     MultiplexedSingleEndBarcodeInSequence,
@@ -53,7 +54,7 @@ plugin.methods.register_function(
         'demultiplexed_sequences': SampleData[SequencesWithQuality],
     },
     parameters={
-        'cores': Int % Range(1, None),
+        'cores': Threads,
         'adapter': List[Str],
         'front': List[Str],
         'anywhere': List[Str],
@@ -144,7 +145,7 @@ plugin.methods.register_function(
         'demultiplexed_sequences': SampleData[PairedEndSequencesWithQuality],
     },
     parameters={
-        'cores': Int % Range(1, None),
+        'cores': Threads,
         'adapter_f': List[Str],
         'front_f': List[Str],
         'anywhere_f': List[Str],
@@ -263,9 +264,11 @@ plugin.methods.register_function(
         'barcodes': MetadataColumn[Categorical],
         'error_rate': Float % Range(0, 1, inclusive_start=True,
                                     inclusive_end=True),
+        'anchor_barcode': Bool,
         'batch_size': Int % Range(0, None),
         'minimum_length': Int % Range(1, None),
-        'cores': Int % Range(1, None),
+        'cut': Int,
+        'cores': Threads,
     },
     outputs=[
         ('per_sample_sequences', SampleData[SequencesWithQuality]),
@@ -282,6 +285,10 @@ plugin.methods.register_function(
                       'allowable error rate. The default value specified by '
                       'cutadapt is 0.1 (=10%), which is greater than '
                       '`demux emp-*`, which is 0.0 (=0%).',
+        'anchor_barcode': 'Anchor the barcode. The barcode is then '
+                          'expected to occur in full length at the beginning '
+                          '(5\' end) of the sequence. Can speed up '
+                          'demultiplexing if used.',
         'batch_size': 'The number of samples cutadapt demultiplexes '
                       'concurrently. Demultiplexing in smaller batches will '
                       'yield the same result with marginal speed loss, and '
@@ -291,6 +298,11 @@ plugin.methods.register_function(
                           'the cutadapt default of 0 has been overridden, '
                           'because that value produces empty sequence '
                           'records.',
+        'cut': 'Remove the specified number of bases from the sequences. '
+               'Bases are removed before demultiplexing. If a positive value '
+               'is provided, bases are removed from the beginning of the '
+               'sequences. If a negative value is provided, bases are removed '
+               'from the end of the sequences.',
     },
     output_descriptions={
         'per_sample_sequences': 'The resulting demultiplexed sequences.',
@@ -314,12 +326,16 @@ plugin.methods.register_function(
     parameters={
         'forward_barcodes': MetadataColumn[Categorical],
         'reverse_barcodes': MetadataColumn[Categorical],
+        'forward_cut': Int,
+        'reverse_cut': Int,
         'error_rate': Float % Range(0, 1, inclusive_start=True,
                                     inclusive_end=True),
+        'anchor_forward_barcode': Bool,
+        'anchor_reverse_barcode': Bool,
         'batch_size': Int % Range(0, None),
         'minimum_length': Int % Range(1, None),
         'mixed_orientation': Bool,
-        'cores': Int % Range(1, None),
+        'cores': Threads,
     },
     outputs=[
         ('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
@@ -334,6 +350,32 @@ plugin.methods.register_function(
                             'per-sample barcodes for the forward reads.',
         'reverse_barcodes': 'The sample metadata column listing the '
                             'per-sample barcodes for the reverse reads.',
+        'forward_cut': 'Remove the specified number of bases from the forward '
+                       'sequences. Bases are removed before demultiplexing. '
+                       'If a positive value is provided, bases are removed '
+                       'from the beginning of the sequences. If a negative '
+                       'value is provided, bases are removed from the end of '
+                       'the sequences. If --p-mixed-orientation is set, then '
+                       'both --p-forward-cut and --p-reverse-cut must be '
+                       'set to the same value.',
+        'reverse_cut': 'Remove the specified number of bases from the reverse '
+                       'sequences. Bases are removed before demultiplexing. '
+                       'If a positive value is provided, bases are removed '
+                       'from the beginning of the sequences. If a negative '
+                       'value is provided, bases are removed from the end of '
+                       'the sequences. If --p-mixed-orientation is set, then '
+                       'both --p-forward-cut and --p-reverse-cut must be '
+                       'set to the same value.',
+        'anchor_forward_barcode': 'Anchor the forward barcode. The '
+                                  'barcode is then expected to occur in full '
+                                  'length at the beginning (5\' end) of the '
+                                  'forward sequence. Can speed up '
+                                  'demultiplexing if used.',
+        'anchor_reverse_barcode': 'Anchor the reverse barcode. The '
+                                  'barcode is then expected to occur in full '
+                                  'length at the beginning (5\' end) of the '
+                                  'reverse sequence. Can speed up '
+                                  'demultiplexing if used.',
         'error_rate': 'The level of error tolerance, specified as the maximum '
                       'allowable error rate.',
         'batch_size': 'The number of samples cutadapt demultiplexes '
@@ -347,7 +389,7 @@ plugin.methods.register_function(
                           'records.',
         'mixed_orientation': 'Handle demultiplexing of mixed orientation '
                              'reads (i.e. when forward and reverse reads '
-                             'coexist in the same file).'
+                             'coexist in the same file).',
     },
     output_descriptions={
         'per_sample_sequences': 'The resulting demultiplexed sequences.',


=====================================
q2_cutadapt/tests/test_demux.py
=====================================
@@ -316,6 +316,135 @@ class TestDemuxSingle(TestPluginBase):
         self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
         self.assert_untrimmed_results('', obs_untrimmed_art)
 
+    def test_cut_positive(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+        exp = [
+            # sample a passed. However, as the first 'A' was removed, there is
+            #  a shift in the extracted sequence.
+            '@id1\nCGTACGT\n+\nzzzzzzz\n'  # vs ACGTACGT in `test_typical`
+            '@id3\nCGTACGT\n+\nzzzzzzz\n',
+            # sample b passed because by default cutadapt allows matching
+            #  non-complete barcodes to the targeted sequences.
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
+        # Expected untrimmed sequence is the same as in `test_typical`, only
+        #  shortened of its first nucleotide.
+        exp_untrimmed = \
+            '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n'
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_single_fn(self.muxed_sequences, metadata, cut=1)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+    def test_cut_positive_overcut(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+        exp = [
+            # sample a passed. However, as the first 'A' was removed, there is
+            #  a shift in the extracted sequence.
+            '@id1\nCGTACGT\n+\nzzzzzzz\n'  # vs ACGTACGT in `test_typical`
+            '@id3\nCGTACGT\n+\nzzzzzzz\n',
+            # Sample b is empty as with the first two nucleotides cut, the
+            #  overlap between barcode and sequence is 2 (under the threshold
+            #  of 3)
+            '', ]
+        exp_untrimmed = (
+            '@id2\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+            '@id4\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+            '@id5\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+            '@id6\nGGACGTACGT\n+\nzzzzzzzzzz\n'
+        )
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_single_fn(self.muxed_sequences, metadata, cut=2)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+    def test_cut_negative(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        # Expected demux and untrimmed sequences are the same as in
+        #  `test_typical`, only shortened of their last two nucleotides.
+        exp = [
+            # sample a
+            '@id1\nACGTAC\n+\nzzzzzz\n'
+            '@id3\nACGTAC\n+\nzzzzzz\n',
+            # sample b
+            '@id2\nACGTAC\n+\nzzzzzz\n'
+            '@id4\nACGTAC\n+\nzzzzzz\n'
+            '@id5\nACGTAC\n+\nzzzzzz\n', ]
+
+        exp_untrimmed = \
+            '@id6\nGGGGACGTAC\n+\nzzzzzzzzzz\n'
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_single_fn(self.muxed_sequences, metadata, cut=-2)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+    def test_anchored(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCA'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b is empty because the first 'C' from the sequence is
+            #  not in the barcode (sequence is 'CCCCACGTACGT')
+            '',
+        ]
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = self.demux_single_fn(
+                self.muxed_sequences, metadata, anchor_barcode=True)
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results('@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                                      '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+                                      obs_untrimmed_art)
+
+    def test_anchored_cut(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a passed. However, as the first 'A' was removed, there was
+            #  a shift in the extracted sequence.
+            '@id1\nCGTACGT\n+\nzzzzzzz\n'  # vs ACGTACGT in other tests
+            '@id3\nCGTACGT\n+\nzzzzzzz\n',
+            # sample b is empty because the removal of the first base only left
+            #  'CCC' from the original barcode.
+            '',
+        ]
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = self.demux_single_fn(
+                self.muxed_sequences, metadata, cut=1, anchor_barcode=True)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        # Rem: the first base was removed from all the sequences
+        self.assert_untrimmed_results('@id2\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+                                      '@id4\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+                                      '@id5\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+                                      '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n',
+                                      obs_untrimmed_art)
+
 
 class TestDemuxPaired(TestPluginBase):
     package = 'q2_cutadapt.tests'
@@ -396,6 +525,81 @@ class TestDemuxPaired(TestPluginBase):
         self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
         self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
 
+    def test_cut(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a, fwd
+            '@id1\nCGTACGT\n+\nzzzzzzz\n'
+            '@id3\nCGTACGT\n+\nzzzzzzz\n',
+            # sample a, rev
+            '@id1\nGGGGTGCATG\n+\nzzzzzzzzzz\n'
+            '@id3\nGGGGTGCATG\n+\nzzzzzzzzzz\n',
+            # sample b, fwd
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b, rev
+            '@id2\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+            '@id4\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+            '@id5\nTTTTTGCATG\n+\nzzzzzzzzzz\n', ]
+        exp_untrimmed = [
+            '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n',
+            '@id6\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+        ]
+
+        # Test a positive cut in forward sequences and a negative cut in
+        #  reverse at the same time
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(self.muxed_sequences,
+                                     forward_barcodes=metadata,
+                                     forward_cut=1,
+                                     reverse_cut=-2)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+    def test_anchored(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCA'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+        exp = [
+            # sample a, fwd
+            '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample a, rev
+            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample b, fwd is empty because the first 'C' from the sequence is
+            #  not in the barcode (sequence is 'CCCCACGTACGT')
+            '',
+            # sample b, rev is empty for the same reason
+            '', ]
+
+        exp_untrimmed = [
+                '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+                '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+                '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+                '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+                '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+                '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            ]
+
+        # Test a positive cut in forward sequences and a negative cut in
+        #  reverse at the same time
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(self.muxed_sequences,
+                                     forward_barcodes=metadata,
+                                     anchor_forward_barcode=True)
+
+        self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
     def test_dual_index_success(self):
         forward_barcodes = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
@@ -414,7 +618,7 @@ class TestDemuxPaired(TestPluginBase):
             '@id2\nACGTACGT\n+\nzzzzzzzz\n'
             '@id4\nACGTACGT\n+\nzzzzzzzz\n'
             '@id5\nACGTACGT\n+\nzzzzzzzz\n',
-            # sample a, rev
+            # sample b, rev
             '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
             '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
             '@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
@@ -431,6 +635,65 @@ class TestDemuxPaired(TestPluginBase):
                                   obs_demuxed_art)
         self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
 
+    def test_dual_index_anchored(self):
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC', 'GGGA'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
+                                     name='id')))
+        reverse_barcodes = CategoricalMetadataColumn(
+            pd.Series(['GGGT', 'TTTT', 'TTTT'], name='ReverseBarcode',
+                      index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
+                                     name='id')))
+        exp = [
+            # sample a, fwd is empty because of reverse anchoring
+            '',
+            # sample a, rev is empty because of reverse anchoring
+            '',
+            # sample b, fwd
+            '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+            '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+            # sample b, rev
+            '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
+            '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
+            '@id5\nTGCATGCA\n+\nzzzzzzzz\n',
+            # sample c, fwd is empty because of forward anchoring,
+            '',
+            # sample c, rev is empty because of forward anchoring,
+            '',
+        ]
+        exp_untrimmed = [
+            '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+            '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+            '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+            '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+        ]
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(self.muxed_sequences,
+                                     forward_barcodes=forward_barcodes,
+                                     reverse_barcodes=reverse_barcodes,
+                                     anchor_forward_barcode=True,
+                                     anchor_reverse_barcode=True)
+
+        self.assert_demux_results(forward_barcodes.to_series(), exp,
+                                  obs_demuxed_art)
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+    def test_dual_index_anchor_fail_no_reverse(self):
+        metadata = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCA'], name='Barcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+        with self.assertRaises(ValueError):
+            self.demux_paired_fn(self.muxed_sequences,
+                                 forward_barcodes=metadata,
+                                 anchor_forward_barcode=True,
+                                 anchor_reverse_barcode=True)
+
     def test_dual_index_mixed_orientation_success(self):
         forward_barcodes = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
@@ -466,7 +729,7 @@ class TestDemuxPaired(TestPluginBase):
             '@id2\nACGTACGT\n+\nzzzzzzzz\n'
             '@id4\nACGTACGT\n+\nzzzzzzzz\n'
             '@id5\nACGTACGT\n+\nzzzzzzzz\n',
-            # sample a, rev
+            # sample b, rev
             '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
             '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
             '@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
@@ -536,6 +799,131 @@ class TestDemuxPaired(TestPluginBase):
         # Everything should match, so untrimmed should be empty
         self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
 
+    def test_mixed_orientation_cut(self):
+        # sample_a and sample_b have reads in both fwd and rev directions.
+        # sample_c only has reads in the fwd direction.
+        # sample_d only has reads in the rev direction.
+        # If `cut` happens during the first and second demux pass, the
+        # samples a, b and d will be too short and will not demux.
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+                                      'sample_d'], name='id')))
+        mixed_orientation_sequences_f_fp = self.get_data_path(
+            'mixed-orientation/forward.fastq.gz')
+        mixed_orientation_sequences_r_fp = self.get_data_path(
+            'mixed-orientation/reverse.fastq.gz')
+        with tempfile.TemporaryDirectory() as temp:
+            shutil.copy(mixed_orientation_sequences_f_fp, temp)
+            shutil.copy(mixed_orientation_sequences_r_fp, temp)
+            mixed_orientation_sequences = Artifact.import_data(
+                'MultiplexedPairedEndBarcodeInSequence', temp)
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(mixed_orientation_sequences,
+                                     forward_barcodes=forward_barcodes,
+                                     forward_cut=1,
+                                     reverse_cut=1,
+                                     mixed_orientation=True)
+        exp = [
+            # sample_a fwd
+            '@id1\nCGTACGT\n+\nyyyyyyy\n'
+            '@id3\nCGTACGT\n+\nyyyyyyy\n',
+            # sample_a rev
+            '@id1\nGCATGCATGCA\n+\nzzzzzzzzzzz\n'
+            '@id3\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+            # sample_b fwd
+            '@id4\nACGTACGT\n+\nyyyyyyyy\n'
+            '@id2\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_b rev
+            '@id4\nGCATGCATGCA\n+\nzzzzzzzzzzz\n'
+            '@id2\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+            # sample_c fwd
+            '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_c rev
+            '@id5\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+            # sample_d fwd
+            '@id6\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_d rev
+            '@id6\nGCATGCATGCA\n+\nzzzzzzzzzzz\n', ]
+
+        # We want to be sure that the validation is 100%, not just `min`,
+        obs_demuxed_art.validate(level='max')
+        # checkpoint assertion for the above `validate` - nothing should fail
+        self.assertTrue(True)
+
+        self.assert_demux_results(forward_barcodes.to_series(), exp,
+                                  obs_demuxed_art)
+
+        # Everything should match, so untrimmed should be empty
+        self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
+
+    def test_mixed_orientation_anchored(self):
+        # sample_a and sample_b have reads in both fwd and rev directions.
+        # sample_c only has reads in the fwd direction.
+        # sample_d only has reads in the rev direction.
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCA', 'GGGG', 'TTTA'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+                                      'sample_d'], name='id')))
+        mixed_orientation_sequences_f_fp = self.get_data_path(
+            'mixed-orientation/forward.fastq.gz')
+        mixed_orientation_sequences_r_fp = self.get_data_path(
+            'mixed-orientation/reverse.fastq.gz')
+        with tempfile.TemporaryDirectory() as temp:
+            shutil.copy(mixed_orientation_sequences_f_fp, temp)
+            shutil.copy(mixed_orientation_sequences_r_fp, temp)
+            mixed_orientation_sequences = Artifact.import_data(
+                'MultiplexedPairedEndBarcodeInSequence', temp)
+
+        with redirected_stdio(stderr=os.devnull):
+            obs_demuxed_art, obs_untrimmed_art = \
+                self.demux_paired_fn(mixed_orientation_sequences,
+                                     forward_barcodes=forward_barcodes,
+                                     anchor_forward_barcode=True,
+                                     anchor_reverse_barcode=True,
+                                     mixed_orientation=True)
+        exp = [
+            # sample_a fwd
+            '@id1\nACGTACGT\n+\nyyyyyyyy\n'
+            '@id3\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_a rev
+            '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample_b fwd is empty because the first 'C' from the sequence is
+            #  not in the barcode (sequence is 'CCCCACGTACGT')
+            '',
+            # sample_b rev is empty for the same reason
+            '',
+            # sample_c fwd
+            '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+            # sample_c rev
+            '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+            # sample_d fwd is empty cf. sample_b
+            '',
+            # sample_d rev is empty cf. sample_b
+            '',
+        ]
+        exp_untrimmed = [
+            '@id2\nCCCCACGTACGT\n+\nyyyyyyyyyyyy\n'
+            '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id6\nTTTTACGTACGT\n+\nyyyyyyyyyyyy\n',
+            '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+            '@id4\nCCCCACGTACGT\n+\nyyyyyyyyyyyy\n'
+            '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+        ]
+
+        # We want to be sure that the validation is 100%, not just `min`,
+        obs_demuxed_art.validate(level='max')
+        # checkpoint assertion for the above `validate` - nothing should fail
+        self.assertTrue(True)
+
+        self.assert_demux_results(forward_barcodes.to_series(), exp,
+                                  obs_demuxed_art)
+
+        self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
     def test_dual_index_mismatched_barcodes(self):
         forward_barcodes = CategoricalMetadataColumn(
             pd.Series(['AAAA', 'CCCC', 'ACGT'], name='ForwardBarcode',
@@ -587,6 +975,18 @@ class TestDemuxPaired(TestPluginBase):
                                  reverse_barcodes=reverse_barcodes,
                                  mixed_orientation=True)
 
+    def test_mixed_different_cuts(self):
+        forward_barcodes = CategoricalMetadataColumn(
+            pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
+                      index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+        with self.assertRaises(ValueError):
+            self.demux_paired_fn(self.muxed_sequences,
+                                 forward_barcodes=forward_barcodes,
+                                 forward_cut=4,
+                                 reverse_cut=2,
+                                 mixed_orientation=True)
+
 
 class TestDemuxUtilsSingleEnd(TestPluginBase):
     package = 'q2_cutadapt.tests'
@@ -613,13 +1013,26 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
                                        0.1,
                                        2)
             self.assertTrue(barcode_fasta.name in obs[2])
+            self.assertTrue('^file' not in obs[2])  # not anchored
             self.assertTrue('0.1' in obs[4])
             self.assertTrue('2' in obs[6])
             self.assertTrue(str(self.per_sample_dir_fmt) in obs[8])
             self.assertTrue(str(self.untrimmed_dir_fmt) in obs[10])
             self.assertEqual(str(self.seqs_dir_fmt.file.view(FastqGzFormat)),
                              obs[11])
-            self.assertTrue('1' in obs[13])
+            self.assertTrue('0' in obs[13])  # fwd cut
+            self.assertTrue('1' in obs[15])  # cores
+
+        # Check that '^' is added before 'file' when adapters are anchored
+        with tempfile.NamedTemporaryFile() as barcode_fasta:
+            obs = _build_demux_command(self.seqs_dir_fmt,
+                                       {'fwd': barcode_fasta, 'rev': None},
+                                       self.per_sample_dir_fmt,
+                                       self.untrimmed_dir_fmt,
+                                       0.1,
+                                       2,
+                                       anchor_forward=True)
+            self.assertTrue('^file' in obs[2])
 
     def test_rename_files_single(self):
         for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
@@ -701,7 +1114,7 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
                                        self.untrimmed_dir_fmt,
                                        0.1,
                                        2)
-            self.assertTrue(barcode_fasta.name in obs[2])
+        self.assertTrue(barcode_fasta.name in obs[2])
         self.assertTrue('0.1' in obs[4])
         self.assertTrue('2' in obs[6])
         self.assertTrue(str(self.per_sample_dir_fmt) in obs[8])  # fwd
@@ -712,6 +1125,9 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
         self.assertEqual(exp_f, obs[15])
         exp_r = str(self.seqs_dir_fmt.reverse_sequences.view(FastqGzFormat))
         self.assertEqual(exp_r, obs[16])
+        self.assertEqual('0', obs[18])  # rev cut
+        self.assertEqual('0', obs[20])  # fwd cut
+        self.assertEqual('1', obs[22])  # cores
 
     def test_build_dual_index_demux_command(self):
         with tempfile.NamedTemporaryFile() as barcode_fasta_f:



View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/commit/5605a03f2ad34bebc91ad1f2b276bc5af7fe36dc

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/commit/5605a03f2ad34bebc91ad1f2b276bc5af7fe36dc
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/a83f345f/attachment-0001.htm>


More information about the debian-med-commit mailing list