[med-svn] [Git][med-team/q2-demux][upstream] New upstream version 2023.9.1+dfsg

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Jan 30 15:04:55 GMT 2024



Andreas Tille pushed to branch upstream at Debian Med / q2-demux


Commits:
0e6a4bcb by Andreas Tille at 2024-01-30T15:37:18+01:00
New upstream version 2023.9.1+dfsg
- - - - -


18 changed files:

- q2_demux/__init__.py
- q2_demux/_demux.py
- q2_demux/_summarize/_visualizer.py
- + q2_demux/_tabulate.py
- q2_demux/_version.py
- q2_demux/plugin_setup.py
- + q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz
- q2_demux/tests/test_demux.py
- + q2_demux/tests/test_tabulate.py
- setup.py


Changes:

=====================================
q2_demux/__init__.py
=====================================
@@ -6,15 +6,18 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from ._demux import emp_single, emp_paired
+from ._demux import (emp_single, emp_paired, partition_samples_single,
+                     partition_samples_paired)
 from ._subsample import subsample_single, subsample_paired
 from ._summarize import summarize
 from ._filter import filter_samples
 from ._version import get_versions
+from ._tabulate import tabulate_read_counts
 
 
 __version__ = get_versions()['version']
 del get_versions
 
-__all__ = ['emp_single', 'emp_paired', 'summarize',
-           'subsample_single', 'subsample_paired', 'filter_samples']
+__all__ = ['emp_single', 'emp_paired', 'partition_samples_single',
+           'partition_samples_paired', 'summarize', 'subsample_single',
+           'subsample_paired', 'filter_samples', 'tabulate_read_counts']


=====================================
q2_demux/_demux.py
=====================================
@@ -14,9 +14,13 @@ import collections.abc
 import random
 import resource
 import re
+import os
+import warnings
 
 import skbio
 import psutil
+import numpy as np
+import pandas as pd
 
 import qiime2
 from q2_types.per_sample_sequences import (
@@ -25,6 +29,7 @@ from q2_types.per_sample_sequences import (
     FastqManifestFormat, YamlFormat)
 from ._ecc import GolayDecoder
 from ._format import ErrorCorrectionDetailsFmt
+from qiime2.util import duplicate
 
 
 FastqHeader = collections.namedtuple('FastqHeader', ['id', 'description'])
@@ -250,7 +255,7 @@ class BarcodePairedSequenceFastqIterator(collections.abc.Iterable):
 def _make_barcode_map(barcodes, rev_comp_mapping_barcodes):
     barcode_map = {}
     barcode_len = None
-    for sample_id, barcode in barcodes.to_series().iteritems():
+    for sample_id, barcode in barcodes.to_series().items():
         if barcode_len is None:
             barcode_len = len(barcode)
         elif len(barcode) != barcode_len:
@@ -504,3 +509,99 @@ def emp_paired(seqs: BarcodePairedSequenceFastqIterator,
     _write_metadata_yaml(result)
 
     return result, ec_details_fmt
+
+
+def partition_samples_single(demux: SingleLanePerSampleSingleEndFastqDirFmt,
+                             num_partitions: int = None
+                             ) -> SingleLanePerSampleSingleEndFastqDirFmt:
+    return _partition_helper(demux, num_partitions, paired=False)
+
+
+def partition_samples_paired(demux: SingleLanePerSamplePairedEndFastqDirFmt,
+                             num_partitions: int = None
+                             ) -> SingleLanePerSamplePairedEndFastqDirFmt:
+    return _partition_helper(demux, num_partitions, paired=True)
+
+
+def _partition_helper(demux, num_partitions, paired):
+    """ Deal with partitioning logic that is largely the same regardless of
+        single or paired.
+    """
+    # Adjust based on if we are in the single or paired end case
+    result_class = type(demux)
+
+    partitioned_demux = {}
+    df = demux.manifest.view(pd.DataFrame)
+
+    # Make sure we are partitioning on samples if no number of partitions or
+    # too many partitions specified and warn if they specified too many
+    # partitions
+    num_samples = df.shape[0]
+    if num_partitions is None:
+        num_partitions = num_samples
+    elif num_partitions > num_samples:
+        warnings.warn("You have requested a number of partitions"
+                      f" '{num_partitions}' that is greater than your number"
+                      f" of samples '{num_samples}.' Your data will be"
+                      f" partitioned by sample into '{num_samples}'"
+                      " partitions.")
+        num_partitions = num_samples
+
+    partitioned_df = np.array_split(df, num_partitions)
+    for i, _df in enumerate(partitioned_df, 1):
+        result = result_class()
+
+        manifest_string = ''
+        for sample in _df.iterrows():
+            sample_id = sample[0]
+
+            manifest_string += _partition_duplicate(
+                    sample, sample_id, result, 'forward')
+            if paired:
+                manifest_string += _partition_duplicate(
+                    sample, sample_id, result, 'reverse')
+
+        manifest = _partition_write_manifest(manifest_string, paired)
+        result.manifest.write_data(manifest, FastqManifestFormat)
+        _write_metadata_yaml(result)
+
+        # If we have one sample per partition we name the partitions after the
+        # samples. Otherwise we number them
+        if num_partitions == num_samples:
+            partitioned_demux[sample_id] = result
+        else:
+            partitioned_demux[i] = result
+
+    return partitioned_demux
+
+
+def _partition_duplicate(sample, sample_id, result, direction):
+    """ Duplicate the given direction of the sample into the result and return
+        the corresponding line for the manifest.
+    """
+    in_path = sample[1][direction]
+
+    artifact_name = os.path.basename(in_path)
+    out_path = os.path.join(result.path, artifact_name)
+    duplicate(in_path, out_path)
+
+    return '%s,%s,%s\n' % (sample_id, artifact_name, direction)
+
+
+def _partition_write_manifest(manifest_string, paired):
+    """ Add header to manifest then write to file.
+    """
+    manifest = FastqManifestFormat()
+
+    header_string = 'sample-id,filename,direction\n'
+    if not paired:
+        header_string += \
+            ('# direction is not meaningful in this file as these\n'
+             '# data may be derived from forward, reverse, or \n'
+             '# joined reads\n')
+    manifest_string = header_string + manifest_string
+
+    with manifest.open() as manifest_fh:
+        manifest_fh.write(manifest_string)
+
+    return manifest


=====================================
q2_demux/_summarize/_visualizer.py
=====================================
@@ -133,7 +133,7 @@ def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None:
 
             # If we have an empty direction for a sample that will be a nan in
             # the manifest. Skip that nan
-            if type(filename) != str:
+            if type(filename) is not str:
                 if filename is None or np.isnan(filename):
                     continue
 
@@ -188,7 +188,7 @@ def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None:
                             result.max(), sequence_count[direction]]],
                           index=['%s reads' % (direction,)],
                           columns=summary_columns)
-        context['result_data'] = context['result_data'].append(df)
+        context['result_data'] = pd.concat([context['result_data'], df])
 
         html_df = result.to_frame()
         context['result'] = context['result'].join(html_df, how='outer')


=====================================
q2_demux/_tabulate.py
=====================================
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+
+import pandas as pd
+
+import qiime2
+from q2_types.per_sample_sequences import (
+    SingleLanePerSampleSingleEndFastqDirFmt)
+
+from q2_demux._demux import _read_fastq_seqs
+
+
+def tabulate_read_counts(sequences: SingleLanePerSampleSingleEndFastqDirFmt
+                         ) -> qiime2.Metadata:
+    result = {}
+
+    for e in sequences:
+        manifest = e.manifest.view(pd.DataFrame)
+        for record in manifest.itertuples():
+            sample_id = record[0]
+            fwd_path = record[1]
+            read_count = 0
+            if sample_id in result:
+                raise KeyError("At least one duplicated sample id was "
+                               f"detected ({sample_id}). "
+                               "Sample ids must be unique across inputs.")
+            fwd_name = os.path.basename(fwd_path)
+            fwd_path = str(e.path / fwd_name)
+            for fwd_rec in _read_fastq_seqs(fwd_path):
+                read_count += 1
+            result[sample_id] = read_count
+
+    result = pd.Series(result)
+    result.name = 'Demultiplexed sequence count'
+    result = result.to_frame()
+    result.index.name = 'sample-id'
+
+    return qiime2.Metadata(result)


=====================================
q2_demux/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2023.7.0, Release-2023.7)"
-    git_full = "7800f5c0dc2570a32283676962f6ce7456fe1c3f"
-    git_date = "2023-08-17 18:39:45 +0000"
+    git_refnames = " (tag: 2023.9.1, Release-2023.9)"
+    git_full = "09097f551d09f026b295962441e90b80c0e5ce81"
+    git_date = "2023-10-24 18:43:25 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_demux/plugin_setup.py
=====================================
@@ -10,10 +10,11 @@ import importlib
 
 from qiime2.plugin import (
     Plugin, Metadata, MetadataColumn, Categorical, Bool, Str, Int, Float,
-    Range, Citations, TypeMatch
+    List, Collection, Range, Citations, TypeMatch
 )
 
 from q2_types.sample_data import SampleData
+from q2_types.metadata import ImmutableMetadata
 from q2_types.per_sample_sequences import (
     SequencesWithQuality, PairedEndSequencesWithQuality,
     JoinedSequencesWithQuality)
@@ -160,6 +161,55 @@ plugin.methods.register_function(
         citations['hamady2009']]
 )
 
+demux_description = 'The demultiplexed sequences to partition.'
+num_partitions_description = 'The number of partitions to split the' \
+                             ' demultiplexed sequences into. Defaults to' \
+                             ' partitioning into individual samples.'
+partitioned_demux_description = 'The partitioned demultiplexed sequences.'
+
+plugin.methods.register_function(
+    function=q2_demux.partition_samples_single,
+    inputs={'demux': SampleData[SequencesWithQuality]},
+    parameters={'num_partitions': Int % Range(1, None)},
+    outputs=[
+        ('partitioned_demux', Collection[SampleData[SequencesWithQuality]]),
+    ],
+    input_descriptions={
+        'demux': demux_description
+    },
+    parameter_descriptions={
+        'num_partitions': num_partitions_description
+    },
+    output_descriptions={
+        'partitioned_demux': partitioned_demux_description
+    },
+    name='Split demultiplexed sequence data into partitions.',
+    description=('Partition demultiplexed single end sequences into '
+                 'individual samples or the number of partitions specified.'),
+)
+
+plugin.methods.register_function(
+    function=q2_demux.partition_samples_paired,
+    inputs={'demux': SampleData[PairedEndSequencesWithQuality]},
+    parameters={'num_partitions': Int % Range(1, None)},
+    outputs=[
+        ('partitioned_demux',
+         Collection[SampleData[PairedEndSequencesWithQuality]]),
+    ],
+    input_descriptions={
+        'demux': demux_description
+    },
+    parameter_descriptions={
+        'num_partitions': num_partitions_description
+    },
+    output_descriptions={
+        'partitioned_demux': partitioned_demux_description
+    },
+    name='Split demultiplexed sequence data into partitions.',
+    description=('Partition demultiplexed paired end sequences into '
+                 'individual samples or the number of partitions specified.'),
+)
+
 plugin.visualizers.register_function(
     function=q2_demux.summarize,
     inputs={'data':
@@ -185,6 +235,25 @@ plugin.visualizers.register_function(
     examples={'demux': ex.summarize}
 )
 
+plugin.methods.register_function(
+    function=q2_demux.tabulate_read_counts,
+    inputs={'sequences':
+            List[SampleData[SequencesWithQuality |
+                            PairedEndSequencesWithQuality |
+                            JoinedSequencesWithQuality]]},
+    parameters={},
+    outputs=[
+        ('counts', ImmutableMetadata)
+    ],
+    input_descriptions={
+        'sequences': 'One or more collections of demultiplexed sequences.'
+    },
+    parameter_descriptions={},
+    name='Tabulate counts per sample',
+    description=('Generate a per-sample count of sequence reads.'),
+    examples={}
+)
+
 plugin.methods.register_function(
     function=q2_demux.subsample_single,
     inputs={'sequences': SampleData[SequencesWithQuality |


=====================================
q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz differ


=====================================
q2_demux/tests/test_demux.py
=====================================
@@ -22,7 +22,8 @@ import numpy.testing as npt
 from qiime2.plugin.testing import TestPluginBase
 from q2_demux._demux import (BarcodeSequenceFastqIterator,
                              BarcodePairedSequenceFastqIterator)
-from q2_demux import emp_single, emp_paired, summarize
+from q2_demux import (emp_single, emp_paired, partition_samples_single,
+                      partition_samples_paired, summarize)
 from q2_types.per_sample_sequences import (
     FastqGzFormat, FastqManifestFormat,
     SingleLanePerSampleSingleEndFastqDirFmt,
@@ -598,6 +599,105 @@ class EmpSingleTests(unittest.TestCase, EmpTestingUtils):
                 golay_error_correction=False,
             )
 
+    def test_partition(self):
+        demux, _ = emp_single(self.bsi, self.barcode_map,
+                              golay_error_correction=False)
+
+        partition = partition_samples_single(demux)
+
+        exp_samples = ('sample1_1_L001_R1_001.fastq.gz',
+                       'sample2_3_L001_R1_001.fastq.gz',
+                       'sample3_2_L001_R1_001.fastq.gz',
+                       'sample4_5_L001_R1_001.fastq.gz',
+                       'sample5_4_L001_R1_001.fastq.gz')
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        for idx, (id, sample) in enumerate(partition.items()):
+            self.assertEqual(id, f'sample{idx + 1}')
+
+            act_manifest = \
+                list(sample.manifest.view(FastqManifestFormat).open())
+            exp_manifest = ['sample-id,filename,direction\n',
+                            f'sample{idx + 1},{exp_samples[idx]},forward\n']
+            self._compare_manifests(act_manifest, exp_manifest)
+
+            output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+            self.assertEqual(len(output_fastq), 1)
+
+            self._validate_sample_fastq(
+                output_fastq[0][1].open(), self.sequences, exp_indices[idx])
+
+    def test_partition_num_specified(self):
+        demux, _ = emp_single(self.bsi, self.barcode_map,
+                              golay_error_correction=False)
+        partition = partition_samples_single(demux, 2)
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        sample = partition[1]
+        act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+        exp_manifest = ['sample-id,filename,direction\n',
+                        'sample1,sample1_1_L001_R1_001.fastq.gz,forward\n',
+                        'sample2,sample2_3_L001_R1_001.fastq.gz,forward\n',
+                        'sample3,sample3_2_L001_R1_001.fastq.gz,forward\n']
+        self._compare_manifests(act_manifest, exp_manifest)
+
+        output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+        self.assertEqual(len(output_fastq), 3)
+
+        self._validate_sample_fastq(
+            output_fastq[0][1].open(), self.sequences, exp_indices[0])
+        self._validate_sample_fastq(
+            output_fastq[1][1].open(), self.sequences, exp_indices[1])
+        self._validate_sample_fastq(
+            output_fastq[2][1].open(), self.sequences, exp_indices[2])
+
+        sample = partition[2]
+        act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+        exp_manifest = ['sample-id,filename,direction\n',
+                        'sample4,sample4_5_L001_R1_001.fastq.gz,forward\n',
+                        'sample5,sample5_4_L001_R1_001.fastq.gz,forward\n']
+        self._compare_manifests(act_manifest, exp_manifest)
+
+        output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+        self.assertEqual(len(output_fastq), 2)
+
+        self._validate_sample_fastq(
+            output_fastq[0][1].open(), self.sequences, exp_indices[3])
+        self._validate_sample_fastq(
+            output_fastq[1][1].open(), self.sequences, exp_indices[4])
+
+    def test_partition_more_partitions_than_samples(self):
+        demux, _ = emp_single(self.bsi, self.barcode_map,
+                              golay_error_correction=False)
+
+        with self.assertWarnsRegex(
+                UserWarning, "You have requested a number of.*100.*5.*5"):
+            partition = partition_samples_single(demux, 100)
+
+        exp_samples = ('sample1_1_L001_R1_001.fastq.gz',
+                       'sample2_3_L001_R1_001.fastq.gz',
+                       'sample3_2_L001_R1_001.fastq.gz',
+                       'sample4_5_L001_R1_001.fastq.gz',
+                       'sample5_4_L001_R1_001.fastq.gz')
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        for idx, (id, sample) in enumerate(partition.items()):
+            self.assertEqual(id, f'sample{idx + 1}')
+
+            act_manifest = \
+                list(sample.manifest.view(FastqManifestFormat).open())
+            exp_manifest = ['sample-id,filename,direction\n',
+                            f'sample{idx + 1},{exp_samples[idx]},forward\n']
+            self._compare_manifests(act_manifest, exp_manifest)
+
+            output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+            self.assertEqual(len(output_fastq), 1)
+
+            self._validate_sample_fastq(
+                output_fastq[0][1].open(), self.sequences, exp_indices[idx])
+
 
 class EmpPairedTests(unittest.TestCase, EmpTestingUtils):
     def setUp(self):
@@ -947,6 +1047,176 @@ class EmpPairedTests(unittest.TestCase, EmpTestingUtils):
         self.check_valid(bpsi, self.barcode_map, golay_error_correction=False,
                          rev_comp_barcodes=True)
 
+    def test_partition(self):
+        demux, _ = emp_paired(self.bpsi, self.barcode_map,
+                              golay_error_correction=False)
+
+        partition = partition_samples_paired(demux)
+
+        exp_samples_fwd = ('sample1_1_L001_R1_001.fastq.gz',
+                           'sample2_3_L001_R1_001.fastq.gz',
+                           'sample3_2_L001_R1_001.fastq.gz',
+                           'sample4_5_L001_R1_001.fastq.gz',
+                           'sample5_4_L001_R1_001.fastq.gz')
+        exp_samples_rev = ('sample1_1_L001_R2_001.fastq.gz',
+                           'sample2_3_L001_R2_001.fastq.gz',
+                           'sample3_2_L001_R2_001.fastq.gz',
+                           'sample4_5_L001_R2_001.fastq.gz',
+                           'sample5_4_L001_R2_001.fastq.gz')
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        for idx, (id, sample) in enumerate(partition.items()):
+            self.assertEqual(id, f'sample{idx + 1}')
+
+            act_manifest = \
+                list(sample.manifest.view(FastqManifestFormat).open())
+            exp_manifest = \
+                ['sample-id,filename,direction\n',
+                 f'sample{idx + 1},{exp_samples_fwd[idx]},forward\n',
+                 f'sample{idx + 1},{exp_samples_rev[idx]},reverse\n']
+            self._compare_manifests(act_manifest, exp_manifest)
+
+            forward_fastq = [
+                view for path, view in
+                sample.sequences.iter_views(FastqGzFormat)
+                if 'R1_001.fastq' in path.name]
+            self.assertEqual(len(forward_fastq), 1)
+
+            reverse_fastq = [
+                view for path, view in
+                sample.sequences.iter_views(FastqGzFormat)
+                if 'R2_001.fastq' in path.name]
+            self.assertEqual(len(reverse_fastq), 1)
+
+            self._validate_sample_fastq(
+                forward_fastq[0].open(), self.forward, exp_indices[idx])
+            self._validate_sample_fastq(
+                reverse_fastq[0].open(), self.reverse, exp_indices[idx])
+
+    def test_partition_num_specified(self):
+        demux, _ = emp_paired(self.bpsi, self.barcode_map,
+                              golay_error_correction=False)
+        partition = partition_samples_paired(demux, 2)
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        sample = partition[1]
+        act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+        exp_manifest = ['sample-id,filename,direction\n',
+                        'sample1,sample1_1_L001_R1_001.fastq.gz,forward\n',
+                        'sample1,sample1_1_L001_R2_001.fastq.gz,reverse\n',
+                        'sample2,sample2_3_L001_R1_001.fastq.gz,forward\n',
+                        'sample2,sample2_3_L001_R2_001.fastq.gz,reverse\n',
+                        'sample3,sample3_2_L001_R1_001.fastq.gz,forward\n',
+                        'sample3,sample3_2_L001_R2_001.fastq.gz,reverse\n']
+        self._compare_manifests(act_manifest, exp_manifest)
+
+        forward_fastq = [
+            view for path, view in
+            sample.sequences.iter_views(FastqGzFormat)
+            if 'R1_001.fastq' in path.name]
+        self.assertEqual(len(forward_fastq), 3)
+
+        self._validate_sample_fastq(
+            forward_fastq[0].open(), self.forward, exp_indices[0])
+        self._validate_sample_fastq(
+            forward_fastq[1].open(), self.forward, exp_indices[1])
+        self._validate_sample_fastq(
+            forward_fastq[2].open(), self.forward, exp_indices[2])
+
+        reverse_fastq = [
+            view for path, view in
+            sample.sequences.iter_views(FastqGzFormat)
+            if 'R2_001.fastq' in path.name]
+        self.assertEqual(len(reverse_fastq), 3)
+
+        self._validate_sample_fastq(
+            reverse_fastq[0].open(), self.reverse, exp_indices[0])
+        self._validate_sample_fastq(
+            reverse_fastq[1].open(), self.reverse, exp_indices[1])
+        self._validate_sample_fastq(
+            reverse_fastq[2].open(), self.reverse, exp_indices[2])
+
+        sample = partition[2]
+        act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+        exp_manifest = ['sample-id,filename,direction\n',
+                        'sample4,sample4_5_L001_R1_001.fastq.gz,forward\n',
+                        'sample4,sample4_5_L001_R2_001.fastq.gz,reverse\n',
+                        'sample5,sample5_4_L001_R1_001.fastq.gz,forward\n',
+                        'sample5,sample5_4_L001_R2_001.fastq.gz,reverse\n']
+        self._compare_manifests(act_manifest, exp_manifest)
+
+        forward_fastq = [
+            view for path, view in
+            sample.sequences.iter_views(FastqGzFormat)
+            if 'R1_001.fastq' in path.name]
+        self.assertEqual(len(forward_fastq), 2)
+
+        self._validate_sample_fastq(
+            forward_fastq[0].open(), self.forward, exp_indices[3])
+        self._validate_sample_fastq(
+            forward_fastq[1].open(), self.forward, exp_indices[4])
+
+        reverse_fastq = [
+            view for path, view in
+            sample.sequences.iter_views(FastqGzFormat)
+            if 'R2_001.fastq' in path.name]
+        self.assertEqual(len(reverse_fastq), 2)
+
+        self._validate_sample_fastq(
+            reverse_fastq[0].open(), self.reverse, exp_indices[3])
+        self._validate_sample_fastq(
+            reverse_fastq[1].open(), self.reverse, exp_indices[4])
+
+    def test_partition_more_partitions_than_samples(self):
+        demux, _ = emp_paired(self.bpsi, self.barcode_map,
+                              golay_error_correction=False)
+
+        with self.assertWarnsRegex(
+                UserWarning, "You have requested a number of.*100.*5.*5"):
+            partition = partition_samples_paired(demux, 100)
+
+        exp_samples_fwd = ('sample1_1_L001_R1_001.fastq.gz',
+                           'sample2_3_L001_R1_001.fastq.gz',
+                           'sample3_2_L001_R1_001.fastq.gz',
+                           'sample4_5_L001_R1_001.fastq.gz',
+                           'sample5_4_L001_R1_001.fastq.gz')
+        exp_samples_rev = ('sample1_1_L001_R2_001.fastq.gz',
+                           'sample2_3_L001_R2_001.fastq.gz',
+                           'sample3_2_L001_R2_001.fastq.gz',
+                           'sample4_5_L001_R2_001.fastq.gz',
+                           'sample5_4_L001_R2_001.fastq.gz')
+        exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+        for idx, (id, sample) in enumerate(partition.items()):
+            self.assertEqual(id, f'sample{idx + 1}')
+
+            act_manifest = \
+                list(sample.manifest.view(FastqManifestFormat).open())
+            exp_manifest = \
+                ['sample-id,filename,direction\n',
+                 f'sample{idx + 1},{exp_samples_fwd[idx]},forward\n',
+                 f'sample{idx + 1},{exp_samples_rev[idx]},reverse\n']
+            self._compare_manifests(act_manifest, exp_manifest)
+
+            forward_fastq = [
+                view for path, view in
+                sample.sequences.iter_views(FastqGzFormat)
+                if 'R1_001.fastq' in path.name]
+            self.assertEqual(len(forward_fastq), 1)
+
+            reverse_fastq = [
+                view for path, view in
+                sample.sequences.iter_views(FastqGzFormat)
+                if 'R2_001.fastq' in path.name]
+            self.assertEqual(len(reverse_fastq), 1)
+
+            self._validate_sample_fastq(
+                forward_fastq[0].open(), self.forward, exp_indices[idx])
+            self._validate_sample_fastq(
+                reverse_fastq[0].open(), self.reverse, exp_indices[idx])
+
 
 class SummarizeTests(TestPluginBase):
     package = 'q2_demux.tests'


=====================================
q2_demux/tests/test_tabulate.py
=====================================
@@ -0,0 +1,109 @@
+import pandas as pd
+
+import qiime2
+from qiime2.plugin.testing import TestPluginBase
+from qiime2.plugin.util import transform
+from q2_types.per_sample_sequences import (
+    CasavaOneEightSingleLanePerSampleDirFmt,
+    SingleLanePerSampleSingleEndFastqDirFmt,
+    SingleLanePerSamplePairedEndFastqDirFmt)
+from q2_demux import tabulate_read_counts
+
+
+class TabulateTests(TestPluginBase):
+    package = 'q2_demux.tests'
+
+    def setUp(self):
+        super().setUp()
+
+        demuxed_se_1 = CasavaOneEightSingleLanePerSampleDirFmt(
+            self.get_data_path('tabulate_read_counts_single_end_1'), mode='r')
+        self.demux_se_data_1 = transform(
+            demuxed_se_1, to_type=SingleLanePerSampleSingleEndFastqDirFmt)
+
+        demuxed_se_2 = CasavaOneEightSingleLanePerSampleDirFmt(
+            self.get_data_path('tabulate_read_counts_single_end_2'), mode='r')
+        self.demux_se_data_2 = transform(
+            demuxed_se_2, to_type=SingleLanePerSampleSingleEndFastqDirFmt)
+
+        demuxed_pe_1 = CasavaOneEightSingleLanePerSampleDirFmt(
+            self.get_data_path('tabulate_read_counts_paired_end_1'), mode='r')
+        self.demux_pe_data_1 = transform(
+            demuxed_pe_1, to_type=SingleLanePerSamplePairedEndFastqDirFmt)
+
+    def test_tabulate_read_counts_se(self):
+        actual = tabulate_read_counts([self.demux_se_data_1])
+
+        expected = {'sample1': 2,
+                    'sample2': 2,
+                    'sample3': 2,
+                    'sample4': 2,
+                    'sample5': 3}
+        expected = pd.Series(expected)
+        expected.name = 'Demultiplexed sequence count'
+        expected = expected.to_frame()
+        expected.index.name = 'sample-id'
+        expected = qiime2.Metadata(expected)
+
+        self.assertEqual(actual, expected)
+
+        actual = tabulate_read_counts([self.demux_se_data_2])
+
+        expected = {'sample6': 2,
+                    'sample7': 2}
+        expected = pd.Series(expected)
+        expected.name = 'Demultiplexed sequence count'
+        expected = expected.to_frame()
+        expected.index.name = 'sample-id'
+        expected = qiime2.Metadata(expected)
+
+        self.assertEqual(actual, expected)
+
+    def test_tabulate_read_counts_pe(self):
+        actual = tabulate_read_counts([self.demux_pe_data_1])
+
+        expected = {'sample1': 2}
+        expected = pd.Series(expected)
+        expected.name = 'Demultiplexed sequence count'
+        expected = expected.to_frame()
+        expected.index.name = 'sample-id'
+        expected = qiime2.Metadata(expected)
+
+        self.assertEqual(actual, expected)
+
+    def test_tabulate_read_counts_multiple(self):
+        actual = tabulate_read_counts([self.demux_se_data_1,
+                                       self.demux_se_data_2])
+
+        expected = {'sample1': 2,
+                    'sample2': 2,
+                    'sample3': 2,
+                    'sample4': 2,
+                    'sample5': 3,
+                    'sample6': 2,
+                    'sample7': 2}
+        expected = pd.Series(expected)
+        expected.name = 'Demultiplexed sequence count'
+        expected = expected.to_frame()
+        expected.index.name = 'sample-id'
+        expected = qiime2.Metadata(expected)
+
+        self.assertEqual(actual, expected)
+
+        actual = tabulate_read_counts([self.demux_pe_data_1,
+                                       self.demux_se_data_2])
+
+        expected = {'sample1': 2,
+                    'sample6': 2,
+                    'sample7': 2}
+        expected = pd.Series(expected)
+        expected.name = 'Demultiplexed sequence count'
+        expected = expected.to_frame()
+        expected.index.name = 'sample-id'
+        expected = qiime2.Metadata(expected)
+
+        self.assertEqual(actual, expected)
+
+    def test_tabulate_read_counts_error(self):
+        with self.assertRaisesRegex(KeyError, 'duplicated.*sample1'):
+            tabulate_read_counts([self.demux_se_data_1, self.demux_pe_data_1])


=====================================
setup.py
=====================================
@@ -43,6 +43,9 @@ setup(
                            'data/summarize_empty/empty_reverse_in_paired/*',
                            'data/summarize_empty/empty_paired_end/*',
                            'data/reverse_only/*',
+                           'data/tabulate_read_counts_single_end_1/*',
+                           'data/tabulate_read_counts_single_end_2/*',
+                           'data/tabulate_read_counts_paired_end_1/*',
                            ],
         'q2_demux': ['_summarize/assets/*.html',
                      '_summarize/assets/dist/*',



View it on GitLab: https://salsa.debian.org/med-team/q2-demux/-/commit/0e6a4bcbb9450c9583af489f2944ed0ca3c843ee

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-demux/-/commit/0e6a4bcbb9450c9583af489f2944ed0ca3c843ee
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240130/a0938a1c/attachment-0001.htm>


More information about the debian-med-commit mailing list