[med-svn] [Git][med-team/q2-demux][upstream] New upstream version 2023.9.1+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Tue Jan 30 15:04:55 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / q2-demux
Commits:
0e6a4bcb by Andreas Tille at 2024-01-30T15:37:18+01:00
New upstream version 2023.9.1+dfsg
- - - - -
18 changed files:
- q2_demux/__init__.py
- q2_demux/_demux.py
- q2_demux/_summarize/_visualizer.py
- + q2_demux/_tabulate.py
- q2_demux/_version.py
- q2_demux/plugin_setup.py
- + q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz
- + q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz
- q2_demux/tests/test_demux.py
- + q2_demux/tests/test_tabulate.py
- setup.py
Changes:
=====================================
q2_demux/__init__.py
=====================================
@@ -6,15 +6,18 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-from ._demux import emp_single, emp_paired
+from ._demux import (emp_single, emp_paired, partition_samples_single,
+ partition_samples_paired)
from ._subsample import subsample_single, subsample_paired
from ._summarize import summarize
from ._filter import filter_samples
from ._version import get_versions
+from ._tabulate import tabulate_read_counts
__version__ = get_versions()['version']
del get_versions
-__all__ = ['emp_single', 'emp_paired', 'summarize',
- 'subsample_single', 'subsample_paired', 'filter_samples']
+__all__ = ['emp_single', 'emp_paired', 'partition_samples_single',
+ 'partition_samples_paired', 'summarize', 'subsample_single',
+ 'subsample_paired', 'filter_samples', 'tabulate_read_counts']
=====================================
q2_demux/_demux.py
=====================================
@@ -14,9 +14,13 @@ import collections.abc
import random
import resource
import re
+import os
+import warnings
import skbio
import psutil
+import numpy as np
+import pandas as pd
import qiime2
from q2_types.per_sample_sequences import (
@@ -25,6 +29,7 @@ from q2_types.per_sample_sequences import (
FastqManifestFormat, YamlFormat)
from ._ecc import GolayDecoder
from ._format import ErrorCorrectionDetailsFmt
+from qiime2.util import duplicate
FastqHeader = collections.namedtuple('FastqHeader', ['id', 'description'])
@@ -250,7 +255,7 @@ class BarcodePairedSequenceFastqIterator(collections.abc.Iterable):
def _make_barcode_map(barcodes, rev_comp_mapping_barcodes):
barcode_map = {}
barcode_len = None
- for sample_id, barcode in barcodes.to_series().iteritems():
+ for sample_id, barcode in barcodes.to_series().items():
if barcode_len is None:
barcode_len = len(barcode)
elif len(barcode) != barcode_len:
@@ -504,3 +509,99 @@ def emp_paired(seqs: BarcodePairedSequenceFastqIterator,
_write_metadata_yaml(result)
return result, ec_details_fmt
+
+
+def partition_samples_single(demux: SingleLanePerSampleSingleEndFastqDirFmt,
+ num_partitions: int = None
+ ) -> SingleLanePerSampleSingleEndFastqDirFmt:
+ return _partition_helper(demux, num_partitions, paired=False)
+
+
+def partition_samples_paired(demux: SingleLanePerSamplePairedEndFastqDirFmt,
+ num_partitions: int = None
+ ) -> SingleLanePerSamplePairedEndFastqDirFmt:
+ return _partition_helper(demux, num_partitions, paired=True)
+
+
+def _partition_helper(demux, num_partitions, paired):
+ """ Deal with partitioning logic that is largely the same regardless of
+ single or paired.
+ """
+ # Adjust based on if we are in the single or paired end case
+ result_class = type(demux)
+
+ partitioned_demux = {}
+ df = demux.manifest.view(pd.DataFrame)
+
+ # Make sure we are partitioning on samples if no number of partitions or
+ # too many partitions specified and warn if they specified too many
+ # partitions
+ num_samples = df.shape[0]
+ if num_partitions is None:
+ num_partitions = num_samples
+ elif num_partitions > num_samples:
+ warnings.warn("You have requested a number of partitions"
+ f" '{num_partitions}' that is greater than your number"
+ f" of samples '{num_samples}.' Your data will be"
+ f" partitioned by sample into '{num_samples}'"
+ " partitions.")
+ num_partitions = num_samples
+
+ partitioned_df = np.array_split(df, num_partitions)
+ for i, _df in enumerate(partitioned_df, 1):
+ result = result_class()
+
+ manifest_string = ''
+ for sample in _df.iterrows():
+ sample_id = sample[0]
+
+ manifest_string += _partition_duplicate(
+ sample, sample_id, result, 'forward')
+ if paired:
+ manifest_string += _partition_duplicate(
+ sample, sample_id, result, 'reverse')
+
+ manifest = _partition_write_manifest(manifest_string, paired)
+ result.manifest.write_data(manifest, FastqManifestFormat)
+ _write_metadata_yaml(result)
+
+ # If we have one sample per partition we name the partitions after the
+ # samples. Otherwise we number them
+ if num_partitions == num_samples:
+ partitioned_demux[sample_id] = result
+ else:
+ partitioned_demux[i] = result
+
+ return partitioned_demux
+
+
+def _partition_duplicate(sample, sample_id, result, direction):
+ """ Duplicate the given direction of the sample into the result and return
+ the corresponding line for the manifest.
+ """
+ in_path = sample[1][direction]
+
+ artifact_name = os.path.basename(in_path)
+ out_path = os.path.join(result.path, artifact_name)
+ duplicate(in_path, out_path)
+
+ return '%s,%s,%s\n' % (sample_id, artifact_name, direction)
+
+
+def _partition_write_manifest(manifest_string, paired):
+ """ Add header to manifest then write to file.
+ """
+ manifest = FastqManifestFormat()
+
+ header_string = 'sample-id,filename,direction\n'
+ if not paired:
+ header_string += \
+ ('# direction is not meaningful in this file as these\n'
+ '# data may be derived from forward, reverse, or \n'
+ '# joined reads\n')
+ manifest_string = header_string + manifest_string
+
+ with manifest.open() as manifest_fh:
+ manifest_fh.write(manifest_string)
+
+ return manifest
=====================================
q2_demux/_summarize/_visualizer.py
=====================================
@@ -133,7 +133,7 @@ def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None:
# If we have an empty direction for a sample that will be a nan in
# the manifest. Skip that nan
- if type(filename) != str:
+ if type(filename) is not str:
if filename is None or np.isnan(filename):
continue
@@ -188,7 +188,7 @@ def summarize(output_dir: str, data: _PlotQualView, n: int = 10000) -> None:
result.max(), sequence_count[direction]]],
index=['%s reads' % (direction,)],
columns=summary_columns)
- context['result_data'] = context['result_data'].append(df)
+ context['result_data'] = pd.concat([context['result_data'], df])
html_df = result.to_frame()
context['result'] = context['result'].join(html_df, how='outer')
=====================================
q2_demux/_tabulate.py
=====================================
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+
+import pandas as pd
+
+import qiime2
+from q2_types.per_sample_sequences import (
+ SingleLanePerSampleSingleEndFastqDirFmt)
+
+from q2_demux._demux import _read_fastq_seqs
+
+
+def tabulate_read_counts(sequences: SingleLanePerSampleSingleEndFastqDirFmt
+ ) -> qiime2.Metadata:
+ result = {}
+
+ for e in sequences:
+ manifest = e.manifest.view(pd.DataFrame)
+ for record in manifest.itertuples():
+ sample_id = record[0]
+ fwd_path = record[1]
+ read_count = 0
+ if sample_id in result:
+ raise KeyError("At least one duplicated sample id was "
+ f"detected ({sample_id}). "
+ "Sample ids must be unique across inputs.")
+ fwd_name = os.path.basename(fwd_path)
+ fwd_path = str(e.path / fwd_name)
+ for fwd_rec in _read_fastq_seqs(fwd_path):
+ read_count += 1
+ result[sample_id] = read_count
+
+ result = pd.Series(result)
+ result.name = 'Demultiplexed sequence count'
+ result = result.to_frame()
+ result.index.name = 'sample-id'
+
+ return qiime2.Metadata(result)
=====================================
q2_demux/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2023.7.0, Release-2023.7)"
- git_full = "7800f5c0dc2570a32283676962f6ce7456fe1c3f"
- git_date = "2023-08-17 18:39:45 +0000"
+ git_refnames = " (tag: 2023.9.1, Release-2023.9)"
+ git_full = "09097f551d09f026b295962441e90b80c0e5ce81"
+ git_date = "2023-10-24 18:43:25 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_demux/plugin_setup.py
=====================================
@@ -10,10 +10,11 @@ import importlib
from qiime2.plugin import (
Plugin, Metadata, MetadataColumn, Categorical, Bool, Str, Int, Float,
- Range, Citations, TypeMatch
+ List, Collection, Range, Citations, TypeMatch
)
from q2_types.sample_data import SampleData
+from q2_types.metadata import ImmutableMetadata
from q2_types.per_sample_sequences import (
SequencesWithQuality, PairedEndSequencesWithQuality,
JoinedSequencesWithQuality)
@@ -160,6 +161,55 @@ plugin.methods.register_function(
citations['hamady2009']]
)
+demux_description = 'The demultiplexed sequences to partition.'
+num_partitions_description = 'The number of partitions to split the' \
+ ' demultiplexed sequences into. Defaults to' \
+ ' partitioning into individual samples.'
+partitioned_demux_description = 'The partitioned demultiplexed sequences.'
+
+plugin.methods.register_function(
+ function=q2_demux.partition_samples_single,
+ inputs={'demux': SampleData[SequencesWithQuality]},
+ parameters={'num_partitions': Int % Range(1, None)},
+ outputs=[
+ ('partitioned_demux', Collection[SampleData[SequencesWithQuality]]),
+ ],
+ input_descriptions={
+ 'demux': demux_description
+ },
+ parameter_descriptions={
+ 'num_partitions': num_partitions_description
+ },
+ output_descriptions={
+ 'partitioned_demux': partitioned_demux_description
+ },
+ name='Split demultiplexed sequence data into partitions.',
+ description=('Partition demultiplexed single end sequences into '
+ 'individual samples or the number of partitions specified.'),
+)
+
+plugin.methods.register_function(
+ function=q2_demux.partition_samples_paired,
+ inputs={'demux': SampleData[PairedEndSequencesWithQuality]},
+ parameters={'num_partitions': Int % Range(1, None)},
+ outputs=[
+ ('partitioned_demux',
+ Collection[SampleData[PairedEndSequencesWithQuality]]),
+ ],
+ input_descriptions={
+ 'demux': demux_description
+ },
+ parameter_descriptions={
+ 'num_partitions': num_partitions_description
+ },
+ output_descriptions={
+ 'partitioned_demux': partitioned_demux_description
+ },
+ name='Split demultiplexed sequence data into partitions.',
+ description=('Partition demultiplexed paired end sequences into '
+ 'individual samples or the number of partitions specified.'),
+)
+
plugin.visualizers.register_function(
function=q2_demux.summarize,
inputs={'data':
@@ -185,6 +235,25 @@ plugin.visualizers.register_function(
examples={'demux': ex.summarize}
)
+plugin.methods.register_function(
+ function=q2_demux.tabulate_read_counts,
+ inputs={'sequences':
+ List[SampleData[SequencesWithQuality |
+ PairedEndSequencesWithQuality |
+ JoinedSequencesWithQuality]]},
+ parameters={},
+ outputs=[
+ ('counts', ImmutableMetadata)
+ ],
+ input_descriptions={
+ 'sequences': 'One or more collections of demultiplexed sequences.'
+ },
+ parameter_descriptions={},
+ name='Tabulate counts per sample',
+ description=('Generate a per-sample count of sequence reads.'),
+ examples={}
+)
+
plugin.methods.register_function(
function=q2_demux.subsample_single,
inputs={'sequences': SampleData[SequencesWithQuality |
=====================================
q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_paired_end_1/sample1_1_L001_R2_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample1_1_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample2_2_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample3_3_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample4_4_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_1/sample5_5_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_2/sample6_1_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_demux/tests/data/tabulate_read_counts_single_end_2/sample7_2_L001_R1_001.fastq.gz differ
=====================================
q2_demux/tests/test_demux.py
=====================================
@@ -22,7 +22,8 @@ import numpy.testing as npt
from qiime2.plugin.testing import TestPluginBase
from q2_demux._demux import (BarcodeSequenceFastqIterator,
BarcodePairedSequenceFastqIterator)
-from q2_demux import emp_single, emp_paired, summarize
+from q2_demux import (emp_single, emp_paired, partition_samples_single,
+ partition_samples_paired, summarize)
from q2_types.per_sample_sequences import (
FastqGzFormat, FastqManifestFormat,
SingleLanePerSampleSingleEndFastqDirFmt,
@@ -598,6 +599,105 @@ class EmpSingleTests(unittest.TestCase, EmpTestingUtils):
golay_error_correction=False,
)
+ def test_partition(self):
+ demux, _ = emp_single(self.bsi, self.barcode_map,
+ golay_error_correction=False)
+
+ partition = partition_samples_single(demux)
+
+ exp_samples = ('sample1_1_L001_R1_001.fastq.gz',
+ 'sample2_3_L001_R1_001.fastq.gz',
+ 'sample3_2_L001_R1_001.fastq.gz',
+ 'sample4_5_L001_R1_001.fastq.gz',
+ 'sample5_4_L001_R1_001.fastq.gz')
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ for idx, (id, sample) in enumerate(partition.items()):
+ self.assertEqual(id, f'sample{idx + 1}')
+
+ act_manifest = \
+ list(sample.manifest.view(FastqManifestFormat).open())
+ exp_manifest = ['sample-id,filename,direction\n',
+ f'sample{idx + 1},{exp_samples[idx]},forward\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+ self.assertEqual(len(output_fastq), 1)
+
+ self._validate_sample_fastq(
+ output_fastq[0][1].open(), self.sequences, exp_indices[idx])
+
+ def test_partition_num_specified(self):
+ demux, _ = emp_single(self.bsi, self.barcode_map,
+ golay_error_correction=False)
+ partition = partition_samples_single(demux, 2)
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ sample = partition[1]
+ act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+ exp_manifest = ['sample-id,filename,direction\n',
+ 'sample1,sample1_1_L001_R1_001.fastq.gz,forward\n',
+ 'sample2,sample2_3_L001_R1_001.fastq.gz,forward\n',
+ 'sample3,sample3_2_L001_R1_001.fastq.gz,forward\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+ self.assertEqual(len(output_fastq), 3)
+
+ self._validate_sample_fastq(
+ output_fastq[0][1].open(), self.sequences, exp_indices[0])
+ self._validate_sample_fastq(
+ output_fastq[1][1].open(), self.sequences, exp_indices[1])
+ self._validate_sample_fastq(
+ output_fastq[2][1].open(), self.sequences, exp_indices[2])
+
+ sample = partition[2]
+ act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+ exp_manifest = ['sample-id,filename,direction\n',
+ 'sample4,sample4_5_L001_R1_001.fastq.gz,forward\n',
+ 'sample5,sample5_4_L001_R1_001.fastq.gz,forward\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+ self.assertEqual(len(output_fastq), 2)
+
+ self._validate_sample_fastq(
+ output_fastq[0][1].open(), self.sequences, exp_indices[3])
+ self._validate_sample_fastq(
+ output_fastq[1][1].open(), self.sequences, exp_indices[4])
+
+ def test_partition_more_partitions_than_samples(self):
+ demux, _ = emp_single(self.bsi, self.barcode_map,
+ golay_error_correction=False)
+
+ with self.assertWarnsRegex(
+ UserWarning, "You have requested a number of.*100.*5.*5"):
+ partition = partition_samples_single(demux, 100)
+
+ exp_samples = ('sample1_1_L001_R1_001.fastq.gz',
+ 'sample2_3_L001_R1_001.fastq.gz',
+ 'sample3_2_L001_R1_001.fastq.gz',
+ 'sample4_5_L001_R1_001.fastq.gz',
+ 'sample5_4_L001_R1_001.fastq.gz')
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ for idx, (id, sample) in enumerate(partition.items()):
+ self.assertEqual(id, f'sample{idx + 1}')
+
+ act_manifest = \
+ list(sample.manifest.view(FastqManifestFormat).open())
+ exp_manifest = ['sample-id,filename,direction\n',
+ f'sample{idx + 1},{exp_samples[idx]},forward\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ output_fastq = list(sample.sequences.iter_views(FastqGzFormat))
+ self.assertEqual(len(output_fastq), 1)
+
+ self._validate_sample_fastq(
+ output_fastq[0][1].open(), self.sequences, exp_indices[idx])
+
class EmpPairedTests(unittest.TestCase, EmpTestingUtils):
def setUp(self):
@@ -947,6 +1047,176 @@ class EmpPairedTests(unittest.TestCase, EmpTestingUtils):
self.check_valid(bpsi, self.barcode_map, golay_error_correction=False,
rev_comp_barcodes=True)
+ def test_partition(self):
+ demux, _ = emp_paired(self.bpsi, self.barcode_map,
+ golay_error_correction=False)
+
+ partition = partition_samples_paired(demux)
+
+ exp_samples_fwd = ('sample1_1_L001_R1_001.fastq.gz',
+ 'sample2_3_L001_R1_001.fastq.gz',
+ 'sample3_2_L001_R1_001.fastq.gz',
+ 'sample4_5_L001_R1_001.fastq.gz',
+ 'sample5_4_L001_R1_001.fastq.gz')
+ exp_samples_rev = ('sample1_1_L001_R2_001.fastq.gz',
+ 'sample2_3_L001_R2_001.fastq.gz',
+ 'sample3_2_L001_R2_001.fastq.gz',
+ 'sample4_5_L001_R2_001.fastq.gz',
+ 'sample5_4_L001_R2_001.fastq.gz')
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ for idx, (id, sample) in enumerate(partition.items()):
+ self.assertEqual(id, f'sample{idx + 1}')
+
+ act_manifest = \
+ list(sample.manifest.view(FastqManifestFormat).open())
+ exp_manifest = \
+ ['sample-id,filename,direction\n',
+ f'sample{idx + 1},{exp_samples_fwd[idx]},forward\n',
+ f'sample{idx + 1},{exp_samples_rev[idx]},reverse\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ forward_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R1_001.fastq' in path.name]
+ self.assertEqual(len(forward_fastq), 1)
+
+ reverse_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R2_001.fastq' in path.name]
+ self.assertEqual(len(reverse_fastq), 1)
+
+ self._validate_sample_fastq(
+ forward_fastq[0].open(), self.forward, exp_indices[idx])
+ self._validate_sample_fastq(
+ reverse_fastq[0].open(), self.reverse, exp_indices[idx])
+
+ def test_partition_num_specified(self):
+ demux, _ = emp_paired(self.bpsi, self.barcode_map,
+ golay_error_correction=False)
+ partition = partition_samples_paired(demux, 2)
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ sample = partition[1]
+ act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+ exp_manifest = ['sample-id,filename,direction\n',
+ 'sample1,sample1_1_L001_R1_001.fastq.gz,forward\n',
+ 'sample1,sample1_1_L001_R2_001.fastq.gz,reverse\n',
+ 'sample2,sample2_3_L001_R1_001.fastq.gz,forward\n',
+ 'sample2,sample2_3_L001_R2_001.fastq.gz,reverse\n',
+ 'sample3,sample3_2_L001_R1_001.fastq.gz,forward\n',
+ 'sample3,sample3_2_L001_R2_001.fastq.gz,reverse\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ forward_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R1_001.fastq' in path.name]
+ self.assertEqual(len(forward_fastq), 3)
+
+ self._validate_sample_fastq(
+ forward_fastq[0].open(), self.forward, exp_indices[0])
+ self._validate_sample_fastq(
+ forward_fastq[1].open(), self.forward, exp_indices[1])
+ self._validate_sample_fastq(
+ forward_fastq[2].open(), self.forward, exp_indices[2])
+
+ reverse_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R2_001.fastq' in path.name]
+ self.assertEqual(len(reverse_fastq), 3)
+
+ self._validate_sample_fastq(
+ reverse_fastq[0].open(), self.reverse, exp_indices[0])
+ self._validate_sample_fastq(
+ reverse_fastq[1].open(), self.reverse, exp_indices[1])
+ self._validate_sample_fastq(
+ reverse_fastq[2].open(), self.reverse, exp_indices[2])
+
+ sample = partition[2]
+ act_manifest = list(sample.manifest.view(FastqManifestFormat).open())
+
+ exp_manifest = ['sample-id,filename,direction\n',
+ 'sample4,sample4_5_L001_R1_001.fastq.gz,forward\n',
+ 'sample4,sample4_5_L001_R2_001.fastq.gz,reverse\n',
+ 'sample5,sample5_4_L001_R1_001.fastq.gz,forward\n',
+ 'sample5,sample5_4_L001_R2_001.fastq.gz,reverse\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ forward_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R1_001.fastq' in path.name]
+ self.assertEqual(len(forward_fastq), 2)
+
+ self._validate_sample_fastq(
+ forward_fastq[0].open(), self.forward, exp_indices[3])
+ self._validate_sample_fastq(
+ forward_fastq[1].open(), self.forward, exp_indices[4])
+
+ reverse_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R2_001.fastq' in path.name]
+ self.assertEqual(len(reverse_fastq), 2)
+
+ self._validate_sample_fastq(
+ reverse_fastq[0].open(), self.reverse, exp_indices[3])
+ self._validate_sample_fastq(
+ reverse_fastq[1].open(), self.reverse, exp_indices[4])
+
+ def test_partition_more_partitions_than_samples(self):
+ demux, _ = emp_paired(self.bpsi, self.barcode_map,
+ golay_error_correction=False)
+
+ with self.assertWarnsRegex(
+ UserWarning, "You have requested a number of.*100.*5.*5"):
+ partition = partition_samples_paired(demux, 100)
+
+ exp_samples_fwd = ('sample1_1_L001_R1_001.fastq.gz',
+ 'sample2_3_L001_R1_001.fastq.gz',
+ 'sample3_2_L001_R1_001.fastq.gz',
+ 'sample4_5_L001_R1_001.fastq.gz',
+ 'sample5_4_L001_R1_001.fastq.gz')
+ exp_samples_rev = ('sample1_1_L001_R2_001.fastq.gz',
+ 'sample2_3_L001_R2_001.fastq.gz',
+ 'sample3_2_L001_R2_001.fastq.gz',
+ 'sample4_5_L001_R2_001.fastq.gz',
+ 'sample5_4_L001_R2_001.fastq.gz')
+ exp_indices = ([0, 5], [2, 4], [1, 3], [7, 10], [6, 8, 9])
+
+ for idx, (id, sample) in enumerate(partition.items()):
+ self.assertEqual(id, f'sample{idx + 1}')
+
+ act_manifest = \
+ list(sample.manifest.view(FastqManifestFormat).open())
+ exp_manifest = \
+ ['sample-id,filename,direction\n',
+ f'sample{idx + 1},{exp_samples_fwd[idx]},forward\n',
+ f'sample{idx + 1},{exp_samples_rev[idx]},reverse\n']
+ self._compare_manifests(act_manifest, exp_manifest)
+
+ forward_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R1_001.fastq' in path.name]
+ self.assertEqual(len(forward_fastq), 1)
+
+ reverse_fastq = [
+ view for path, view in
+ sample.sequences.iter_views(FastqGzFormat)
+ if 'R2_001.fastq' in path.name]
+ self.assertEqual(len(reverse_fastq), 1)
+
+ self._validate_sample_fastq(
+ forward_fastq[0].open(), self.forward, exp_indices[idx])
+ self._validate_sample_fastq(
+ reverse_fastq[0].open(), self.reverse, exp_indices[idx])
+
class SummarizeTests(TestPluginBase):
package = 'q2_demux.tests'
=====================================
q2_demux/tests/test_tabulate.py
=====================================
@@ -0,0 +1,109 @@
+import pandas as pd
+
+import qiime2
+from qiime2.plugin.testing import TestPluginBase
+from qiime2.plugin.util import transform
+from q2_types.per_sample_sequences import (
+ CasavaOneEightSingleLanePerSampleDirFmt,
+ SingleLanePerSampleSingleEndFastqDirFmt,
+ SingleLanePerSamplePairedEndFastqDirFmt)
+from q2_demux import tabulate_read_counts
+
+
+class TabulateTests(TestPluginBase):
+ package = 'q2_demux.tests'
+
+ def setUp(self):
+ super().setUp()
+
+ demuxed_se_1 = CasavaOneEightSingleLanePerSampleDirFmt(
+ self.get_data_path('tabulate_read_counts_single_end_1'), mode='r')
+ self.demux_se_data_1 = transform(
+ demuxed_se_1, to_type=SingleLanePerSampleSingleEndFastqDirFmt)
+
+ demuxed_se_2 = CasavaOneEightSingleLanePerSampleDirFmt(
+ self.get_data_path('tabulate_read_counts_single_end_2'), mode='r')
+ self.demux_se_data_2 = transform(
+ demuxed_se_2, to_type=SingleLanePerSampleSingleEndFastqDirFmt)
+
+ demuxed_pe_1 = CasavaOneEightSingleLanePerSampleDirFmt(
+ self.get_data_path('tabulate_read_counts_paired_end_1'), mode='r')
+ self.demux_pe_data_1 = transform(
+ demuxed_pe_1, to_type=SingleLanePerSamplePairedEndFastqDirFmt)
+
+ def test_tabulate_read_counts_se(self):
+ actual = tabulate_read_counts([self.demux_se_data_1])
+
+ expected = {'sample1': 2,
+ 'sample2': 2,
+ 'sample3': 2,
+ 'sample4': 2,
+ 'sample5': 3}
+ expected = pd.Series(expected)
+ expected.name = 'Demultiplexed sequence count'
+ expected = expected.to_frame()
+ expected.index.name = 'sample-id'
+ expected = qiime2.Metadata(expected)
+
+ self.assertEqual(actual, expected)
+
+ actual = tabulate_read_counts([self.demux_se_data_2])
+
+ expected = {'sample6': 2,
+ 'sample7': 2}
+ expected = pd.Series(expected)
+ expected.name = 'Demultiplexed sequence count'
+ expected = expected.to_frame()
+ expected.index.name = 'sample-id'
+ expected = qiime2.Metadata(expected)
+
+ self.assertEqual(actual, expected)
+
+ def test_tabulate_read_counts_pe(self):
+ actual = tabulate_read_counts([self.demux_pe_data_1])
+
+ expected = {'sample1': 2}
+ expected = pd.Series(expected)
+ expected.name = 'Demultiplexed sequence count'
+ expected = expected.to_frame()
+ expected.index.name = 'sample-id'
+ expected = qiime2.Metadata(expected)
+
+ self.assertEqual(actual, expected)
+
+ def test_tabulate_read_counts_multiple(self):
+ actual = tabulate_read_counts([self.demux_se_data_1,
+ self.demux_se_data_2])
+
+ expected = {'sample1': 2,
+ 'sample2': 2,
+ 'sample3': 2,
+ 'sample4': 2,
+ 'sample5': 3,
+ 'sample6': 2,
+ 'sample7': 2}
+ expected = pd.Series(expected)
+ expected.name = 'Demultiplexed sequence count'
+ expected = expected.to_frame()
+ expected.index.name = 'sample-id'
+ expected = qiime2.Metadata(expected)
+
+ self.assertEqual(actual, expected)
+
+ actual = tabulate_read_counts([self.demux_pe_data_1,
+ self.demux_se_data_2])
+
+ expected = {'sample1': 2,
+ 'sample6': 2,
+ 'sample7': 2}
+ expected = pd.Series(expected)
+ expected.name = 'Demultiplexed sequence count'
+ expected = expected.to_frame()
+ expected.index.name = 'sample-id'
+ expected = qiime2.Metadata(expected)
+
+ self.assertEqual(actual, expected)
+
+ def test_tabulate_read_counts_error(self):
+ with self.assertRaisesRegex(KeyError, 'duplicated.*sample1'):
+ tabulate_read_counts([self.demux_se_data_1, self.demux_pe_data_1])
=====================================
setup.py
=====================================
@@ -43,6 +43,9 @@ setup(
'data/summarize_empty/empty_reverse_in_paired/*',
'data/summarize_empty/empty_paired_end/*',
'data/reverse_only/*',
+ 'data/tabulate_read_counts_single_end_1/*',
+ 'data/tabulate_read_counts_single_end_2/*',
+ 'data/tabulate_read_counts_paired_end_1/*',
],
'q2_demux': ['_summarize/assets/*.html',
'_summarize/assets/dist/*',
View it on GitLab: https://salsa.debian.org/med-team/q2-demux/-/commit/0e6a4bcbb9450c9583af489f2944ed0ca3c843ee
--
View it on GitLab: https://salsa.debian.org/med-team/q2-demux/-/commit/0e6a4bcbb9450c9583af489f2944ed0ca3c843ee
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240130/a0938a1c/attachment-0001.htm>
More information about the debian-med-commit
mailing list