[med-svn] [Git][med-team/q2-cutadapt][upstream] New upstream version 2024.2.0
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Feb 18 13:42:37 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / q2-cutadapt
Commits:
5605a03f by Andreas Tille at 2024-02-18T14:38:19+01:00
New upstream version 2024.2.0
- - - - -
6 changed files:
- .github/workflows/ci-dev.yaml
- README.md
- q2_cutadapt/_demux.py
- q2_cutadapt/_version.py
- q2_cutadapt/plugin_setup.py
- q2_cutadapt/tests/test_demux.py
Changes:
=====================================
.github/workflows/ci-dev.yaml
=====================================
@@ -9,4 +9,4 @@ jobs:
ci:
uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml at dev
with:
- distro: core
\ No newline at end of file
+ distro: amplicon
=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
# q2-cutadapt
-![](https://github.com/qiime2/q2-cutadapt/workflows/ci/badge.svg)
+![](https://github.com/qiime2/q2-cutadapt/workflows/ci-dev/badge.svg)
This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org.
\ No newline at end of file
=====================================
q2_cutadapt/_demux.py
=====================================
@@ -40,9 +40,12 @@ def run_command(cmd, verbose=True):
def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
untrimmed_dir_fmt, error_rate, minimum_length,
+ forward_cut=0, reverse_cut=0,
+ anchor_forward=False, anchor_reverse=False,
cores=1):
cmd = ['cutadapt',
- '--front', 'file:%s' % barcode_fhs['fwd'].name,
+ '-g',
+ f'{"^" if anchor_forward else ""}file:{barcode_fhs["fwd"].name}',
'--error-rate', str(error_rate),
'--minimum-length', str(minimum_length),
# {name} is a cutadapt convention for interpolating the sample id
@@ -57,7 +60,8 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
# Dual indices
cmd += [
'--pair-adapters',
- '-G', 'file:%s' % barcode_fhs['rev'].name,
+ '-G',
+ f'{"^" if anchor_reverse else ""}file:{barcode_fhs["rev"].name}', # noqa: E501
]
cmd += [
'-p', os.path.join(str(per_sample_dir_fmt), '{name}.2.fastq.gz'),
@@ -65,12 +69,16 @@ def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
os.path.join(str(untrimmed_dir_fmt), 'reverse.fastq.gz'),
str(seqs_dir_fmt.forward_sequences.view(FastqGzFormat)),
str(seqs_dir_fmt.reverse_sequences.view(FastqGzFormat)),
- ]
+ '-U', str(reverse_cut),
+ ]
else:
# SINGLE-END
cmd += [str(seqs_dir_fmt.file.view(FastqGzFormat))]
- cmd += ['-j', str(cores)]
+ cmd += [
+ '-u', str(forward_cut),
+ '-j', str(cores)
+ ]
return cmd
@@ -191,7 +199,8 @@ def _check_barcodes_uniqueness(
def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
- error_tolerance, mux_fmt, batch_size, minimum_length, cores):
+ error_tolerance, mux_fmt, batch_size, minimum_length, forward_cut,
+ reverse_cut, anchor_forward, anchor_reverse, cores):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
@@ -217,10 +226,11 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
open_fhs['rev'] = tempfile.NamedTemporaryFile()
_write_barcode_fasta(barcode_batch[rev_barcode_name],
open_fhs['rev'])
- cmd = _build_demux_command(previous_untrimmed, open_fhs,
- per_sample_sequences,
- current_untrimmed, error_tolerance,
- minimum_length, cores)
+ cmd = _build_demux_command(
+ previous_untrimmed, open_fhs, per_sample_sequences,
+ current_untrimmed, error_tolerance, minimum_length, forward_cut,
+ reverse_cut, anchor_forward, anchor_reverse, cores
+ )
run_command(cmd)
open_fhs['fwd'].close()
if reverse_barcodes is not None:
@@ -237,6 +247,8 @@ def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
barcodes: qiime2.CategoricalMetadataColumn,
+ cut: int = 0,
+ anchor_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
@@ -248,15 +260,53 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
untrimmed = _demux(
- seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
- batch_size, minimum_length, cores)
+ seqs, per_sample_sequences, barcodes, None, error_rate,
+ mux_fmt, batch_size, minimum_length, cut, 0, anchor_barcode, False,
+ cores)
return per_sample_sequences, untrimmed
+def _check_paired_requirements(loc):
+ mixed_orientation = loc.get("mixed_orientation", None)
+ forward_cut = loc.get("forward_cut", 0)
+ reverse_cut = loc.get("reverse_cut", 0)
+ reverse_barcodes = loc.get("reverse_barcodes", None)
+ anchor_forward_barcode = loc.get("anchor_forward_barcode", False)
+ anchor_reverse_barcode = loc.get("anchor_reverse_barcode", False)
+
+ if (
+ not mixed_orientation
+ and anchor_reverse_barcode and (reverse_barcodes is None)
+ ):
+ raise ValueError("A reverse barcode needs to be provided in order to "
+ "anchor the reverse barcode.")
+
+ if (
+ mixed_orientation
+ and forward_cut != reverse_cut
+ ):
+ raise ValueError("'forward_cut' and 'reverse_cut' need to be set to "
+ "the same number when using the 'mixed_orientation' "
+ "mode.")
+
+ if (
+ mixed_orientation
+ and anchor_forward_barcode != anchor_reverse_barcode
+ ):
+ raise ValueError(
+ "'anchor_forward_barcode' and 'anchor_reverse_barcode' need to be "
+ "set to the same value when using the 'mixed_orientation' mode."
+ )
+
+
def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
forward_barcodes: qiime2.CategoricalMetadataColumn,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
+ forward_cut: int = 0,
+ reverse_cut: int = 0,
+ anchor_forward_barcode: bool = False,
+ anchor_reverse_barcode: bool = False,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1,
@@ -266,13 +316,15 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
_check_barcodes_uniqueness(
forward_barcodes, reverse_barcodes, mixed_orientation)
+ _check_paired_requirements(locals())
per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
untrimmed = _demux(
seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
- error_rate, mux_fmt, batch_size, minimum_length, cores)
+ error_rate, mux_fmt, batch_size, minimum_length, forward_cut,
+ reverse_cut, anchor_forward_barcode, anchor_reverse_barcode, cores)
if mixed_orientation:
fwd = untrimmed.forward_sequences.view(FastqGzFormat)
@@ -282,10 +334,12 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
# fwd -> rev && rev -> fwd
remaining_seqs.forward_sequences.write_data(rev, FastqGzFormat)
remaining_seqs.reverse_sequences.write_data(fwd, FastqGzFormat)
-
+ # Cuts have already been performed during the first demux pass, set
+ # forward and reverse cut to 0
untrimmed = _demux(
remaining_seqs, per_sample_sequences, forward_barcodes,
reverse_barcodes, error_rate, mux_fmt, batch_size,
- minimum_length, cores)
+ minimum_length, 0, 0, anchor_reverse_barcode,
+ anchor_forward_barcode, cores)
return per_sample_sequences, untrimmed
=====================================
q2_cutadapt/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2023.9.0, Release-2023.9)"
- git_full = "4098881c977e2a54475e580e65b40c15c92c490d"
- git_date = "2023-10-03 21:51:11 +0000"
+ git_refnames = " (tag: 2024.2.0, Release-2024.2)"
+ git_full = "aee8d31a9f1517824d9bda653dc525a8c8721cfb"
+ git_date = "2024-02-16 21:55:34 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_cutadapt/plugin_setup.py
=====================================
@@ -17,6 +17,7 @@ from qiime2.plugin import (
List,
Str,
Bool,
+ Threads,
)
from q2_types.multiplexed_sequences import (
MultiplexedSingleEndBarcodeInSequence,
@@ -53,7 +54,7 @@ plugin.methods.register_function(
'demultiplexed_sequences': SampleData[SequencesWithQuality],
},
parameters={
- 'cores': Int % Range(1, None),
+ 'cores': Threads,
'adapter': List[Str],
'front': List[Str],
'anywhere': List[Str],
@@ -144,7 +145,7 @@ plugin.methods.register_function(
'demultiplexed_sequences': SampleData[PairedEndSequencesWithQuality],
},
parameters={
- 'cores': Int % Range(1, None),
+ 'cores': Threads,
'adapter_f': List[Str],
'front_f': List[Str],
'anywhere_f': List[Str],
@@ -263,9 +264,11 @@ plugin.methods.register_function(
'barcodes': MetadataColumn[Categorical],
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
+ 'anchor_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
- 'cores': Int % Range(1, None),
+ 'cut': Int,
+ 'cores': Threads,
},
outputs=[
('per_sample_sequences', SampleData[SequencesWithQuality]),
@@ -282,6 +285,10 @@ plugin.methods.register_function(
'allowable error rate. The default value specified by '
'cutadapt is 0.1 (=10%), which is greater than '
'`demux emp-*`, which is 0.0 (=0%).',
+ 'anchor_barcode': 'Anchor the barcode. The barcode is then '
+ 'expected to occur in full length at the beginning '
+ '(5\' end) of the sequence. Can speed up '
+ 'demultiplexing if used.',
'batch_size': 'The number of samples cutadapt demultiplexes '
'concurrently. Demultiplexing in smaller batches will '
'yield the same result with marginal speed loss, and '
@@ -291,6 +298,11 @@ plugin.methods.register_function(
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
+ 'cut': 'Remove the specified number of bases from the sequences. '
+ 'Bases are removed before demultiplexing. If a positive value '
+ 'is provided, bases are removed from the beginning of the '
+ 'sequences. If a negative value is provided, bases are removed '
+ 'from the end of the sequences.',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
@@ -314,12 +326,16 @@ plugin.methods.register_function(
parameters={
'forward_barcodes': MetadataColumn[Categorical],
'reverse_barcodes': MetadataColumn[Categorical],
+ 'forward_cut': Int,
+ 'reverse_cut': Int,
'error_rate': Float % Range(0, 1, inclusive_start=True,
inclusive_end=True),
+ 'anchor_forward_barcode': Bool,
+ 'anchor_reverse_barcode': Bool,
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
'mixed_orientation': Bool,
- 'cores': Int % Range(1, None),
+ 'cores': Threads,
},
outputs=[
('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
@@ -334,6 +350,32 @@ plugin.methods.register_function(
'per-sample barcodes for the forward reads.',
'reverse_barcodes': 'The sample metadata column listing the '
'per-sample barcodes for the reverse reads.',
+ 'forward_cut': 'Remove the specified number of bases from the forward '
+ 'sequences. Bases are removed before demultiplexing. '
+ 'If a positive value is provided, bases are removed '
+ 'from the beginning of the sequences. If a negative '
+ 'value is provided, bases are removed from the end of '
+ 'the sequences. If --p-mixed-orientation is set, then '
+ 'both --p-forward-cut and --p-reverse-cut must be '
+ 'set to the same value.',
+ 'reverse_cut': 'Remove the specified number of bases from the reverse '
+ 'sequences. Bases are removed before demultiplexing. '
+ 'If a positive value is provided, bases are removed '
+ 'from the beginning of the sequences. If a negative '
+ 'value is provided, bases are removed from the end of '
+ 'the sequences. If --p-mixed-orientation is set, then '
+ 'both --p-forward-cut and --p-reverse-cut must be '
+ 'set to the same value.',
+ 'anchor_forward_barcode': 'Anchor the forward barcode. The '
+ 'barcode is then expected to occur in full '
+ 'length at the beginning (5\' end) of the '
+ 'forward sequence. Can speed up '
+ 'demultiplexing if used.',
+ 'anchor_reverse_barcode': 'Anchor the reverse barcode. The '
+ 'barcode is then expected to occur in full '
+ 'length at the beginning (5\' end) of the '
+ 'reverse sequence. Can speed up '
+ 'demultiplexing if used.',
'error_rate': 'The level of error tolerance, specified as the maximum '
'allowable error rate.',
'batch_size': 'The number of samples cutadapt demultiplexes '
@@ -347,7 +389,7 @@ plugin.methods.register_function(
'records.',
'mixed_orientation': 'Handle demultiplexing of mixed orientation '
'reads (i.e. when forward and reverse reads '
- 'coexist in the same file).'
+ 'coexist in the same file).',
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
=====================================
q2_cutadapt/tests/test_demux.py
=====================================
@@ -316,6 +316,135 @@ class TestDemuxSingle(TestPluginBase):
self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
self.assert_untrimmed_results('', obs_untrimmed_art)
+ def test_cut_positive(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+ exp = [
+ # sample a passed. However, as the first 'A' was removed, there is
+ # a shift in the extracted sequence.
+ '@id1\nCGTACGT\n+\nzzzzzzz\n' # vs ACGTACGT in `test_typical`
+ '@id3\nCGTACGT\n+\nzzzzzzz\n',
+ # sample b passed because by default cutadapt allows matching
+ # non-complete barcodes to the targeted sequences.
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
+ # Expected untrimmed sequence is the same as in `test_typical`, only
+ # shortened of its first nucleotide.
+ exp_untrimmed = \
+ '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n'
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_single_fn(self.muxed_sequences, metadata, cut=1)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+ def test_cut_positive_overcut(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+ exp = [
+ # sample a passed. However, as the first 'A' was removed, there is
+ # a shift in the extracted sequence.
+ '@id1\nCGTACGT\n+\nzzzzzzz\n' # vs ACGTACGT in `test_typical`
+ '@id3\nCGTACGT\n+\nzzzzzzz\n',
+ # Sample b is empty as with the first two nucleotides cut, the
+ # overlap between barcode and sequence is 2 (under the threshold
+ # of 3)
+ '', ]
+ exp_untrimmed = (
+ '@id2\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+ '@id4\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+ '@id5\nCCACGTACGT\n+\nzzzzzzzzzz\n'
+ '@id6\nGGACGTACGT\n+\nzzzzzzzzzz\n'
+ )
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_single_fn(self.muxed_sequences, metadata, cut=2)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+ def test_cut_negative(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ # Expected demux and untrimmed sequences are the same as in
+ # `test_typical`, only shortened of their last two nucleotides.
+ exp = [
+ # sample a
+ '@id1\nACGTAC\n+\nzzzzzz\n'
+ '@id3\nACGTAC\n+\nzzzzzz\n',
+ # sample b
+ '@id2\nACGTAC\n+\nzzzzzz\n'
+ '@id4\nACGTAC\n+\nzzzzzz\n'
+ '@id5\nACGTAC\n+\nzzzzzz\n', ]
+
+ exp_untrimmed = \
+ '@id6\nGGGGACGTAC\n+\nzzzzzzzzzz\n'
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_single_fn(self.muxed_sequences, metadata, cut=-2)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+ def test_anchored(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCA'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b is empty because the first 'C' from the sequence is
+ # not in the barcode (sequence is 'CCCCACGTACGT')
+ '',
+ ]
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = self.demux_single_fn(
+ self.muxed_sequences, metadata, anchor_barcode=True)
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ obs_untrimmed_art)
+
+ def test_anchored_cut(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a passed. However, as the first 'A' was removed, there was
+ # a shift in the extracted sequence.
+ '@id1\nCGTACGT\n+\nzzzzzzz\n' # vs ACGTACGT in other tests
+ '@id3\nCGTACGT\n+\nzzzzzzz\n',
+ # sample b is empty because the removal of the first base only left
+ # 'CCC' from the original barcode.
+ '',
+ ]
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = self.demux_single_fn(
+ self.muxed_sequences, metadata, cut=1, anchor_barcode=True)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ # Rem: the first base was removed from all the sequences
+ self.assert_untrimmed_results('@id2\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+ '@id4\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+ '@id5\nCCCACGTACGT\n+\nzzzzzzzzzzz\n'
+ '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n',
+ obs_untrimmed_art)
+
class TestDemuxPaired(TestPluginBase):
package = 'q2_cutadapt.tests'
@@ -396,6 +525,81 @@ class TestDemuxPaired(TestPluginBase):
self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+ def test_cut(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a, fwd
+ '@id1\nCGTACGT\n+\nzzzzzzz\n'
+ '@id3\nCGTACGT\n+\nzzzzzzz\n',
+ # sample a, rev
+ '@id1\nGGGGTGCATG\n+\nzzzzzzzzzz\n'
+ '@id3\nGGGGTGCATG\n+\nzzzzzzzzzz\n',
+ # sample b, fwd
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b, rev
+ '@id2\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+ '@id4\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+ '@id5\nTTTTTGCATG\n+\nzzzzzzzzzz\n', ]
+ exp_untrimmed = [
+ '@id6\nGGGACGTACGT\n+\nzzzzzzzzzzz\n',
+ '@id6\nTTTTTGCATG\n+\nzzzzzzzzzz\n'
+ ]
+
+ # Test a positive cut in forward sequences and a negative cut in
+ # reverse at the same time
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(self.muxed_sequences,
+ forward_barcodes=metadata,
+ forward_cut=1,
+ reverse_cut=-2)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+ def test_anchored(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCA'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a, fwd
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample a, rev
+ '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample b, fwd is empty because the first 'C' from the sequence is
+ # not in the barcode (sequence is 'CCCCACGTACGT')
+ '',
+ # sample b, rev is empty for the same reason
+ '', ]
+
+ exp_untrimmed = [
+ '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ ]
+
+ # Test a positive cut in forward sequences and a negative cut in
+ # reverse at the same time
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(self.muxed_sequences,
+ forward_barcodes=metadata,
+ anchor_forward_barcode=True)
+
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
def test_dual_index_success(self):
forward_barcodes = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
@@ -414,7 +618,7 @@ class TestDemuxPaired(TestPluginBase):
'@id2\nACGTACGT\n+\nzzzzzzzz\n'
'@id4\nACGTACGT\n+\nzzzzzzzz\n'
'@id5\nACGTACGT\n+\nzzzzzzzz\n',
- # sample a, rev
+ # sample b, rev
'@id2\nTGCATGCA\n+\nzzzzzzzz\n'
'@id4\nTGCATGCA\n+\nzzzzzzzz\n'
'@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
@@ -431,6 +635,65 @@ class TestDemuxPaired(TestPluginBase):
obs_demuxed_art)
self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+ def test_dual_index_anchored(self):
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC', 'GGGA'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
+ name='id')))
+ reverse_barcodes = CategoricalMetadataColumn(
+ pd.Series(['GGGT', 'TTTT', 'TTTT'], name='ReverseBarcode',
+ index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
+ name='id')))
+ exp = [
+ # sample a, fwd is empty because of reverse anchoring
+ '',
+ # sample a, rev is empty because of reverse anchoring
+ '',
+ # sample b, fwd
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b, rev
+ '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
+ '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
+ '@id5\nTGCATGCA\n+\nzzzzzzzz\n',
+ # sample c, fwd is empty because of forward anchoring,
+ '',
+ # sample c, rev is empty because of forward anchoring,
+ '',
+ ]
+ exp_untrimmed = [
+ '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ ]
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(self.muxed_sequences,
+ forward_barcodes=forward_barcodes,
+ reverse_barcodes=reverse_barcodes,
+ anchor_forward_barcode=True,
+ anchor_reverse_barcode=True)
+
+ self.assert_demux_results(forward_barcodes.to_series(), exp,
+ obs_demuxed_art)
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
+ def test_dual_index_anchor_fail_no_reverse(self):
+ metadata = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCA'], name='Barcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+ with self.assertRaises(ValueError):
+ self.demux_paired_fn(self.muxed_sequences,
+ forward_barcodes=metadata,
+ anchor_forward_barcode=True,
+ anchor_reverse_barcode=True)
+
def test_dual_index_mixed_orientation_success(self):
forward_barcodes = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
@@ -466,7 +729,7 @@ class TestDemuxPaired(TestPluginBase):
'@id2\nACGTACGT\n+\nzzzzzzzz\n'
'@id4\nACGTACGT\n+\nzzzzzzzz\n'
'@id5\nACGTACGT\n+\nzzzzzzzz\n',
- # sample a, rev
+ # sample b, rev
'@id2\nTGCATGCA\n+\nzzzzzzzz\n'
'@id4\nTGCATGCA\n+\nzzzzzzzz\n'
'@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
@@ -536,6 +799,131 @@ class TestDemuxPaired(TestPluginBase):
# Everything should match, so untrimmed should be empty
self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
+ def test_mixed_orientation_cut(self):
+ # sample_a and sample_b have reads in both fwd and rev directions.
+ # sample_c only has reads in the fwd direction.
+ # sample_d only has reads in the rev direction.
+ # If `cut` happens during the first and second demux pass, the
+ # samples a, b and d will be too short and will not demux.
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+ 'sample_d'], name='id')))
+ mixed_orientation_sequences_f_fp = self.get_data_path(
+ 'mixed-orientation/forward.fastq.gz')
+ mixed_orientation_sequences_r_fp = self.get_data_path(
+ 'mixed-orientation/reverse.fastq.gz')
+ with tempfile.TemporaryDirectory() as temp:
+ shutil.copy(mixed_orientation_sequences_f_fp, temp)
+ shutil.copy(mixed_orientation_sequences_r_fp, temp)
+ mixed_orientation_sequences = Artifact.import_data(
+ 'MultiplexedPairedEndBarcodeInSequence', temp)
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(mixed_orientation_sequences,
+ forward_barcodes=forward_barcodes,
+ forward_cut=1,
+ reverse_cut=1,
+ mixed_orientation=True)
+ exp = [
+ # sample_a fwd
+ '@id1\nCGTACGT\n+\nyyyyyyy\n'
+ '@id3\nCGTACGT\n+\nyyyyyyy\n',
+ # sample_a rev
+ '@id1\nGCATGCATGCA\n+\nzzzzzzzzzzz\n'
+ '@id3\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+ # sample_b fwd
+ '@id4\nACGTACGT\n+\nyyyyyyyy\n'
+ '@id2\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_b rev
+ '@id4\nGCATGCATGCA\n+\nzzzzzzzzzzz\n'
+ '@id2\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+ # sample_c fwd
+ '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_c rev
+ '@id5\nGCATGCATGCA\n+\nzzzzzzzzzzz\n',
+ # sample_d fwd
+ '@id6\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_d rev
+ '@id6\nGCATGCATGCA\n+\nzzzzzzzzzzz\n', ]
+
+ # We want to be sure that the validation is 100%, not just `min`,
+ obs_demuxed_art.validate(level='max')
+ # checkpoint assertion for the above `validate` - nothing should fail
+ self.assertTrue(True)
+
+ self.assert_demux_results(forward_barcodes.to_series(), exp,
+ obs_demuxed_art)
+
+ # Everything should match, so untrimmed should be empty
+ self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
+
+ def test_mixed_orientation_anchored(self):
+ # sample_a and sample_b have reads in both fwd and rev directions.
+ # sample_c only has reads in the fwd direction.
+ # sample_d only has reads in the rev direction.
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCA', 'GGGG', 'TTTA'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+ 'sample_d'], name='id')))
+ mixed_orientation_sequences_f_fp = self.get_data_path(
+ 'mixed-orientation/forward.fastq.gz')
+ mixed_orientation_sequences_r_fp = self.get_data_path(
+ 'mixed-orientation/reverse.fastq.gz')
+ with tempfile.TemporaryDirectory() as temp:
+ shutil.copy(mixed_orientation_sequences_f_fp, temp)
+ shutil.copy(mixed_orientation_sequences_r_fp, temp)
+ mixed_orientation_sequences = Artifact.import_data(
+ 'MultiplexedPairedEndBarcodeInSequence', temp)
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(mixed_orientation_sequences,
+ forward_barcodes=forward_barcodes,
+ anchor_forward_barcode=True,
+ anchor_reverse_barcode=True,
+ mixed_orientation=True)
+ exp = [
+ # sample_a fwd
+ '@id1\nACGTACGT\n+\nyyyyyyyy\n'
+ '@id3\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_a rev
+ '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample_b fwd is empty because the first 'C' from the sequence is
+ # not in the barcode (sequence is 'CCCCACGTACGT')
+ '',
+ # sample_b rev is empty for the same reason
+ '',
+ # sample_c fwd
+ '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_c rev
+ '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample_d fwd is empty cf. sample_b
+ '',
+ # sample_d rev is empty cf. sample_b
+ '',
+ ]
+ exp_untrimmed = [
+ '@id2\nCCCCACGTACGT\n+\nyyyyyyyyyyyy\n'
+ '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id6\nTTTTACGTACGT\n+\nyyyyyyyyyyyy\n',
+ '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id4\nCCCCACGTACGT\n+\nyyyyyyyyyyyy\n'
+ '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n'
+ ]
+
+ # We want to be sure that the validation is 100%, not just `min`,
+ obs_demuxed_art.validate(level='max')
+ # checkpoint assertion for the above `validate` - nothing should fail
+ self.assertTrue(True)
+
+ self.assert_demux_results(forward_barcodes.to_series(), exp,
+ obs_demuxed_art)
+
+ self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+
def test_dual_index_mismatched_barcodes(self):
forward_barcodes = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC', 'ACGT'], name='ForwardBarcode',
@@ -587,6 +975,18 @@ class TestDemuxPaired(TestPluginBase):
reverse_barcodes=reverse_barcodes,
mixed_orientation=True)
+ def test_mixed_different_cuts(self):
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+ with self.assertRaises(ValueError):
+ self.demux_paired_fn(self.muxed_sequences,
+ forward_barcodes=forward_barcodes,
+ forward_cut=4,
+ reverse_cut=2,
+ mixed_orientation=True)
+
class TestDemuxUtilsSingleEnd(TestPluginBase):
package = 'q2_cutadapt.tests'
@@ -613,13 +1013,26 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
0.1,
2)
self.assertTrue(barcode_fasta.name in obs[2])
+ self.assertTrue('^file' not in obs[2]) # not anchored
self.assertTrue('0.1' in obs[4])
self.assertTrue('2' in obs[6])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[8])
self.assertTrue(str(self.untrimmed_dir_fmt) in obs[10])
self.assertEqual(str(self.seqs_dir_fmt.file.view(FastqGzFormat)),
obs[11])
- self.assertTrue('1' in obs[13])
+ self.assertTrue('0' in obs[13]) # fwd cut
+ self.assertTrue('1' in obs[15]) # cores
+
+ # Check that '^' is added before 'file' when adapters are anchored
+ with tempfile.NamedTemporaryFile() as barcode_fasta:
+ obs = _build_demux_command(self.seqs_dir_fmt,
+ {'fwd': barcode_fasta, 'rev': None},
+ self.per_sample_dir_fmt,
+ self.untrimmed_dir_fmt,
+ 0.1,
+ 2,
+ anchor_forward=True)
+ self.assertTrue('^file' in obs[2])
def test_rename_files_single(self):
for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
@@ -701,7 +1114,7 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
self.untrimmed_dir_fmt,
0.1,
2)
- self.assertTrue(barcode_fasta.name in obs[2])
+ self.assertTrue(barcode_fasta.name in obs[2])
self.assertTrue('0.1' in obs[4])
self.assertTrue('2' in obs[6])
self.assertTrue(str(self.per_sample_dir_fmt) in obs[8]) # fwd
@@ -712,6 +1125,9 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
self.assertEqual(exp_f, obs[15])
exp_r = str(self.seqs_dir_fmt.reverse_sequences.view(FastqGzFormat))
self.assertEqual(exp_r, obs[16])
+ self.assertEqual('0', obs[18]) # rev cut
+ self.assertEqual('0', obs[20]) # fwd cut
+ self.assertEqual('1', obs[22]) # cores
def test_build_dual_index_demux_command(self):
with tempfile.NamedTemporaryFile() as barcode_fasta_f:
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/commit/5605a03f2ad34bebc91ad1f2b276bc5af7fe36dc
--
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/commit/5605a03f2ad34bebc91ad1f2b276bc5af7fe36dc
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/a83f345f/attachment-0001.htm>
More information about the debian-med-commit
mailing list