[med-svn] [Git][med-team/q2-cutadapt][upstream] 2 commits: New upstream version 2020.2.0
Steffen Möller
gitlab at salsa.debian.org
Tue Dec 1 23:38:31 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / q2-cutadapt
Commits:
e90e9700 by Steffen Moeller at 2020-06-10T16:20:31+02:00
New upstream version 2020.2.0
- - - - -
34f17f14 by Steffen Moeller at 2020-12-02T00:09:58+01:00
New upstream version 2020.11.0
- - - - -
13 changed files:
- LICENSE
- ci/recipe/meta.yaml
- q2_cutadapt/__init__.py
- q2_cutadapt/_demux.py
- q2_cutadapt/_trim.py
- q2_cutadapt/_version.py
- q2_cutadapt/plugin_setup.py
- q2_cutadapt/tests/__init__.py
- + q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz
- + q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz
- q2_cutadapt/tests/test_demux.py
- q2_cutadapt/tests/test_trim.py
- setup.py
Changes:
=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
BSD 3-Clause License
-Copyright (c) 2017-2019, QIIME 2 development team.
+Copyright (c) 2017-2020, QIIME 2 development team.
All rights reserved.
Redistribution and use in source and binary forms, with or without
=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,7 @@ requirements:
run:
- python {{ python }}
- - cutadapt
+ - cutadapt >=3
- pigz
- pandas
- numpy
=====================================
q2_cutadapt/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_cutadapt/_demux.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -8,6 +8,7 @@
import gzip
import os
+import shutil
import subprocess
import tempfile
@@ -83,8 +84,20 @@ def _rename_files(seqs_dir_fmt, per_sample_dir_fmt, barcode_series):
src = os.path.join(str(per_sample_dir_fmt),
'%s.%d.fastq.gz' % (sample_id,
read_direction))
+
+ # TODO: remove this outer guard when we upgrade to cutadapt 3
if os.path.isfile(src):
- os.rename(src, str(out_fp))
+ if out_fp.exists():
+ _merge_files(src, str(out_fp))
+ os.remove(src)
+ else:
+ os.rename(src, str(out_fp))
+
+
+def _merge_files(src, dst):
+ with gzip.open(src, mode='rt', encoding='ascii') as src_fh, \
+ gzip.open(dst, mode='at', encoding='ascii') as dst_fh:
+ shutil.copyfileobj(src_fh, dst_fh)
def _write_barcode_fasta(barcode_series, barcode_fasta):
@@ -106,8 +119,8 @@ def _write_empty_fastq_to_mux_barcode_in_seq_fmt(seqs_dir_fmt):
seqs_dir_fmt.file.write_data(fastq, FastqGzFormat)
-def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
- mux_fmt, batch_size, minimum_length):
+def _demux(seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
+ error_tolerance, mux_fmt, batch_size, minimum_length):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
@@ -130,14 +143,16 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
if samples_w_missing_barcodes:
raise ValueError('The following samples do not have both '
- 'forward and reverse barcodes: %s'
+ 'forward and reverse barcodes (note: if your '
+ 'reads are in single index mixed orientation, '
+ 'try again with all of your barcodes in a single '
+ 'metadata column): %s'
% ', '.join(sorted(samples_w_missing_barcodes)))
if samples_w_dup_barcode_pairs:
raise ValueError('The following samples have duplicate barcode'
' pairs: %s' %
', '.join(sorted(samples_w_dup_barcode_pairs)))
- per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
n_samples = len(barcodes)
if batch_size > n_samples:
raise ValueError('The batch_size (%d) cannot be greater than the '
@@ -164,13 +179,13 @@ def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
if reverse_barcodes is not None:
open_fhs['rev'].close()
previous_untrimmed = current_untrimmed
- # Only use the forward barcode in the renamed files
- _rename_files(seqs, per_sample_sequences, barcodes[fwd_barcode_name])
+ # Only use the forward barcode in the renamed files
+ _rename_files(seqs, per_sample_sequences, barcodes[fwd_barcode_name])
muxed = len(list(per_sample_sequences.sequences.iter_views(FastqGzFormat)))
if muxed == 0:
raise ValueError('No samples were demultiplexed.')
- return per_sample_sequences, previous_untrimmed
+ return previous_untrimmed
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
@@ -180,9 +195,14 @@ def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
minimum_length: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedSingleEndBarcodeInSequenceDirFmt):
+ per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
- return _demux(seqs, barcodes, None, error_rate, mux_fmt, batch_size,
- minimum_length)
+
+ untrimmed = _demux(
+ seqs, per_sample_sequences, barcodes, None, error_rate, mux_fmt,
+ batch_size, minimum_length)
+
+ return per_sample_sequences, untrimmed
def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
@@ -190,9 +210,33 @@ def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
error_rate: float = 0.1,
batch_size: int = 0,
- minimum_length: int = 1) -> \
+ minimum_length: int = 1,
+ mixed_orientation: bool = False) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
+ if mixed_orientation and reverse_barcodes is not None:
+ raise ValueError('Dual-indexed barcodes for mixed orientation '
+ 'reads are not supported.')
+
+ per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
- return _demux(seqs, forward_barcodes, reverse_barcodes, error_rate,
- mux_fmt, batch_size, minimum_length)
+
+ untrimmed = _demux(
+ seqs, per_sample_sequences, forward_barcodes, reverse_barcodes,
+ error_rate, mux_fmt, batch_size, minimum_length)
+
+ if mixed_orientation:
+ fwd = untrimmed.forward_sequences.view(FastqGzFormat)
+ rev = untrimmed.reverse_sequences.view(FastqGzFormat)
+
+ remaining_seqs = MultiplexedPairedEndBarcodeInSequenceDirFmt()
+ # fwd -> rev && rev -> fwd
+ remaining_seqs.forward_sequences.write_data(rev, FastqGzFormat)
+ remaining_seqs.reverse_sequences.write_data(fwd, FastqGzFormat)
+
+ untrimmed = _demux(
+ remaining_seqs, per_sample_sequences, forward_barcodes,
+ reverse_barcodes, error_rate, mux_fmt, batch_size,
+ minimum_length)
+
+ return per_sample_sequences, untrimmed
=====================================
q2_cutadapt/_trim.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_cutadapt/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.10.0)"
- git_full = "b7f4a9c3769dbea95c437997289ffdf6858e40f9"
- git_date = "2019-11-01 01:04:25 +0000"
+ git_refnames = " (HEAD -> master, tag: 2020.11.0)"
+ git_full = "8a18174db6b284af087f8bc82b0b717a446aab4e"
+ git_date = "2020-11-25 17:13:09 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_cutadapt/plugin_setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -282,6 +282,7 @@ plugin.methods.register_function(
inclusive_end=True),
'batch_size': Int % Range(0, None),
'minimum_length': Int % Range(1, None),
+ 'mixed_orientation': Bool,
},
outputs=[
('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
@@ -306,6 +307,9 @@ plugin.methods.register_function(
'the cutadapt default of 0 has been overridden, '
'because that value produces empty sequence '
'records.',
+ 'mixed_orientation': 'Handle demultiplexing of mixed orientation '
+ 'reads (i.e. when forward and reverse reads '
+ 'coexist in the same file).'
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
=====================================
q2_cutadapt/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz
=====================================
Binary files /dev/null and b/q2_cutadapt/tests/data/mixed-orientation/forward.fastq.gz differ
=====================================
q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz
=====================================
Binary files /dev/null and b/q2_cutadapt/tests/data/mixed-orientation/reverse.fastq.gz differ
=====================================
q2_cutadapt/tests/test_demux.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------
import gzip
+import itertools
import os
import pathlib
import shutil
@@ -33,22 +34,28 @@ from qiime2.plugin.testing import TestPluginBase
class TestDemuxSingle(TestPluginBase):
package = 'q2_cutadapt.tests'
- def assert_demux_results(self, exp_samples_and_barcodes, obs_demuxed_art):
+ def assert_demux_results(self, exp_samples_and_barcodes, exp_results,
+ obs_demuxed_art):
obs_demuxed = obs_demuxed_art.view(
SingleLanePerSampleSingleEndFastqDirFmt)
obs_demuxed_seqs = obs_demuxed.sequences.iter_views(FastqGzFormat)
- zipped = zip(exp_samples_and_barcodes.iteritems(), obs_demuxed_seqs)
- for (sample_id, barcode), (filename, _) in zipped:
+ zipped = itertools.zip_longest(exp_samples_and_barcodes.iteritems(),
+ exp_results, obs_demuxed_seqs)
+ for (sample_id, barcode), exp, (filename, fmt) in zipped:
filename = str(filename)
self.assertTrue(sample_id in filename)
self.assertTrue(barcode in filename)
+ with gzip.open(str(fmt), 'rt') as fh:
+ obs = ''.join(fh.readlines())
+ self.assertEqual(exp, obs)
def assert_untrimmed_results(self, exp, obs_untrimmed_art):
obs_untrimmed = obs_untrimmed_art.view(
MultiplexedSingleEndBarcodeInSequenceDirFmt)
obs_untrimmed = obs_untrimmed.file.view(FastqGzFormat)
- obs_untrimmed = gzip.decompress(obs_untrimmed.path.read_bytes())
- self.assertEqual(exp, obs_untrimmed)
+ with gzip.open(str(obs_untrimmed), 'rt') as fh:
+ obs = ''.join(fh.readlines())
+ self.assertEqual(exp, obs)
def setUp(self):
super().setUp()
@@ -62,13 +69,21 @@ class TestDemuxSingle(TestPluginBase):
metadata = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
obs_untrimmed_art)
def test_all_matched(self):
@@ -76,45 +91,83 @@ class TestDemuxSingle(TestPluginBase):
pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample c
+ '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- # obs_untrimmed should be empty, since everything matched
- self.assert_untrimmed_results(b'', obs_untrimmed_art)
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('', obs_untrimmed_art)
+ # NOTE: this test used to check for an exception because it was
+ # possible to generate a completely empty output dir with no fastq.gz
+ # files in it. As of cutadapt 3 this is no longer possible, because output
+ # files are generated for every sample (and we must specify at least one
+ # sample in order for the barcodes to be valid QIIME 2 Metadata). Rather
+ # than remove the test, we will retool it here.
def test_none_matched(self):
metadata = CategoricalMetadataColumn(
pd.Series(['TTTT'], name='Barcode',
index=pd.Index(['sample_d'], name='id')))
with redirected_stdio(stderr=os.devnull):
- with self.assertRaisesRegex(ValueError, 'demultiplexed'):
+ obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
+ self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art)
+ self.assert_untrimmed_results('@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ obs_untrimmed_art)
+
def test_error_tolerance_filtering(self):
metadata = CategoricalMetadataColumn(
pd.Series(['AAAG', 'CCCC'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a has no reads (bc we misspelled the barcode)
+ '',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
- # sample_a is dropped because of a substitution error (AAAA vs AAAG)
- exp_samples_and_barcodes = pd.Series(['CCCC'], index=['sample_b'])
- self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
- self.assert_untrimmed_results(b'@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
- b'@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
- b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ self.assert_demux_results(metadata.to_series(), exp,
+ obs_demuxed_art)
+ self.assert_untrimmed_results('@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n'
+ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
obs_untrimmed_art)
def test_error_tolerance_high_enough_to_prevent_filtering(self):
metadata = CategoricalMetadataColumn(
pd.Series(['AAAG', 'CCCC'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
@@ -122,8 +175,8 @@ class TestDemuxSingle(TestPluginBase):
error_rate=0.25)
# This test should yield the same results as test_typical, above
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
obs_untrimmed_art)
def test_extra_barcode_in_metadata(self):
@@ -131,19 +184,29 @@ class TestDemuxSingle(TestPluginBase):
pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b', 'sample_c',
'sample_d'], name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample c
+ '@id6\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample d is empty bc no reads matched the barcode TTTT
+ '', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
- # TTTT/sample_d shouldn't be in the demuxed results, because there
- # were no reads with that barcode present
- exp_samples_and_barcodes = pd.Series(['AAAA', 'CCCC', 'GGGG'],
+ exp_samples_and_barcodes = pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'],
index=['sample_a', 'sample_b',
- 'sample_c'])
- self.assert_demux_results(exp_samples_and_barcodes, obs_demuxed_art)
- # obs_untrimmed should be empty, since everything matched
- self.assert_untrimmed_results(b'', obs_untrimmed_art)
+ 'sample_c', 'sample_d'])
+ self.assert_demux_results(exp_samples_and_barcodes, exp,
+ obs_demuxed_art)
+ self.assert_untrimmed_results('', obs_untrimmed_art)
def test_variable_length_barcodes(self):
metadata = CategoricalMetadataColumn(
@@ -153,28 +216,46 @@ class TestDemuxSingle(TestPluginBase):
muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz')
muxed_sequences = Artifact.import_data(
'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp)
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample c
+ '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(muxed_sequences, metadata)
- # This test should yield the same results as test_typical, above, just
- # with variable length barcodes
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- self.assert_untrimmed_results(b'', obs_untrimmed_art)
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('', obs_untrimmed_art)
def test_batch_size(self):
metadata = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata,
batch_size=1)
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ # This test should yield the same results as test_typical, above,
+ # the fact that we are batching shouldn't impact the final results
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
obs_untrimmed_art)
def test_invalid_batch_size(self):
@@ -190,15 +271,24 @@ class TestDemuxSingle(TestPluginBase):
pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample c
+ '@id6\nACGTACGT\n+\nzzzzzzzz\n', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata,
batch_size=2)
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- # obs_untrimmed should be empty, since everything matched
- self.assert_untrimmed_results(b'', obs_untrimmed_art)
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('', obs_untrimmed_art)
def test_min_length(self):
metadata = CategoricalMetadataColumn(
@@ -207,42 +297,59 @@ class TestDemuxSingle(TestPluginBase):
pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b', 'sample_c'],
name='id')))
+ exp = [
+ # sample a
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample c is empty because the barcode matched the entire
+ # read, which removed everything.
+ '', ]
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_single_fn(self.muxed_sequences, metadata)
- obs = obs_demuxed_art.view(SingleLanePerSampleSingleEndFastqDirFmt)
-
- (obs_f1, _), (obs_f2, _) = obs.sequences.iter_views(FastqGzFormat)
-
- self.assertEqual('sample_a_AAAA_L001_R1_001.fastq.gz', str(obs_f1))
- self.assertEqual('sample_b_CCCC_L001_R1_001.fastq.gz', str(obs_f2))
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
+ self.assert_untrimmed_results('', obs_untrimmed_art)
class TestDemuxPaired(TestPluginBase):
package = 'q2_cutadapt.tests'
- def assert_demux_results(self, exp_samples_and_barcodes, obs_demuxed_art):
+ def assert_demux_results(self, exp_samples_and_barcodes, exp_results,
+ obs_demuxed_art):
obs_demuxed = obs_demuxed_art.view(
SingleLanePerSamplePairedEndFastqDirFmt)
obs_demuxed_seqs = obs_demuxed.sequences.iter_views(FastqGzFormat)
# Since we are working with fwd/rev reads, duplicate each list elem
exp = [x for x in exp_samples_and_barcodes.iteritems() for _ in (0, 1)]
- zipped = zip(exp, obs_demuxed_seqs)
- for (sample_id, barcode), (filename, _) in zipped:
+ zipped = itertools.zip_longest(exp, exp_results, obs_demuxed_seqs)
+ for (sample_id, barcode), exp, (filename, fmt) in zipped:
filename = str(filename)
self.assertTrue(sample_id in filename)
self.assertTrue(barcode in filename)
+ with gzip.open(str(fmt), 'rt') as fh:
+ obs = ''.join(fh.readlines())
+ self.assertEqual(exp, obs)
def assert_untrimmed_results(self, exp, obs_untrimmed_art):
obs_untrimmed = obs_untrimmed_art.view(
MultiplexedPairedEndBarcodeInSequenceDirFmt)
+
+ # first check the fwd reads
obs_untrimmed_f = obs_untrimmed.forward_sequences.view(FastqGzFormat)
- obs_untrimmed_f = gzip.decompress(obs_untrimmed_f.path.read_bytes())
+ with gzip.open(str(obs_untrimmed_f), 'rt') as fh:
+ obs_untrimmed_f = ''.join(fh.readlines())
self.assertEqual(exp[0], obs_untrimmed_f)
+
+ # next check the rev reads
obs_untrimmed_r = obs_untrimmed.reverse_sequences.view(FastqGzFormat)
- obs_untrimmed_r = gzip.decompress(obs_untrimmed_r.path.read_bytes())
+ with gzip.open(str(obs_untrimmed_r), 'rt') as fh:
+ obs_untrimmed_r = ''.join(fh.readlines())
self.assertEqual(exp[1], obs_untrimmed_r)
def setUp(self):
@@ -264,14 +371,29 @@ class TestDemuxPaired(TestPluginBase):
metadata = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC'], name='Barcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a, fwd
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample a, rev
+ '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample b, fwd
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample b, fwd
+ '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n'
+ '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n', ]
+ exp_untrimmed = ['@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
self.demux_paired_fn(self.muxed_sequences, metadata)
- self.assert_demux_results(metadata.to_series(), obs_demuxed_art)
- exp_untrimmed = [b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
- b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
+ self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art)
self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def test_di_typical(self):
@@ -281,6 +403,23 @@ class TestDemuxPaired(TestPluginBase):
reverse_barcodes = CategoricalMetadataColumn(
pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode',
index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ exp = [
+ # sample a, fwd
+ '@id1\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id3\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample a, rev
+ '@id1\nTGCATGCA\n+\nzzzzzzzz\n'
+ '@id3\nTGCATGCA\n+\nzzzzzzzz\n',
+ # sample b, fwd
+ '@id2\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id4\nACGTACGT\n+\nzzzzzzzz\n'
+ '@id5\nACGTACGT\n+\nzzzzzzzz\n',
+ # sample a, fwd
+ '@id2\nTGCATGCA\n+\nzzzzzzzz\n'
+ '@id4\nTGCATGCA\n+\nzzzzzzzz\n'
+ '@id5\nTGCATGCA\n+\nzzzzzzzz\n', ]
+ exp_untrimmed = ['@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
+ '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
with redirected_stdio(stderr=os.devnull):
obs_demuxed_art, obs_untrimmed_art = \
@@ -288,12 +427,66 @@ class TestDemuxPaired(TestPluginBase):
forward_barcodes=forward_barcodes,
reverse_barcodes=reverse_barcodes)
- self.assert_demux_results(forward_barcodes.to_series(),
+ self.assert_demux_results(forward_barcodes.to_series(), exp,
obs_demuxed_art)
- exp_untrimmed = [b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n',
- b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n']
self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
+ def test_mixed_orientation_success(self):
+ # sample_a and sample_b have reads in both fwd and rev directions.
+ # sample_c only has reads in the fwd direction.
+ # sample_d only has reads in the rev direction.
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b', 'sample_c',
+ 'sample_d'], name='id')))
+ mixed_orientation_sequences_f_fp = self.get_data_path(
+ 'mixed-orientation/forward.fastq.gz')
+ mixed_orientation_sequences_r_fp = self.get_data_path(
+ 'mixed-orientation/reverse.fastq.gz')
+ with tempfile.TemporaryDirectory() as temp:
+ shutil.copy(mixed_orientation_sequences_f_fp, temp)
+ shutil.copy(mixed_orientation_sequences_r_fp, temp)
+ mixed_orientation_sequences = Artifact.import_data(
+ 'MultiplexedPairedEndBarcodeInSequence', temp)
+
+ with redirected_stdio(stderr=os.devnull):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(mixed_orientation_sequences,
+ forward_barcodes=forward_barcodes,
+ mixed_orientation=True)
+ exp = [
+ # sample_a fwd
+ '@id1\nACGTACGT\n+\nyyyyyyyy\n' \
+ '@id3\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_a rev
+ '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
+ '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample_b fwd
+ '@id4\nACGTACGT\n+\nyyyyyyyy\n' \
+ '@id2\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_b rev
+ '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \
+ '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample_c fwd
+ '@id5\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_c rev
+ '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n',
+ # sample_d fwd
+ '@id6\nACGTACGT\n+\nyyyyyyyy\n',
+ # sample_d rev
+ '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ]
+
+ # We want to be sure that the validation is 100%, not just `min`,
+ obs_demuxed_art.validate(level='max')
+ # checkpoint assertion for the above `validate` - nothing should fail
+ self.assertTrue(True)
+
+ self.assert_demux_results(forward_barcodes.to_series(), exp,
+ obs_demuxed_art)
+
+ # Everything should match, so untrimmed should be empty
+ self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
+
def test_di_mismatched_barcodes(self):
forward_barcodes = CategoricalMetadataColumn(
pd.Series(['AAAA', 'CCCC', 'ACGT'], name='ForwardBarcode',
@@ -326,6 +519,36 @@ class TestDemuxPaired(TestPluginBase):
forward_barcodes=forward_barcodes,
reverse_barcodes=reverse_barcodes)
+ def test_multiple_orientations_dual_indices(self):
+ forward_barcodes = CategoricalMetadataColumn(
+ pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+ reverse_barcodes = CategoricalMetadataColumn(
+ pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode',
+ index=pd.Index(['sample_a', 'sample_b'], name='id')))
+
+ mixed_orientation_sequences_f_fp = self.get_data_path(
+ 'mixed-orientation/forward.fastq.gz')
+ mixed_orientation_sequences_r_fp = self.get_data_path(
+ 'mixed-orientation/reverse.fastq.gz')
+
+ # These files have forward and reverse reads mixed together in the same
+ # file
+ with tempfile.TemporaryDirectory() as temp:
+ shutil.copy(mixed_orientation_sequences_f_fp, temp)
+ shutil.copy(mixed_orientation_sequences_r_fp, temp)
+ mixed_orientation_sequences = Artifact.import_data(
+ 'MultiplexedPairedEndBarcodeInSequence', temp)
+
+ with self.assertRaisesRegex(ValueError,
+ 'Dual-indexed barcodes for mixed '
+ 'orientation reads are not supported.'):
+ obs_demuxed_art, obs_untrimmed_art = \
+ self.demux_paired_fn(mixed_orientation_sequences,
+ forward_barcodes=forward_barcodes,
+ reverse_barcodes=reverse_barcodes,
+ mixed_orientation=True)
+
class TestDemuxUtilsSingleEnd(TestPluginBase):
package = 'q2_cutadapt.tests'
@@ -360,7 +583,7 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
obs[11])
def test_rename_files_single(self):
- for fn in ['sample_a.fastq.gz', 'sample_b.fastq.gz']:
+ for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
shutil.copy(self.fastq_fp,
str(self.per_sample_dir_fmt.path / pathlib.Path(fn)))
@@ -368,16 +591,19 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
self.barcode_series)
seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
+ counter = 0
for fn, (sample_id, barcode) in zip(seqs,
self.barcode_series.iteritems()):
self.assertTrue(sample_id in str(fn))
self.assertTrue(barcode in str(fn))
+ counter += 1
+ self.assertEqual(counter, 2)
def test_rename_files_extra_samples_in_barcode_map(self):
barcode_series = pd.Series(['A', 'G', 'C'],
index=['sample_a', 'sample_b', 'sample_c'])
- for fn in ['sample_a.fastq.gz', 'sample_b.fastq.gz']:
+ for fn in ['sample_a.1.fastq.gz', 'sample_b.1.fastq.gz']:
shutil.copy(self.fastq_fp,
str(self.per_sample_dir_fmt.path / pathlib.Path(fn)))
@@ -385,9 +611,12 @@ class TestDemuxUtilsSingleEnd(TestPluginBase):
barcode_series)
seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
+ counter = 0
for fn, (sample_id, barcode) in zip(seqs, barcode_series.iteritems()):
self.assertTrue(sample_id in str(fn))
self.assertTrue(barcode in str(fn))
+ counter += 1
+ self.assertEqual(counter, 2)
def test_write_empty_fastq_to_mux_barcode_in_seq_fmt(self):
_write_empty_fastq_to_mux_barcode_in_seq_fmt(self.untrimmed_dir_fmt)
@@ -471,9 +700,12 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
exp = [('sample_a', 'A'), ('sample_a', 'A'),
('sample_b', 'G'), ('sample_b', 'G')]
+ counter = 0
for fn, (sample_id, barcode) in zip(seqs, exp):
self.assertTrue(sample_id in str(fn))
self.assertTrue(barcode in str(fn))
+ counter += 1
+ self.assertEqual(counter, 4)
def test_rename_files_extra_samples_in_barcode_map(self):
barcode_series = pd.Series(['A', 'G', 'C'],
@@ -490,9 +722,12 @@ class TestDemuxUtilsPairedEnd(TestPluginBase):
seqs = self.per_sample_dir_fmt.sequences.iter_views(FastqGzFormat)
exp = [('sample_a', 'A'), ('sample_a', 'A'),
('sample_b', 'G'), ('sample_b', 'G')]
+ counter = 0
for fn, (sample_id, barcode) in zip(seqs, exp):
self.assertTrue(sample_id in str(fn))
self.assertTrue(barcode in str(fn))
+ counter += 1
+ self.assertEqual(counter, 4)
def test_write_empty_fastq_to_mux_barcode_in_seq_fmt(self):
_write_empty_fastq_to_mux_barcode_in_seq_fmt(self.untrimmed_dir_fmt)
=====================================
q2_cutadapt/tests/test_trim.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -31,6 +31,7 @@ setup(
'data/single-end/*',
'data/paired-end/*',
'data/paired-end-unordered/*',
+ 'data/mixed-orientation/*',
],
},
zip_safe=False,
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/compare/4a7d2a06004d319eabd45ef2ac451a2f48fa6802...34f17f14522275fdab535c7539c4754017903da4
--
View it on GitLab: https://salsa.debian.org/med-team/q2-cutadapt/-/compare/4a7d2a06004d319eabd45ef2ac451a2f48fa6802...34f17f14522275fdab535c7539c4754017903da4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201201/04ae7699/attachment-0001.html>
More information about the debian-med-commit
mailing list