[med-svn] [Git][med-team/python-cutadapt][master] 4 commits: New upstream version 2.7

Steffen Möller gitlab at salsa.debian.org
Mon Nov 25 13:20:26 GMT 2019



Steffen Möller pushed to branch master at Debian Med / python-cutadapt


Commits:
31e5476e by Steffen Moeller at 2019-11-25T13:17:52Z
New upstream version 2.7
- - - - -
6fbf28c5 by Steffen Moeller at 2019-11-25T13:17:52Z
New upstream version

- - - - -
e739c20a by Steffen Moeller at 2019-11-25T13:17:53Z
Update upstream source from tag 'upstream/2.7'

Update to upstream version '2.7'
with Debian dir ce75f369dbf1aef9cc00c88e44f0e5bc85210a07
- - - - -
02a1c823 by Steffen Moeller at 2019-11-25T13:18:33Z
Upload to unstable

- - - - -


16 changed files:

- .travis.yml
- CHANGES.rst
- buildwheels.sh
- debian/changelog
- debian/control
- doc/conf.py
- doc/guide.rst
- doc/installation.rst
- setup.py
- src/cutadapt/__main__.py
- src/cutadapt/adapters.py
- src/cutadapt/filters.py
- src/cutadapt/parser.py
- src/cutadapt/pipeline.py
- src/cutadapt/utils.py
- tests/test_commandline.py


Changes:

=====================================
.travis.yml
=====================================
@@ -10,7 +10,7 @@ python:
   - "3.5"
   - "3.6"
   - "3.7"
-  - "3.8-dev"
+  - "3.8"
   - "nightly"
 
 install:


=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,18 @@
 Changes
 =======
 
+v2.7 (2019-11-22)
+-----------------
+
+* :issue:`427`: Multicore is now supported even when using ``--info-file``,
+  ``--rest-file`` or ``--wildcard-file``. The only remaining feature that
+  still does not work with multicore is now demultiplexing.
+* :issue:`290`: When running on a single core, Cutadapt no longer spawns
+  external ``pigz`` processes for writing gzip-compressed files. This is a first
+  step towards ensuring that using ``--cores=n`` uses only at most *n* CPU
+  cores.
+* This release adds support for Python 3.8.
+
 v2.6 (2019-10-26)
 -----------------
 


=====================================
buildwheels.sh
=====================================
@@ -1,28 +1,28 @@
 #!/bin/bash
 #
-# Build manylinux1 wheels for cutadapt. Based on the example at
+# Build manylinux wheels. Based on the example at
 # <https://github.com/pypa/python-manylinux-demo>
 #
 # It is best to run this in a fresh clone of the repository!
 #
 # Run this within the repository root:
-#   docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
+#   ./buildwheels.sh
 #
 # The wheels will be put into the wheelhouse/ subdirectory.
 #
 # For interactive tests:
-#   docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash
+#   docker run -it -v $(pwd):/io quay.io/pypa/manylinux2010_x86_64 /bin/bash
 
 set -xeuo pipefail
 
-MANYLINUX=quay.io/pypa/manylinux2010_x86_64
+manylinux=quay.io/pypa/manylinux2010_x86_64
 
 # For convenience, if this script is called from outside of a docker container,
 # it starts a container and runs itself inside of it.
 if ! grep -q docker /proc/1/cgroup; then
   # We are not inside a container
-  docker pull ${MANYLINUX}
-  exec docker run --rm -v $(pwd):/io ${MANYLINUX} /io/$0
+  docker pull ${manylinux}
+  exec docker run --rm -v $(pwd):/io ${manylinux} /io/$0
 fi
 
 # Strip binaries (copied from multibuild)
@@ -37,11 +37,12 @@ PYBINS="/opt/python/*/bin"
 HAS_CYTHON=0
 for PYBIN in ${PYBINS}; do
 #    ${PYBIN}/pip install -r /io/requirements.txt
-    ${PYBIN}/pip wheel /io/ -w wheelhouse/
+    ${PYBIN}/pip wheel --no-deps /io/ -w wheelhouse/
 done
+ls wheelhouse/
 
 # Bundle external shared libraries into the wheels
-for whl in wheelhouse/cutadapt-*.whl; do
+for whl in wheelhouse/*.whl; do
     auditwheel repair "$whl" --plat manylinux1_x86_64 -w repaired/
 done
 


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-cutadapt (2.7-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+
+ -- Steffen Moeller <moeller at debian.org>  Mon, 25 Nov 2019 14:17:56 +0100
+
 python-cutadapt (2.6-1) unstable; urgency=medium
 
   * Team upload


=====================================
debian/control
=====================================
@@ -14,7 +14,7 @@ Build-Depends: debhelper-compat (= 12),
                python3-setuptools-scm,
                python3-pytest-timeout,
                python3-xopen (>= 0.5.0),
-               python3-dnaio (>= 0.4),
+               python3-dnaio,
                cython3
 Standards-Version: 4.4.1
 Vcs-Browser: https://salsa.debian.org/med-team/python-cutadapt


=====================================
doc/conf.py
=====================================
@@ -46,7 +46,7 @@ source_suffix = '.rst'
 master_doc = 'index'
 
 # General information about the project.
-project = u'cutadapt'
+project = u'Cutadapt'
 copyright = u'2010-2019, Marcel Martin'
 
 # The version info for the project you're documenting, acts as replacement for


=====================================
doc/guide.rst
=====================================
@@ -129,22 +129,9 @@ Make also sure that you have ``pigz`` (parallel gzip) installed if you use
 multiple cores and write to a ``.gz`` output file. Otherwise, compression of
 the output will be done in a single thread and therefore be a bottleneck.
 
-There are some limitations at the moment:
-
-* The following command-line arguments are not compatible with
-  multi-core:
-
-      - ``--info-file``
-      - ``--rest-file``
-      - ``--wildcard-file``
-      - ``--format``
-
-* Multi-core is not available when you use Cutadapt for demultiplexing.
-
-If you try to use multiple cores with an incompatible commandline option, you
-will get an error message.
-
-Some of these limitations will be lifted in the future, as time allows.
+Currently, multi-core support is not available when demultiplexing. You
+will get an error message if you try to use it. This limitations may be
+lifted in the future.
 
 .. versionadded:: 1.15
 
@@ -154,6 +141,9 @@ Some of these limitations will be lifted in the future, as time allows.
 .. versionadded:: 2.5
     Multicore works with ``--untrimmed/too-short/too-long-(paired)-output``
 
+.. versionadded:: 2.7
+    Muticore works with ``--info-file``, ``--rest-file``, ``--wildcard-file``
+
 
 Speed-up tricks
 ---------------
@@ -1037,7 +1027,7 @@ each read. Steps not requested on the command-line are skipped.
 Filtering reads
 ===============
 
-By default, all processed reads, no matter whether they were trimmed are not,
+By default, all processed reads, no matter whether they were trimmed or not,
 are written to the output file specified by the ``-o`` option (or to standard
 output if ``-o`` was not provided). For paired-end reads, the second read in a
 pair is always written to the file specified by the ``-p`` option.
@@ -1047,23 +1037,22 @@ them entirely or by redirecting them to other files. When redirecting reads,
 the basic rule is that *each read is written to at most one file*. You cannot
 write reads to more than one output file.
 
-In the following, the term "processed read" refers to a read to which all
-modifications have been applied (adapter removal, quality trimming etc.). A
-processed read can be identical to the input read if no modifications were done.
-
+Filters are applied to *all* processed reads, no matter whether they have been
+modified by adapter- or quality trimming.
 
 ``--minimum-length LENGTH`` or ``-m LENGTH``
-    Discard processed reads that are shorter than LENGTH. Reads that are too
-    short even before adapter removal are also discarded. Without this option,
-    reads that have a length of zero (empty reads) are kept in the output.
+    Discard processed reads that are shorter than LENGTH.
+
+    If you do not use this option, reads that have a length of zero (empty
+    reads) are kept in the output. Some downstream tools may have problems
+    with zero-length sequences. In that case, specify at least ``-m 1``.
 
 ``--too-short-output FILE``
     Instead of discarding the reads that are too short according to ``-m``,
     write them to *FILE* (in FASTA/FASTQ format).
 
 ``--maximum-length LENGTH`` or ``-M LENGTH``
-    Discard processed reads that are longer than LENGTH. Reads that are too
-    long even before adapter removal are also discarded.
+    Discard processed reads that are longer than LENGTH.
 
 ``--too-long-output FILE``
     Instead of discarding reads that are too long (according to ``-M``),
@@ -1074,11 +1063,11 @@ processed read can be identical to the input read if no modifications were done.
     of writing them to the regular output file.
 
 ``--discard-trimmed``
-   Discard reads in which an adapter was found.
+    Discard reads in which an adapter was found.
 
 ``--discard-untrimmed``
-   Discard reads in which *no* adapter was found. This has the same effect as
-   specifying ``--untrimmed-output /dev/null``.
+    Discard reads in which *no* adapter was found. This has the same effect as
+    specifying ``--untrimmed-output /dev/null``.
 
 The options ``--too-short-output`` and ``--too-long-output`` are applied first.
 This means, for example, that a read that is too long will never end up in the
@@ -1089,11 +1078,11 @@ The options ``--untrimmed-output``, ``--discard-trimmed`` and ``-discard-untrimm
 are mutually exclusive.
 
 The following filtering options do not have a corresponding option for redirecting
-reads. They always discard reads for which the filtering criterion applies.
+reads. They always discard those reads for which the filtering criterion applies.
 
 ``--max-n COUNT_or_FRACTION``
-    Discard reads with more than COUNT ``N`` bases. If ``COUNT_or_FRACTION`` is an
-    number between 0 and 1, it is interpreted as a fraction of the read length
+    Discard reads with more than COUNT ``N`` bases. If ``COUNT_or_FRACTION`` is
+    a number between 0 and 1, it is interpreted as a fraction of the read length
 
 ``--discard-casava``
     Discard reads that did not pass CASAVA filtering. Illumina’s CASAVA pipeline in


=====================================
doc/installation.rst
=====================================
@@ -38,6 +38,24 @@ Then install Cutadapt like this::
 If neither ``pip`` nor ``conda`` installation works, keep reading.
 
 
+Installation on a Debian-based Linux distribution
+-------------------------------------------------
+
+Cutadapt is also included in Debian-based Linux distributions, such as Ubuntu.
+Simply use your favorite package manager to install Cutadapt. On the
+command-line, this should work ::
+
+    sudo apt install cutadapt
+
+or possibly ::
+
+    sudo apt install python3-cutadapt
+
+Please be aware that this will likely give you an old version of Cutadapt. If
+you encounter unexpected behavior, please use one of the other installation
+methods to get an up-to-date version before reporting bugs.
+
+
 .. _dependencies:
 
 Dependencies


=====================================
setup.py
=====================================
@@ -103,11 +103,11 @@ setup(
     packages=find_packages('src'),
     entry_points={'console_scripts': ['cutadapt = cutadapt.__main__:main']},
     install_requires=[
-        'dnaio~=0.4.0',
+        'dnaio~=0.4.1',
         'xopen~=0.8.4',
     ],
     extras_require={
-        'dev': ['Cython', 'pytest', 'pytest-timeout', 'sphinx', 'sphinx_issues'],
+        'dev': ['Cython', 'pytest', 'pytest-timeout', 'pytest-mock', 'sphinx', 'sphinx_issues'],
     },
     python_requires='>=3.5',
     classifiers=[


=====================================
src/cutadapt/__main__.py
=====================================
@@ -60,7 +60,6 @@ import logging
 import platform
 from argparse import ArgumentParser, SUPPRESS, HelpFormatter
 
-from xopen import xopen
 import dnaio
 
 from cutadapt import __version__
@@ -72,7 +71,7 @@ from cutadapt.modifiers import (LengthTagModifier, SuffixRemover, PrefixSuffixAd
 from cutadapt.report import full_report, minimal_report
 from cutadapt.pipeline import (SingleEndPipeline, PairedEndPipeline, InputFiles, OutputFiles,
     SerialPipelineRunner, ParallelPipelineRunner)
-from cutadapt.utils import available_cpu_count, Progress, DummyProgress
+from cutadapt.utils import available_cpu_count, Progress, DummyProgress, FileOpener
 from cutadapt.log import setup_logging, REPORT
 
 logger = logging.getLogger()
@@ -185,7 +184,7 @@ def get_argument_parser():
             "mask: replace with 'N' characters; "
             "lowercase: convert to lowercase; "
             "none: leave unchanged (useful with "
-            "--discard-untrimmed). Default: trim")
+            "--discard-untrimmed). Default: %(default)s")
     group.add_argument("--no-trim", dest='action', action='store_const', const='none',
         help=SUPPRESS)  # Deprecated, use --action=none
     group.add_argument("--mask-adapter", dest='action', action='store_const', const='mask',
@@ -381,25 +380,24 @@ def parse_lengths(s):
     return tuple(values)
 
 
-def open_output_files(args, default_outfile, interleaved):
+def open_output_files(args, default_outfile, interleaved, file_opener):
     """
     Return an OutputFiles instance. If demultiplex is True, the untrimmed, untrimmed2, out and out2
     attributes are not opened files, but paths (out and out2 with the '{name}' template).
     """
-    compression_level = args.compression_level
 
     def open1(path):
         """Return opened file (or None if path is None)"""
         if path is None:
             return None
-        return xopen(path, "w", compresslevel=compression_level)
+        return file_opener.xopen(path, "wb")
 
     def open2(path1, path2):
         file1 = file2 = None
         if path1 is not None:
-            file1 = xopen(path1, 'wb', compresslevel=compression_level)
+            file1 = file_opener.xopen(path1, "wb")
             if path2 is not None:
-                file2 = xopen(path2, 'wb', compresslevel=compression_level)
+                file2 = file_opener.xopen(path2, "wb")
         return file1, file2
 
     rest_file = open1(args.rest_file)
@@ -599,7 +597,7 @@ def check_arguments(args, paired, is_interleaved_output):
         raise CommandLineError("--pair-adapters cannot be used with --times")
 
 
-def pipeline_from_parsed_args(args, paired, is_interleaved_output):
+def pipeline_from_parsed_args(args, paired, is_interleaved_output, file_opener):
     """
     Setup a processing pipeline from parsed command-line arguments.
 
@@ -632,9 +630,9 @@ def pipeline_from_parsed_args(args, paired, is_interleaved_output):
     # Create the processing pipeline
     if paired:
         pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
-        pipeline = PairedEndPipeline(pair_filter_mode)
+        pipeline = PairedEndPipeline(pair_filter_mode, file_opener)
     else:
-        pipeline = SingleEndPipeline()
+        pipeline = SingleEndPipeline(file_opener)
 
     # When adapters are being trimmed only in R1 or R2, override the pair filter mode
     # as using the default of 'any' would regard all read pairs as untrimmed.
@@ -780,30 +778,30 @@ def main(cmdlineargs=None, default_outfile=sys.stdout.buffer):
 
     # Print the header now because some of the functions below create logging output
     log_header(cmdlineargs)
+    if args.cores < 0:
+        parser.error('Value for --cores cannot be negative')
+
+    cores = available_cpu_count() if args.cores == 0 else args.cores
+    file_opener = FileOpener(
+        compression_level=args.compression_level, threads=0 if cores == 1 else None)
     try:
         is_interleaved_input, is_interleaved_output = determine_interleaved(args)
         input_filename, input_paired_filename = input_files_from_parsed_args(args.inputs,
             paired, is_interleaved_input)
-        pipeline = pipeline_from_parsed_args(args, paired, is_interleaved_output)
-        outfiles = open_output_files(args, default_outfile, is_interleaved_output)
+        pipeline = pipeline_from_parsed_args(args, paired, is_interleaved_output, file_opener)
+        outfiles = open_output_files(args, default_outfile, is_interleaved_output, file_opener)
     except CommandLineError as e:
         parser.error(str(e))
         return  # avoid IDE warnings below
 
-    if args.cores < 0:
-        parser.error('Value for --cores cannot be negative')
-    cores = available_cpu_count() if args.cores == 0 else args.cores
     if cores > 1:
         if ParallelPipelineRunner.can_output_to(outfiles):
             runner_class = ParallelPipelineRunner
             runner_kwargs = dict(n_workers=cores, buffer_size=args.buffer_size)
         else:
-            parser.error('Running in parallel is currently not supported for '
-                'the given combination of command-line parameters.\nThese '
-                'options are not supported: --info-file, --rest-file, '
-                '--wildcard-file, --format\n'
-                'Also, demultiplexing is not supported.\n'
-                'Omit --cores/-j to continue.')
+            parser.error("Running in parallel is currently not supported "
+                "when using --format or when demultiplexing.\n"
+                "Omit --cores/-j to continue.")
             return  # avoid IDE warnings below
     else:
         runner_class = SerialPipelineRunner


=====================================
src/cutadapt/adapters.py
=====================================
@@ -73,6 +73,15 @@ class EndStatistics:
         self._remove = adapter.remove
         self.adjacent_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '': 0}
 
+    def __repr__(self):
+        errors = {k: dict(v) for k, v in self.errors.items()}
+        return "EndStatistics(where={}, max_error_rate={}, errors={}, adjacent_bases={})".format(
+            self.where,
+            self.max_error_rate,
+            errors,
+            self.adjacent_bases,
+        )
+
     def __iadd__(self, other):
         if not isinstance(other, self.__class__):
             raise ValueError("Cannot compare")
@@ -140,6 +149,14 @@ class AdapterStatistics:
         else:
             self.back = EndStatistics(adapter2)
 
+    def __repr__(self):
+        return "AdapterStatistics(name={}, where={}, front={}, back={})".format(
+            self.name,
+            self.where,
+            self.front,
+            self.back,
+        )
+
     def __iadd__(self, other):
         if self.where != other.where:  # TODO self.name != other.name or
             raise ValueError('incompatible objects')
@@ -297,7 +314,7 @@ class Adapter:
         self.name = _generate_adapter_name() if name is None else name
         self.sequence = sequence.upper().replace('U', 'T')
         if not self.sequence:
-            raise ValueError('Sequence is empty')
+            raise ValueError("Adapter sequence is empty")
         self.where = where
         if remove not in (None, 'prefix', 'suffix', 'auto'):
             raise ValueError('remove parameter must be "prefix", "suffix", "auto" or None')


=====================================
src/cutadapt/filters.py
=====================================
@@ -14,11 +14,6 @@ The read is then assumed to have been "consumed", that is, either written
 somewhere or filtered (should be discarded).
 """
 from abc import ABC, abstractmethod
-import errno
-
-import dnaio
-
-from .utils import raise_open_files_limit
 
 
 # Constants used when returning from a Filter’s __call__ method to improve
@@ -230,29 +225,13 @@ class CasavaFilter(SingleEndFilter):
         return right[1:4] == ':Y:'  # discard if :Y: found
 
 
-def _open_raise_limit(path, qualities):
-    """
-    Open a FASTA/FASTQ file for writing. If it fails because the number of open files
-    would be exceeded, try to raise the soft limit and re-try.
-    """
-    try:
-        f = dnaio.open(path, mode="w", qualities=qualities)
-    except OSError as e:
-        if e.errno == errno.EMFILE:  # Too many open files
-            raise_open_files_limit(8)
-            f = dnaio.open(path, mode="w", qualities=qualities)
-        else:
-            raise
-    return f
-
-
 class Demultiplexer(SingleEndFilter):
     """
     Demultiplex trimmed reads. Reads are written to different output files
     depending on which adapter matches. Files are created when the first read
     is written to them.
     """
-    def __init__(self, path_template, untrimmed_path, qualities):
+    def __init__(self, path_template, untrimmed_path, qualities, file_opener):
         """
         path_template must contain the string '{name}', which will be replaced
         with the name of the adapter to form the final output path.
@@ -267,6 +246,7 @@ class Demultiplexer(SingleEndFilter):
         self.written = 0
         self.written_bp = [0, 0]
         self.qualities = qualities
+        self.file_opener = file_opener
 
     def __call__(self, read, matches):
         """
@@ -275,14 +255,14 @@ class Demultiplexer(SingleEndFilter):
         if matches:
             name = matches[-1].adapter.name
             if name not in self.writers:
-                self.writers[name] = _open_raise_limit(
+                self.writers[name] = self.file_opener.dnaio_open_raise_limit(
                     self.template.replace('{name}', name), self.qualities)
             self.written += 1
             self.written_bp[0] += len(read)
             self.writers[name].write(read)
         else:
             if self.untrimmed_writer is None and self.untrimmed_path is not None:
-                self.untrimmed_writer = _open_raise_limit(
+                self.untrimmed_writer = self.file_opener.dnaio_open_raise_limit(
                     self.untrimmed_path, self.qualities)
             if self.untrimmed_writer is not None:
                 self.written += 1
@@ -303,16 +283,16 @@ class PairedDemultiplexer(PairedEndFilter):
     depending on which adapter (in read 1) matches.
     """
     def __init__(self, path_template, path_paired_template, untrimmed_path, untrimmed_paired_path,
-            qualities):
+            qualities, file_opener):
         """
         The path templates must contain the string '{name}', which will be replaced
         with the name of the adapter to form the final output path.
         Read pairs without an adapter match are written to the files named by
         untrimmed_path.
         """
-        self._demultiplexer1 = Demultiplexer(path_template, untrimmed_path, qualities)
+        self._demultiplexer1 = Demultiplexer(path_template, untrimmed_path, qualities, file_opener)
         self._demultiplexer2 = Demultiplexer(path_paired_template, untrimmed_paired_path,
-            qualities)
+            qualities, file_opener)
 
     @property
     def written(self):
@@ -337,7 +317,7 @@ class CombinatorialDemultiplexer(PairedEndFilter):
     Demultiplex reads depending on which adapter matches, taking into account both matches
     on R1 and R2.
     """
-    def __init__(self, path_template, path_paired_template, untrimmed_name, qualities):
+    def __init__(self, path_template, path_paired_template, untrimmed_name, qualities, file_opener):
         """
         path_template must contain the string '{name1}' and '{name2}', which will be replaced
         with the name of the adapters found on R1 and R2, respectively to form the final output
@@ -355,6 +335,7 @@ class CombinatorialDemultiplexer(PairedEndFilter):
         self.written = 0
         self.written_bp = [0, 0]
         self.qualities = qualities
+        self.file_opener = file_opener
 
     @staticmethod
     def _make_path(template, name1, name2):
@@ -379,8 +360,8 @@ class CombinatorialDemultiplexer(PairedEndFilter):
             path1 = self._make_path(self.template, name1, name2)
             path2 = self._make_path(self.paired_template, name1, name2)
             self.writers[key] = (
-                _open_raise_limit(path1, qualities=self.qualities),
-                _open_raise_limit(path2, qualities=self.qualities),
+                self.file_opener.dnaio_open_raise_limit(path1, qualities=self.qualities),
+                self.file_opener.dnaio_open_raise_limit(path2, qualities=self.qualities),
             )
         writer1, writer2 = self.writers[key]
         self.written += 1


=====================================
src/cutadapt/parser.py
=====================================
@@ -3,6 +3,7 @@ Parse adapter specifications
 """
 import re
 import logging
+from xopen import xopen
 from dnaio.readers import FastaReader
 from .adapters import Where, WHERE_TO_REMOVE_MAP, Adapter, BackOrFrontAdapter, LinkedAdapter
 
@@ -397,7 +398,8 @@ class AdapterParser:
         """
         if spec.startswith('file:'):
             # read adapter sequences from a file
-            with FastaReader(spec[5:]) as fasta:
+            with xopen(spec[5:], mode="rb", threads=0) as f:
+                fasta = FastaReader(f)
                 for record in fasta:
                     name = record.name.split(None, 1)
                     name = name[0] if name else None


=====================================
src/cutadapt/pipeline.py
=====================================
@@ -6,13 +6,14 @@ import logging
 import functools
 from abc import ABC, abstractmethod
 from multiprocessing import Process, Pipe, Queue
+from pathlib import Path
 import multiprocessing.connection
 import traceback
 
 from xopen import xopen
 import dnaio
 
-from .utils import Progress
+from .utils import Progress, FileOpener
 from .modifiers import PairedModifier
 from .report import Statistics
 from .filters import (Redirector, PairedRedirector, NoFilter, PairedNoFilter, InfoFileWriter,
@@ -93,13 +94,14 @@ class Pipeline(ABC):
     """
     n_adapters = 0
 
-    def __init__(self):
+    def __init__(self, file_opener: FileOpener):
         self._close_files = []
         self._reader = None
         self._filters = None
         self._modifiers = []
         self._outfiles = None
         self._demultiplexer = None
+        self._textiowrappers = []
 
         # Filter settings
         self._minimum_length = None
@@ -108,6 +110,7 @@ class Pipeline(ABC):
         self.discard_casava = False
         self.discard_trimmed = False
         self.discard_untrimmed = False
+        self.file_opener = file_opener
 
     def connect_io(self, infiles: InputFiles, outfiles: OutputFiles):
         self._reader = dnaio.open(infiles.file1, file2=infiles.file2,
@@ -119,6 +122,10 @@ class Pipeline(ABC):
         # for all outputs
         if force_fasta:
             kwargs['fileformat'] = 'fasta'
+        # file and file2 must already be file-like objects because we don’t want to
+        # take care of threads and compression levels here.
+        for f in (file, file2):
+            assert not (f is None and isinstance(f, (str, bytes, Path)))
         return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities,
             **kwargs)
 
@@ -133,7 +140,9 @@ class Pipeline(ABC):
             (WildcardFileWriter, outfiles.wildcard),
         ):
             if outfile:
-                self._filters.append(filter_wrapper(None, filter_class(outfile), None))
+                textiowrapper = io.TextIOWrapper(outfile)
+                self._textiowrappers.append(textiowrapper)
+                self._filters.append(filter_wrapper(None, filter_class(textiowrapper), None))
 
         # minimum length and maximum length
         for lengths, file1, file2, filter_class in (
@@ -184,7 +193,16 @@ class Pipeline(ABC):
                     untrimmed_filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter(), DiscardUntrimmedFilter()))
             self._filters.append(self._final_filter(outfiles))
 
+    def flush(self):
+        for f in self._textiowrappers:
+            f.flush()
+        for f in self._outfiles:
+            if f is not None:
+                f.flush()
+
     def close(self):
+        for f in self._textiowrappers:
+            f.close()  # This also closes the underlying files; a second close occurs below
         for f in self._outfiles:
             # TODO do not use hasattr
             if f is not None and f is not sys.stdin and f is not sys.stdout and hasattr(f, 'close'):
@@ -223,8 +241,8 @@ class SingleEndPipeline(Pipeline):
     """
     paired = False
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, file_opener: FileOpener):
+        super().__init__(file_opener)
         self._modifiers = []
 
     def add(self, modifier):
@@ -261,7 +279,8 @@ class SingleEndPipeline(Pipeline):
         return NoFilter(writer)
 
     def _create_demultiplexer(self, outfiles):
-        return Demultiplexer(outfiles.out, outfiles.untrimmed, qualities=self.uses_qualities)
+        return Demultiplexer(outfiles.out, outfiles.untrimmed, qualities=self.uses_qualities,
+            file_opener=self.file_opener)
 
     @property
     def minimum_length(self):
@@ -288,8 +307,8 @@ class PairedEndPipeline(Pipeline):
     """
     paired = True
 
-    def __init__(self, pair_filter_mode):
-        super().__init__()
+    def __init__(self, pair_filter_mode, file_opener: FileOpener):
+        super().__init__(file_opener)
         self._pair_filter_mode = pair_filter_mode
         self._reader = None
         # Whether to ignore pair_filter mode for discard-untrimmed filter
@@ -359,10 +378,11 @@ class PairedEndPipeline(Pipeline):
     def _create_demultiplexer(self, outfiles):
         if '{name1}' in outfiles.out and '{name2}' in outfiles.out:
             return CombinatorialDemultiplexer(outfiles.out, outfiles.out2,
-                outfiles.untrimmed, qualities=self.uses_qualities)
+                outfiles.untrimmed, qualities=self.uses_qualities, file_opener=self.file_opener)
         else:
             return PairedDemultiplexer(outfiles.out, outfiles.out2,
-            outfiles.untrimmed, outfiles.untrimmed2, qualities=self.uses_qualities)
+                outfiles.untrimmed, outfiles.untrimmed2, qualities=self.uses_qualities,
+                file_opener=self.file_opener)
 
     @property
     def minimum_length(self):
@@ -472,6 +492,7 @@ class WorkerProcess(Process):
                 outfiles = self._make_output_files()
                 self._pipeline.connect_io(infiles, outfiles)
                 (n, bp1, bp2) = self._pipeline.process_reads()
+                self._pipeline.flush()
                 cur_stats = Statistics().collect(n, bp1, bp2, [], self._pipeline._filters)
                 stats += cur_stats
                 self._send_outfiles(outfiles, chunk_index, n)
@@ -502,7 +523,6 @@ class WorkerProcess(Process):
         that has BytesIO instances for each non-None output file
         """
         output_files = copy.copy(self._orig_outfiles)
-        # TODO info, rest, wildcard need to be StringIO()
         for attr in (
             "out", "out2", "untrimmed", "untrimmed2", "too_short", "too_short2", "too_long",
             "too_long2", "info", "rest", "wildcard"
@@ -632,13 +652,7 @@ class ParallelPipelineRunner(PipelineRunner):
 
     @staticmethod
     def can_output_to(outfiles):
-        return (
-            outfiles.out is not None
-            and outfiles.rest is None
-            and outfiles.info is None
-            and outfiles.wildcard is None
-            and not outfiles.demultiplex
-        )
+        return outfiles.out is not None and not outfiles.demultiplex
 
     def _assign_output(self, outfiles):
         if not self.can_output_to(outfiles):
@@ -682,8 +696,7 @@ class ParallelPipelineRunner(PipelineRunner):
                         # this happens only when there is an exception sending
                         # the statistics)
                         e, tb_str = connection.recv()
-                        # TODO traceback should only be printed in development
-                        logger.debug('%s', tb_str)
+                        logger.error('%s', tb_str)
                         raise e
                     if stats is None:
                         stats = cur_stats
@@ -695,17 +708,16 @@ class ParallelPipelineRunner(PipelineRunner):
                     # An exception has occurred in the worker
                     e, tb_str = connection.recv()
 
-                    # TODO traceback should only be printed in development
                     # We should use the worker's actual traceback object
                     # here, but traceback objects are not picklable.
-                    logger.debug('%s', tb_str)
+                    logger.error('%s', tb_str)
                     raise e
 
                 # No. of reads processed in this chunk
                 chunk_n = connection.recv()
                 if chunk_n == -2:
                     e, tb_str = connection.recv()
-                    logger.debug('%s', tb_str)
+                    logger.error('%s', tb_str)
                     raise e
                 n += chunk_n
                 self._progress.update(n)


=====================================
src/cutadapt/utils.py
=====================================
@@ -1,8 +1,10 @@
 import re
 import sys
 import time
+import errno
 import multiprocessing
 
+from xopen import xopen
 import dnaio
 
 
@@ -142,3 +144,31 @@ def reverse_complemented_sequence(sequence: dnaio.Sequence):
     else:
         qualities = sequence.qualities[::-1]
     return dnaio.Sequence(sequence.name, reverse_complement(sequence.sequence), qualities)
+
+
+class FileOpener:
+    def __init__(self, compression_level: int = 6, threads: int = None):
+        self.compression_level = compression_level
+        self.threads = threads
+
+    def xopen(self, path, mode):
+        return xopen(path, mode, compresslevel=self.compression_level, threads=self.threads)
+
+    def dnaio_open(self, *args, **kwargs):
+        kwargs["opener"] = self.xopen
+        return dnaio.open(*args, **kwargs)
+
+    def dnaio_open_raise_limit(self, path, qualities):
+        """
+        Open a FASTA/FASTQ file for writing. If it fails because the number of open files
+        would be exceeded, try to raise the soft limit and re-try.
+        """
+        try:
+            f = self.dnaio_open(path, mode="w", qualities=qualities)
+        except OSError as e:
+            if e.errno == errno.EMFILE:  # Too many open files
+                raise_open_files_limit(8)
+                f = self.dnaio_open(path, mode="w", qualities=qualities)
+            else:
+                raise
+        return f


=====================================
tests/test_commandline.py
=====================================
@@ -46,10 +46,10 @@ def test_lowercase(run):
     run('-a ttagacatatctccgtcg', 'lowercase.fastq', 'small.fastq')
 
 
-def test_rest(run, tmpdir):
+def test_rest(run, tmpdir, cores):
     """-r/--rest-file"""
     rest = str(tmpdir.join("rest.tmp"))
-    run(['-b', 'ADAPTER', '-N', '-r', rest], "rest.fa", "rest.fa")
+    run(['--cores', str(cores), '-b', 'ADAPTER', '-N', '-r', rest], "rest.fa", "rest.fa")
     assert_files_equal(datapath('rest.txt'), rest)
 
 
@@ -228,11 +228,14 @@ def test_read_wildcard(run):
     ("-a", "wildcard_adapter.fa"),
     ("-b", "wildcard_adapter_anywhere.fa"),
 ])
-def test_adapter_wildcard(adapter_type, expected, run, tmpdir):
+def test_adapter_wildcard(adapter_type, expected, run, tmpdir, cores):
     """wildcards in adapter"""
     wildcard_path = str(tmpdir.join("wildcards.txt"))
-    run("--wildcard-file {} {} ACGTNNNACGT".format(wildcard_path, adapter_type),
-        expected, "wildcard_adapter.fa")
+    run([
+            "--cores", str(cores),
+            "--wildcard-file", wildcard_path,
+            adapter_type, "ACGTNNNACGT"
+        ],  expected, "wildcard_adapter.fa")
     with open(wildcard_path) as wct:
         lines = wct.readlines()
     lines = [line.strip() for line in lines]
@@ -309,26 +312,26 @@ def test_strip_suffix(run):
     run("--strip-suffix _sequence -a XXXXXXX", "stripped.fasta", "simple.fasta")
 
 
-def test_info_file(run, tmpdir):
+def test_info_file(run, tmpdir, cores):
     # The true adapter sequence in the illumina.fastq.gz data set is
     # GCCTAACTTCTTAGACTGCCTTAAGGACGT (fourth base is different from the sequence shown here)
     info_path = str(tmpdir.join("info.txt"))
-    run(["--info-file", info_path, "-a", "adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT"],
+    run(["--cores", str(cores), "--info-file", info_path, "-a", "adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT"],
         "illumina.fastq", "illumina.fastq.gz")
     assert_files_equal(cutpath("illumina.info.txt"), info_path)
 
 
-def test_info_file_times(run, tmpdir):
+def test_info_file_times(run, tmpdir, cores):
     info_path = str(tmpdir.join("info.txt"))
-    run(["--info-file", info_path, "--times", "2", "-a", "adapt=GCCGAACTTCTTA",
+    run(["--cores", str(cores), "--info-file", info_path, "--times", "2", "-a", "adapt=GCCGAACTTCTTA",
         "-a", "adapt2=GACTGCCTTAAGGACGT"], "illumina5.fastq", "illumina5.fastq")
     assert_files_equal(cutpath('illumina5.info.txt'), info_path)
 
 
-def test_info_file_fasta(run, tmpdir):
+def test_info_file_fasta(run, tmpdir, cores):
     info_path = str(tmpdir.join("info.txt"))
     # Just make sure that it runs
-    run(["--info-file", info_path, "-a", "TTAGACATAT", "-g", "GAGATTGCCA", "--no-indels"],
+    run(["--cores", str(cores), "--info-file", info_path, "-a", "TTAGACATAT", "-g", "GAGATTGCCA", "--no-indels"],
         "no_indels.fasta", "no_indels.fasta")
 
 



View it on GitLab: https://salsa.debian.org/med-team/python-cutadapt/compare/0a23355224830bf21673855945f517581262f7b8...02a1c823fa85aa7d1c49d01e4545933922e84c55

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-cutadapt/compare/0a23355224830bf21673855945f517581262f7b8...02a1c823fa85aa7d1c49d01e4545933922e84c55
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191125/b3db812c/attachment-0001.html>


More information about the debian-med-commit mailing list