[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 0.4.1

Sun Nov 17 15:24:19 GMT 2019


Andreas Tille pushed to branch upstream at Debian Med / python-dnaio


Commits:
6f93bd69 by Andreas Tille at 2019-11-17T15:18:27Z
New upstream version 0.4.1
- - - - -


9 changed files:

- .travis.yml
- setup.py
- src/dnaio/__init__.py
- src/dnaio/chunks.py
- src/dnaio/readers.py
- src/dnaio/writers.py
- tests/test_internal.py
- tests/test_open.py
- tox.ini


Changes:

=====================================
.travis.yml
=====================================
@@ -11,6 +11,7 @@ python:
   - "3.5"
   - "3.6"
   - "3.7"
+  - "3.8"
   - "nightly"
 
 install:
@@ -45,5 +46,10 @@ jobs:
           ls -l dist/
           python3 -m twine upload dist/*
 
+    - name: flake8
+      python: "3.6"
+      install: python3 -m pip install flake8
+      script: flake8 src/ tests/
+
   allow_failures:
     - python: "nightly"


=====================================
setup.py
=====================================
@@ -77,7 +77,7 @@ setup(
     install_requires=['xopen>=0.8.2'],
     python_requires='>=3.4',
     classifiers=[
-            "Development Status :: 3 - Alpha",
+            "Development Status :: 4 - Beta",
             "Intended Audience :: Science/Research",
             "License :: OSI Approved :: MIT License",
             "Natural Language :: English",


=====================================
src/dnaio/__init__.py
=====================================
@@ -16,6 +16,9 @@ __all__ = [
     'InterleavedSequenceReader',
     'InterleavedSequenceWriter',
     'PairedSequenceReader',
+    'read_chunks',
+    'read_paired_chunks',
+    '__version__',
 ]
 
 import os
@@ -45,7 +48,9 @@ except ImportError:
         return path
 
 
-def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qualities=None):
+def open(
+    file1, *, file2=None, fileformat=None, interleaved=False, mode="r", qualities=None, opener=xopen
+):
     """
     Open sequence files in FASTA or FASTQ format for reading or writing. This is
     a factory that returns an instance of one of the ...Reader or ...Writer
@@ -71,32 +76,37 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
           appropriately.
         * When False (no qualities available), an exception is raised when the
           auto-detected output format is FASTQ.
+
+    opener -- A function that is used to open file1 and file2 if they are not
+        already open file-like objects. By default, xopen is used, which can
+        also open compressed file formats.
     """
     if mode not in ("r", "w", "a"):
         raise ValueError("Mode must be 'r', 'w' or 'a'")
     if interleaved and file2 is not None:
         raise ValueError("When interleaved is set, file2 must be None")
+
     if file2 is not None:
         if mode in "wa" and file1 == file2:
             raise ValueError("The paired-end output files are identical")
         if mode == "r":
-            return PairedSequenceReader(file1, file2, fileformat)
+            return PairedSequenceReader(file1, file2, fileformat, opener=opener)
         elif mode == "w":
-            return PairedSequenceWriter(file1, file2, fileformat, qualities)
+            return PairedSequenceWriter(file1, file2, fileformat, qualities, opener=opener)
         else:
-            return PairedSequenceAppender(file1, file2, fileformat, qualities)
+            return PairedSequenceAppender(file1, file2, fileformat, qualities, opener=opener)
     if interleaved:
         if mode == "r":
-            return InterleavedSequenceReader(file1, fileformat)
+            return InterleavedSequenceReader(file1, fileformat, opener=opener)
         elif mode == "w":
-            return InterleavedSequenceWriter(file1, fileformat, qualities)
+            return InterleavedSequenceWriter(file1, fileformat, qualities, opener=opener)
         else:
-            return InterleavedSequenceAppender(file1, fileformat, qualities)
+            return InterleavedSequenceAppender(file1, fileformat, qualities, opener=opener)
 
     # The multi-file options have been dealt with, delegate rest to the
     # single-file function.
     return _open_single(
-        file1, fileformat=fileformat, mode=mode, qualities=qualities)
+        file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities)
 
 
 def _detect_format_from_name(name):
@@ -118,16 +128,16 @@ def _detect_format_from_name(name):
     return None
 
 
-def _open_single(file, *, fileformat=None, mode='r', qualities=None):
+def _open_single(file, opener, *, fileformat=None, mode="r", qualities=None):
     """
     Open a single sequence file. See description of open() above.
     """
     if mode not in ("r", "w", "a"):
         raise ValueError("Mode must be 'r', 'w' or 'a'")
 
-    if isinstance(file, (str, pathlib.Path)):
+    if isinstance(file, (str, pathlib.Path)):  # TODO Use os.PathLike in Python 3.6+
         path = fspath(file)
-        file = xopen(path, mode + 'b')
+        file = opener(path, mode + "b")
         close_file = True
     else:
         if mode == 'r' and not hasattr(file, 'readinto'):
@@ -164,30 +174,18 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
         fileformat = 'fastq' if qualities else 'fasta'
 
     if mode == 'r' and fileformat is None:
-        # No format detected so far. Try to read from the file.
-        if file.seekable():
-            first_char = file.read(1)
-            file.seek(-1, 1)
-        else:
-            first_char = file.peek(1)[0:1]
-        formats = {
-            b'@': 'fastq',
-            b'>': 'fasta',
-            b'#': 'fasta',  # Some FASTA variants allow comments
-            b'': 'fastq',  # Pretend FASTQ for empty input
-        }
-        try:
-            fileformat = formats[first_char]
-        except KeyError:
+        fileformat = _detect_format_from_content(file)
+        if fileformat is None:
             raise UnknownFileFormat(
                 'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was '
-                'not available or not recognized and the first character in the file ({!r}) is '
-                'unexpected.'.format(file, first_char))
+                'not available or not recognized and the first character in the file is '
+                'unexpected.'.format(file))
 
     if fileformat is None:
         assert mode == 'w'
         extra = " because the output file name is not available" if path is None else ""
-        raise UnknownFileFormat("Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
+        raise UnknownFileFormat(
+            "Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
 
     if fileformat == 'fastq' and mode in "wa" and qualities is False:
         raise ValueError(
@@ -196,6 +194,24 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
     return handlers[fileformat](file)
 
 
+def _detect_format_from_content(file):
+    """
+    Return 'fasta', 'fastq' or None
+    """
+    if file.seekable():
+        first_char = file.read(1)
+        file.seek(-1, 1)
+    else:
+        first_char = file.peek(1)[0:1]
+    formats = {
+        b'@': 'fastq',
+        b'>': 'fasta',
+        b'#': 'fasta',  # Some FASTA variants allow comments
+        b'': 'fastq',  # Pretend FASTQ for empty input
+    }
+    return formats.get(first_char, None)
+
+
 def _sequence_names_match(r1, r2):
     """
     Check whether the sequence records r1 and r2 have identical names, ignoring a
@@ -220,10 +236,10 @@ class PairedSequenceReader:
     """
     paired = True
 
-    def __init__(self, file1, file2, fileformat=None):
+    def __init__(self, file1, file2, fileformat=None, opener=xopen):
         with ExitStack() as stack:
-            self.reader1 = stack.enter_context(_open_single(file1, fileformat=fileformat))
-            self.reader2 = stack.enter_context(_open_single(file2, fileformat=fileformat))
+            self.reader1 = stack.enter_context(_open_single(file1, opener=opener, fileformat=fileformat))
+            self.reader2 = stack.enter_context(_open_single(file2, opener=opener, fileformat=fileformat))
             self._close = stack.pop_all().close
         self.delivers_qualities = self.reader1.delivers_qualities
 
@@ -240,7 +256,8 @@ class PairedSequenceReader:
                 # End of file 1. Make sure that file 2 is also at end.
                 try:
                     next(it2)
-                    raise FileFormatError("Reads are improperly paired. There are more reads in "
+                    raise FileFormatError(
+                        "Reads are improperly paired. There are more reads in "
                         "file 2 than in file 1.", line=None) from None
                 except StopIteration:
                     pass
@@ -248,10 +265,12 @@ class PairedSequenceReader:
             try:
                 r2 = next(it2)
             except StopIteration:
-                raise FileFormatError("Reads are improperly paired. There are more reads in "
+                raise FileFormatError(
+                    "Reads are improperly paired. There are more reads in "
                     "file 1 than in file 2.", line=None) from None
             if not _sequence_names_match(r1, r2):
-                raise FileFormatError("Reads are improperly paired. Read name '{}' "
+                raise FileFormatError(
+                    "Reads are improperly paired. Read name '{}' "
                     "in file 1 does not match '{}' in file 2.".format(r1.name, r2.name), line=None) from None
             yield (r1, r2)
 
@@ -271,8 +290,8 @@ class InterleavedSequenceReader:
     """
     paired = True
 
-    def __init__(self, file, fileformat=None):
-        self.reader = _open_single(file, fileformat=fileformat)
+    def __init__(self, file, fileformat=None, opener=xopen):
+        self.reader = _open_single(file, opener=opener, fileformat=fileformat)
         self.delivers_qualities = self.reader.delivers_qualities
 
     def __iter__(self):
@@ -281,10 +300,12 @@ class InterleavedSequenceReader:
             try:
                 r2 = next(it)
             except StopIteration:
-                raise FileFormatError("Interleaved input file incomplete: Last record "
+                raise FileFormatError(
+                    "Interleaved input file incomplete: Last record "
                     "{!r} has no partner.".format(r1.name), line=None) from None
             if not _sequence_names_match(r1, r2):
-                raise FileFormatError("Reads are improperly paired. Name {!r} "
+                raise FileFormatError(
+                    "Reads are improperly paired. Name {!r} "
                     "(first) does not match {!r} (second).".format(r1.name, r2.name), line=None)
             yield (r1, r2)
 
@@ -301,12 +322,14 @@ class InterleavedSequenceReader:
 class PairedSequenceWriter:
     _mode = "w"
 
-    def __init__(self, file1, file2, fileformat='fastq', qualities=None):
+    def __init__(self, file1, file2, fileformat='fastq', qualities=None, opener=xopen):
         with ExitStack() as stack:
-            self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode=self._mode,
-                qualities=qualities))
-            self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode=self._mode,
-                qualities=qualities))
+            self._writer1 = stack.enter_context(
+                _open_single(
+                    file1, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
+            self._writer2 = stack.enter_context(
+                _open_single(
+                    file2, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
             self._close = stack.pop_all().close
 
     def write(self, read1, read2):
@@ -334,10 +357,10 @@ class InterleavedSequenceWriter:
     """
     _mode = "w"
 
-    def __init__(self, file, fileformat='fastq', qualities=None):
+    def __init__(self, file, fileformat='fastq', qualities=None, opener=xopen):
 
         self._writer = _open_single(
-            file, fileformat=fileformat, mode=self._mode, qualities=qualities)
+            file, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities)
 
     def write(self, read1, read2):
         self._writer.write(read1)


=====================================
src/dnaio/chunks.py
=====================================
@@ -103,7 +103,8 @@ def read_paired_chunks(f, f2, buffer_size=4*1024**2):
     start1 = f.readinto(memoryview(buf1)[0:1])
     start2 = f2.readinto(memoryview(buf2)[0:1])
     if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'):
-        raise FileFormatError('Paired-end data must be in FASTQ format when using multiple cores', line=None)
+        raise FileFormatError(
+            "Paired-end data must be in FASTQ format when using multiple cores", line=None)
 
     while True:
         if start1 == len(buf1) or start2 == len(buf2):


=====================================
src/dnaio/readers.py
=====================================
@@ -18,13 +18,13 @@ class BinaryFileReader:
     paired = False
     mode = 'rb'
 
-    def __init__(self, file, _close_file=None):
+    def __init__(self, file, opener=xopen, _close_file=None):
         """
         The file is a path or a file-like object. In both cases, the file may
         be compressed (.gz, .bz2, .xz).
         """
         if isinstance(file, str):
-            file = xopen(file, self.mode)
+            file = opener(file, self.mode)
             self._close_on_exit = True
         elif _close_file:
             self._close_on_exit = True
@@ -49,14 +49,14 @@ class FastaReader(BinaryFileReader):
     Reader for FASTA files.
     """
 
-    def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, _close_file=None):
+    def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, opener=xopen, _close_file=None):
         """
         file is a path or a file-like object. In both cases, the file may
         be compressed (.gz, .bz2, .xz).
 
         keep_linebreaks -- whether to keep newline characters in the sequence
         """
-        super().__init__(file, _close_file=_close_file)
+        super().__init__(file, opener=opener, _close_file=_close_file)
         self.sequence_class = sequence_class
         self.delivers_qualities = False
         self._delimiter = '\n' if keep_linebreaks else ''
@@ -83,8 +83,9 @@ class FastaReader(BinaryFileReader):
             elif name is not None:
                 seq.append(line)
             else:
-                raise FastaFormatError("Expected '>' at beginning of "
-                    "record, but got {!r}.".format(_shorten(line)), line=i)
+                raise FastaFormatError(
+                    "Expected '>' at beginning of record, but got {!r}."
+                    .format(_shorten(line)), line=i)
 
         if name is not None:
             yield self.sequence_class(name, self._delimiter.join(seq), None)
@@ -97,12 +98,12 @@ class FastqReader(BinaryFileReader):
     Reader for FASTQ files. Does not support multi-line FASTQ files.
     """
 
-    def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, _close_file=None):
+    def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, opener=xopen, _close_file=None):
         """
         file is a filename or a file-like object.
         If file is a filename, then .gz files are supported.
         """
-        super().__init__(file, _close_file=_close_file)
+        super().__init__(file, opener=opener, _close_file=_close_file)
         self.sequence_class = sequence_class
         self.delivers_qualities = True
         self.buffer_size = buffer_size


=====================================
src/dnaio/writers.py
=====================================
@@ -2,10 +2,10 @@ from xopen import xopen
 
 
 class FileWriter:
-    def __init__(self, file, _close_file=None):
+    def __init__(self, file, opener=xopen, _close_file=None):
         self._file = file
         if isinstance(file, str):
-            self._file = xopen(file, 'wb')
+            self._file = opener(file, "wb")
             self._close_on_exit = True
         else:
             self._close_on_exit = bool(_close_file)
@@ -28,12 +28,12 @@ class FastaWriter(FileWriter):
     Write FASTA-formatted sequences to a file.
     """
 
-    def __init__(self, file, line_length=None, _close_file=None):
+    def __init__(self, file, line_length=None, opener=xopen, _close_file=None):
         """
         If line_length is not None, the lines will
         be wrapped after line_length characters.
         """
-        super().__init__(file, _close_file=_close_file)
+        super().__init__(file, opener=opener, _close_file=_close_file)
         self.line_length = line_length if line_length != 0 else None
 
     def write(self, name_or_record, sequence=None):
@@ -78,8 +78,8 @@ class FastqWriter(FileWriter):
     """
     file_mode = 'wb'
 
-    def __init__(self, file, two_headers=False, _close_file=None):
-        super().__init__(file, _close_file=_close_file)
+    def __init__(self, file, two_headers=False, opener=xopen, _close_file=None):
+        super().__init__(file, opener=opener, _close_file=_close_file)
         self._two_headers = two_headers
 
     def write(self, record):


=====================================
tests/test_internal.py
=====================================
@@ -82,18 +82,18 @@ class TestFastaReader:
         filename = "tests/data/simple.fasta"
         with open(filename, 'rb') as f:
             assert not f.closed
-            reads = list(dnaio.open(f))
+            _ = list(dnaio.open(f))
             assert not f.closed
         assert f.closed
 
         with FastaReader(filename) as sr:
             tmp_sr = sr
             assert not sr._file.closed
-            reads = list(sr)
+            _ = list(sr)
             assert not sr._file.closed
         assert tmp_sr._file is None
         # Open it a second time
-        with FastaReader(filename) as sr:
+        with FastaReader(filename):
             pass
 
 
@@ -112,7 +112,7 @@ class TestFastqReader:
     def test_fastqreader_buffersize_too_small(self):
         with raises(ValueError):
             with FastqReader("tests/data/simple.fastq", buffer_size=0) as f:
-                reads = list(f)  # pragma: no cover
+                _ = list(f)  # pragma: no cover
 
     def test_fastqreader_dos(self):
         # DOS line breaks
@@ -212,7 +212,7 @@ class TestFastqReader:
         with FastqReader(filename) as sr:
             tmp_sr = sr
             assert not sr._file.closed
-            reads = list(sr)
+            _ = list(sr)
             assert not sr._file.closed
         assert tmp_sr._file is None
 
@@ -445,7 +445,12 @@ class TestInterleavedWriter:
         with InterleavedSequenceWriter(bio) as writer:
             for read1, read2 in reads:
                 writer.write(read1, read2)
-        assert bio.getvalue() == b'@A/1 comment\nTTA\n+\n##H\n at A/2 comment\nGCT\n+\nHH#\n at B/1\nCC\n+\nHH\n at B/2\nTG\n+\n#H\n'
+        assert bio.getvalue() == (
+            b'@A/1 comment\nTTA\n+\n##H\n'
+            b'@A/2 comment\nGCT\n+\nHH#\n'
+            b'@B/1\nCC\n+\nHH\n'
+            b'@B/2\nTG\n+\n#H\n'
+        )
 
 
 class TestPairedSequenceReader:


=====================================
tests/test_open.py
=====================================
@@ -62,6 +62,49 @@ def test_read_pathlib_path(fileformat, extension):
     assert records == SIMPLE_RECORDS[fileformat]
 
 
+def test_read_opener(fileformat, extension):
+    def my_opener(path, mode):
+        import io
+        if fileformat == "fasta":
+            data = b">read\nACG\n"
+        else:
+            data = b"@read\nACG\n+\nHHH\n"
+        return io.BytesIO(data)
+
+    with dnaio.open("totally-ignored-filename." + fileformat + extension, opener=my_opener) as f:
+        records = list(f)
+    assert len(records) == 1
+    assert records[0].name == "read"
+    assert records[0].sequence == "ACG"
+
+
+ at pytest.mark.parametrize("interleaved", [False, True])
+def test_paired_opener(fileformat, extension, interleaved):
+    def my_opener(_path, _mode):
+        import io
+        if fileformat == "fasta":
+            data = b">read\nACG\n"
+        else:
+            data = b"@read\nACG\n+\nHHH\n"
+        return io.BytesIO(data + data)
+
+    path1 = "ignored-filename." + fileformat + extension
+    path2 = "also-ignored-filename." + fileformat + extension
+    if interleaved:
+        with dnaio.open(path1, file2=path2, opener=my_opener) as f:
+            records = list(f)
+        expected = 2
+    else:
+        with dnaio.open(path1, interleaved=True, opener=my_opener) as f:
+            records = list(f)
+        expected = 1
+    assert len(records) == expected
+    assert records[0][0].name == "read"
+    assert records[0][0].sequence == "ACG"
+    assert records[0][1].name == "read"
+    assert records[0][1].sequence == "ACG"
+
+
 def test_detect_fastq_from_content():
     """FASTQ file that is not named .fastq"""
     with dnaio.open('tests/data/missingextension') as f:
@@ -115,8 +158,8 @@ def test_write_pathlib(tmpdir, fileformat, extension):
 def test_write_paired_same_path(tmpdir):
     path1 = str(tmpdir / "same.fastq")
     path2 = str(tmpdir / "same.fastq")
-    with pytest.raises(ValueError) as e:
-        with dnaio.open(file1=path1, file2=path2, mode="w") as f:
+    with pytest.raises(ValueError):
+        with dnaio.open(file1=path1, file2=path2, mode="w"):
             pass
 
 


=====================================
tox.ini
=====================================
@@ -1,5 +1,5 @@
 [tox]
-envlist = py34,py35,py36,py37
+envlist = flake8,py34,py35,py36,py37,py38
 
 [testenv]
 deps =
@@ -10,6 +10,11 @@ commands =
     coverage combine
     coverage report
 
+[testenv:flake8]
+basepython = python3.6
+deps = flake8
+commands = flake8 src/ tests/
+
 [coverage:run]
 parallel = True
 include =
@@ -20,3 +25,7 @@ include =
 source =
     src/
     */site-packages/
+
+[flake8]
+max-line-length = 110
+max-complexity = 15



View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/commit/6f93bd69eb9b8774209182f268b650e2dd47f219

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/commit/6f93bd69eb9b8774209182f268b650e2dd47f219
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191117/d07e945d/attachment-0001.html>