[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 0.7.1

Andreas Tille (@tille) gitlab at salsa.debian.org
Sat Feb 19 07:34:37 GMT 2022



Andreas Tille pushed to branch upstream at Debian Med / python-dnaio


Commits:
46dd21e2 by Andreas Tille at 2022-02-19T08:23:13+01:00
New upstream version 0.7.1
- - - - -


13 changed files:

- .github/workflows/ci.yml
- CHANGES.rst
- pyproject.toml
- + setup.cfg
- setup.py
- src/dnaio/_core.pyx
- src/dnaio/chunks.py
- src/dnaio/pairedend.py
- src/dnaio/readers.py
- src/dnaio/singleend.py
- tests/test_chunks.py
- tests/test_internal.py
- tox.ini


Changes:

=====================================
.github/workflows/ci.yml
=====================================
@@ -4,7 +4,7 @@ on: [push, pull_request]
 
 jobs:
   lint:
-    timeout-minutes: 5
+    timeout-minutes: 10
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -16,62 +16,89 @@ jobs:
       uses: actions/setup-python at v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+    - name: Install tox
       run: python -m pip install tox
     - name: Run tox ${{ matrix.toxenv }}
       run: tox -e ${{ matrix.toxenv }}
 
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout at v2
+      with:
+        fetch-depth: 0  # required for setuptools_scm
+    - name: Build sdist and temporary wheel
+      run: pipx run build
+    - uses: actions/upload-artifact at v2
+      with:
+        name: sdist
+        path: dist/*.tar.gz
+
   test:
-    timeout-minutes: 5
+    timeout-minutes: 10
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9, "3.10-dev"]
         os: [ubuntu-latest]
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
         include:
-        - python-version: 3.8
-          os: macos-latest
+        - os: macos-latest
+          python-version: 3.8
+        - os: windows-latest
+          python-version: 3.8
     steps:
     - uses: actions/checkout at v2
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python at v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+    - name: Install tox
       run: python -m pip install tox
     - name: Test
       run: tox -e py
     - name: Upload coverage report
       uses: codecov/codecov-action at v1
 
-  deploy:
-    timeout-minutes: 5
-    runs-on: ubuntu-latest
+  wheels:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
     needs: [lint, test]
-    if: startsWith(github.ref, 'refs/tags')
+    timeout-minutes: 15
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019]
+    runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout at v2
       with:
         fetch-depth: 0  # required for setuptools_scm
     - name: Build wheels
-      uses: pypa/cibuildwheel at v2.1.1
-      with:
-        output-dir: dist/
+      uses: pypa/cibuildwheel at v2.1.2
       env:
-        CIBW_BUILD: "cp3*-manylinux_x86_64"
-        CIBW_MANYLINUX_X86_64_IMAGE: manylinux2010
+        CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64"
         CIBW_ENVIRONMENT: "CFLAGS=-g0"
-    - name: Set up Python
-      uses: actions/setup-python at v2
+        CIBW_TEST_REQUIRES: "pytest"
+        CIBW_TEST_COMMAND_LINUX: "cd {project} && pytest tests"
+        CIBW_TEST_COMMAND_WINDOWS: "cd /d {project} && pytest tests"
+    - uses: actions/upload-artifact at v2
+      with:
+        name: wheels
+        path: wheelhouse/*.whl
+
+  publish:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+    needs: [build, wheels]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/download-artifact at v2
+      with:
+        name: sdist
+        path: dist/
+    - uses: actions/download-artifact at v2
       with:
-        python-version: 3.7
-    - name: Make distributions
-      run: |
-        python -m pip install build
-        python -m build --sdist
-        ls -l dist/
+        name: wheels
+        path: dist/
     - name: Publish to PyPI
-      uses: pypa/gh-action-pypi-publish at v1.4.1
+      uses: pypa/gh-action-pypi-publish at v1.4.2
       with:
         user: __token__
         password: ${{ secrets.pypi_password }}


=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,17 @@
 Changelog
 =========
 
+v0.7.1 (2022-01-26)
+-------------------
+
+* PR #34: Fix parsing of FASTA files that just contain a comment and no reads
+
+v0.7.0 (2022-01-17)
+-------------------
+
+* @rhpvorderman contributed many performance improvements in PR #15, #17, #18, #20, #21, #22, #23. Reading and writing FASTQ files and reading of paired-end FASTQ files was sped up significantly. For example, reading uncompressed FASTQ is 50% faster (!) than before.
+* PR #28: Windows support added
+
 
 v0.6.0 (2021-09-28)
 -------------------


=====================================
pyproject.toml
=====================================
@@ -1,5 +1,9 @@
 [build-system]
-requires = ["setuptools", "wheel", "setuptools_scm", "Cython"]
+requires = ["setuptools >= 45", "wheel", "setuptools_scm >= 6.2", "Cython >= 0.29.20"]
+build-backend = "setuptools.build_meta"
 
 [black.tool]
 line-length = 100
+
+[tool.setuptools_scm]
+write_to = "src/dnaio/_version.py"


=====================================
setup.cfg
=====================================
@@ -0,0 +1,35 @@
+[metadata]
+name = dnaio
+author = Marcel Martin
+author_email = marcel.martin at scilifelab.se
+url = https://github.com/marcelm/dnaio/
+description = Read and write FASTA and FASTQ files efficiently
+long_description = file: README.md
+long_description_content_type = text/markdown
+license = MIT
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: MIT License
+    Programming Language :: Cython
+    Programming Language :: Python :: 3
+    Topic :: Scientific/Engineering :: Bio-Informatics
+
+[options]
+python_requires = >=3.6
+package_dir =
+    =src
+packages = find:
+install_requires =
+    xopen >= 1.4.0
+
+[options.packages.find]
+where = src
+
+[options.package_data]
+* = py.typed, *.pyi
+
+[options.extras_require]
+dev =
+    Cython
+    pytest


=====================================
setup.py
=====================================
@@ -1,81 +1,11 @@
-import os.path
-from setuptools import setup, Extension, find_packages
-from distutils.command.sdist import sdist
-from distutils.command.build_ext import build_ext
-
-
-def no_cythonize(extensions, **_ignore):
-    """Change .pyx to .c or .cpp (copied from Cython documentation)"""
-    for extension in extensions:
-        sources = []
-        for sfile in extension.sources:
-            path, ext = os.path.splitext(sfile)
-            if ext in ('.pyx', '.py'):
-                if extension.language == 'c++':
-                    ext = '.cpp'
-                else:
-                    ext = '.c'
-                sfile = path + ext
-            sources.append(sfile)
-        extension.sources[:] = sources
-
-
-class BuildExt(build_ext):
-    def run(self):
-        # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip
-        # file retrieved from PyPI that already includes the pre-cythonized
-        # extension modules, and then we do not need to run cythonize().
-        if os.path.exists('PKG-INFO'):
-            no_cythonize(self.extensions)
-        else:
-            # Otherwise, this is a 'developer copy' of the code, and then the
-            # only sensible thing is to require Cython to be installed.
-            from Cython.Build import cythonize
-            self.extensions = cythonize(self.extensions)
-        super().run()
-
-
-class SDist(sdist):
-    def run(self):
-        # Make sure the compiled Cython files in the distribution are up-to-date
-        from Cython.Build import cythonize
-        cythonize(self.distribution.ext_modules)
-        super().run()
-
-
-with open('README.md', encoding='utf-8') as f:
-    long_description = f.read()
+from setuptools import setup, Extension
+from Cython.Build import cythonize
+import setuptools_scm  # noqa  Ensure it’s installed
 
 setup(
-    name='dnaio',
-    setup_requires=['setuptools_scm'],  # Support pip versions that don't know about pyproject.toml
-    use_scm_version={'write_to': 'src/dnaio/_version.py'},
-    author='Marcel Martin',
-    author_email='marcel.martin at scilifelab.se',
-    url='https://github.com/marcelm/dnaio/',
-    description='Read and write FASTA and FASTQ files efficiently',
-    long_description=long_description,
-    long_description_content_type='text/markdown',
-    license='MIT',
-    package_dir={'': 'src'},
-    packages=find_packages('src'),
-    package_data={"dnaio": ["py.typed", "*.pyi"]},
-    extras_require={
-        'dev': ['Cython', 'pytest'],
-    },
-    ext_modules=[
-        Extension('dnaio._core', sources=['src/dnaio/_core.pyx']),
-    ],
-    cmdclass={'build_ext': BuildExt, 'sdist': SDist},
-    install_requires=['xopen>=0.8.2'],
-    python_requires='>=3.6',
-    classifiers=[
-            "Development Status :: 5 - Production/Stable",
-            "Intended Audience :: Science/Research",
-            "License :: OSI Approved :: MIT License",
-            "Natural Language :: English",
-            "Programming Language :: Cython",
-            "Programming Language :: Python :: 3",
-            "Topic :: Scientific/Engineering :: Bio-Informatics"
-    ]
+    ext_modules=cythonize(
+        [
+            Extension("dnaio._core", sources=["src/dnaio/_core.pyx"]),
+        ]
+    ),
 )


=====================================
src/dnaio/_core.pyx
=====================================
@@ -1,9 +1,15 @@
 # cython: language_level=3, emit_code_comments=False
 
-from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING
-from libc.string cimport strncmp, memcmp, memcpy
+from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes_GET_SIZE
+from cpython.unicode cimport PyUnicode_DecodeLatin1
+from libc.string cimport strncmp, memcmp, memcpy, memchr, strcspn
+from cpython.unicode cimport PyUnicode_GET_LENGTH
 cimport cython
 
+cdef extern from *:
+    unsigned char * PyUnicode_1BYTE_DATA(object o)
+    int PyUnicode_KIND(object o)
+    int PyUnicode_1BYTE_KIND
 from .exceptions import FastqFormatError
 from ._util import shorten
 
@@ -34,11 +40,13 @@ cdef class Sequence:
         self.sequence = sequence
         self.qualities = qualities
 
+    def __init__(self, str name, str sequence, str qualities = None):
+        # __cinit__ is called first and sets all the variables.
         if qualities is not None and len(qualities) != len(sequence):
             rname = shorten(name)
             raise ValueError("In read named {!r}: length of quality sequence "
-                "({}) and length of read ({}) do not match".format(
-                    rname, len(qualities), len(sequence)))
+                             "({}) and length of read ({}) do not match".format(
+                rname, len(qualities), len(sequence)))
 
     def __getitem__(self, key):
         """
@@ -88,62 +96,91 @@ cdef class Sequence:
         This is a faster version of qualities.encode('ascii')."""
         return self.qualities.encode('ascii')
 
-    def fastq_bytes(self):
+    def fastq_bytes(self, two_headers = False):
         """Return the entire FASTQ record as bytes which can be written
-        into a file."""
-        # Convert to ASCII bytes sequences first as these have a one-to-one
-        # relation between size and number of bytes
-        # Unlike decoding, ascii is not slower than latin-1. This is because
-        # CPython performs a call to PyUnicodeCheck on both occassions. This
-        # determines the type of the Unicode object. In fact, the ascii encode
-        # is slightly faster because the check for PyASCIIObject is performed
-        # first.
-        cdef bytes name = self.name.encode('ascii')
-        cdef bytes sequence = self.sequence.encode('ascii')
-        cdef bytes qualities = self.qualities.encode('ascii')
-        cdef Py_ssize_t name_length = len(name)
-        cdef Py_ssize_t sequence_length = len(sequence)
-        cdef Py_ssize_t qualities_length = len(qualities)
-
-        # Since Cython will generate code above that is a 100% sure to generate
-        # bytes objects, we can call Python C-API functions that don't perform
-        # checks on the object.
-        cdef char * name_ptr = PyBytes_AS_STRING(name)
-        cdef char * sequence_ptr = PyBytes_AS_STRING(sequence)
-        cdef char * qualities_ptr = PyBytes_AS_STRING(qualities)
+        into a file.
+
+        Optionally the header (after the @) can be repeated on the third line
+        (after the +), when two_headers is enabled."""
+        cdef:
+            char * name
+            char * sequence
+            char * qualities
+            Py_ssize_t name_length
+            Py_ssize_t sequence_length
+            Py_ssize_t qualities_length
+
+        if PyUnicode_KIND(self.name) == PyUnicode_1BYTE_KIND:
+            name = <char *>PyUnicode_1BYTE_DATA(self.name)
+            name_length = <size_t>PyUnicode_GET_LENGTH(self.name)
+        else:
+            # Allow non-ASCII in name
+            name_bytes = self.name.encode('latin-1')
+            name = PyBytes_AS_STRING(name_bytes)
+            name_length = PyBytes_GET_SIZE(name_bytes)
+        if PyUnicode_KIND(self.sequence) == PyUnicode_1BYTE_KIND:
+            sequence = <char *>PyUnicode_1BYTE_DATA(self.sequence)
+            sequence_length = <size_t>PyUnicode_GET_LENGTH(self.sequence)
+        else:
+            # Don't allow non-ASCII in sequence and qualities
+            sequence_bytes = self.sequence.encode('ascii')
+            sequence = PyBytes_AS_STRING(sequence_bytes)
+            sequence_length = PyBytes_GET_SIZE(sequence_bytes)
+        if PyUnicode_KIND(self.qualities) == PyUnicode_1BYTE_KIND:
+            qualities = <char *>PyUnicode_1BYTE_DATA(self.qualities)
+            qualities_length = <size_t>PyUnicode_GET_LENGTH(self.qualities)
+        else:
+            qualities_bytes = self.qualities.encode('ascii')
+            qualities = PyBytes_AS_STRING(qualities_bytes)
+            qualities_length = PyBytes_GET_SIZE(qualities_bytes)
+        return create_fastq_record(name, sequence, qualities,
+                                   name_length, sequence_length, qualities_length,
+                                   two_headers)
 
+    def fastq_bytes_two_headers(self):
+        """
+        Return this record in FASTQ format as a bytes object where the header (after the @) is
+        repeated on the third line.
+        """
+        return self.fastq_bytes(two_headers=True)
+
+
+cdef bytes create_fastq_record(char * name, char * sequence, char * qualities,
+                               Py_ssize_t name_length,
+                               Py_ssize_t sequence_length,
+                               Py_ssize_t qualities_length,
+                               bint two_headers = False):
         # Total size is name + sequence + qualities + 4 newlines + '+' and an
         # '@' to be put in front of the name.
         cdef Py_ssize_t total_size = name_length + sequence_length + qualities_length + 6
 
+        if two_headers:
+            # We need space for the name after the +.
+            total_size += name_length
+
         # This is the canonical way to create an uninitialized bytestring of given size
         cdef bytes retval = PyBytes_FromStringAndSize(NULL, total_size)
         cdef char * retval_ptr = PyBytes_AS_STRING(retval)
 
         # Write the sequences into the bytestring at the correct positions.
-        cdef Py_ssize_t cursor
+        cdef size_t cursor
         retval_ptr[0] = b"@"
-        memcpy(retval_ptr + 1, name_ptr, name_length)
+        memcpy(retval_ptr + 1, name, name_length)
         cursor = name_length + 1
         retval_ptr[cursor] = b"\n"; cursor += 1
-        memcpy(retval_ptr + cursor, sequence_ptr, sequence_length)
+        memcpy(retval_ptr + cursor, sequence, sequence_length)
         cursor += sequence_length
         retval_ptr[cursor] = b"\n"; cursor += 1
         retval_ptr[cursor] = b"+"; cursor += 1
+        if two_headers:
+            memcpy(retval_ptr + cursor, name, name_length)
+            cursor += name_length
         retval_ptr[cursor] = b"\n"; cursor += 1
-        memcpy(retval_ptr + cursor, qualities_ptr, qualities_length)
+        memcpy(retval_ptr + cursor, qualities, qualities_length)
         cursor += qualities_length
         retval_ptr[cursor] = b"\n"
         return retval
 
-    def fastq_bytes_two_headers(self):
-        """
-        Return this record in FASTQ format as a bytes object where the header (after the @) is
-        repeated on the third line.
-        """
-        return f"@{self.name}\n{self.sequence}\n+{self.name}\n{self.qualities}\n".encode("ascii")
-
-
 # It would be nice to be able to have the first parameter be an
 # unsigned char[:] (memory view), but this fails with a BufferError
 # when a bytes object is passed in.
@@ -209,12 +246,19 @@ def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
         bytearray buf = bytearray(buffer_size)
         char[:] buf_view = buf
         char* c_buf = buf
-        int endskip
         str name
-        char* name_encoded
-        Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
-        Py_ssize_t second_header_start, sequence_length, qualities_start
-        Py_ssize_t second_header_length, name_length
+        str sequence
+        str qualities
+        Py_ssize_t last_read_position = 0
+        Py_ssize_t record_start = 0
+        Py_ssize_t bufstart, bufend, name_start, name_end, name_length
+        Py_ssize_t sequence_start, sequence_end, sequence_length
+        Py_ssize_t second_header_start, second_header_end, second_header_length
+        Py_ssize_t qualities_start, qualities_end, qualities_length
+        char *name_end_ptr
+        char *sequence_end_ptr
+        char *second_header_end_ptr
+        char *qualities_end_ptr
         bint custom_class = sequence_class is not Sequence
         Py_ssize_t n_records = 0
         bint extra_newline = False
@@ -252,138 +296,182 @@ def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
                 bufstart += 1
                 bufend += 1
                 extra_newline = True
-            else:
-                break
+            elif last_read_position > record_start:  # Incomplete FASTQ records are present.
+                if extra_newline:
+                    # Do not report the linefeed that was added by dnaio but
+                    # was not present in the original input.
+                    last_read_position -= 1
+                lines = buf[record_start:last_read_position].count(b'\n')
+                raise FastqFormatError(
+                    'Premature end of file encountered. The incomplete final record was: '
+                    '{!r}'.format(
+                        shorten(buf[record_start:last_read_position].decode('latin-1'),
+                                500)),
+                    line=n_records * 4 + lines)
+            else:  # EOF Reached. Stop iterating.
+                return
 
         # Parse all complete FASTQ records in this chunk
-        pos = 0
         record_start = 0
         while True:
-            # Parse the name (line 0)
-            if c_buf[pos] != b'@':
-                raise FastqFormatError("Line expected to "
-                    "start with '@', but found {!r}".format(chr(c_buf[pos])),
-                    line=n_records * 4)
-            pos += 1
-            while pos < bufend and c_buf[pos] != b'\n':
-                pos += 1
-            if pos == bufend:
+            ### Check for a complete record (i.e 4 newlines are present)
+            # Use libc memchr, this optimizes looking for characters by
+            # using 64-bit integers. See:
+            # https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=string/memchr.c;hb=HEAD
+            # void *memchr(const void *str, int c, size_t n)
+            name_end_ptr = <char *>memchr(c_buf + record_start, b'\n', <size_t>(bufend - record_start))
+            if name_end_ptr == NULL:
                 break
-            endskip = 1 if c_buf[pos-1] == b'\r' else 0
-            name_length = pos - endskip - record_start - 1
-            name_encoded = c_buf + record_start + 1
-            # .decode('latin-1') is 50% faster than .decode('ascii')
-            # This is because PyUnicode_DecodeLatin1 is an alias for
-            # _PyUnicode_FromUCS1. Which directly copies the bytes into a
-            # string object. No operations are taking place. With
-            # PyUnicode_DecodeASCII, all characters are checked whether they
-            # exceed 128.
-            name = c_buf[record_start+1:pos-endskip].decode('latin-1')
-
-            pos += 1
-
-            # Parse the sequence (line 1)
-            sequence_start = pos
-            while pos < bufend and c_buf[pos] != b'\n':
-                pos += 1
-            if pos == bufend:
+            # bufend - sequence_start is always nonnegative:
+            # - name_end is at most bufend - 1
+            # - thus sequence_start is at most bufend
+            name_end = name_end_ptr - c_buf
+            sequence_start = name_end + 1
+            sequence_end_ptr = <char *>memchr(c_buf + sequence_start, b'\n', <size_t>(bufend - sequence_start))
+            if sequence_end_ptr == NULL:
+                break
+            sequence_end = sequence_end_ptr - c_buf
+            second_header_start = sequence_end + 1
+            second_header_end_ptr = <char *>memchr(c_buf + second_header_start, b'\n', <size_t>(bufend - second_header_start))
+            if second_header_end_ptr == NULL:
                 break
-            endskip = 1 if c_buf[pos-1] == b'\r' else 0
-            sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
-            sequence_length = pos - endskip - sequence_start
-            pos += 1
-
-            # Parse second header (line 2)
-            second_header_start = pos
-            if pos == bufend:
+            second_header_end = second_header_end_ptr - c_buf
+            qualities_start = second_header_end + 1
+            qualities_end_ptr = <char *>memchr(c_buf + qualities_start, b'\n', <size_t>(bufend - qualities_start))
+            if qualities_end_ptr == NULL:
                 break
-            if c_buf[pos] != b'+':
+            qualities_end = qualities_end_ptr - c_buf
+
+            if c_buf[record_start] != b'@':
+                raise FastqFormatError("Line expected to "
+                    "start with '@', but found {!r}".format(chr(c_buf[record_start])),
+                    line=n_records * 4)
+            if c_buf[second_header_start] != b'+':
                 raise FastqFormatError("Line expected to "
-                    "start with '+', but found {!r}".format(chr(c_buf[pos])),
+                    "start with '+', but found {!r}".format(chr(c_buf[second_header_start])),
                     line=n_records * 4 + 2)
-            pos += 1  # skip over the '+'
-            while pos < bufend and c_buf[pos] != b'\n':
-                pos += 1
-            if pos == bufend:
-                break
-            endskip = 1 if c_buf[pos-1] == b'\r' else 0
-            second_header_length = pos - endskip - second_header_start - 1
-            if second_header_length == 0:
-                second_header = False
-            else:
+
+            name_start = record_start + 1  # Skip @
+            second_header_start += 1  # Skip +
+            name_length = name_end - name_start
+            sequence_length = sequence_end - sequence_start
+            second_header_length = second_header_end - second_header_start
+            qualities_length = qualities_end - qualities_start
+
+            # Check for \r\n line-endings and compensate
+            if c_buf[name_end - 1] == b'\r':
+                name_length -= 1
+            if c_buf[sequence_end - 1] == b'\r':
+                sequence_length -= 1
+            if c_buf[second_header_end - 1] == b'\r':
+                second_header_length -= 1
+            if c_buf[qualities_end - 1] == b'\r':
+                qualities_length -= 1
+
+            if second_header_length:  # should be 0 when only + is present
                 if (name_length != second_header_length or
-                        strncmp(c_buf+second_header_start+1,
-                            name_encoded, second_header_length) != 0):
+                        strncmp(c_buf+second_header_start,
+                            c_buf + name_start, second_header_length) != 0):
                     raise FastqFormatError(
                         "Sequence descriptions don't match ('{}' != '{}').\n"
                         "The second sequence description must be either "
                         "empty or equal to the first description.".format(
-                            name_encoded[:name_length].decode('latin-1'),
-                            c_buf[second_header_start+1:pos-endskip]
+                            c_buf[name_start:name_end].decode('latin-1'),
+                            c_buf[second_header_start:second_header_end]
                             .decode('latin-1')), line=n_records * 4 + 2)
-                second_header = True
-            pos += 1
-
-            # Parse qualities (line 3)
-            qualities_start = pos
-            while pos < bufend and c_buf[pos] != b'\n':
-                pos += 1
-            if pos == bufend:
-                break
-            endskip = 1 if c_buf[pos-1] == b'\r' else 0
-            qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
-            if pos - endskip - qualities_start != sequence_length:
-                raise FastqFormatError("Length of sequence and "
-                    "qualities differ", line=n_records * 4 + 3)
-            pos += 1
+
+            if qualities_length != sequence_length:
+                raise FastqFormatError(
+                    "Length of sequence and qualities differ", line=n_records * 4 + 3)
+
+            ### Copy record into python variables
+            # PyUnicode_DecodeLatin1 is 50% faster than PyUnicode_DecodeASCII.
+            # This is because PyUnicode_DecodeLatin1 is an alias for
+            # _PyUnicode_FromUCS1. Which directly copies the bytes into a
+            # string object after some checks. With PyUnicode_DecodeASCII,
+            # there is an extra check whether characters exceed 128.
+            name = PyUnicode_DecodeLatin1(c_buf + name_start, name_length, 'strict')
+            sequence = PyUnicode_DecodeLatin1(c_buf + sequence_start, sequence_length, 'strict')
+            qualities = PyUnicode_DecodeLatin1(c_buf + qualities_start, qualities_length, 'strict')
+
             if n_records == 0:
-                yield second_header  # first yielded value is special
+                yield bool(second_header_length)  # first yielded value is special
             if custom_class:
                 yield sequence_class(name, sequence, qualities)
             else:
                 yield Sequence.__new__(Sequence, name, sequence, qualities)
+
+            ### Advance record to next position
             n_records += 1
-            record_start = pos
-            if pos == bufend:
-                break
-        if pos == bufend:
-            if record_start == 0 and bufend == len(buf):
-                # buffer too small, double it
-                buffer_size *= 2
-                prev_buf = buf
-                buf = bytearray(buffer_size)
-                buf[0:bufend] = prev_buf
-                del prev_buf
-                bufstart = bufend
-                buf_view = buf
-                c_buf = buf
-            else:
-                bufstart = bufend - record_start
-                buf[0:bufstart] = buf[record_start:bufend]
-    if pos > record_start:
-        if extra_newline:
-            pos -= 1
-        lines = buf[record_start:pos].count(b'\n')
-        raise FastqFormatError(
-            'Premature end of file encountered. The incomplete final record was: '
-            '{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
-            line=n_records * 4 + lines)
+            record_start = qualities_end + 1
+        # bufend reached
+        last_read_position = bufend
+        if record_start == 0 and bufend == len(buf):
+            # buffer too small, double it
+            buffer_size *= 2
+            prev_buf = buf
+            buf = bytearray(buffer_size)
+            buf[0:bufend] = prev_buf
+            del prev_buf
+            bufstart = bufend
+            buf_view = buf
+            c_buf = buf
+        else:
+            bufstart = bufend - record_start
+            buf[0:bufstart] = buf[record_start:bufend]
 
 
 def record_names_match(header1: str, header2: str):
     """
     Check whether the sequence record ids id1 and id2 are compatible, ignoring a
     suffix of '1', '2' or '3'. This exception allows to check some old
-    paired-end reads that have names ending in '/1' and '/2'. Also, the
+    paired-end reads that have IDs ending in '/1' and '/2'. Also, the
     fastq-dump tool (used for converting SRA files to FASTQ) appends '.1', '.2'
     and sometimes '.3' to paired-end reads if option -I is used.
     """
-    cdef:
-        str name1, name2
-
-    name1 = header1.split()[0]
-    name2 = header2.split()[0]
-    if name1 and name2 and name1[-1] in '123' and name2[-1] in '123':
-        name1 = name1[:-1]
-        name2 = name2[:-1]
-    return name1 == name2
+    if (
+        PyUnicode_KIND(header1) != PyUnicode_1BYTE_KIND or
+        PyUnicode_KIND(header2) != PyUnicode_1BYTE_KIND
+    ):
+        # Fall back to slower code path.
+        name1 = header1.split(maxsplit=1)[0]
+        name2 = header2.split(maxsplit=1)[0]
+        if name1 and name2 and name1[-1] in '123' and name2[-1] in '123':
+            return name1[:-1] == name2[:-1]
+        return name1 == name2
+    # Do not call .encode functions but use the unicode pointer inside the
+    # python object directly, provided it is in 1-byte encoding, so we can
+    # find the spaces and tabs easily.
+    cdef char * header1_chars = <char *>PyUnicode_1BYTE_DATA(header1)
+    cdef char * header2_chars = <char *>PyUnicode_1BYTE_DATA(header2)
+    cdef size_t header1_length = <size_t>PyUnicode_GET_LENGTH(header1)
+    return record_ids_match(header1_chars, header2_chars, header1_length)
+
+
+cdef bint record_ids_match(char *header1, char *header2, size_t header1_length):
+    """
+    Check whether the ASCII-encoded IDs match. Only header1_length is needed.
+    """
+    # Only the read ID is of interest.
+    # Find the first tab or space, if not present, strcspn will return the
+    # position of the terminating NULL byte. (I.e. the length).
+    # Header1 is not searched because we can reuse the end of ID position of
+    # header2 as header1's ID should end at the same position.
+    cdef size_t id2_length = strcspn(header2, b' \t')
+
+    if header1_length < id2_length:
+        return False
+
+    cdef char end = header1[id2_length]
+    if end != b'\000' and end != b' ' and end != b'\t':
+        return False
+
+    # Check if the IDs end with 1, 2 or 3. This is the read pair number
+    # which should not be included in the comparison.
+    cdef bint id1endswithnumber = b'1' <= header1[id2_length - 1] <= b'3'
+    cdef bint id2endswithnumber = b'1' <= header2[id2_length - 1] <= b'3'
+    if id1endswithnumber and id2endswithnumber:
+        id2_length -= 1
+
+    # Compare the strings up to the ID end position.
+    return memcmp(<void *>header1, <void *>header2, id2_length) == 0


=====================================
src/dnaio/chunks.py
=====================================
@@ -16,9 +16,15 @@ def _fasta_head(buf: bytes, end: Optional[int] = None) -> int:
     pos = buf.rfind(b'\n>', 0, end)
     if pos != -1:
         return pos + 1
-    if buf[0:1] == b'>':
+    if buf[0:1] == b'>' or buf[0:1] == b'#':
         return 0
-    raise FastaFormatError('File does not start with ">"', line=None)
+    if len(buf) == 0:
+        return 0
+    c = chr(buf[0])
+    raise FastaFormatError(
+        f"FASTA file expected to start with '>', but found {repr(c)}",
+        line=None,
+    )
 
 
 def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
@@ -52,15 +58,19 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
     # Read one byte to determine file format.
     # If there is a comment char, we assume FASTA!
     start = f.readinto(memoryview(buf)[0:1])
-    if start == 1 and buf[0:1] == b'@':
-        head = _fastq_head
-    elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'):
-        head = _fasta_head
-    elif start == 0:
+    if start == 0:
         # Empty file
         return
+    assert start == 1
+    if buf[0:1] == b'@':
+        head = _fastq_head
+    elif buf[0:1] == b'#' or buf[0:1] == b'>':
+        head = _fasta_head
     else:
-        raise UnknownFileFormat('Input file format unknown')
+        raise UnknownFileFormat(
+            f"Cannnot determine input file format: First character expected to be '>' or '@', "
+            f"but found {repr(chr(buf[0]))}"
+        )
 
     # Layout of buf
     #


=====================================
src/dnaio/pairedend.py
=====================================
@@ -1,3 +1,4 @@
+import itertools
 from contextlib import ExitStack
 from os import PathLike
 from typing import Union, BinaryIO, Optional, Iterator, Tuple
@@ -47,26 +48,24 @@ class TwoFilePairedEndReader(PairedEndReader):
         """
         Iterate over the paired reads. Each item is a pair of Sequence objects.
         """
-        # Avoid usage of zip() below since it will consume one item too many.
-        it1, it2 = iter(self.reader1), iter(self.reader2)
-        while True:
-            try:
-                r1 = next(it1)
-            except StopIteration:
-                # End of file 1. Make sure that file 2 is also at end.
-                try:
-                    next(it2)
-                    raise FileFormatError(
-                        "Reads are improperly paired. There are more reads in "
-                        "file 2 than in file 1.",
-                        line=None,
-                    ) from None
-                except StopIteration:
-                    pass
-                break
-            try:
-                r2 = next(it2)
-            except StopIteration:
+        # Avoid usage of zip() below since it will consume one item too many,
+        # when one of the iterators is exhausted. zip in python 3.10 has a
+        # 'strict' keyword that can be used to prevent this and throw an error,
+        # but it will take a long time for 3.10 or higher to be available on
+        # everyone's machine.
+        # Instead use zip_longest from itertools. This yields None if one of
+        # the iterators is exhausted. Checking for None identity is fast..
+        # So we can quickly check if the iterator is still yielding.
+        # This is faster than implementing a while loop with next calls,
+        # which requires expensive function lookups.
+        for r1, r2 in itertools.zip_longest(self.reader1, self.reader2):
+            if r1 is None:
+                raise FileFormatError(
+                    "Reads are improperly paired. There are more reads in "
+                    "file 2 than in file 1.",
+                    line=None,
+                ) from None
+            if r2 is None:
                 raise FileFormatError(
                     "Reads are improperly paired. There are more reads in "
                     "file 1 than in file 2.",


=====================================
src/dnaio/readers.py
=====================================
@@ -121,7 +121,7 @@ class FastqReader(BinaryFileReader, SingleEndReader):
         file: Union[str, BinaryIO],
         *,
         sequence_class=Sequence,
-        buffer_size: int = 1048576,
+        buffer_size: int = 128 * 1024,  # Buffer size used by cat, pigz etc.
         opener=xopen,
         _close_file: Optional[bool] = None,
     ):


=====================================
src/dnaio/singleend.py
=====================================
@@ -112,10 +112,11 @@ def _detect_format_from_content(file: BinaryIO) -> Optional[str]:
     Return 'fasta', 'fastq' or None
     """
     if file.seekable():
+        original_position = file.tell()
         first_char = file.read(1)
-        if file.tell() > 0:
-            file.seek(-1, 1)
+        file.seek(original_position)
     else:
+        # We cannot always use peek() because BytesIO objects do not suppert it
         first_char = file.peek(1)[0:1]  # type: ignore
     formats = {
         b"@": "fastq",


=====================================
tests/test_chunks.py
=====================================
@@ -2,7 +2,33 @@ from pytest import raises
 from io import BytesIO
 
 from dnaio._core import paired_fastq_heads
-from dnaio.chunks import _fastq_head, read_chunks, read_paired_chunks
+from dnaio.chunks import _fastq_head, _fasta_head, read_chunks, read_paired_chunks
+
+
+def test_fasta_head():
+    assert _fasta_head(b'') == 0
+    assert _fasta_head(b'>1\n') == 0
+    assert _fasta_head(b'>1\n3') == 0
+    assert _fasta_head(b'>1\n3\n') == 0
+    assert _fasta_head(b'>1\n3\n>') == 5
+    assert _fasta_head(b'>1\n3\n>6') == 5
+    assert _fasta_head(b'>1\n3\n>6\n') == 5
+    assert _fasta_head(b'>1\n3\n>6\n8') == 5
+    assert _fasta_head(b'>1\n3\n>6\n8\n') == 5
+    assert _fasta_head(b'>1\n3\n>6\n8\n0') == 5
+    assert _fasta_head(b'>1\n3\n>6\n8\n0\n') == 5
+    assert _fasta_head(b'>1\n3\n>6\n8\n0\n>') == 12
+
+
+def test_fasta_head_with_comment():
+    assert _fasta_head(b'#') == 0
+    assert _fasta_head(b'#\n') == 0
+    assert _fasta_head(b'#\n>') == 2
+    assert _fasta_head(b'#\n>3') == 2
+    assert _fasta_head(b'#\n>3\n') == 2
+    assert _fasta_head(b'#\n>3\n5') == 2
+    assert _fasta_head(b'#\n>3\n5\n') == 2
+    assert _fasta_head(b'#\n>3\n5\n>') == 7
 
 
 def test_paired_fastq_heads():


=====================================
tests/test_internal.py
=====================================
@@ -239,7 +239,7 @@ class TestFastqReader:
             list(fq)
 
     def test_second_header_not_equal(self):
-        fastq = BytesIO(b'@r1\nACG\n+xy\n')
+        fastq = BytesIO(b'@r1\nACG\n+xy\nXXX\n')
         with raises(FastqFormatError) as info:
             with FastqReader(fastq) as fq:
                 list(fq)  # pragma: no cover
@@ -482,6 +482,11 @@ class TestPairedSequenceReader:
         assert match('abc def', 'abc ghi')
         assert match('abc', 'abc ghi')
         assert not match('abc', 'xyz')
+        assert match('abc\tdef', 'abc')
+        assert match('abc\tdef', 'abc\tghi')
+        assert match('abc somecomment\tanothercomment', 'abc andanothercomment\tbla')
+        assert match('abc\tcomments comments', 'abc\tothers others')
+        assert match('abc\tdef', 'abc def')
 
     def test_record_names_match_with_ignored_trailing_12(self):
         match = record_names_match
@@ -521,6 +526,15 @@ class TestPairedSequenceReader:
                 list(psr)
         assert "There are more reads in file 1 than in file 2" in info.value.message
 
+    def test_empty_sequences_do_not_stop_iteration(self):
+        s1 = BytesIO(b'@r1_1\nACG\n+\nHHH\n at r2_1\nACG\n+\nHHH\n at r3_2\nACG\n+\nHHH\n')
+        s2 = BytesIO(b'@r1_1\nACG\n+\nHHH\n at r2_2\n\n+\n\n at r3_2\nACG\n+\nHHH\n')
+        # Second sequence for s2 is empty but valid. Should not lead to a stop of iteration.
+        with TwoFilePairedEndReader(s1, s2) as psr:
+            seqs = list(psr)
+        print(seqs)
+        assert len(seqs) == 3
+
     def test_incorrectly_paired(self):
         s1 = BytesIO(b'@r1/1\nACG\n+\nHHH\n')
         s2 = BytesIO(b'@wrong_name\nTTT\n+\nHHH\n')
@@ -531,18 +545,21 @@ class TestPairedSequenceReader:
 
 
 @mark.parametrize('path', [
-    'tests/data/simple.fastq',
-    'tests/data/dos.fastq',
-    'tests/data/simple.fasta',
-    'tests/data/with_comment.fasta',
+    os.path.join('tests', 'data', 'simple.fastq'),
+    os.path.join('tests', 'data', 'dos.fastq'),
+    os.path.join('tests', 'data', 'simple.fasta'),
+    os.path.join('tests', 'data', 'with_comment.fasta'),
 ])
 def test_read_stdin(path):
     # Get number of records in the input file
     with dnaio.open(path) as f:
         expected = len(list(f))
 
-    # Use 'cat' to simulate that no file name is available for stdin of the subprocess
-    with subprocess.Popen(['cat', path], stdout=subprocess.PIPE) as cat:
+    # Use piping from a separate subprocess to force the input file name to be unavailable
+    cmd = "type" if sys.platform == "win32" else "cat"
+    with subprocess.Popen(
+        [cmd, path], stdout=subprocess.PIPE, shell=sys.platform == "win32"
+    ) as cat:
         with subprocess.Popen(
             [sys.executable, 'tests/read_from_stdin.py'], stdin=cat.stdout, stdout=subprocess.PIPE
         ) as py:


=====================================
tox.ini
=====================================
@@ -1,6 +1,6 @@
 [tox]
-envlist = flake8,mypy,py36,py37,py38,py39
-requires = Cython>=0.29.13
+envlist = flake8,mypy,py36,py37,py38,py39,py310
+isolated_build = True
 
 [testenv]
 deps =



View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/46dd21e29ae2b360ca730fc6f67ecd45ea321a51

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/46dd21e29ae2b360ca730fc6f67ecd45ea321a51
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220219/81de3b56/attachment-0001.htm>


More information about the debian-med-commit mailing list