[med-svn] [Git][med-team/python-dnaio][master] 4 commits: New upstream version 0.7.1
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sat Feb 19 07:34:27 GMT 2022
Andreas Tille pushed to branch master at Debian Med / python-dnaio
Commits:
46dd21e2 by Andreas Tille at 2022-02-19T08:23:13+01:00
New upstream version 0.7.1
- - - - -
14ce1378 by Andreas Tille at 2022-02-19T08:23:13+01:00
routine-update: New upstream version
- - - - -
d0b0b374 by Andreas Tille at 2022-02-19T08:23:14+01:00
Update upstream source from tag 'upstream/0.7.1'
Update to upstream version '0.7.1'
with Debian dir 28a6471fb462470d93e614097b750dca4739742c
- - - - -
0f3b72fc by Andreas Tille at 2022-02-19T08:24:37+01:00
routine-update: Ready to upload to unstable
- - - - -
14 changed files:
- .github/workflows/ci.yml
- CHANGES.rst
- debian/changelog
- pyproject.toml
- + setup.cfg
- setup.py
- src/dnaio/_core.pyx
- src/dnaio/chunks.py
- src/dnaio/pairedend.py
- src/dnaio/readers.py
- src/dnaio/singleend.py
- tests/test_chunks.py
- tests/test_internal.py
- tox.ini
Changes:
=====================================
.github/workflows/ci.yml
=====================================
@@ -4,7 +4,7 @@ on: [push, pull_request]
jobs:
lint:
- timeout-minutes: 5
+ timeout-minutes: 10
runs-on: ubuntu-latest
strategy:
matrix:
@@ -16,62 +16,89 @@ jobs:
uses: actions/setup-python at v2
with:
python-version: ${{ matrix.python-version }}
- - name: Install dependencies
+ - name: Install tox
run: python -m pip install tox
- name: Run tox ${{ matrix.toxenv }}
run: tox -e ${{ matrix.toxenv }}
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout at v2
+ with:
+ fetch-depth: 0 # required for setuptools_scm
+ - name: Build sdist and temporary wheel
+ run: pipx run build
+ - uses: actions/upload-artifact at v2
+ with:
+ name: sdist
+ path: dist/*.tar.gz
+
test:
- timeout-minutes: 5
+ timeout-minutes: 10
runs-on: ${{ matrix.os }}
strategy:
matrix:
- python-version: [3.6, 3.7, 3.8, 3.9, "3.10-dev"]
os: [ubuntu-latest]
+ python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
include:
- - python-version: 3.8
- os: macos-latest
+ - os: macos-latest
+ python-version: 3.8
+ - os: windows-latest
+ python-version: 3.8
steps:
- uses: actions/checkout at v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python at v2
with:
python-version: ${{ matrix.python-version }}
- - name: Install dependencies
+ - name: Install tox
run: python -m pip install tox
- name: Test
run: tox -e py
- name: Upload coverage report
uses: codecov/codecov-action at v1
- deploy:
- timeout-minutes: 5
- runs-on: ubuntu-latest
+ wheels:
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
needs: [lint, test]
- if: startsWith(github.ref, 'refs/tags')
+ timeout-minutes: 15
+ strategy:
+ matrix:
+ os: [ubuntu-20.04, windows-2019]
+ runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout at v2
with:
fetch-depth: 0 # required for setuptools_scm
- name: Build wheels
- uses: pypa/cibuildwheel at v2.1.1
- with:
- output-dir: dist/
+ uses: pypa/cibuildwheel at v2.1.2
env:
- CIBW_BUILD: "cp3*-manylinux_x86_64"
- CIBW_MANYLINUX_X86_64_IMAGE: manylinux2010
+ CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64"
CIBW_ENVIRONMENT: "CFLAGS=-g0"
- - name: Set up Python
- uses: actions/setup-python at v2
+ CIBW_TEST_REQUIRES: "pytest"
+ CIBW_TEST_COMMAND_LINUX: "cd {project} && pytest tests"
+ CIBW_TEST_COMMAND_WINDOWS: "cd /d {project} && pytest tests"
+ - uses: actions/upload-artifact at v2
+ with:
+ name: wheels
+ path: wheelhouse/*.whl
+
+ publish:
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+ needs: [build, wheels]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/download-artifact at v2
+ with:
+ name: sdist
+ path: dist/
+ - uses: actions/download-artifact at v2
with:
- python-version: 3.7
- - name: Make distributions
- run: |
- python -m pip install build
- python -m build --sdist
- ls -l dist/
+ name: wheels
+ path: dist/
- name: Publish to PyPI
- uses: pypa/gh-action-pypi-publish at v1.4.1
+ uses: pypa/gh-action-pypi-publish at v1.4.2
with:
user: __token__
password: ${{ secrets.pypi_password }}
=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,17 @@
Changelog
=========
+v0.7.1 (2022-01-26)
+-------------------
+
+* PR #34: Fix parsing of FASTA files that just contain a comment and no reads
+
+v0.7.0 (2022-01-17)
+-------------------
+
+* @rhpvorderman contributed many performance improvements in PR #15, #17, #18, #20, #21, #22, #23. Reading and writing FASTQ files and reading of paired-end FASTQ files was sped up significantly. For example, reading uncompressed FASTQ is 50% faster (!) than before.
+* PR #28: Windows support added
+
v0.6.0 (2021-09-28)
-------------------
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-dnaio (0.7.1-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+
+ -- Andreas Tille <tille at debian.org> Sat, 19 Feb 2022 08:23:26 +0100
+
python-dnaio (0.6.0-2) unstable; urgency=medium
* Team upload.
=====================================
pyproject.toml
=====================================
@@ -1,5 +1,9 @@
[build-system]
-requires = ["setuptools", "wheel", "setuptools_scm", "Cython"]
+requires = ["setuptools >= 45", "wheel", "setuptools_scm >= 6.2", "Cython >= 0.29.20"]
+build-backend = "setuptools.build_meta"
[black.tool]
line-length = 100
+
+[tool.setuptools_scm]
+write_to = "src/dnaio/_version.py"
=====================================
setup.cfg
=====================================
@@ -0,0 +1,35 @@
+[metadata]
+name = dnaio
+author = Marcel Martin
+author_email = marcel.martin at scilifelab.se
+url = https://github.com/marcelm/dnaio/
+description = Read and write FASTA and FASTQ files efficiently
+long_description = file: README.md
+long_description_content_type = text/markdown
+license = MIT
+classifiers =
+ Development Status :: 5 - Production/Stable
+ Intended Audience :: Science/Research
+ License :: OSI Approved :: MIT License
+ Programming Language :: Cython
+ Programming Language :: Python :: 3
+ Topic :: Scientific/Engineering :: Bio-Informatics
+
+[options]
+python_requires = >=3.6
+package_dir =
+ =src
+packages = find:
+install_requires =
+ xopen >= 1.4.0
+
+[options.packages.find]
+where = src
+
+[options.package_data]
+* = py.typed, *.pyi
+
+[options.extras_require]
+dev =
+ Cython
+ pytest
=====================================
setup.py
=====================================
@@ -1,81 +1,11 @@
-import os.path
-from setuptools import setup, Extension, find_packages
-from distutils.command.sdist import sdist
-from distutils.command.build_ext import build_ext
-
-
-def no_cythonize(extensions, **_ignore):
- """Change .pyx to .c or .cpp (copied from Cython documentation)"""
- for extension in extensions:
- sources = []
- for sfile in extension.sources:
- path, ext = os.path.splitext(sfile)
- if ext in ('.pyx', '.py'):
- if extension.language == 'c++':
- ext = '.cpp'
- else:
- ext = '.c'
- sfile = path + ext
- sources.append(sfile)
- extension.sources[:] = sources
-
-
-class BuildExt(build_ext):
- def run(self):
- # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip
- # file retrieved from PyPI that already includes the pre-cythonized
- # extension modules, and then we do not need to run cythonize().
- if os.path.exists('PKG-INFO'):
- no_cythonize(self.extensions)
- else:
- # Otherwise, this is a 'developer copy' of the code, and then the
- # only sensible thing is to require Cython to be installed.
- from Cython.Build import cythonize
- self.extensions = cythonize(self.extensions)
- super().run()
-
-
-class SDist(sdist):
- def run(self):
- # Make sure the compiled Cython files in the distribution are up-to-date
- from Cython.Build import cythonize
- cythonize(self.distribution.ext_modules)
- super().run()
-
-
-with open('README.md', encoding='utf-8') as f:
- long_description = f.read()
+from setuptools import setup, Extension
+from Cython.Build import cythonize
+import setuptools_scm # noqa Ensure it’s installed
setup(
- name='dnaio',
- setup_requires=['setuptools_scm'], # Support pip versions that don't know about pyproject.toml
- use_scm_version={'write_to': 'src/dnaio/_version.py'},
- author='Marcel Martin',
- author_email='marcel.martin at scilifelab.se',
- url='https://github.com/marcelm/dnaio/',
- description='Read and write FASTA and FASTQ files efficiently',
- long_description=long_description,
- long_description_content_type='text/markdown',
- license='MIT',
- package_dir={'': 'src'},
- packages=find_packages('src'),
- package_data={"dnaio": ["py.typed", "*.pyi"]},
- extras_require={
- 'dev': ['Cython', 'pytest'],
- },
- ext_modules=[
- Extension('dnaio._core', sources=['src/dnaio/_core.pyx']),
- ],
- cmdclass={'build_ext': BuildExt, 'sdist': SDist},
- install_requires=['xopen>=0.8.2'],
- python_requires='>=3.6',
- classifiers=[
- "Development Status :: 5 - Production/Stable",
- "Intended Audience :: Science/Research",
- "License :: OSI Approved :: MIT License",
- "Natural Language :: English",
- "Programming Language :: Cython",
- "Programming Language :: Python :: 3",
- "Topic :: Scientific/Engineering :: Bio-Informatics"
- ]
+ ext_modules=cythonize(
+ [
+ Extension("dnaio._core", sources=["src/dnaio/_core.pyx"]),
+ ]
+ ),
)
=====================================
src/dnaio/_core.pyx
=====================================
@@ -1,9 +1,15 @@
# cython: language_level=3, emit_code_comments=False
-from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING
-from libc.string cimport strncmp, memcmp, memcpy
+from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes_GET_SIZE
+from cpython.unicode cimport PyUnicode_DecodeLatin1
+from libc.string cimport strncmp, memcmp, memcpy, memchr, strcspn
+from cpython.unicode cimport PyUnicode_GET_LENGTH
cimport cython
+cdef extern from *:
+ unsigned char * PyUnicode_1BYTE_DATA(object o)
+ int PyUnicode_KIND(object o)
+ int PyUnicode_1BYTE_KIND
from .exceptions import FastqFormatError
from ._util import shorten
@@ -34,11 +40,13 @@ cdef class Sequence:
self.sequence = sequence
self.qualities = qualities
+ def __init__(self, str name, str sequence, str qualities = None):
+ # __cinit__ is called first and sets all the variables.
if qualities is not None and len(qualities) != len(sequence):
rname = shorten(name)
raise ValueError("In read named {!r}: length of quality sequence "
- "({}) and length of read ({}) do not match".format(
- rname, len(qualities), len(sequence)))
+ "({}) and length of read ({}) do not match".format(
+ rname, len(qualities), len(sequence)))
def __getitem__(self, key):
"""
@@ -88,62 +96,91 @@ cdef class Sequence:
This is a faster version of qualities.encode('ascii')."""
return self.qualities.encode('ascii')
- def fastq_bytes(self):
+ def fastq_bytes(self, two_headers = False):
"""Return the entire FASTQ record as bytes which can be written
- into a file."""
- # Convert to ASCII bytes sequences first as these have a one-to-one
- # relation between size and number of bytes
- # Unlike decoding, ascii is not slower than latin-1. This is because
- # CPython performs a call to PyUnicodeCheck on both occassions. This
- # determines the type of the Unicode object. In fact, the ascii encode
- # is slightly faster because the check for PyASCIIObject is performed
- # first.
- cdef bytes name = self.name.encode('ascii')
- cdef bytes sequence = self.sequence.encode('ascii')
- cdef bytes qualities = self.qualities.encode('ascii')
- cdef Py_ssize_t name_length = len(name)
- cdef Py_ssize_t sequence_length = len(sequence)
- cdef Py_ssize_t qualities_length = len(qualities)
-
- # Since Cython will generate code above that is a 100% sure to generate
- # bytes objects, we can call Python C-API functions that don't perform
- # checks on the object.
- cdef char * name_ptr = PyBytes_AS_STRING(name)
- cdef char * sequence_ptr = PyBytes_AS_STRING(sequence)
- cdef char * qualities_ptr = PyBytes_AS_STRING(qualities)
+ into a file.
+
+ Optionally the header (after the @) can be repeated on the third line
+ (after the +), when two_headers is enabled."""
+ cdef:
+ char * name
+ char * sequence
+ char * qualities
+ Py_ssize_t name_length
+ Py_ssize_t sequence_length
+ Py_ssize_t qualities_length
+
+ if PyUnicode_KIND(self.name) == PyUnicode_1BYTE_KIND:
+ name = <char *>PyUnicode_1BYTE_DATA(self.name)
+ name_length = <size_t>PyUnicode_GET_LENGTH(self.name)
+ else:
+ # Allow non-ASCII in name
+ name_bytes = self.name.encode('latin-1')
+ name = PyBytes_AS_STRING(name_bytes)
+ name_length = PyBytes_GET_SIZE(name_bytes)
+ if PyUnicode_KIND(self.sequence) == PyUnicode_1BYTE_KIND:
+ sequence = <char *>PyUnicode_1BYTE_DATA(self.sequence)
+ sequence_length = <size_t>PyUnicode_GET_LENGTH(self.sequence)
+ else:
+ # Don't allow non-ASCII in sequence and qualities
+ sequence_bytes = self.sequence.encode('ascii')
+ sequence = PyBytes_AS_STRING(sequence_bytes)
+ sequence_length = PyBytes_GET_SIZE(sequence_bytes)
+ if PyUnicode_KIND(self.qualities) == PyUnicode_1BYTE_KIND:
+ qualities = <char *>PyUnicode_1BYTE_DATA(self.qualities)
+ qualities_length = <size_t>PyUnicode_GET_LENGTH(self.qualities)
+ else:
+ qualities_bytes = self.qualities.encode('ascii')
+ qualities = PyBytes_AS_STRING(qualities_bytes)
+ qualities_length = PyBytes_GET_SIZE(qualities_bytes)
+ return create_fastq_record(name, sequence, qualities,
+ name_length, sequence_length, qualities_length,
+ two_headers)
+ def fastq_bytes_two_headers(self):
+ """
+ Return this record in FASTQ format as a bytes object where the header (after the @) is
+ repeated on the third line.
+ """
+ return self.fastq_bytes(two_headers=True)
+
+
+cdef bytes create_fastq_record(char * name, char * sequence, char * qualities,
+ Py_ssize_t name_length,
+ Py_ssize_t sequence_length,
+ Py_ssize_t qualities_length,
+ bint two_headers = False):
# Total size is name + sequence + qualities + 4 newlines + '+' and an
# '@' to be put in front of the name.
cdef Py_ssize_t total_size = name_length + sequence_length + qualities_length + 6
+ if two_headers:
+ # We need space for the name after the +.
+ total_size += name_length
+
# This is the canonical way to create an uninitialized bytestring of given size
cdef bytes retval = PyBytes_FromStringAndSize(NULL, total_size)
cdef char * retval_ptr = PyBytes_AS_STRING(retval)
# Write the sequences into the bytestring at the correct positions.
- cdef Py_ssize_t cursor
+ cdef size_t cursor
retval_ptr[0] = b"@"
- memcpy(retval_ptr + 1, name_ptr, name_length)
+ memcpy(retval_ptr + 1, name, name_length)
cursor = name_length + 1
retval_ptr[cursor] = b"\n"; cursor += 1
- memcpy(retval_ptr + cursor, sequence_ptr, sequence_length)
+ memcpy(retval_ptr + cursor, sequence, sequence_length)
cursor += sequence_length
retval_ptr[cursor] = b"\n"; cursor += 1
retval_ptr[cursor] = b"+"; cursor += 1
+ if two_headers:
+ memcpy(retval_ptr + cursor, name, name_length)
+ cursor += name_length
retval_ptr[cursor] = b"\n"; cursor += 1
- memcpy(retval_ptr + cursor, qualities_ptr, qualities_length)
+ memcpy(retval_ptr + cursor, qualities, qualities_length)
cursor += qualities_length
retval_ptr[cursor] = b"\n"
return retval
- def fastq_bytes_two_headers(self):
- """
- Return this record in FASTQ format as a bytes object where the header (after the @) is
- repeated on the third line.
- """
- return f"@{self.name}\n{self.sequence}\n+{self.name}\n{self.qualities}\n".encode("ascii")
-
-
# It would be nice to be able to have the first parameter be an
# unsigned char[:] (memory view), but this fails with a BufferError
# when a bytes object is passed in.
@@ -209,12 +246,19 @@ def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
bytearray buf = bytearray(buffer_size)
char[:] buf_view = buf
char* c_buf = buf
- int endskip
str name
- char* name_encoded
- Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
- Py_ssize_t second_header_start, sequence_length, qualities_start
- Py_ssize_t second_header_length, name_length
+ str sequence
+ str qualities
+ Py_ssize_t last_read_position = 0
+ Py_ssize_t record_start = 0
+ Py_ssize_t bufstart, bufend, name_start, name_end, name_length
+ Py_ssize_t sequence_start, sequence_end, sequence_length
+ Py_ssize_t second_header_start, second_header_end, second_header_length
+ Py_ssize_t qualities_start, qualities_end, qualities_length
+ char *name_end_ptr
+ char *sequence_end_ptr
+ char *second_header_end_ptr
+ char *qualities_end_ptr
bint custom_class = sequence_class is not Sequence
Py_ssize_t n_records = 0
bint extra_newline = False
@@ -252,138 +296,182 @@ def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
bufstart += 1
bufend += 1
extra_newline = True
- else:
- break
+ elif last_read_position > record_start: # Incomplete FASTQ records are present.
+ if extra_newline:
+ # Do not report the linefeed that was added by dnaio but
+ # was not present in the original input.
+ last_read_position -= 1
+ lines = buf[record_start:last_read_position].count(b'\n')
+ raise FastqFormatError(
+ 'Premature end of file encountered. The incomplete final record was: '
+ '{!r}'.format(
+ shorten(buf[record_start:last_read_position].decode('latin-1'),
+ 500)),
+ line=n_records * 4 + lines)
+ else: # EOF Reached. Stop iterating.
+ return
# Parse all complete FASTQ records in this chunk
- pos = 0
record_start = 0
while True:
- # Parse the name (line 0)
- if c_buf[pos] != b'@':
- raise FastqFormatError("Line expected to "
- "start with '@', but found {!r}".format(chr(c_buf[pos])),
- line=n_records * 4)
- pos += 1
- while pos < bufend and c_buf[pos] != b'\n':
- pos += 1
- if pos == bufend:
+ ### Check for a complete record (i.e 4 newlines are present)
+ # Use libc memchr, this optimizes looking for characters by
+ # using 64-bit integers. See:
+ # https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=string/memchr.c;hb=HEAD
+ # void *memchr(const void *str, int c, size_t n)
+ name_end_ptr = <char *>memchr(c_buf + record_start, b'\n', <size_t>(bufend - record_start))
+ if name_end_ptr == NULL:
break
- endskip = 1 if c_buf[pos-1] == b'\r' else 0
- name_length = pos - endskip - record_start - 1
- name_encoded = c_buf + record_start + 1
- # .decode('latin-1') is 50% faster than .decode('ascii')
- # This is because PyUnicode_DecodeLatin1 is an alias for
- # _PyUnicode_FromUCS1. Which directly copies the bytes into a
- # string object. No operations are taking place. With
- # PyUnicode_DecodeASCII, all characters are checked whether they
- # exceed 128.
- name = c_buf[record_start+1:pos-endskip].decode('latin-1')
-
- pos += 1
-
- # Parse the sequence (line 1)
- sequence_start = pos
- while pos < bufend and c_buf[pos] != b'\n':
- pos += 1
- if pos == bufend:
+ # bufend - sequence_start is always nonnegative:
+ # - name_end is at most bufend - 1
+ # - thus sequence_start is at most bufend
+ name_end = name_end_ptr - c_buf
+ sequence_start = name_end + 1
+ sequence_end_ptr = <char *>memchr(c_buf + sequence_start, b'\n', <size_t>(bufend - sequence_start))
+ if sequence_end_ptr == NULL:
+ break
+ sequence_end = sequence_end_ptr - c_buf
+ second_header_start = sequence_end + 1
+ second_header_end_ptr = <char *>memchr(c_buf + second_header_start, b'\n', <size_t>(bufend - second_header_start))
+ if second_header_end_ptr == NULL:
break
- endskip = 1 if c_buf[pos-1] == b'\r' else 0
- sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
- sequence_length = pos - endskip - sequence_start
- pos += 1
-
- # Parse second header (line 2)
- second_header_start = pos
- if pos == bufend:
+ second_header_end = second_header_end_ptr - c_buf
+ qualities_start = second_header_end + 1
+ qualities_end_ptr = <char *>memchr(c_buf + qualities_start, b'\n', <size_t>(bufend - qualities_start))
+ if qualities_end_ptr == NULL:
break
- if c_buf[pos] != b'+':
+ qualities_end = qualities_end_ptr - c_buf
+
+ if c_buf[record_start] != b'@':
+ raise FastqFormatError("Line expected to "
+ "start with '@', but found {!r}".format(chr(c_buf[record_start])),
+ line=n_records * 4)
+ if c_buf[second_header_start] != b'+':
raise FastqFormatError("Line expected to "
- "start with '+', but found {!r}".format(chr(c_buf[pos])),
+ "start with '+', but found {!r}".format(chr(c_buf[second_header_start])),
line=n_records * 4 + 2)
- pos += 1 # skip over the '+'
- while pos < bufend and c_buf[pos] != b'\n':
- pos += 1
- if pos == bufend:
- break
- endskip = 1 if c_buf[pos-1] == b'\r' else 0
- second_header_length = pos - endskip - second_header_start - 1
- if second_header_length == 0:
- second_header = False
- else:
+
+ name_start = record_start + 1 # Skip @
+ second_header_start += 1 # Skip +
+ name_length = name_end - name_start
+ sequence_length = sequence_end - sequence_start
+ second_header_length = second_header_end - second_header_start
+ qualities_length = qualities_end - qualities_start
+
+ # Check for \r\n line-endings and compensate
+ if c_buf[name_end - 1] == b'\r':
+ name_length -= 1
+ if c_buf[sequence_end - 1] == b'\r':
+ sequence_length -= 1
+ if c_buf[second_header_end - 1] == b'\r':
+ second_header_length -= 1
+ if c_buf[qualities_end - 1] == b'\r':
+ qualities_length -= 1
+
+ if second_header_length: # should be 0 when only + is present
if (name_length != second_header_length or
- strncmp(c_buf+second_header_start+1,
- name_encoded, second_header_length) != 0):
+ strncmp(c_buf+second_header_start,
+ c_buf + name_start, second_header_length) != 0):
raise FastqFormatError(
"Sequence descriptions don't match ('{}' != '{}').\n"
"The second sequence description must be either "
"empty or equal to the first description.".format(
- name_encoded[:name_length].decode('latin-1'),
- c_buf[second_header_start+1:pos-endskip]
+ c_buf[name_start:name_end].decode('latin-1'),
+ c_buf[second_header_start:second_header_end]
.decode('latin-1')), line=n_records * 4 + 2)
- second_header = True
- pos += 1
-
- # Parse qualities (line 3)
- qualities_start = pos
- while pos < bufend and c_buf[pos] != b'\n':
- pos += 1
- if pos == bufend:
- break
- endskip = 1 if c_buf[pos-1] == b'\r' else 0
- qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
- if pos - endskip - qualities_start != sequence_length:
- raise FastqFormatError("Length of sequence and "
- "qualities differ", line=n_records * 4 + 3)
- pos += 1
+
+ if qualities_length != sequence_length:
+ raise FastqFormatError(
+ "Length of sequence and qualities differ", line=n_records * 4 + 3)
+
+ ### Copy record into python variables
+ # PyUnicode_DecodeLatin1 is 50% faster than PyUnicode_DecodeASCII.
+ # This is because PyUnicode_DecodeLatin1 is an alias for
+ # _PyUnicode_FromUCS1. Which directly copies the bytes into a
+ # string object after some checks. With PyUnicode_DecodeASCII,
+ # there is an extra check whether characters exceed 128.
+ name = PyUnicode_DecodeLatin1(c_buf + name_start, name_length, 'strict')
+ sequence = PyUnicode_DecodeLatin1(c_buf + sequence_start, sequence_length, 'strict')
+ qualities = PyUnicode_DecodeLatin1(c_buf + qualities_start, qualities_length, 'strict')
+
if n_records == 0:
- yield second_header # first yielded value is special
+ yield bool(second_header_length) # first yielded value is special
if custom_class:
yield sequence_class(name, sequence, qualities)
else:
yield Sequence.__new__(Sequence, name, sequence, qualities)
+
+ ### Advance record to next position
n_records += 1
- record_start = pos
- if pos == bufend:
- break
- if pos == bufend:
- if record_start == 0 and bufend == len(buf):
- # buffer too small, double it
- buffer_size *= 2
- prev_buf = buf
- buf = bytearray(buffer_size)
- buf[0:bufend] = prev_buf
- del prev_buf
- bufstart = bufend
- buf_view = buf
- c_buf = buf
- else:
- bufstart = bufend - record_start
- buf[0:bufstart] = buf[record_start:bufend]
- if pos > record_start:
- if extra_newline:
- pos -= 1
- lines = buf[record_start:pos].count(b'\n')
- raise FastqFormatError(
- 'Premature end of file encountered. The incomplete final record was: '
- '{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
- line=n_records * 4 + lines)
+ record_start = qualities_end + 1
+ # bufend reached
+ last_read_position = bufend
+ if record_start == 0 and bufend == len(buf):
+ # buffer too small, double it
+ buffer_size *= 2
+ prev_buf = buf
+ buf = bytearray(buffer_size)
+ buf[0:bufend] = prev_buf
+ del prev_buf
+ bufstart = bufend
+ buf_view = buf
+ c_buf = buf
+ else:
+ bufstart = bufend - record_start
+ buf[0:bufstart] = buf[record_start:bufend]
def record_names_match(header1: str, header2: str):
"""
Check whether the sequence record ids id1 and id2 are compatible, ignoring a
suffix of '1', '2' or '3'. This exception allows to check some old
- paired-end reads that have names ending in '/1' and '/2'. Also, the
+ paired-end reads that have IDs ending in '/1' and '/2'. Also, the
fastq-dump tool (used for converting SRA files to FASTQ) appends '.1', '.2'
and sometimes '.3' to paired-end reads if option -I is used.
"""
- cdef:
- str name1, name2
-
- name1 = header1.split()[0]
- name2 = header2.split()[0]
- if name1 and name2 and name1[-1] in '123' and name2[-1] in '123':
- name1 = name1[:-1]
- name2 = name2[:-1]
- return name1 == name2
+ if (
+ PyUnicode_KIND(header1) != PyUnicode_1BYTE_KIND or
+ PyUnicode_KIND(header2) != PyUnicode_1BYTE_KIND
+ ):
+ # Fall back to slower code path.
+ name1 = header1.split(maxsplit=1)[0]
+ name2 = header2.split(maxsplit=1)[0]
+ if name1 and name2 and name1[-1] in '123' and name2[-1] in '123':
+ return name1[:-1] == name2[:-1]
+ return name1 == name2
+ # Do not call .encode functions but use the unicode pointer inside the
+ # python object directly, provided it is in 1-byte encoding, so we can
+ # find the spaces and tabs easily.
+ cdef char * header1_chars = <char *>PyUnicode_1BYTE_DATA(header1)
+ cdef char * header2_chars = <char *>PyUnicode_1BYTE_DATA(header2)
+ cdef size_t header1_length = <size_t>PyUnicode_GET_LENGTH(header1)
+ return record_ids_match(header1_chars, header2_chars, header1_length)
+
+
+cdef bint record_ids_match(char *header1, char *header2, size_t header1_length):
+ """
+ Check whether the ASCII-encoded IDs match. Only header1_length is needed.
+ """
+ # Only the read ID is of interest.
+ # Find the first tab or space, if not present, strcspn will return the
+ # position of the terminating NULL byte. (I.e. the length).
+ # Header1 is not searched because we can reuse the end of ID position of
+ # header2 as header1's ID should end at the same position.
+ cdef size_t id2_length = strcspn(header2, b' \t')
+
+ if header1_length < id2_length:
+ return False
+
+ cdef char end = header1[id2_length]
+ if end != b'\000' and end != b' ' and end != b'\t':
+ return False
+
+ # Check if the IDs end with 1, 2 or 3. This is the read pair number
+ # which should not be included in the comparison.
+ cdef bint id1endswithnumber = b'1' <= header1[id2_length - 1] <= b'3'
+ cdef bint id2endswithnumber = b'1' <= header2[id2_length - 1] <= b'3'
+ if id1endswithnumber and id2endswithnumber:
+ id2_length -= 1
+
+ # Compare the strings up to the ID end position.
+ return memcmp(<void *>header1, <void *>header2, id2_length) == 0
=====================================
src/dnaio/chunks.py
=====================================
@@ -16,9 +16,15 @@ def _fasta_head(buf: bytes, end: Optional[int] = None) -> int:
pos = buf.rfind(b'\n>', 0, end)
if pos != -1:
return pos + 1
- if buf[0:1] == b'>':
+ if buf[0:1] == b'>' or buf[0:1] == b'#':
return 0
- raise FastaFormatError('File does not start with ">"', line=None)
+ if len(buf) == 0:
+ return 0
+ c = chr(buf[0])
+ raise FastaFormatError(
+ f"FASTA file expected to start with '>', but found {repr(c)}",
+ line=None,
+ )
def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
@@ -52,15 +58,19 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
# Read one byte to determine file format.
# If there is a comment char, we assume FASTA!
start = f.readinto(memoryview(buf)[0:1])
- if start == 1 and buf[0:1] == b'@':
- head = _fastq_head
- elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'):
- head = _fasta_head
- elif start == 0:
+ if start == 0:
# Empty file
return
+ assert start == 1
+ if buf[0:1] == b'@':
+ head = _fastq_head
+ elif buf[0:1] == b'#' or buf[0:1] == b'>':
+ head = _fasta_head
else:
- raise UnknownFileFormat('Input file format unknown')
+ raise UnknownFileFormat(
+ f"Cannnot determine input file format: First character expected to be '>' or '@', "
+ f"but found {repr(chr(buf[0]))}"
+ )
# Layout of buf
#
=====================================
src/dnaio/pairedend.py
=====================================
@@ -1,3 +1,4 @@
+import itertools
from contextlib import ExitStack
from os import PathLike
from typing import Union, BinaryIO, Optional, Iterator, Tuple
@@ -47,26 +48,24 @@ class TwoFilePairedEndReader(PairedEndReader):
"""
Iterate over the paired reads. Each item is a pair of Sequence objects.
"""
- # Avoid usage of zip() below since it will consume one item too many.
- it1, it2 = iter(self.reader1), iter(self.reader2)
- while True:
- try:
- r1 = next(it1)
- except StopIteration:
- # End of file 1. Make sure that file 2 is also at end.
- try:
- next(it2)
- raise FileFormatError(
- "Reads are improperly paired. There are more reads in "
- "file 2 than in file 1.",
- line=None,
- ) from None
- except StopIteration:
- pass
- break
- try:
- r2 = next(it2)
- except StopIteration:
+ # Avoid usage of zip() below since it will consume one item too many,
+ # when one of the iterators is exhausted. zip in python 3.10 has a
+ # 'strict' keyword that can be used to prevent this and throw an error,
+ # but it will take a long time for 3.10 or higher to be available on
+ # everyone's machine.
+ # Instead use zip_longest from itertools. This yields None if one of
+ # the iterators is exhausted. Checking for None identity is fast..
+ # So we can quickly check if the iterator is still yielding.
+ # This is faster than implementing a while loop with next calls,
+ # which requires expensive function lookups.
+ for r1, r2 in itertools.zip_longest(self.reader1, self.reader2):
+ if r1 is None:
+ raise FileFormatError(
+ "Reads are improperly paired. There are more reads in "
+ "file 2 than in file 1.",
+ line=None,
+ ) from None
+ if r2 is None:
raise FileFormatError(
"Reads are improperly paired. There are more reads in "
"file 1 than in file 2.",
=====================================
src/dnaio/readers.py
=====================================
@@ -121,7 +121,7 @@ class FastqReader(BinaryFileReader, SingleEndReader):
file: Union[str, BinaryIO],
*,
sequence_class=Sequence,
- buffer_size: int = 1048576,
+ buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc.
opener=xopen,
_close_file: Optional[bool] = None,
):
=====================================
src/dnaio/singleend.py
=====================================
@@ -112,10 +112,11 @@ def _detect_format_from_content(file: BinaryIO) -> Optional[str]:
Return 'fasta', 'fastq' or None
"""
if file.seekable():
+ original_position = file.tell()
first_char = file.read(1)
- if file.tell() > 0:
- file.seek(-1, 1)
+ file.seek(original_position)
else:
+ # We cannot always use peek() because BytesIO objects do not suppert it
first_char = file.peek(1)[0:1] # type: ignore
formats = {
b"@": "fastq",
=====================================
tests/test_chunks.py
=====================================
@@ -2,7 +2,33 @@ from pytest import raises
from io import BytesIO
from dnaio._core import paired_fastq_heads
-from dnaio.chunks import _fastq_head, read_chunks, read_paired_chunks
+from dnaio.chunks import _fastq_head, _fasta_head, read_chunks, read_paired_chunks
+
+
+def test_fasta_head():
+ assert _fasta_head(b'') == 0
+ assert _fasta_head(b'>1\n') == 0
+ assert _fasta_head(b'>1\n3') == 0
+ assert _fasta_head(b'>1\n3\n') == 0
+ assert _fasta_head(b'>1\n3\n>') == 5
+ assert _fasta_head(b'>1\n3\n>6') == 5
+ assert _fasta_head(b'>1\n3\n>6\n') == 5
+ assert _fasta_head(b'>1\n3\n>6\n8') == 5
+ assert _fasta_head(b'>1\n3\n>6\n8\n') == 5
+ assert _fasta_head(b'>1\n3\n>6\n8\n0') == 5
+ assert _fasta_head(b'>1\n3\n>6\n8\n0\n') == 5
+ assert _fasta_head(b'>1\n3\n>6\n8\n0\n>') == 12
+
+
+def test_fasta_head_with_comment():
+ assert _fasta_head(b'#') == 0
+ assert _fasta_head(b'#\n') == 0
+ assert _fasta_head(b'#\n>') == 2
+ assert _fasta_head(b'#\n>3') == 2
+ assert _fasta_head(b'#\n>3\n') == 2
+ assert _fasta_head(b'#\n>3\n5') == 2
+ assert _fasta_head(b'#\n>3\n5\n') == 2
+ assert _fasta_head(b'#\n>3\n5\n>') == 7
def test_paired_fastq_heads():
=====================================
tests/test_internal.py
=====================================
@@ -239,7 +239,7 @@ class TestFastqReader:
list(fq)
def test_second_header_not_equal(self):
- fastq = BytesIO(b'@r1\nACG\n+xy\n')
+ fastq = BytesIO(b'@r1\nACG\n+xy\nXXX\n')
with raises(FastqFormatError) as info:
with FastqReader(fastq) as fq:
list(fq) # pragma: no cover
@@ -482,6 +482,11 @@ class TestPairedSequenceReader:
assert match('abc def', 'abc ghi')
assert match('abc', 'abc ghi')
assert not match('abc', 'xyz')
+ assert match('abc\tdef', 'abc')
+ assert match('abc\tdef', 'abc\tghi')
+ assert match('abc somecomment\tanothercomment', 'abc andanothercomment\tbla')
+ assert match('abc\tcomments comments', 'abc\tothers others')
+ assert match('abc\tdef', 'abc def')
def test_record_names_match_with_ignored_trailing_12(self):
match = record_names_match
@@ -521,6 +526,15 @@ class TestPairedSequenceReader:
list(psr)
assert "There are more reads in file 1 than in file 2" in info.value.message
+ def test_empty_sequences_do_not_stop_iteration(self):
+ s1 = BytesIO(b'@r1_1\nACG\n+\nHHH\n at r2_1\nACG\n+\nHHH\n at r3_2\nACG\n+\nHHH\n')
+ s2 = BytesIO(b'@r1_1\nACG\n+\nHHH\n at r2_2\n\n+\n\n at r3_2\nACG\n+\nHHH\n')
+ # Second sequence for s2 is empty but valid. Should not lead to a stop of iteration.
+ with TwoFilePairedEndReader(s1, s2) as psr:
+ seqs = list(psr)
+ print(seqs)
+ assert len(seqs) == 3
+
def test_incorrectly_paired(self):
s1 = BytesIO(b'@r1/1\nACG\n+\nHHH\n')
s2 = BytesIO(b'@wrong_name\nTTT\n+\nHHH\n')
@@ -531,18 +545,21 @@ class TestPairedSequenceReader:
@mark.parametrize('path', [
- 'tests/data/simple.fastq',
- 'tests/data/dos.fastq',
- 'tests/data/simple.fasta',
- 'tests/data/with_comment.fasta',
+ os.path.join('tests', 'data', 'simple.fastq'),
+ os.path.join('tests', 'data', 'dos.fastq'),
+ os.path.join('tests', 'data', 'simple.fasta'),
+ os.path.join('tests', 'data', 'with_comment.fasta'),
])
def test_read_stdin(path):
# Get number of records in the input file
with dnaio.open(path) as f:
expected = len(list(f))
- # Use 'cat' to simulate that no file name is available for stdin of the subprocess
- with subprocess.Popen(['cat', path], stdout=subprocess.PIPE) as cat:
+ # Use piping from a separate subprocess to force the input file name to be unavailable
+ cmd = "type" if sys.platform == "win32" else "cat"
+ with subprocess.Popen(
+ [cmd, path], stdout=subprocess.PIPE, shell=sys.platform == "win32"
+ ) as cat:
with subprocess.Popen(
[sys.executable, 'tests/read_from_stdin.py'], stdin=cat.stdout, stdout=subprocess.PIPE
) as py:
=====================================
tox.ini
=====================================
@@ -1,6 +1,6 @@
[tox]
-envlist = flake8,mypy,py36,py37,py38,py39
-requires = Cython>=0.29.13
+envlist = flake8,mypy,py36,py37,py38,py39,py310
+isolated_build = True
[testenv]
deps =
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/compare/ec0066b8b6e9334440e0d82ed774937d4514f646...0f3b72fc4ed67d190b1f3eea27f380deee9aaec7
--
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/compare/ec0066b8b6e9334440e0d82ed774937d4514f646...0f3b72fc4ed67d190b1f3eea27f380deee9aaec7
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220219/fcc504cc/attachment-0001.htm>
More information about the debian-med-commit
mailing list