[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 1.2.3

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Sat Nov 30 18:26:49 GMT 2024



Étienne Mollier pushed to branch upstream at Debian Med / python-dnaio


Commits:
94e13724 by Étienne Mollier at 2024-11-30T19:21:28+01:00
New upstream version 1.2.3
- - - - -


16 changed files:

- .github/workflows/ci.yml
- CHANGES.rst
- README.rst
- pyproject.toml
- + src/dnaio/_bam.py
- src/dnaio/_core.pyi
- src/dnaio/_core.pyx
- src/dnaio/chunks.py
- src/dnaio/multipleend.py
- src/dnaio/readers.py
- src/dnaio/singleend.py
- + tests/data/missing_header_no_bgzip_raw_bam_bytes
- tests/test_chunks.py
- tests/test_internal.py
- tests/test_open.py
- tox.ini


Changes:

=====================================
.github/workflows/ci.yml
=====================================
@@ -49,17 +49,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest, windows-latest]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
         include:
         - os: macos-13
           python-version: "3.10"
         - os: macos-14
           python-version: "3.10"
-        - os: windows-latest
-          python-version: "3.10"
-        - os: ubuntu-latest
-          python-version: "3.10"
     steps:
     - uses: actions/checkout at v4
     - name: Set up Python ${{ matrix.python-version }}
@@ -88,11 +84,9 @@ jobs:
       with:
         fetch-depth: 0  # required for setuptools_scm
     - name: Build wheels
-      uses: pypa/cibuildwheel at v2.17.0
+      uses: pypa/cibuildwheel at v2.21.3
       env:
         CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64 cp3*-macosx_x86_64 cp3*-macosx_arm64"
-        CIBW_SKIP: "cp37-*"
-        CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
     - uses: actions/upload-artifact at v4
       with:
         name: wheels-${{ matrix.os }}


=====================================
CHANGES.rst
=====================================
@@ -2,6 +2,13 @@
 Changelog
 =========
 
+v1.2.3 (2024-11-12)
+-------------------
+
+* :pr:`141`: Allow chunked reading of uBAM.
+* Added support for Python 3.13
+* Dropped support for Python 3.8
+
 v1.2.2 (2024-10-04)
 -------------------
 * :pr:`139`: Fix an error that occurred when decoding BAM records with missing


=====================================
README.rst
=====================================
@@ -16,7 +16,7 @@
 dnaio processes FASTQ, FASTA and uBAM files
 ===========================================
 
-``dnaio`` is a Python 3.8+ library for very efficient parsing and writing of FASTQ and also FASTA files.
+``dnaio`` is a Python 3.9+ library for very efficient parsing and writing of FASTQ and also FASTA files.
 Since ``dnaio`` version 1.1.0, support for efficiently parsing uBAM files has been implemented.
 This allows reading ONT files from the `dorado <https://github.com/nanoporetech/dorado>`_
 basecaller directly.


=====================================
pyproject.toml
=====================================
@@ -19,7 +19,7 @@ classifiers = [
     "Programming Language :: Python :: 3",
     "Topic :: Scientific/Engineering :: Bio-Informatics"
 ]
-requires-python = ">3.7"
+requires-python = ">=3.9"
 dependencies = [
     "xopen >= 1.4.0"
 ]


=====================================
src/dnaio/_bam.py
=====================================
@@ -0,0 +1,43 @@
+import struct
+from io import BufferedIOBase
+
+
+def read_bam_header(fileobj: BufferedIOBase) -> bytes:
+    magic = fileobj.read(4)
+    if not isinstance(magic, bytes):
+        raise TypeError(
+            f"fileobj {fileobj} (type: {type(fileobj)}), was not opened in binary mode."
+        )
+    if len(magic) < 4:
+        raise EOFError("Truncated BAM file")
+    if magic[:4] != b"BAM\1":
+        raise ValueError(
+            f"fileobj: {fileobj}, is not a BAM file. No BAM file signature, instead "
+            f"found {magic!r}"
+        )
+    return read_bam_header_after_magic(fileobj)
+
+
+def read_bam_header_after_magic(fileobj: BufferedIOBase) -> bytes:
+    header_size = fileobj.read(4)
+    if len(header_size) < 4:
+        raise EOFError("Truncated BAM file")
+    (l_text,) = struct.unpack("<I", header_size)
+    header = fileobj.read(l_text)
+    if len(header) < l_text:
+        raise EOFError("Truncated BAM file")
+    n_ref_obj = fileobj.read(4)
+    if len(n_ref_obj) < 4:
+        raise EOFError("Truncated BAM file")
+    (n_ref,) = struct.unpack("<I", n_ref_obj)
+    for i in range(n_ref):
+        l_name_obj = fileobj.read(4)
+        if len(l_name_obj) < 4:
+            raise EOFError("Truncated BAM file")
+        (l_name,) = struct.unpack("<I", l_name_obj)
+        reference_chunk_size = l_name + 4  # Include name and uint32_t of size
+        reference_chunk = fileobj.read(reference_chunk_size)
+        if len(reference_chunk) < reference_chunk_size:
+            raise EOFError("Truncated BAM file")
+    # Fileobj is now skipped ahead and at the start of the BAM records
+    return header


=====================================
src/dnaio/_core.pyi
=====================================
@@ -1,3 +1,4 @@
+import sys
 from typing import (
     Generic,
     Optional,
@@ -36,6 +37,7 @@ class SequenceRecord:
 def paired_fastq_heads(
     buf1: ByteString, buf2: ByteString, end1: int, end2: int
 ) -> Tuple[int, int]: ...
+def bam_head(buf: bytes, end: Optional[int] = None) -> int: ...
 def records_are_mates(
     __first_record: SequenceRecord,
     __second_record: SequenceRecord,
@@ -54,7 +56,7 @@ class FastqIter(Generic[T]):
     def number_of_records(self) -> int: ...
 
 class BamIter:
-    def __init__(self, file: BinaryIO, buffer_size: int): ...
+    def __init__(self, file: BinaryIO, read_in_size: int, with_header: bool = True): ...
     def __iter__(self) -> Iterator[SequenceRecord]: ...
     def __next__(self) -> SequenceRecord: ...
     @property


=====================================
src/dnaio/_core.pyx
=====================================
@@ -5,6 +5,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes
 from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
 from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnicode_DecodeASCII
 from cpython.object cimport Py_TYPE, PyTypeObject
+from cpython.pyport cimport PY_SSIZE_T_MAX
 from cpython.ref cimport PyObject
 from cpython.tuple cimport PyTuple_GET_ITEM
 from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
@@ -12,6 +13,8 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t
 
 cimport cython
 
+from ._bam import read_bam_header
+
 cdef extern from "Python.h":
     void *PyUnicode_DATA(object o)
     bint PyUnicode_IS_COMPACT_ASCII(object o)
@@ -431,6 +434,29 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2):
     return record_start1 - data1, record_start2 - data2
 
 
+def bam_head(buf, end = None):
+    """Return the end of the last complete BAM record in the buf."""
+    cdef Py_ssize_t c_end = PY_SSIZE_T_MAX
+    if end is not None:
+        c_end = end
+    cdef Py_buffer buffer
+    PyObject_GetBuffer(buf, &buffer, PyBUF_SIMPLE)
+    cdef:
+        uint8_t *buffer_start = <uint8_t *>buffer.buf
+        uint8_t *record_start = buffer_start
+        uint8_t *buffer_end = buffer_start + min(c_end, buffer.len)
+        uint32_t block_size
+        size_t record_size
+
+    while (record_start + 4) < buffer_end:
+        record_size = (<uint32_t *>record_start)[0] + 4
+        if (record_start + record_size) > buffer_end:
+            break
+        record_start += record_size
+    cdef Py_ssize_t head = <Py_ssize_t>(record_start - buffer_start)
+    PyBuffer_Release(&buffer)
+    return head
+
 cdef class FastqIter:
     """
     Parse a FASTQ file and yield SequenceRecord objects
@@ -688,6 +714,22 @@ cdef struct BamRecordHeader:
     int32_t tlen
 
 cdef class BamIter:
+    """
+    Parse a uBAM file and yield SequenceRecord objects
+
+    Arguments:
+        file: a file-like object, opened in binary mode (it must have a readinto
+            method)
+
+        buffer_size: size of the initial buffer. This is automatically grown
+            if a BAM record is encountered that does not fit.
+
+        with_header: The BAM file has a header that needs parsing. Default is True.
+            False can be used in circumstances where chunks of BAM records are read.
+
+    Yields:
+        SequenceRecord Objects
+    """
     cdef:
         uint8_t *record_start
         uint8_t *buffer_end
@@ -701,42 +743,16 @@ cdef class BamIter:
     def __dealloc__(self):
         PyMem_Free(self.read_in_buffer)
 
-    def __cinit__(self, fileobj, read_in_size = 48 * 1024):
+    def __cinit__(self, fileobj, read_in_size = 48 * 1024, with_header = True):
         if read_in_size < 4:
             raise ValueError(f"read_in_size must be at least 4 got "
                               f"{read_in_size}")
 
-        # Skip ahead and save the BAM header for later inspection
-        magic_and_header_size = fileobj.read(8)
-        if not isinstance(magic_and_header_size, bytes):
-            raise TypeError(f"fileobj {fileobj} is not a binary IO type, "
-                            f"got {type(fileobj)}")
-        if len(magic_and_header_size) < 8:
-            raise EOFError("Truncated BAM file")
-        if magic_and_header_size[:4] != b"BAM\1":
-            raise ValueError(
-                f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead "
-                f"found {magic_and_header_size[:4]}")
-        l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False)
-        header =  fileobj.read(l_text)
-        if len(header) < l_text:
-            raise EOFError("Truncated BAM file")
-        n_ref_obj = fileobj.read(4)
-        if len(n_ref_obj) < 4:
-            raise EOFError("Truncated BAM file")
-        n_ref = int.from_bytes(n_ref_obj, "little", signed=False)
-        for i in range(n_ref):
-            l_name_obj = fileobj.read(4)
-            if len(l_name_obj) < 4:
-                raise EOFError("Truncated BAM file")
-            l_name = int.from_bytes(l_name_obj, "little", signed=False)
-            reference_chunk_size = l_name + 4  # Include name and uint32_t of size
-            reference_chunk = fileobj.read(reference_chunk_size)
-            if len(reference_chunk) < reference_chunk_size:
-                raise EOFError("Truncated BAM file")
-        # Fileobj is now skipped ahead and at the start of the BAM records
-
-        self.header = header
+        if with_header:
+            # Skip ahead and save the BAM header for later inspection
+            self.header = read_bam_header(fileobj)
+        else:
+            self.header = b""
         self.read_in_size = read_in_size
         self.file = fileobj
         self.read_in_buffer = NULL
@@ -746,9 +762,9 @@ cdef class BamIter:
 
     def __iter__(self):
         return self
-    
+
     cdef _read_into_buffer(self):
-        cdef size_t read_in_size 
+        cdef size_t read_in_size
         cdef size_t leftover_size = self.buffer_end - self.record_start
         cdef uint32_t block_size
         memmove(self.read_in_buffer, self.record_start, leftover_size)
@@ -769,7 +785,7 @@ cdef class BamIter:
             raise StopIteration()
         elif new_bytes_size == 0:
             raise EOFError("Incomplete record at the end of file")
-        cdef uint8_t *tmp 
+        cdef uint8_t *tmp
         if new_buffer_size > self.read_in_buffer_size:
             tmp = <uint8_t *>PyMem_Realloc(self.read_in_buffer, new_buffer_size)
             if tmp == NULL:
@@ -843,7 +859,7 @@ cdef class BamIter:
 def record_names_match(header1: str, header2: str):
     """
     Check whether the sequence record ids id1 and id2 are compatible, ignoring a
-    suffix of '1', '2' or '3'. This exception allows to check some old
+    suffix of '1', '2' or '3'. This exception allows one to check some old
     paired-end reads that have IDs ending in '/1' and '/2'. Also, the
     fastq-dump tool (used for converting SRA files to FASTQ) appends '.1', '.2'
     and sometimes '.3' to paired-end reads if option -I is used.


=====================================
src/dnaio/chunks.py
=====================================
@@ -6,10 +6,12 @@ without actually parsing the records. The chunks can then be distributed to work
 or subprocess and be parsed and processed there.
 """
 
-from io import RawIOBase
+from io import BufferedIOBase
 from typing import Optional, Iterator, Tuple
 
+from ._bam import read_bam_header_after_magic
 from ._core import paired_fastq_heads as _paired_fastq_heads
+from ._core import bam_head as _bam_head
 from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat
 
 
@@ -79,7 +81,9 @@ def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
     return right + 1  # type: ignore
 
 
-def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]:
+def read_chunks(
+    f: BufferedIOBase, buffer_size: int = 4 * 1024**2
+) -> Iterator[memoryview]:
     """
     Read chunks of complete FASTA or FASTQ records from a file.
     If the format is detected to be FASTQ, all chunks except possibly the last contain
@@ -104,19 +108,23 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
 
     # Read one byte to determine file format.
     # If there is a comment char, we assume FASTA!
-    start = f.readinto(memoryview(buf)[0:1])
+    start = f.readinto(memoryview(buf)[0:4])
     if start == 0:
         # Empty file
         return
-    assert start == 1
+    assert start == 4
     if buf[0:1] == b"@":
         head = _fastq_head
     elif buf[0:1] == b"#" or buf[0:1] == b">":
         head = _fasta_head
+    elif buf[0:4] == b"BAM\x01":
+        head = _bam_head
+        _ = read_bam_header_after_magic(f)
+        start = 0  # Skip header and start at the records.
     else:
         raise UnknownFileFormat(
-            f"Cannnot determine input file format: First character expected to be '>' or '@', "
-            f"but found {repr(chr(buf[0]))}"
+            f"Cannnot determine input file format: First characters expected "
+            f"to be '>'. '@', or 'BAM\1', but found {repr(buf[0:4])}"
         )
 
     # Layout of buf
@@ -135,7 +143,7 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
     while True:
         if start == len(buf):
             raise OverflowError("FASTA/FASTQ record does not fit into buffer")
-        bufend = f.readinto(memoryview(buf)[start:]) + start  # type: ignore
+        bufend = f.readinto(memoryview(buf)[start:]) + start
         if start == bufend:
             # End of file
             break
@@ -152,8 +160,8 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
 
 
 def read_paired_chunks(
-    f: RawIOBase,
-    f2: RawIOBase,
+    f: BufferedIOBase,
+    f2: BufferedIOBase,
     buffer_size: int = 4 * 1024**2,
 ) -> Iterator[Tuple[memoryview, memoryview]]:
     """
@@ -222,8 +230,8 @@ def read_paired_chunks(
             raise ValueError(
                 f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}"
             )
-        bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1  # type: ignore
-        bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2  # type: ignore
+        bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1
+        bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2
         if start1 == bufend1 and start2 == bufend2:
             break
 


=====================================
src/dnaio/multipleend.py
=====================================
@@ -202,7 +202,7 @@ class MultipleFastqWriter(MultipleFileWriter):
         self._stack = contextlib.ExitStack()
         self._writers: List[IO] = [
             self._stack.enter_context(
-                opener(file, mode + "b") if not hasattr(file, "write") else file
+                opener(file, mode + "b") if not hasattr(file, "write") else file  # type: ignore
             )
             for file in self._files
         ]


=====================================
src/dnaio/readers.py
=====================================
@@ -212,13 +212,16 @@ class BamReader(BinaryFileReader, SingleEndReader):
         buffer_size: int = 128 * 1024,  # Buffer size used by cat, pigz etc.
         opener=xopen,
         _close_file: Optional[bool] = None,
+        with_header: bool = True,
     ):
         super().__init__(file, opener=opener, _close_file=_close_file)
         self.sequence_class = sequence_class
         self.delivers_qualities = True
         self.buffer_size = buffer_size
         try:
-            self._iter: Iterator[SequenceRecord] = BamIter(self._file, self.buffer_size)
+            self._iter: Iterator[SequenceRecord] = BamIter(
+                self._file, read_in_size=self.buffer_size, with_header=with_header
+            )
         except Exception:
             self.close()
             raise


=====================================
src/dnaio/singleend.py
=====================================
@@ -6,7 +6,7 @@ from .readers import BamReader, FastaReader, FastqReader
 from .writers import FastaWriter, FastqWriter
 
 
-def _open_single(
+def _open_single(  # noqa: C901
     file_or_path: Union[str, os.PathLike, BinaryIO],
     opener,
     *,
@@ -64,6 +64,11 @@ def _open_single(
             return BamReader(file, _close_file=close_file)
         # This should not be reached
         raise NotImplementedError("Only reading is supported for BAM files")
+    elif fileformat == "bam_no_header":
+        if "r" in mode:
+            return BamReader(file, _close_file=close_file, with_header=False)
+        # This should not be reached
+        raise NotImplementedError("Only reading is supported for headerless BAM files")
     if close_file:
         file.close()
     raise UnknownFileFormat(
@@ -121,13 +126,13 @@ def _detect_format_from_content(file: BinaryIO) -> Optional[str]:
     """
     Return 'fasta', 'fastq' or None
     """
-    if file.seekable():
+    if hasattr(file, "peek"):
+        magic = file.peek(4)[0:4]
+    else:
+        # We cannot always use peek() because BytesIO objects do not suppert it
         original_position = file.tell()
         magic = file.read(4)
         file.seek(original_position)
-    else:
-        # We cannot always use peek() because BytesIO objects do not suppert it
-        magic = file.peek(4)[0:4]  # type: ignore
     if magic.startswith(b"@") or magic == b"":
         # Pretend FASTQ for empty input
         return "fastq"


=====================================
tests/data/missing_header_no_bgzip_raw_bam_bytes
=====================================
Binary files /dev/null and b/tests/data/missing_header_no_bgzip_raw_bam_bytes differ


=====================================
tests/test_chunks.py
=====================================
@@ -1,3 +1,5 @@
+import gzip
+import struct
 import textwrap
 
 from pytest import raises
@@ -7,6 +9,7 @@ import dnaio
 from dnaio import UnknownFileFormat, FileFormatError
 from dnaio._core import paired_fastq_heads
 from dnaio.chunks import (
+    _bam_head,
     _fastq_head,
     _fasta_head,
     read_chunks,
@@ -41,6 +44,13 @@ def test_fasta_head_with_comment():
     assert _fasta_head(b"#\n>3\n5\n>") == 7
 
 
+def test_bam_head():
+    assert _bam_head(struct.pack("<Ix", 32)) == 0
+    assert _bam_head(struct.pack("<I32x", 32)) == 36
+    assert _bam_head(struct.pack("<I32xI32x", 32, 33)) == 36
+    assert _bam_head(struct.pack("<I32xI33x", 32, 33)) == 73
+
+
 def test_paired_fasta_heads():
     def pheads(buf1, buf2):
         return _paired_fasta_heads(buf1, buf2, len(buf1), len(buf2))
@@ -179,3 +189,15 @@ def test_read_chunks_empty():
 def test_invalid_file_format():
     with raises(UnknownFileFormat):
         list(read_chunks(BytesIO(b"invalid format")))
+
+
+def test_read_chunks_bam():
+    with gzip.open("tests/data/simple.unaligned.bam", "rb") as f:
+        for i, chunk in enumerate(read_chunks(f, 70)):
+            if i == 0:
+                assert b"Myheader" in chunk.tobytes()
+            if i == 1:
+                assert b"AnotherHeader" in chunk.tobytes()
+            if i == 2:
+                assert b"YetAnotherHeader" in chunk.tobytes()
+            assert b"RGZA\x00" in chunk.tobytes()


=====================================
tests/test_internal.py
=====================================
@@ -33,7 +33,6 @@ from dnaio.writers import FileWriter
 from dnaio.readers import BinaryFileReader
 from dnaio._core import bytes_ascii_check
 
-
 TEST_DATA = Path(__file__).parent / "data"
 SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq")
 # files tests/data/simple.fast{q,a}
@@ -786,7 +785,7 @@ class TestBamReader:
         )
         with pytest.raises(TypeError) as error:
             BamReader(file)
-        error.match("binary IO")
+        error.match("binary mode")
 
     @pytest.mark.parametrize("buffersize", [4, 8, 10, 20, 40])
     def test_small_buffersize(self, buffersize):


=====================================
tests/test_open.py
=====================================
@@ -1,3 +1,4 @@
+import io
 import os
 from pathlib import Path
 
@@ -194,6 +195,15 @@ def test_detect_bam_from_filename() -> None:
         assert record.name == "Myheader"
 
 
+def test_read_raw_bam_no_header_from_memory() -> None:
+    with open("tests/data/missing_header_no_bgzip_raw_bam_bytes", "rb") as f:
+        raw_bam = f.read()
+    in_memory_bam = io.BytesIO(raw_bam)
+    with dnaio.open(in_memory_bam, fileformat="bam_no_header") as f:
+        record = next(iter(f))
+        assert record.name == "Myheader"
+
+
 def test_write(tmp_path, extension) -> None:
     out_fastq = tmp_path / ("out.fastq" + extension)
     with dnaio.open(str(out_fastq), mode="w") as f:


=====================================
tox.ini
=====================================
@@ -1,5 +1,5 @@
 [tox]
-envlist = flake8,black,mypy,docs,py38,py39,py310,py311,py312
+envlist = flake8,black,mypy,docs,py39,py310,py311,py312,py313
 isolated_build = True
 
 [testenv]



View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/94e13724aa9f69253fbe17d676811b74c7bb419c

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/94e13724aa9f69253fbe17d676811b74c7bb419c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241130/670b5baa/attachment-0001.htm>


More information about the debian-med-commit mailing list