[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 0.4.2
Steffen Möller
gitlab at salsa.debian.org
Thu Apr 30 21:53:13 BST 2020
Steffen Möller pushed to branch upstream at Debian Med / python-dnaio
Commits:
850ea9ad by Steffen Moeller at 2020-04-30T22:47:56+02:00
New upstream version 0.4.2
- - - - -
4 changed files:
- src/dnaio/__init__.py
- src/dnaio/_core.pyx
- src/dnaio/writers.py
- tests/test_internal.py
Changes:
=====================================
src/dnaio/__init__.py
=====================================
@@ -28,7 +28,7 @@ import pathlib
from xopen import xopen
-from ._core import Sequence
+from ._core import Sequence, record_names_match as _record_names_match
from .readers import FastaReader, FastqReader
from .writers import FastaWriter, FastqWriter
from .exceptions import UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError
@@ -212,21 +212,6 @@ def _detect_format_from_content(file):
return formats.get(first_char, None)
-def _sequence_names_match(r1, r2):
- """
- Check whether the sequence records r1 and r2 have identical names, ignoring a
- suffix of '1' or '2'. Some old paired-end reads have names that end in '/1'
- and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ)
- appends a .1 and .2 to paired-end reads if option -I is used.
- """
- name1 = r1.name.split(None, 1)[0]
- name2 = r2.name.split(None, 1)[0]
- if name1[-1:] in '12' and name2[-1:] in '12':
- name1 = name1[:-1]
- name2 = name2[:-1]
- return name1 == name2
-
-
class PairedSequenceReader:
"""
Read paired-end reads from two files.
@@ -268,7 +253,7 @@ class PairedSequenceReader:
raise FileFormatError(
"Reads are improperly paired. There are more reads in "
"file 1 than in file 2.", line=None) from None
- if not _sequence_names_match(r1, r2):
+ if not _record_names_match(r1.name, r2.name):
raise FileFormatError(
"Reads are improperly paired. Read name '{}' "
"in file 1 does not match '{}' in file 2.".format(r1.name, r2.name), line=None) from None
@@ -303,7 +288,7 @@ class InterleavedSequenceReader:
raise FileFormatError(
"Interleaved input file incomplete: Last record "
"{!r} has no partner.".format(r1.name), line=None) from None
- if not _sequence_names_match(r1, r2):
+ if not _record_names_match(r1.name, r2.name):
raise FileFormatError(
"Reads are improperly paired. Name {!r} "
"(first) does not match {!r} (second).".format(r1.name, r2.name), line=None)
=====================================
src/dnaio/_core.pyx
=====================================
@@ -1,6 +1,6 @@
# cython: language_level=3, emit_code_comments=False
-from libc.string cimport strncmp
+from libc.string cimport strncmp, memcmp
cimport cython
from .exceptions import FastqFormatError
@@ -62,6 +62,16 @@ cdef class Sequence:
def __reduce__(self):
return (Sequence, (self.name, self.sequence, self.qualities))
+ def fastq_bytes(self):
+ s = ('@' + self.name + '\n' + self.sequence + '\n+\n'
+ + self.qualities + '\n')
+ return s.encode('ascii')
+
+ def fastq_bytes_two_headers(self):
+ s = ('@' + self.name + '\n' + self.sequence + '\n+'
+ + self.name + '\n' + self.qualities + '\n')
+ return s.encode('ascii')
+
# It would be nice to be able to have the first parameter be an
# unsigned char[:] (memory view), but this fails with a BufferError
@@ -282,3 +292,21 @@ def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
'Premature end of file encountered. The incomplete final record was: '
'{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
line=n_records * 4 + lines)
+
+
+def record_names_match(header1: str, header2: str):
+ """
+ Check whether the sequence record ids id1 and id2 are compatible, ignoring a
+ suffix of '1' or '2'. Some old paired-end reads have names that end in '/1'
+ and '/2'. Also, the fastq-dump tool (used for converting SRA files to FASTQ)
+ appends a .1 and .2 to paired-end reads if option -I is used.
+ """
+ # TODO optimize this a bit more
+ cdef:
+ str name1, name2
+ name1 = header1.split()[0]
+ name2 = header2.split()[0]
+ if name1[-1:] in '12' and name2[-1:] in '12':
+ name1 = name1[:-1]
+ name2 = name2[:-1]
+ return name1 == name2
=====================================
src/dnaio/writers.py
=====================================
@@ -81,17 +81,21 @@ class FastqWriter(FileWriter):
def __init__(self, file, two_headers=False, opener=xopen, _close_file=None):
super().__init__(file, opener=opener, _close_file=_close_file)
self._two_headers = two_headers
+ self.write = self._write_two_headers if self._two_headers else self._write
- def write(self, record):
+ def _write(self, record):
"""
Write a Sequence record to the FASTQ file.
- The record object must have attributes .name, .sequence and .qualities.
"""
- name2 = record.name if self._two_headers else ''
- s = ('@' + record.name + '\n' + record.sequence + '\n+'
- + name2 + '\n' + record.qualities + '\n')
- self._file.write(s.encode('ascii'))
+ self._file.write(record.fastq_bytes())
+
+ def _write_two_headers(self, record):
+ """
+ Write a Sequence record to the FASTQ file, repeating the header
+ in the third line after the "+" .
+ """
+ self._file.write(record.fastq_bytes_two_headers())
def writeseq(self, name, sequence, qualities):
self._file.write("@{0:s}\n{1:s}\n+\n{2:s}\n".format(
=====================================
tests/test_internal.py
=====================================
@@ -14,7 +14,7 @@ from dnaio import (
FastaReader, FastqReader, InterleavedSequenceReader,
FastaWriter, FastqWriter, InterleavedSequenceWriter,
PairedSequenceReader)
-from dnaio import _sequence_names_match, Sequence
+from dnaio import _record_names_match, Sequence
# files tests/data/simple.fast{q,a}
@@ -462,12 +462,8 @@ class TestPairedSequenceReader:
(Sequence("r1", "ACG", "HHH"), Sequence("r2", "GTT", "858")),
] == list(psr)
- def test_sequence_names_match(self):
- def match(name1, name2):
- seq1 = Sequence(name1, 'ACGT')
- seq2 = Sequence(name2, 'AACC')
- return _sequence_names_match(seq1, seq2)
-
+ def test_record_names_match(self):
+ match = _record_names_match
assert match('abc', 'abc')
assert match('abc/1', 'abc/2')
assert match('abc.1', 'abc.2')
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/850ea9ad1ba0e6e63c8315deec863b4288833f17
--
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/-/commit/850ea9ad1ba0e6e63c8315deec863b4288833f17
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200430/e3d43335/attachment-0001.html>
More information about the debian-med-commit
mailing list