[med-svn] [Git][med-team/python-dnaio][upstream] New upstream version 0.4.1
Andreas Tille
gitlab at salsa.debian.org
Sun Nov 17 15:24:19 GMT 2019
Andreas Tille pushed to branch upstream at Debian Med / python-dnaio
Commits:
6f93bd69 by Andreas Tille at 2019-11-17T15:18:27Z
New upstream version 0.4.1
- - - - -
9 changed files:
- .travis.yml
- setup.py
- src/dnaio/__init__.py
- src/dnaio/chunks.py
- src/dnaio/readers.py
- src/dnaio/writers.py
- tests/test_internal.py
- tests/test_open.py
- tox.ini
Changes:
=====================================
.travis.yml
=====================================
@@ -11,6 +11,7 @@ python:
- "3.5"
- "3.6"
- "3.7"
+ - "3.8"
- "nightly"
install:
@@ -45,5 +46,10 @@ jobs:
ls -l dist/
python3 -m twine upload dist/*
+ - name: flake8
+ python: "3.6"
+ install: python3 -m pip install flake8
+ script: flake8 src/ tests/
+
allow_failures:
- python: "nightly"
=====================================
setup.py
=====================================
@@ -77,7 +77,7 @@ setup(
install_requires=['xopen>=0.8.2'],
python_requires='>=3.4',
classifiers=[
- "Development Status :: 3 - Alpha",
+ "Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
=====================================
src/dnaio/__init__.py
=====================================
@@ -16,6 +16,9 @@ __all__ = [
'InterleavedSequenceReader',
'InterleavedSequenceWriter',
'PairedSequenceReader',
+ 'read_chunks',
+ 'read_paired_chunks',
+ '__version__',
]
import os
@@ -45,7 +48,9 @@ except ImportError:
return path
-def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qualities=None):
+def open(
+ file1, *, file2=None, fileformat=None, interleaved=False, mode="r", qualities=None, opener=xopen
+):
"""
Open sequence files in FASTA or FASTQ format for reading or writing. This is
a factory that returns an instance of one of the ...Reader or ...Writer
@@ -71,32 +76,37 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
appropriately.
* When False (no qualities available), an exception is raised when the
auto-detected output format is FASTQ.
+
+ opener -- A function that is used to open file1 and file2 if they are not
+ already open file-like objects. By default, xopen is used, which can
+ also open compressed file formats.
"""
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
if interleaved and file2 is not None:
raise ValueError("When interleaved is set, file2 must be None")
+
if file2 is not None:
if mode in "wa" and file1 == file2:
raise ValueError("The paired-end output files are identical")
if mode == "r":
- return PairedSequenceReader(file1, file2, fileformat)
+ return PairedSequenceReader(file1, file2, fileformat, opener=opener)
elif mode == "w":
- return PairedSequenceWriter(file1, file2, fileformat, qualities)
+ return PairedSequenceWriter(file1, file2, fileformat, qualities, opener=opener)
else:
- return PairedSequenceAppender(file1, file2, fileformat, qualities)
+ return PairedSequenceAppender(file1, file2, fileformat, qualities, opener=opener)
if interleaved:
if mode == "r":
- return InterleavedSequenceReader(file1, fileformat)
+ return InterleavedSequenceReader(file1, fileformat, opener=opener)
elif mode == "w":
- return InterleavedSequenceWriter(file1, fileformat, qualities)
+ return InterleavedSequenceWriter(file1, fileformat, qualities, opener=opener)
else:
- return InterleavedSequenceAppender(file1, fileformat, qualities)
+ return InterleavedSequenceAppender(file1, fileformat, qualities, opener=opener)
# The multi-file options have been dealt with, delegate rest to the
# single-file function.
return _open_single(
- file1, fileformat=fileformat, mode=mode, qualities=qualities)
+ file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities)
def _detect_format_from_name(name):
@@ -118,16 +128,16 @@ def _detect_format_from_name(name):
return None
-def _open_single(file, *, fileformat=None, mode='r', qualities=None):
+def _open_single(file, opener, *, fileformat=None, mode="r", qualities=None):
"""
Open a single sequence file. See description of open() above.
"""
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
- if isinstance(file, (str, pathlib.Path)):
+ if isinstance(file, (str, pathlib.Path)): # TODO Use os.PathLike in Python 3.6+
path = fspath(file)
- file = xopen(path, mode + 'b')
+ file = opener(path, mode + "b")
close_file = True
else:
if mode == 'r' and not hasattr(file, 'readinto'):
@@ -164,30 +174,18 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
fileformat = 'fastq' if qualities else 'fasta'
if mode == 'r' and fileformat is None:
- # No format detected so far. Try to read from the file.
- if file.seekable():
- first_char = file.read(1)
- file.seek(-1, 1)
- else:
- first_char = file.peek(1)[0:1]
- formats = {
- b'@': 'fastq',
- b'>': 'fasta',
- b'#': 'fasta', # Some FASTA variants allow comments
- b'': 'fastq', # Pretend FASTQ for empty input
- }
- try:
- fileformat = formats[first_char]
- except KeyError:
+ fileformat = _detect_format_from_content(file)
+ if fileformat is None:
raise UnknownFileFormat(
'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was '
- 'not available or not recognized and the first character in the file ({!r}) is '
- 'unexpected.'.format(file, first_char))
+ 'not available or not recognized and the first character in the file is '
+ 'unexpected.'.format(file))
if fileformat is None:
assert mode == 'w'
extra = " because the output file name is not available" if path is None else ""
- raise UnknownFileFormat("Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
+ raise UnknownFileFormat(
+ "Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
if fileformat == 'fastq' and mode in "wa" and qualities is False:
raise ValueError(
@@ -196,6 +194,24 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
return handlers[fileformat](file)
+def _detect_format_from_content(file):
+ """
+ Return 'fasta', 'fastq' or None
+ """
+ if file.seekable():
+ first_char = file.read(1)
+ file.seek(-1, 1)
+ else:
+ first_char = file.peek(1)[0:1]
+ formats = {
+ b'@': 'fastq',
+ b'>': 'fasta',
+ b'#': 'fasta', # Some FASTA variants allow comments
+ b'': 'fastq', # Pretend FASTQ for empty input
+ }
+ return formats.get(first_char, None)
+
+
def _sequence_names_match(r1, r2):
"""
Check whether the sequence records r1 and r2 have identical names, ignoring a
@@ -220,10 +236,10 @@ class PairedSequenceReader:
"""
paired = True
- def __init__(self, file1, file2, fileformat=None):
+ def __init__(self, file1, file2, fileformat=None, opener=xopen):
with ExitStack() as stack:
- self.reader1 = stack.enter_context(_open_single(file1, fileformat=fileformat))
- self.reader2 = stack.enter_context(_open_single(file2, fileformat=fileformat))
+ self.reader1 = stack.enter_context(_open_single(file1, opener=opener, fileformat=fileformat))
+ self.reader2 = stack.enter_context(_open_single(file2, opener=opener, fileformat=fileformat))
self._close = stack.pop_all().close
self.delivers_qualities = self.reader1.delivers_qualities
@@ -240,7 +256,8 @@ class PairedSequenceReader:
# End of file 1. Make sure that file 2 is also at end.
try:
next(it2)
- raise FileFormatError("Reads are improperly paired. There are more reads in "
+ raise FileFormatError(
+ "Reads are improperly paired. There are more reads in "
"file 2 than in file 1.", line=None) from None
except StopIteration:
pass
@@ -248,10 +265,12 @@ class PairedSequenceReader:
try:
r2 = next(it2)
except StopIteration:
- raise FileFormatError("Reads are improperly paired. There are more reads in "
+ raise FileFormatError(
+ "Reads are improperly paired. There are more reads in "
"file 1 than in file 2.", line=None) from None
if not _sequence_names_match(r1, r2):
- raise FileFormatError("Reads are improperly paired. Read name '{}' "
+ raise FileFormatError(
+ "Reads are improperly paired. Read name '{}' "
"in file 1 does not match '{}' in file 2.".format(r1.name, r2.name), line=None) from None
yield (r1, r2)
@@ -271,8 +290,8 @@ class InterleavedSequenceReader:
"""
paired = True
- def __init__(self, file, fileformat=None):
- self.reader = _open_single(file, fileformat=fileformat)
+ def __init__(self, file, fileformat=None, opener=xopen):
+ self.reader = _open_single(file, opener=opener, fileformat=fileformat)
self.delivers_qualities = self.reader.delivers_qualities
def __iter__(self):
@@ -281,10 +300,12 @@ class InterleavedSequenceReader:
try:
r2 = next(it)
except StopIteration:
- raise FileFormatError("Interleaved input file incomplete: Last record "
+ raise FileFormatError(
+ "Interleaved input file incomplete: Last record "
"{!r} has no partner.".format(r1.name), line=None) from None
if not _sequence_names_match(r1, r2):
- raise FileFormatError("Reads are improperly paired. Name {!r} "
+ raise FileFormatError(
+ "Reads are improperly paired. Name {!r} "
"(first) does not match {!r} (second).".format(r1.name, r2.name), line=None)
yield (r1, r2)
@@ -301,12 +322,14 @@ class InterleavedSequenceReader:
class PairedSequenceWriter:
_mode = "w"
- def __init__(self, file1, file2, fileformat='fastq', qualities=None):
+ def __init__(self, file1, file2, fileformat='fastq', qualities=None, opener=xopen):
with ExitStack() as stack:
- self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode=self._mode,
- qualities=qualities))
- self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode=self._mode,
- qualities=qualities))
+ self._writer1 = stack.enter_context(
+ _open_single(
+ file1, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
+ self._writer2 = stack.enter_context(
+ _open_single(
+ file2, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
self._close = stack.pop_all().close
def write(self, read1, read2):
@@ -334,10 +357,10 @@ class InterleavedSequenceWriter:
"""
_mode = "w"
- def __init__(self, file, fileformat='fastq', qualities=None):
+ def __init__(self, file, fileformat='fastq', qualities=None, opener=xopen):
self._writer = _open_single(
- file, fileformat=fileformat, mode=self._mode, qualities=qualities)
+ file, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities)
def write(self, read1, read2):
self._writer.write(read1)
=====================================
src/dnaio/chunks.py
=====================================
@@ -103,7 +103,8 @@ def read_paired_chunks(f, f2, buffer_size=4*1024**2):
start1 = f.readinto(memoryview(buf1)[0:1])
start2 = f2.readinto(memoryview(buf2)[0:1])
if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'):
- raise FileFormatError('Paired-end data must be in FASTQ format when using multiple cores', line=None)
+ raise FileFormatError(
+ "Paired-end data must be in FASTQ format when using multiple cores", line=None)
while True:
if start1 == len(buf1) or start2 == len(buf2):
=====================================
src/dnaio/readers.py
=====================================
@@ -18,13 +18,13 @@ class BinaryFileReader:
paired = False
mode = 'rb'
- def __init__(self, file, _close_file=None):
+ def __init__(self, file, opener=xopen, _close_file=None):
"""
The file is a path or a file-like object. In both cases, the file may
be compressed (.gz, .bz2, .xz).
"""
if isinstance(file, str):
- file = xopen(file, self.mode)
+ file = opener(file, self.mode)
self._close_on_exit = True
elif _close_file:
self._close_on_exit = True
@@ -49,14 +49,14 @@ class FastaReader(BinaryFileReader):
Reader for FASTA files.
"""
- def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, _close_file=None):
+ def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, opener=xopen, _close_file=None):
"""
file is a path or a file-like object. In both cases, the file may
be compressed (.gz, .bz2, .xz).
keep_linebreaks -- whether to keep newline characters in the sequence
"""
- super().__init__(file, _close_file=_close_file)
+ super().__init__(file, opener=opener, _close_file=_close_file)
self.sequence_class = sequence_class
self.delivers_qualities = False
self._delimiter = '\n' if keep_linebreaks else ''
@@ -83,8 +83,9 @@ class FastaReader(BinaryFileReader):
elif name is not None:
seq.append(line)
else:
- raise FastaFormatError("Expected '>' at beginning of "
- "record, but got {!r}.".format(_shorten(line)), line=i)
+ raise FastaFormatError(
+ "Expected '>' at beginning of record, but got {!r}."
+ .format(_shorten(line)), line=i)
if name is not None:
yield self.sequence_class(name, self._delimiter.join(seq), None)
@@ -97,12 +98,12 @@ class FastqReader(BinaryFileReader):
Reader for FASTQ files. Does not support multi-line FASTQ files.
"""
- def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, _close_file=None):
+ def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, opener=xopen, _close_file=None):
"""
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
"""
- super().__init__(file, _close_file=_close_file)
+ super().__init__(file, opener=opener, _close_file=_close_file)
self.sequence_class = sequence_class
self.delivers_qualities = True
self.buffer_size = buffer_size
=====================================
src/dnaio/writers.py
=====================================
@@ -2,10 +2,10 @@ from xopen import xopen
class FileWriter:
- def __init__(self, file, _close_file=None):
+ def __init__(self, file, opener=xopen, _close_file=None):
self._file = file
if isinstance(file, str):
- self._file = xopen(file, 'wb')
+ self._file = opener(file, "wb")
self._close_on_exit = True
else:
self._close_on_exit = bool(_close_file)
@@ -28,12 +28,12 @@ class FastaWriter(FileWriter):
Write FASTA-formatted sequences to a file.
"""
- def __init__(self, file, line_length=None, _close_file=None):
+ def __init__(self, file, line_length=None, opener=xopen, _close_file=None):
"""
If line_length is not None, the lines will
be wrapped after line_length characters.
"""
- super().__init__(file, _close_file=_close_file)
+ super().__init__(file, opener=opener, _close_file=_close_file)
self.line_length = line_length if line_length != 0 else None
def write(self, name_or_record, sequence=None):
@@ -78,8 +78,8 @@ class FastqWriter(FileWriter):
"""
file_mode = 'wb'
- def __init__(self, file, two_headers=False, _close_file=None):
- super().__init__(file, _close_file=_close_file)
+ def __init__(self, file, two_headers=False, opener=xopen, _close_file=None):
+ super().__init__(file, opener=opener, _close_file=_close_file)
self._two_headers = two_headers
def write(self, record):
=====================================
tests/test_internal.py
=====================================
@@ -82,18 +82,18 @@ class TestFastaReader:
filename = "tests/data/simple.fasta"
with open(filename, 'rb') as f:
assert not f.closed
- reads = list(dnaio.open(f))
+ _ = list(dnaio.open(f))
assert not f.closed
assert f.closed
with FastaReader(filename) as sr:
tmp_sr = sr
assert not sr._file.closed
- reads = list(sr)
+ _ = list(sr)
assert not sr._file.closed
assert tmp_sr._file is None
# Open it a second time
- with FastaReader(filename) as sr:
+ with FastaReader(filename):
pass
@@ -112,7 +112,7 @@ class TestFastqReader:
def test_fastqreader_buffersize_too_small(self):
with raises(ValueError):
with FastqReader("tests/data/simple.fastq", buffer_size=0) as f:
- reads = list(f) # pragma: no cover
+ _ = list(f) # pragma: no cover
def test_fastqreader_dos(self):
# DOS line breaks
@@ -212,7 +212,7 @@ class TestFastqReader:
with FastqReader(filename) as sr:
tmp_sr = sr
assert not sr._file.closed
- reads = list(sr)
+ _ = list(sr)
assert not sr._file.closed
assert tmp_sr._file is None
@@ -445,7 +445,12 @@ class TestInterleavedWriter:
with InterleavedSequenceWriter(bio) as writer:
for read1, read2 in reads:
writer.write(read1, read2)
- assert bio.getvalue() == b'@A/1 comment\nTTA\n+\n##H\n at A/2 comment\nGCT\n+\nHH#\n at B/1\nCC\n+\nHH\n at B/2\nTG\n+\n#H\n'
+ assert bio.getvalue() == (
+ b'@A/1 comment\nTTA\n+\n##H\n'
+ b'@A/2 comment\nGCT\n+\nHH#\n'
+ b'@B/1\nCC\n+\nHH\n'
+ b'@B/2\nTG\n+\n#H\n'
+ )
class TestPairedSequenceReader:
=====================================
tests/test_open.py
=====================================
@@ -62,6 +62,49 @@ def test_read_pathlib_path(fileformat, extension):
assert records == SIMPLE_RECORDS[fileformat]
+def test_read_opener(fileformat, extension):
+ def my_opener(path, mode):
+ import io
+ if fileformat == "fasta":
+ data = b">read\nACG\n"
+ else:
+ data = b"@read\nACG\n+\nHHH\n"
+ return io.BytesIO(data)
+
+ with dnaio.open("totally-ignored-filename." + fileformat + extension, opener=my_opener) as f:
+ records = list(f)
+ assert len(records) == 1
+ assert records[0].name == "read"
+ assert records[0].sequence == "ACG"
+
+
+ at pytest.mark.parametrize("interleaved", [False, True])
+def test_paired_opener(fileformat, extension, interleaved):
+ def my_opener(_path, _mode):
+ import io
+ if fileformat == "fasta":
+ data = b">read\nACG\n"
+ else:
+ data = b"@read\nACG\n+\nHHH\n"
+ return io.BytesIO(data + data)
+
+ path1 = "ignored-filename." + fileformat + extension
+ path2 = "also-ignored-filename." + fileformat + extension
+ if interleaved:
+ with dnaio.open(path1, file2=path2, opener=my_opener) as f:
+ records = list(f)
+ expected = 2
+ else:
+ with dnaio.open(path1, interleaved=True, opener=my_opener) as f:
+ records = list(f)
+ expected = 1
+ assert len(records) == expected
+ assert records[0][0].name == "read"
+ assert records[0][0].sequence == "ACG"
+ assert records[0][1].name == "read"
+ assert records[0][1].sequence == "ACG"
+
+
def test_detect_fastq_from_content():
"""FASTQ file that is not named .fastq"""
with dnaio.open('tests/data/missingextension') as f:
@@ -115,8 +158,8 @@ def test_write_pathlib(tmpdir, fileformat, extension):
def test_write_paired_same_path(tmpdir):
path1 = str(tmpdir / "same.fastq")
path2 = str(tmpdir / "same.fastq")
- with pytest.raises(ValueError) as e:
- with dnaio.open(file1=path1, file2=path2, mode="w") as f:
+ with pytest.raises(ValueError):
+ with dnaio.open(file1=path1, file2=path2, mode="w"):
pass
=====================================
tox.ini
=====================================
@@ -1,5 +1,5 @@
[tox]
-envlist = py34,py35,py36,py37
+envlist = flake8,py34,py35,py36,py37,py38
[testenv]
deps =
@@ -10,6 +10,11 @@ commands =
coverage combine
coverage report
+[testenv:flake8]
+basepython = python3.6
+deps = flake8
+commands = flake8 src/ tests/
+
[coverage:run]
parallel = True
include =
@@ -20,3 +25,7 @@ include =
source =
src/
*/site-packages/
+
+[flake8]
+max-line-length = 110
+max-complexity = 15
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/commit/6f93bd69eb9b8774209182f268b650e2dd47f219
--
View it on GitLab: https://salsa.debian.org/med-team/python-dnaio/commit/6f93bd69eb9b8774209182f268b650e2dd47f219
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191117/d07e945d/attachment-0001.html>
More information about the debian-med-commit
mailing list