[med-svn] [Git][med-team/python-pyfaidx][upstream] New upstream version 0.5.7
Steffen Möller
gitlab at salsa.debian.org
Sat Jan 25 17:58:26 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / python-pyfaidx
Commits:
739f7379 by Steffen Moeller at 2020-01-01T19:58:11+01:00
New upstream version 0.5.7
- - - - -
7 changed files:
- .travis.yml
- README.rst
- pyfaidx/__init__.py
- pyfaidx/cli.py
- setup.py
- tests/test_FastaRecord.py
- tests/test_feature_bounds_check.py
Changes:
=====================================
.travis.yml
=====================================
@@ -7,9 +7,7 @@ python:
- '3.6'
- '3.5'
- '3.4'
- - '3.3'
- '2.7'
- - '2.6'
- 'pypy'
- 'pypy3'
install:
=====================================
README.rst
=====================================
@@ -1,4 +1,4 @@
-|Travis| |PyPI| |Landscape| |Coverage| |Depsy| |Appveyor|
+|Travis| |PyPI| |Coverage| |Depsy|
Description
-----------
@@ -54,7 +54,8 @@ Acts like a dictionary.
.. code:: python
- >>> genes.keys() ('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
+ >>> genes.keys()
+ ('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
>>> genes['NM_001282543.1'][200:230]
>NM_001282543.1:201-230
@@ -544,7 +545,7 @@ Support for compressed FASTA
----------------------------
``pyfaidx`` can create and read ``.fai`` indices for FASTA files that have
-been compressed using the `bgzip <http://www.htslib.org/doc/tabix.html>`_
+been compressed using the `bgzip <https://www.htslib.org/doc/bgzip.html>`_
tool from `samtools <http://www.htslib.org/>`_. ``bgzip`` writes compressed
data in a ``BGZF`` format. ``BGZF`` is ``gzip`` compatible, consisting of
multiple concatenated ``gzip`` blocks, each with an additional ``gzip``
=====================================
pyfaidx/__init__.py
=====================================
@@ -25,7 +25,7 @@ if sys.version_info > (3, ):
dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
-__version__ = '0.5.5.2'
+__version__ = '0.5.7'
class KeyFunctionError(ValueError):
@@ -123,7 +123,7 @@ class Sequence(object):
>chr1
AC
"""
- if self.start is None or self.end is None:
+ if self.start is None or self.end is None or len(self.seq) == 0:
correction_factor = 0
elif len(
self.seq
@@ -461,7 +461,7 @@ class Faidx(object):
rname, rlen, offset, lenc, lenb = line.split('\t')
rlen, offset, lenc, lenb = map(int,
(rlen, offset, lenc, lenb))
- newlines = int(ceil(rlen / lenc) * (lenb - lenc))
+ newlines = int(ceil(rlen / lenc) * (lenb - lenc)) if lenc else 0
bend = offset + newlines + rlen
rec = IndexRecord(rlen, offset, lenc, lenb, bend,
prev_bend)
@@ -508,8 +508,8 @@ class Faidx(object):
rname = None # reference sequence name
offset = 0 # binary offset of end of current line
rlen = 0 # reference character length
- blen = None # binary line length (includes newline)
- clen = None # character line length
+ blen = 0 # binary line length (includes newline)
+ clen = 0 # character line length
bad_lines = [] # lines > || < than blen
thisoffset = offset
valid_entry = False
@@ -535,9 +535,9 @@ class Faidx(object):
"Inconsistent line found in >{0} at "
"line {1:n}.".format(
rname, bad_lines[0][0] + 1))
- blen = None
+ blen = 0
rlen = 0
- clen = None
+ clen = 0
bad_lines = []
try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
rname = line.rstrip('\n\r')[1:].split()[
@@ -648,8 +648,8 @@ class Faidx(object):
# Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398)
newlines_before = int(
- (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0
- newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc))
+ (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 and i.lenc else 0
+ newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0
newlines_inside = newlines_to_end - newlines_before
seq_blen = newlines_inside + seq_len
bstart = i.offset + newlines_before + start0
@@ -669,12 +669,15 @@ class Faidx(object):
else:
self.file.seek(bstart)
+ # If the requested sequence exceeds len(FastaRecord), return as much as possible
if bstart + seq_blen > i.bend and not self.strict_bounds:
seq_blen = i.bend - bstart
-
+ # Otherwise it should be safe to read the sequence
if seq_blen > 0:
seq = self.file.read(seq_blen).decode()
- elif seq_blen <= 0 and not self.strict_bounds:
+ # If the requested sequence is negative, we will pad the empty string with default_seq.
+ # This was changed to support #155 with strict_bounds=True.
+ elif seq_blen <= 0:
seq = ''
if not internals:
@@ -994,13 +997,9 @@ class Fasta(object):
sequence_always_upper=sequence_always_upper,
rebuild=rebuild,
build_index=build_index)
- self.keys = self.faidx.index.keys
- if not self.mutable:
- self.records = dict(
- [(rname, FastaRecord(rname, self)) for rname in self.keys()])
- elif self.mutable:
- self.records = dict([(rname, MutableFastaRecord(rname, self))
- for rname in self.keys()])
+
+ _record_constructor = MutableFastaRecord if self.mutable else FastaRecord
+ self.records = OrderedDict([(rname, _record_constructor(rname, self)) for rname in self.faidx.index.keys()])
def __contains__(self, rname):
"""Return True if genome contains record."""
@@ -1057,6 +1056,15 @@ class Fasta(object):
# len(Sequence.seq) != end - start
return Sequence(name=name, seq=seq, start=None, end=None)
+ def keys(self):
+ return self.records.keys()
+
+ def values(self):
+ return self.records.values()
+
+ def items(self):
+ return self.records.items()
+
def close(self):
self.__exit__()
@@ -1258,6 +1266,21 @@ def check_bad_lines(rname, bad_lines, i):
raise RuntimeError("Unhandled exception during fasta indexing at entry " + rname + \
"Please report this issue at https://github.com/mdshw5/pyfaidx/issues " + \
str(bad_lines))
+
+def get_valid_filename(s):
+ """
+ From https://github.com/django/django/blob/efc3e32d6d7fb9bb41be73b80c8607b653c1fbd6/django/utils/text.py#L222-L232
+ Return the given string converted to a string that can be used for a clean
+ filename. Remove leading and trailing spaces; convert other spaces to
+ underscores; and remove anything that is not an alphanumeric, dash,
+ underscore, or dot.
+ >>> get_valid_filename("HPV16_144-1.fa")
+ 'HPV16_144-1.fa'
+ >>> get_valid_filename("chromosome 6.fa")
+ 'chromosome_6.fa'
+ """
+ s = str(s).strip().replace(' ', '_')
+ return re.sub(r'(?u)[^-\w.]', '', s)
if __name__ == "__main__":
=====================================
pyfaidx/cli.py
=====================================
@@ -3,11 +3,9 @@ import argparse
import sys
import os.path
import re
-from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split
+from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split, get_valid_filename
from collections import defaultdict
-keepcharacters = (' ', '.', '_')
-
def write_sequence(args):
_, ext = os.path.splitext(args.fasta)
if ext:
@@ -36,7 +34,7 @@ def write_sequence(args):
continue
if args.split_files: # open output file based on sequence name
filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
- filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
+ filename = get_valid_filename(filename)
outfile = open(filename, 'w')
elif args.out:
outfile = args.out
=====================================
setup.py
=====================================
@@ -36,12 +36,11 @@ setup(
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Operating System :: Unix",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.4",
- "Programming Language :: Python :: 3.3",
- "Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 2.7",
- "Programming Language :: Python :: 2.6",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
=====================================
tests/test_FastaRecord.py
=====================================
@@ -46,6 +46,7 @@ class TestFastaRecord(TestCase):
long_names = []
for record in fasta:
long_names.append(record.long_name)
+ print(tuple(zip(deflines, long_names)))
assert deflines == long_names
def test_issue_62(self):
=====================================
tests/test_feature_bounds_check.py
=====================================
@@ -6,6 +6,53 @@ from unittest import TestCase
path = os.path.dirname(__file__)
os.chdir(path)
+class TestFeatureZeroLength:
+ """Tests for handling zero-length entries, added in #155"""
+ def setUp(self):
+ with open('data/zero_length.fasta', 'w') as fasta:
+ fasta.write(""">A
+ATCG
+>B
+>C
+
+>D
+GTA
+GC""")
+
+ def tearDown(self):
+ os.remove('data/zero_length.fasta')
+ os.remove('data/zero_length.fasta.fai')
+
+ def test_index_zero_length(self):
+ fasta = Fasta('data/zero_length.fasta')
+
+ def test_fetch_zero_length(self):
+ fasta = Fasta('data/zero_length.fasta')
+ b = fasta["B"]
+ assert str(b) == ''
+
+class TestZeroLengthSequenceSubRange(TestCase):
+ def setUp(self):
+ pass
+
+ def tearDown(self):
+ try:
+ os.remove('data/genes.fasta.fai')
+ except EnvironmentError:
+ pass # some tests may delete this file
+
+ def test_as_raw_zero_length_subsequence(self):
+ fasta = Fasta('data/genes.fasta', as_raw=True, strict_bounds=True)
+ expect = ''
+ result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
+ assert result == expect
+
+ def test_zero_length_subsequence(self):
+ fasta = Fasta('data/genes.fasta', strict_bounds=True)
+ expect = ''
+ result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
+ assert result.seq == expect
+
class TestFeatureBoundsCheck:
def setUp(self):
pass
View it on GitLab: https://salsa.debian.org/med-team/python-pyfaidx/commit/739f737969497e06c16c17c0fc67c416e3524fdb
--
View it on GitLab: https://salsa.debian.org/med-team/python-pyfaidx/commit/739f737969497e06c16c17c0fc67c416e3524fdb
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200125/b2359fec/attachment-0001.html>
More information about the debian-med-commit
mailing list