[med-svn] [python-pyfaidx] 01/05: Imported Upstream version 0.4.7.1
Andreas Tille
tille at debian.org
Sat May 14 22:13:52 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-pyfaidx.
commit 12ea052331199fe6e00596b0b451d130e79d1ebf
Author: Andreas Tille <tille at debian.org>
Date: Sat May 14 23:48:01 2016 +0200
Imported Upstream version 0.4.7.1
---
README.rst | 5 ++
pyfaidx/__init__.py | 178 ++++++++++++++++++++++++++-----------
tests/data/issue_83.fasta | 6 ++
tests/test_feature_bounds_check.py | 27 ++++++
tests/test_feature_indexing.py | 10 +++
tests/test_sequence_class.py | 12 ++-
6 files changed, 187 insertions(+), 51 deletions(-)
diff --git a/README.rst b/README.rst
index edd05c6..4b8d96c 100644
--- a/README.rst
+++ b/README.rst
@@ -468,6 +468,11 @@ Changelog
Please see the `releases <https://github.com/mdshw5/pyfaidx/releases>`_ for a
comprehensive list of version changes.
+Known issues
+------------
+
+I try to fix as many bugs as possible, but most of this work is supported by a single developer. Please check the `known issues <https://github.com/mdshw5/pyfaidx/issues?utf8=✓&q=is%3Aissue+is%3Aopen+label%3Aknown>`_ for bugs relevant to your work. Pull requests are welcome.
+
Contributing
------------
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
index ceaa532..d21fe47 100644
--- a/pyfaidx/__init__.py
+++ b/pyfaidx/__init__.py
@@ -21,7 +21,7 @@ from math import ceil
dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
-__version__ = '0.4.5.2'
+__version__ = '0.4.7.1'
class FastaIndexingError(Exception):
@@ -65,7 +65,7 @@ class Sequence(object):
assert isinstance(seq, string_types)
def __getitem__(self, n):
- """ Returns the reverse compliment of sequence
+ """ Returns a sliced version of Sequence
>>> x = Sequence(name='chr1', seq='ATCGTA', start=1, end=6)
>>> x
>chr1:1-6
@@ -76,28 +76,67 @@ class Sequence(object):
>>> x[3:]
>chr1:4-6
GTA
+ >>> x[1:-1]
+ >chr1:2-5
+ TCGT
>>> x[::-1]
>chr1:6-1
ATGCTA
+ >>> x[::-3]
+ >chr1
+ AC
+ >>> x = Sequence(name='chr1', seq='ATCGTA', start=0, end=6)
+ >>> x
+ >chr1:0-6
+ ATCGTA
+ >>> x[:3]
+ >chr1:0-3
+ ATC
+ >>> x[3:]
+ >chr1:3-6
+ GTA
+ >>> x[1:-1]
+ >chr1:1-5
+ TCGT
+ >>> x[::-1]
+ >chr1:6-0
+ ATGCTA
+ >>> x[::-3]
+ >chr1
+ AC
"""
+ if self.start is None or self.end is None:
+ correction_factor = 0
+ elif len(self.seq) == abs(self.end - self.start) + 1: # determine coordinate system
+ one_based = True
+ correction_factor = -1
+ elif len(self.seq) == abs(self.end - self.start):
+ one_based = False
+ correction_factor = 0
+ elif len(self.seq) != abs(self.end - self.start):
+ raise ValueError("Coordinates start=%s and end=%s imply a diffent length than sequence (length %s)." % (self.start, self.end, len(self.seq)))
+
if isinstance(n, slice):
slice_start, slice_stop, slice_step = n.indices(len(self))
- if slice_step < 0: # flip the coordinates when we reverse
-
- self_start, self_end = (self.end, self.start)
- slice_stop, slice_start = (slice_start, slice_stop)
- if self.start is not None and self.end is not None:
- start = self_start + slice_start + 1
- end = self_end - (len(self) - slice_stop - 1)
- else:
- if self.start is not None and self.end is not None:
- start = self.start + slice_start
- end = self.end - (self.end - slice_stop)
if self.start is None or self.end is None: # there should never be self.start != self.end == None
start = None
end = None
- return self.__class__(self.name, self.seq[n.start:n.stop:n.step],
- start, end, self.comp)
+ return self.__class__(self.name, self.seq[n],
+ start, end, self.comp)
+ self_end, self_start = (self.end, self.start)
+ if abs(slice_step) > 1:
+ start = None
+ end = None
+ elif slice_step == -1: # flip the coordinates when we reverse
+ if slice_stop == -1:
+ slice_stop = 0
+ start = self_end - slice_stop
+ end = self_start + slice_stop
+ #print(locals())
+ else:
+ start = self_start + slice_start
+ end = self_start + slice_stop + correction_factor
+ return self.__class__(self.name, self.seq[n], start, end, self.comp)
elif isinstance(n, int):
if n < 0:
n = len(self) + n
@@ -144,7 +183,7 @@ class Sequence(object):
'chr1:1-6 (complement)'
"""
name = self.name
- if self.start and self.end:
+ if self.start is not None and self.end is not None:
name = ':'.join([name, '-'.join([str(self.start), str(self.end)])])
if self.comp:
name += ' (complement)'
@@ -310,22 +349,6 @@ class Faidx(object):
self.index.pop(dup, None)
def build_index(self):
-
- def check_bad_lines(rname, bad_lines, i):
- if len(bad_lines) > 1:
- raise FastaIndexingError("Line length of fasta"
- " file is not "
- "consistent! "
- "Inconsistent line found in >{0} at "
- "line {1:n}.".format(rname, bad_lines[0] + 1))
- elif len(bad_lines) == 1: # check that the line is previous line
- if bad_lines[0] + 1 != i:
- raise FastaIndexingError("Line length of fasta"
- " file is not "
- "consistent! "
- "Inconsistent line found in >{0} at "
- "line {1:n}.".format(rname, bad_lines[0] + 1))
-
with open(self.filename, 'r') as fastafile:
with open(self.indexname, 'w') as indexfile:
rname = None # reference sequence name
@@ -340,9 +363,15 @@ class Faidx(object):
line_clen = len(line.rstrip('\n\r'))
# write an index line
if line[0] == '>':
- check_bad_lines(rname, bad_lines, i) # raises errors
- if i > 0:
+ valid_entry = check_bad_lines(rname, bad_lines, i - 1)
+ if valid_entry and i > 0:
indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+ elif not valid_entry:
+ raise FastaIndexingError("Line length of fasta"
+ " file is not "
+ "consistent! "
+ "Inconsistent line found in >{0} at "
+ "line {1:n}.".format(rname, bad_lines[0][0] + 1))
blen = None
rlen = 0
clen = None
@@ -362,12 +391,18 @@ class Faidx(object):
# before we hit the next header, and it
# should be the last line in the entry
if line_blen != blen or line_blen == 1:
- bad_lines.append(i)
+ bad_lines.append((i, line_blen))
offset += line_blen
rlen += line_clen
# write the final index line
- check_bad_lines(rname, bad_lines, i + 1) # advance index since we're at the end of the file
+ valid_entry = check_bad_lines(rname, bad_lines, i) # advance index since we're at the end of the file
+ if not valid_entry:
+ raise FastaIndexingError("Line length of fasta"
+ " file is not "
+ "consistent! "
+ "Inconsistent line found in >{0} at "
+ "line {1:n}.".format(rname, bad_lines[0][0] + 1))
indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
def write_fai(self):
@@ -692,7 +727,7 @@ class FastaVariant(Fasta):
if os.path.exists(vcf_file):
self.vcf = vcf.Reader(filename=vcf_file)
else:
- raise IOError("File {s:0} does not exist.".format(vcf_file))
+ raise IOError("File {0} does not exist.".format(vcf_file))
if sample is not None:
self.sample = sample
else:
@@ -745,25 +780,35 @@ def wrap_sequence(n, sequence, fillvalue=''):
for line in zip_longest(fillvalue=fillvalue, *args):
yield ''.join(line + ("\n",))
+# To take a complement, we map each character in the first string in this pair
+# to the corresponding character in the second string.
+complement_map = ('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx', 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx')
+invalid_characters_set = set(
+ chr(x) for x in range(256) if chr(x) not in complement_map[0])
+invalid_characters_string = ''.join(invalid_characters_set)
+
+if PY3:
+ complement_table = str.maketrans(
+ complement_map[0], complement_map[1], invalid_characters_string)
+ translate_arguments = (complement_table,)
+elif PY2:
+ complement_table = string.maketrans(complement_map[0], complement_map[1])
+ translate_arguments = (complement_table, invalid_characters_string)
def complement(seq):
- """ Returns the compliment of seq.
+ """ Returns the complement of seq.
>>> seq = 'ATCGTA'
>>> complement(seq)
'TAGCAT'
"""
- if PY3:
- table = str.maketrans('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx', 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx')
- elif PY2:
- table = string.maketrans('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx', 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx')
- if len(re.findall(dna_bases, seq)) == 1: # finds invalid characters if > 1
- return str(seq).translate(table)
- else:
- matches = re.findall(dna_bases, seq)
- position = len(matches[0])
- raise ValueError("Sequence contains non-DNA character '{0}' at position {1:n}\n".format(seq[position], position + 1))
-
-
+ seq = str(seq)
+ result = seq.translate(*translate_arguments)
+ if len(result) != len(seq):
+ first_invalid_position = next(
+ i for i in range(len(seq)) if seq[i] in invalid_characters_set)
+ raise ValueError("Sequence contains non-DNA character '{0}' at position {1:n}\n".format(
+ seq[first_invalid_position], first_invalid_position + 1))
+ return result
def translate_chr_name(from_name, to_name):
chr_name_map = dict(zip(from_name, to_name))
@@ -796,6 +841,39 @@ def ucsc_split(region):
start, end = (None, None)
return (rname, start, end)
+def check_bad_lines(rname, bad_lines, i):
+ """ Find inconsistent line lengths in the middle of an
+ entry. Allow blank lines between entries, and short lines
+ occurring at the last line of an entry. Returns boolean
+ validating the entry.
+ >>> check_bad_lines('chr0', [(10, 79)], 10)
+ True
+ >>> check_bad_lines('chr0', [(9, 79)], 10)
+ False
+ >>> check_bad_lines('chr0', [(9, 79), (10, 1)], 10)
+ True
+ """
+ if len(bad_lines) == 0:
+ return True
+ elif len(bad_lines) == 1:
+ if bad_lines[0][0] == i: # must be last line
+ return True
+ else:
+ return False
+ elif len(bad_lines) == 2:
+ if bad_lines[0][0] == i: # must not be last line
+ return False
+ elif bad_lines[1][0] == i and bad_lines[1][1] == 1: # blank last line
+ if bad_lines[0][0] + 1 == i and bad_lines[0][1] > 1: # non-blank line
+ return True
+ else:
+ return False
+ if len(bad_lines) > 2:
+ return False
+ raise RuntimeError("Unhandled exception during fasta indexing at entry " + rname + \
+ "Please report this issue at https://github.com/mdshw5/pyfaidx/issues " + \
+ str(bad_lines))
+
if __name__ == "__main__":
import doctest
diff --git a/tests/data/issue_83.fasta b/tests/data/issue_83.fasta
new file mode 100644
index 0000000..a6a5771
--- /dev/null
+++ b/tests/data/issue_83.fasta
@@ -0,0 +1,6 @@
+>MT
+CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTC
+ATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG
+
+>GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1
+CATACATTTAATATACCCTCACCATACAGAATGTTCTTTCCCTATTACATAAGGAGTATA
diff --git a/tests/test_feature_bounds_check.py b/tests/test_feature_bounds_check.py
index 5f583a5..2cce1f0 100644
--- a/tests/test_feature_bounds_check.py
+++ b/tests/test_feature_bounds_check.py
@@ -126,3 +126,30 @@ class TestFeatureBoundsCheck:
end1 = f1['gi|557361099|gb|KF435150.1|'][1:90].end
print((end0, end1))
assert end0 == end1
+
+ def test_issue_79_fix(self):
+ f = Fasta('data/genes.fasta')
+ s = f['gi|557361099|gb|KF435150.1|'][100:105]
+ print((s.start, s.end))
+ assert (101, 105) == (s.start, s.end)
+
+ def test_issue_79_fix_negate(self):
+ f = Fasta('data/genes.fasta')
+ s = f['gi|557361099|gb|KF435150.1|'][100:105]
+ s = -s
+ print((s.start, s.end))
+ assert (105, 101) == (s.start, s.end)
+
+ def test_issue_79_fix_one_based_false(self):
+ f = Fasta('data/genes.fasta', one_based_attributes=False)
+ s = f['gi|557361099|gb|KF435150.1|'][100:105]
+ print((s.start, s.end))
+ assert (100, 105) == (s.start, s.end)
+
+ def test_issue_79_fix_one_based_false_negate(self):
+ f = Fasta('data/genes.fasta', one_based_attributes=False)
+ s = f['gi|557361099|gb|KF435150.1|'][100:105]
+ print(s.__dict__)
+ s = -s
+ print(s.__dict__)
+ assert (105, 100) == (s.start, s.end)
diff --git a/tests/test_feature_indexing.py b/tests/test_feature_indexing.py
index 26ebd5b..968b787 100644
--- a/tests/test_feature_indexing.py
+++ b/tests/test_feature_indexing.py
@@ -161,3 +161,13 @@ class TestIndexing(TestCase):
time.sleep(2)
faidx = Faidx('data/genes.fasta')
assert getmtime(faidx.indexname) > index_mtime
+
+ def test_build_issue_83(self):
+ """ Ensure that blank lines between entries are treated in the
+ same way as samtools 1.2. See mdshw5/pyfaidx#83.
+ """
+ expect_index = ("MT 119 4 70 71\nGL000207.1 60 187 60 61\n")
+ index_file = Faidx('data/issue_83.fasta').indexname
+ result_index = open(index_file).read()
+ os.remove('data/issue_83.fasta.fai')
+ assert result_index == expect_index
diff --git a/tests/test_sequence_class.py b/tests/test_sequence_class.py
index 8379a23..507710c 100644
--- a/tests/test_sequence_class.py
+++ b/tests/test_sequence_class.py
@@ -32,5 +32,15 @@ def test_slice_index():
def test_comp_invalid():
complement(comp_invalid)
+ at raises(ValueError)
+def test_check_coordinates():
+ x = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA',
+ start=100, end=110)
+ x[:]
+
def test_comp_valid():
- complement(comp_valid)
+ assert complement(comp_valid).startswith("AACTTCTAAAnCG")
+ assert complement(complement(comp_valid)) == comp_valid
+
+def test_comp_empty():
+ assert complement('') == ''
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyfaidx.git
More information about the debian-med-commit
mailing list