[med-svn] [python-pyfaidx] 01/06: New upstream version 0.4.8.1
Andreas Tille
tille at debian.org
Sun Dec 11 17:10:34 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-pyfaidx.
commit c24135f0475e1cbf6907a5d4a39de3ae3c4b7e30
Author: Andreas Tille <tille at debian.org>
Date: Sun Dec 11 17:36:16 2016 +0100
New upstream version 0.4.8.1
---
.coveragerc | 8 +-
.gitignore | 7 +-
.travis.yml | 13 +-
README.rst | 21 +++-
appveyor.yml | 31 +++++
ci/appveyor/vcvars64.bat | 1 +
dev-requirements.txt | 11 +-
pyfaidx/__init__.py | 233 +++++++++++++++++++-----------------
pyfaidx/cli.py | 5 +-
setup.py | 7 +-
tests/test_FastaRecord.py | 13 +-
tests/test_FastaVariant.py | 1 +
tests/test_Fasta_synchronization.py | 218 +++++++++++++++++++++++++++++++++
tests/test_feature_indexing.py | 87 +++++++++++++-
tox.ini | 27 +++++
15 files changed, 545 insertions(+), 138 deletions(-)
diff --git a/.coveragerc b/.coveragerc
index dba015f..bd72b98 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,9 +1,15 @@
[run]
+parallel = True
omit =
- */python?.?/*
*/lib-python/?.?/*.py
*/lib_pypy/_*.py
*/site-packages/ordereddict.py
*/site-packages/nose/*
*/site-packages/six/*
*/unittest2/*
+
+[paths]
+source =
+ pyfaidx
+ .tox/*/lib/python*/site-packages/pyfaidx
+ .tox/*/site-packages/pyfaidx
diff --git a/.gitignore b/.gitignore
index 22e1f11..cc3cdad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,9 @@ __pycache__
*.pyc
.project
.pydevproject
-
+.idea
+*.egg-info
+.tox
+.coverage
+.coverage.*
+tests/data/chr22*
diff --git a/.travis.yml b/.travis.yml
index 252e2f3..eaf605b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,14 +5,13 @@ python:
- '3.5'
- '3.4'
- '3.3'
- - '3.2'
- '2.7'
- '2.6'
- - pypy
- - pypy3
+ - 'pypy'
+ - 'pypy3'
install:
- - pip wheel -f wheelhouse coveralls biopython cython pysam pyvcf || true
- - pip install -f wheelhouse biopython cython pysam pyfasta coveralls pyvcf || true
+ - pip wheel -f wheelhouse coverage biopython cython pysam pyvcf || true
+ - pip install -f wheelhouse biopython cython pysam pyfasta coverage pyvcf || true
- python setup.py install
- if [ ! -f samtools-1.2 ]; then wget -q -O - https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 | tar -xjv; fi
- cd samtools-1.2
@@ -36,11 +35,13 @@ deploy:
matrix:
allow_failures:
- python: 'nightly'
+ - python: 'pypy3'
+ fast_finish: true
cache:
directories:
- tests/data
- samtools-1.2
- wheelhouse
after_success:
- - coveralls
+ - bash <(curl -s https://codecov.io/bash)
- if [ $TRAVIS_PYTHON_VERSION == '3.4' ] && [ $TRAVIS_TAG ]; then python scripts/benchmark.py 1000; fi
diff --git a/README.rst b/README.rst
index 4b8d96c..1d39ff8 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-|Travis| |PyPI| |Landscape| |Coveralls| |Depsy|
+|Travis| |PyPI| |Landscape| |Coverage| |Depsy| |Appveyor| |Flattr|
Description
-----------
@@ -265,7 +265,7 @@ The FastaVariant class provides a way to integrate single nucleotide variant cal
>22:16042791-16042800
TCATAGGACA
- >>> consensus = FastaVariant('tests/data/chr22.fasta', 'tests/data/chr22.vcf.gz', het=True, hom=True, call_filter='GT == "0/1"')
+ >>> consensus = FastaVariant('tests/data/chr22.fasta', 'tests/data/chr22.vcf.gz', sample='NA06984', het=True, hom=True, call_filter='GT == "0/1"')
>>> consensus['22'].variant_sites
(16042793, 29187373, 29187448, 29194610, 29821332)
@@ -310,6 +310,7 @@ cli script: faidx
-m, --mask-with-default-seq
mask the FASTA file using --default-seq default: False
-M, --mask-by-case mask the FASTA file by changing to lowercase. default: False
+ --no-rebuild do not rebuild the .fai index even if it is out of date. default: False
--version print pyfaidx version number
Examples:
@@ -423,7 +424,7 @@ Examples:
AGCTTCCCTGTGGTTTCCCGAGGCTTCCTTGCTTCCCGCTCTGCGAGGAGCCTTTCATCCGAAGGCGGGA
.......
-
+
$ faidx --size-range 5500,6000 -i chromsizes tests/data/genes.fasta
NM_000465.3 5523
@@ -510,8 +511,16 @@ Comprehensive Cancer Center in the Department of Oncology.
:target: https://landscape.io/github/mdshw5/pyfaidx/master
:alt: Code Health
-.. |Coveralls| image:: https://coveralls.io/repos/mdshw5/pyfaidx/badge.svg?branch=master
- :target: https://coveralls.io/r/mdshw5/pyfaidx?branch=master
-
+.. |Coverage| image:: https://codecov.io/gh/mdshw5/pyfaidx/branch/master/graph/badge.svg
+ :target: https://codecov.io/gh/mdshw5/pyfaidx
+
.. |Depsy| image:: http://depsy.org/api/package/pypi/pyfaidx/badge.svg
:target: http://depsy.org/package/python/pyfaidx
+
+.. |Appveyor| image:: https://ci.appveyor.com/api/projects/status/80ihlw30a003596w?svg=true
+ :target: https://ci.appveyor.com/project/mdshw5/pyfaidx
+
+.. |Flattr| image:: http://button.flattr.com/flattr-badge-large.png
+ :target: https://flattr.com/submit/auto?fid=po00kq&url=https%3A%2F%2Fgithub.com%2Fmdshw5%2Fpyfaidx
+
+
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..d0147cc
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,31 @@
+environment:
+
+ matrix:
+
+ # For Python versions available on Appveyor, see
+ # http://www.appveyor.com/docs/installed-software#python
+
+ - PYTHON: "C:\\Python27"
+ - PYTHON: "C:\\Python33"
+ - PYTHON: "C:\\Python34"
+ - PYTHON: "C:\\Python35"
+ - PYTHON: "C:\\Python27-x64"
+ - PYTHON: "C:\\Python33-x64"
+ DISTUTILS_USE_SDK: "1"
+ - PYTHON: "C:\\Python34-x64"
+ DISTUTILS_USE_SDK: "1"
+ - PYTHON: "C:\\Python35-x64"
+
+install:
+ # Fix for problem building extensions for x64 under Python 3.3 and 3.4
+ # See: http://help.appveyor.com/discussions/problems/4278-cant-build-some-c-extensions-with-python-34-x64
+ # Used same solution as Matplotlib: https://github.com/matplotlib/matplotlib/blob/master/appveyor.yml
+ - cmd: copy ci\appveyor\vcvars64.bat "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\amd64"
+
+ # We need wheel installed to build wheels
+ - "%PYTHON%\\python.exe -m pip install -r dev-requirements.txt"
+
+build: off
+
+test_script:
+ - "%PYTHON%\\python.exe setup.py nosetests"
diff --git a/ci/appveyor/vcvars64.bat b/ci/appveyor/vcvars64.bat
new file mode 100644
index 0000000..ef77b9d
--- /dev/null
+++ b/ci/appveyor/vcvars64.bat
@@ -0,0 +1 @@
+CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64
diff --git a/dev-requirements.txt b/dev-requirements.txt
index cd933d6..76a6d1b 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,6 +1,5 @@
-Pygments>=1
-collective.checkdocs>=0.2
-docutils>=0.12
-six>=1.7.3
-nose==1.3.7
-biopython==1.65
+six
+nose
+biopython
+setuptools >= 0.7
+mock; python_version < '3.3'
\ No newline at end of file
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
index d21fe47..9e34bd2 100644
--- a/pyfaidx/__init__.py
+++ b/pyfaidx/__init__.py
@@ -7,7 +7,7 @@ Fasta file -> Faidx -> Fasta -> FastaRecord -> Sequence
from __future__ import division
import os
from os.path import getmtime
-from six import PY2, PY3, string_types
+from six import PY2, PY3, string_types, integer_types
from six.moves import zip_longest
try:
from collections import OrderedDict
@@ -18,10 +18,11 @@ import re
import string
import warnings
from math import ceil
+from threading import Lock
dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
-__version__ = '0.4.7.1'
+__version__ = '0.4.8.1'
class FastaIndexingError(Exception):
@@ -137,7 +138,7 @@ class Sequence(object):
start = self_start + slice_start
end = self_start + slice_stop + correction_factor
return self.__class__(self.name, self.seq[n], start, end, self.comp)
- elif isinstance(n, int):
+ elif isinstance(n, integer_types):
if n < 0:
n = len(self) + n
if self.start:
@@ -268,7 +269,7 @@ class Faidx(object):
as_raw=False, strict_bounds=False, read_ahead=None,
mutable=False, split_char=None, filt_function=None,
one_based_attributes=True,
- sequence_always_upper=False):
+ sequence_always_upper=False, rebuild=True):
"""
filename: name of fasta file
key_function: optional callback function which should return a unique
@@ -291,23 +292,32 @@ class Faidx(object):
self.one_based_attributes = one_based_attributes
self.sequence_always_upper = sequence_always_upper
self.index = OrderedDict()
+ self.lock = Lock()
self.buffer = dict((('seq', None), ('name', None), ('start', None), ('end', None)))
- if not read_ahead or isinstance(read_ahead, int):
+ if not read_ahead or isinstance(read_ahead, integer_types):
self.read_ahead = read_ahead
- elif not isinstance(read_ahead, int):
+ elif not isinstance(read_ahead, integer_types):
raise ValueError("read_ahead value must be int, not {0}".format(type(read_ahead)))
self.mutable = mutable
-
- if os.path.exists(self.indexname) and getmtime(self.indexname) >= getmtime(self.filename):
- self.read_fai(split_char)
- else:
+ with self.lock: # lock around index generation so only one thread calls method
try:
- self.build_index()
+ if os.path.exists(self.indexname) and getmtime(self.indexname) >= getmtime(self.filename):
+ self.read_fai(split_char)
+ elif os.path.exists(self.indexname) and getmtime(self.indexname) < getmtime(self.filename) and not rebuild:
+ self.read_fai(split_char)
+ warnings.warn("Index file {0} is older than FASTA file {1}.".format(self.indexname, self.filename), RuntimeWarning)
+ else:
+ self.build_index()
+ self.read_fai(split_char)
except FastaIndexingError as e:
os.remove(self.indexname)
+ self.file.close()
raise FastaIndexingError(e)
- self.read_fai(split_char)
+ except Exception:
+ # Handle potential exceptions other than 'FastaIndexingError'
+ self.file.close()
+ raise
def __contains__(self, region):
if not self.buffer['name']:
@@ -349,66 +359,70 @@ class Faidx(object):
self.index.pop(dup, None)
def build_index(self):
- with open(self.filename, 'r') as fastafile:
- with open(self.indexname, 'w') as indexfile:
- rname = None # reference sequence name
- offset = 0 # binary offset of end of current line
- rlen = 0 # reference character length
- blen = None # binary line length (includes newline)
- clen = None # character line length
- bad_lines = [] # lines > || < than blen
- thisoffset = offset
- for i, line in enumerate(fastafile):
- line_blen = len(line)
- line_clen = len(line.rstrip('\n\r'))
- # write an index line
- if line[0] == '>':
- valid_entry = check_bad_lines(rname, bad_lines, i - 1)
- if valid_entry and i > 0:
- indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
- elif not valid_entry:
- raise FastaIndexingError("Line length of fasta"
- " file is not "
- "consistent! "
- "Inconsistent line found in >{0} at "
- "line {1:n}.".format(rname, bad_lines[0][0] + 1))
- blen = None
- rlen = 0
- clen = None
- bad_lines = []
- try: # must catch empty deflines
- rname = line.rstrip('\n\r')[1:].split()[0] # remove comments
- except IndexError:
- raise FastaIndexingError("Bad sequence name %s at line %s." % (line.rstrip('\n\r'), str(i)))
- offset += line_blen
- thisoffset = offset
- else: # check line and advance offset
- if not blen:
- blen = line_blen
- if not clen:
- clen = line_clen
- # only one short line should be allowed
- # before we hit the next header, and it
- # should be the last line in the entry
- if line_blen != blen or line_blen == 1:
- bad_lines.append((i, line_blen))
- offset += line_blen
- rlen += line_clen
-
- # write the final index line
- valid_entry = check_bad_lines(rname, bad_lines, i) # advance index since we're at the end of the file
- if not valid_entry:
- raise FastaIndexingError("Line length of fasta"
- " file is not "
- "consistent! "
- "Inconsistent line found in >{0} at "
- "line {1:n}.".format(rname, bad_lines[0][0] + 1))
- indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+ try:
+ with open(self.filename, 'r') as fastafile:
+ with open(self.indexname, 'w') as indexfile:
+ rname = None # reference sequence name
+ offset = 0 # binary offset of end of current line
+ rlen = 0 # reference character length
+ blen = None # binary line length (includes newline)
+ clen = None # character line length
+ bad_lines = [] # lines > || < than blen
+ thisoffset = offset
+ for i, line in enumerate(fastafile):
+ line_blen = len(line)
+ line_clen = len(line.rstrip('\n\r'))
+ # write an index line
+ if line[0] == '>':
+ valid_entry = check_bad_lines(rname, bad_lines, i - 1)
+ if valid_entry and i > 0:
+ indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+ elif not valid_entry:
+ raise FastaIndexingError("Line length of fasta"
+ " file is not "
+ "consistent! "
+ "Inconsistent line found in >{0} at "
+ "line {1:n}.".format(rname, bad_lines[0][0] + 1))
+ blen = None
+ rlen = 0
+ clen = None
+ bad_lines = []
+ try: # must catch empty deflines
+ rname = line.rstrip('\n\r')[1:].split()[0] # remove comments
+ except IndexError:
+ raise FastaIndexingError("Bad sequence name %s at line %s." % (line.rstrip('\n\r'), str(i)))
+ offset += line_blen
+ thisoffset = offset
+ else: # check line and advance offset
+ if not blen:
+ blen = line_blen
+ if not clen:
+ clen = line_clen
+ # only one short line should be allowed
+ # before we hit the next header, and it
+ # should be the last line in the entry
+ if line_blen != blen or line_blen == 1:
+ bad_lines.append((i, line_blen))
+ offset += line_blen
+ rlen += line_clen
+
+ # write the final index line
+ valid_entry = check_bad_lines(rname, bad_lines, i) # advance index since we're at the end of the file
+ if not valid_entry:
+ raise FastaIndexingError("Line length of fasta"
+ " file is not "
+ "consistent! "
+ "Inconsistent line found in >{0} at "
+ "line {1:n}.".format(rname, bad_lines[0][0] + 1))
+ indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+ except IOError:
+ raise IOError("%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." % self.indexname)
def write_fai(self):
- with open(self.indexname, 'w') as outfile:
- for k, v in self.index.items():
- outfile.write('\t'.join([k, str(v)]))
+ with self.lock:
+ with open(self.indexname, 'w') as outfile:
+ for k, v in self.index.items():
+ outfile.write('\t'.join([k, str(v)]))
def from_buffer(self, start, end):
i_start = start - self.buffer['start'] # want [0, 1) coordinates from [1, 1] coordinates
@@ -444,8 +458,8 @@ class Faidx(object):
4. Seek to start position, taking newlines into account
5. Read to end position, return sequence
"""
- assert isinstance(start, int)
- assert isinstance(end, int)
+ assert isinstance(start, integer_types)
+ assert isinstance(end, integer_types)
try:
i = self.index[rname]
except KeyError:
@@ -463,20 +477,23 @@ class Faidx(object):
newlines_inside = newlines_to_end - newlines_before
seq_blen = newlines_inside + seq_len
bstart = i.offset + newlines_before + start0
- self.file.seek(bstart)
-
- if bstart + seq_blen > i.bend and not self.strict_bounds:
- seq_blen = i.bend - bstart
- elif bstart + seq_blen > i.bend and self.strict_bounds:
- raise FetchError("Requested end coordinate {0:n} outside of {1}. "
- "\n".format(end, rname))
- if seq_blen > 0:
- seq = self.file.read(seq_blen).decode()
- elif seq_blen <= 0 and not self.strict_bounds:
- seq = ''
- elif seq_blen <= 0 and self.strict_bounds:
- raise FetchError("Requested coordinates start={0:n} end={1:n} are "
- "invalid.\n".format(start, end))
+
+ with self.lock:
+ self.file.seek(bstart)
+
+ if bstart + seq_blen > i.bend and not self.strict_bounds:
+ seq_blen = i.bend - bstart
+ elif bstart + seq_blen > i.bend and self.strict_bounds:
+ raise FetchError("Requested end coordinate {0:n} outside of {1}. "
+ "\n".format(end, rname))
+ if seq_blen > 0:
+ seq = self.file.read(seq_blen).decode()
+ elif seq_blen <= 0 and not self.strict_bounds:
+ seq = ''
+ elif seq_blen <= 0 and self.strict_bounds:
+ raise FetchError("Requested coordinates start={0:n} end={1:n} are "
+ "invalid.\n".format(start, end))
+
if not internals:
return seq.replace('\n', '')
else:
@@ -508,21 +525,23 @@ class Faidx(object):
if not self.mutable:
raise IOError("Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA.")
file_seq, internals = self.from_file(rname, start, end, internals=True)
- if len(seq) != len(file_seq) - internals['newlines_inside']:
- raise IOError("Specified replacement sequence needs to have the same length as original.")
- elif len(seq) == len(file_seq) - internals['newlines_inside']:
- line_len = internals['i'].lenc
- self.file.seek(internals['bstart'])
- if internals['newlines_inside'] == 0:
- self.file.write(seq.encode())
- elif internals['newlines_inside'] > 0:
- n = 0
- m = file_seq.index('\n')
- while m < len(seq):
- self.file.write(''.join([seq[n:m], '\n']).encode())
- n = m
- m += line_len
- self.file.write(seq[n:].encode())
+
+ with self.lock:
+ if len(seq) != len(file_seq) - internals['newlines_inside']:
+ raise IOError("Specified replacement sequence needs to have the same length as original.")
+ elif len(seq) == len(file_seq) - internals['newlines_inside']:
+ line_len = internals['i'].lenc
+ self.file.seek(internals['bstart'])
+ if internals['newlines_inside'] == 0:
+ self.file.write(seq.encode())
+ elif internals['newlines_inside'] > 0:
+ n = 0
+ m = file_seq.index('\n')
+ while m < len(seq):
+ self.file.write(''.join([seq[n:m], '\n']).encode())
+ n = m
+ m += line_len
+ self.file.write(seq[n:].encode())
def close(self):
self.__exit__()
@@ -556,7 +575,7 @@ class FastaRecord(object):
start = len(self) + start
return self._fa.get_seq(self.name, start + 1, stop)[::step]
- elif isinstance(n, int):
+ elif isinstance(n, integer_types):
if n < 0:
n = len(self) + n
return self._fa.get_seq(self.name, n + 1, n + 1)
@@ -632,7 +651,7 @@ class MutableFastaRecord(FastaRecord):
start = len(self) + start
self._fa.faidx.to_file(self.name, start + 1, stop, value)
- elif isinstance(n, int):
+ elif isinstance(n, integer_types):
if n < 0:
n = len(self) + n
return self._fa.faidx.to_file(self.name, n + 1, n + 1, value)
@@ -644,7 +663,7 @@ class Fasta(object):
def __init__(self, filename, default_seq=None, key_function=None, as_raw=False,
strict_bounds=False, read_ahead=None, mutable=False, split_char=None,
filt_function=None, one_based_attributes=True,
- sequence_always_upper=False):
+ sequence_always_upper=False, rebuild=True):
"""
An object that provides a pygr compatible interface.
filename: name of fasta file
@@ -655,7 +674,7 @@ class Fasta(object):
default_seq=default_seq, strict_bounds=strict_bounds,
read_ahead=read_ahead, mutable=mutable, split_char=split_char,
filt_function=filt_function, one_based_attributes=one_based_attributes,
- sequence_always_upper=sequence_always_upper)
+ sequence_always_upper=sequence_always_upper, rebuild=rebuild)
self.keys = self.faidx.index.keys
if not self.mutable:
self.records = dict([(rname, FastaRecord(rname, self)) for rname in self.keys()])
@@ -668,7 +687,7 @@ class Fasta(object):
def __getitem__(self, rname):
"""Return a chromosome by its name, or its numerical index."""
- if isinstance(rname, int):
+ if isinstance(rname, integer_types):
rname = tuple(self.keys())[rname]
try:
return self.records[rname]
@@ -759,7 +778,7 @@ class FastaVariant(Fasta):
else:
seq_mut = list(seq.seq)
del seq.seq
- var = self.vcf.fetch(name, start, end)
+ var = self.vcf.fetch(name, start - 1, end)
for record in var:
if record.is_snp: # skip indels
sample = record.genotype(self.sample)
diff --git a/pyfaidx/cli.py b/pyfaidx/cli.py
index 9e55d0e..87c1c32 100644
--- a/pyfaidx/cli.py
+++ b/pyfaidx/cli.py
@@ -13,14 +13,14 @@ def write_sequence(args):
if ext:
ext = ext[1:] # remove the dot from extension
filt_function = re.compile(args.regex).search
- fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function)
+ fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)
regions_to_fetch, split_function = split_regions(args)
if not regions_to_fetch:
regions_to_fetch = fasta.keys()
if args.invert_match:
sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
- fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)
+ fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
split_function = ucsc_split
@@ -152,6 +152,7 @@ def main(ext_args=None):
masking = parser.add_mutually_exclusive_group()
masking.add_argument('-m', '--mask-with-default-seq', action="store_true", default=False, help="mask the FASTA file using --default-seq default: %(default)s")
masking.add_argument('-M', '--mask-by-case', action="store_true", default=False, help="mask the FASTA file by changing to lowercase. default: %(default)s")
+ parser.add_argument('--no-rebuild', action="store_true", default=False, help="do not rebuild the .fai index even if it is out of date. default: %(default)s")
parser.add_argument('--version', action="version", version=__version__, help="print pyfaidx version number")
# print help usage if no arguments are supplied
if len(sys.argv)==1 and not ext_args:
diff --git a/setup.py b/setup.py
index 915f2da..7926f06 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,8 @@
from setuptools import setup
+from io import open
import sys
-install_requires = ['six']
+install_requires = ['six', 'setuptools >= 0.7']
if sys.version_info[0] == 2 and sys.version_info[1] == 6:
install_requires.extend(['ordereddict', 'argparse'])
@@ -17,13 +18,13 @@ def get_version(string):
setup(
name='pyfaidx',
provides='pyfaidx',
- version=get_version(open('pyfaidx/__init__.py').read()),
+ version=get_version(open('pyfaidx/__init__.py', encoding='utf-8').read()),
author='Matthew Shirley',
author_email='mdshw5 at gmail.com',
url='http://mattshirley.com',
description='pyfaidx: efficient pythonic random '
'access to fasta subsequences',
- long_description=open('README.rst').read(),
+ long_description=open('README.rst', encoding='utf-8').read(),
license='BSD',
packages=['pyfaidx'],
install_requires=install_requires,
diff --git a/tests/test_FastaRecord.py b/tests/test_FastaRecord.py
index f080f03..2316e7a 100644
--- a/tests/test_FastaRecord.py
+++ b/tests/test_FastaRecord.py
@@ -1,8 +1,10 @@
import os
+import sys
from pyfaidx import Fasta
from tempfile import NamedTemporaryFile
from unittest import TestCase
from nose.tools import raises
+from difflib import Differ
path = os.path.dirname(__file__)
os.chdir(path)
@@ -48,17 +50,17 @@ class TestFastaRecord(TestCase):
""" Check for pathogenic FastaRecord.long_name behavior in mdshw5/pyfaidx#62 """
deflines = []
line_len = None
- with open('data/genes.fasta') as fasta_file:
- with open('data/issue_62.fa', 'w') as fasta_uniform_len:
+ with open('data/genes.fasta', 'rb') as fasta_file:
+ with open('data/issue_62.fa', 'wb') as fasta_uniform_len:
for line in fasta_file:
- if line[0] == '>':
- deflines.append(line[1:-1])
+ if line.startswith(b'>'):
+ deflines.append(line[1:-1].decode('ascii'))
fasta_uniform_len.write(line)
elif line_len is None:
line_len = len(line)
fasta_uniform_len.write(line)
elif line_len > len(line):
- fasta_uniform_len.write(line.rstrip() + 'N' * (line_len - len(line)) + '\n')
+ fasta_uniform_len.write(line.rstrip() + b'N' * (line_len - len(line)) + b'\n')
else:
fasta_uniform_len.write(line)
fasta = Fasta('data/issue_62.fa', as_raw=True)
@@ -70,6 +72,7 @@ class TestFastaRecord(TestCase):
os.remove('data/issue_62.fa.fai')
except EnvironmentError:
pass
+ sys.stdout.writelines(tuple(Differ().compare(deflines, long_names)))
assert deflines == long_names
class TestMutableFastaRecord(TestCase):
diff --git a/tests/test_FastaVariant.py b/tests/test_FastaVariant.py
index 981e5b3..d945ee7 100644
--- a/tests/test_FastaVariant.py
+++ b/tests/test_FastaVariant.py
@@ -54,6 +54,7 @@ class TestFastaVariant(TestCase):
try:
fasta = FastaVariant('data/chr22.fasta', 'data/chr22.vcf.gz', hom=True, het=True, as_raw=True)
ref = Fasta('data/chr22.fasta', as_raw=True)
+ print([(ref['22'][pos-1], fasta['22'][pos-1]) for pos in fasta['22'].variant_sites])
assert all(ref['22'][pos-1] != fasta['22'][pos-1] for pos in fasta['22'].variant_sites)
except (ImportError, IOError):
raise SkipTest
diff --git a/tests/test_Fasta_synchronization.py b/tests/test_Fasta_synchronization.py
new file mode 100644
index 0000000..a173418
--- /dev/null
+++ b/tests/test_Fasta_synchronization.py
@@ -0,0 +1,218 @@
+import os
+try:
+ from collections import OrderedDict
+except ImportError: #python 2.6
+ from ordereddict import OrderedDict
+import threading
+from pyfaidx import Fasta
+import random
+import tempfile
+import time
+import shutil
+from unittest import TestCase
+
+path = os.path.dirname(__file__)
+os.chdir(path)
+
+
+class _ThreadReadSequence(threading.Thread):
+ def __init__(self, rand, result_map, result_lock, name, seq):
+ super(_ThreadReadSequence, self).__init__()
+
+ seq_len = len(seq)
+ sub_seq_slices = list(slice(i, min(i + 20, seq_len)) for i in range(0, seq_len, 20))
+ random.shuffle(sub_seq_slices, rand.random)
+
+ self.result_map = result_map
+ self.result_lock = result_lock
+ self.name = name
+ self.seq = seq
+ self.sub_seq_slices = sub_seq_slices
+
+ def run(self):
+ name = self.name
+ seq = self.seq
+
+ sub_seqs = [''] * len(self.sub_seq_slices)
+ for sub_seq_slice in self.sub_seq_slices:
+ sub_seqs[sub_seq_slice.start//20] = seq[sub_seq_slice]
+ time.sleep(0)
+
+ # Put sub-sequences in correct order
+ seq_str = ''.join(sub_seqs)
+
+ with self.result_lock:
+ self.result_map[name] = seq_str
+
+
+class _ThreadWriteSequence(threading.Thread):
+ def __init__(self, rand, name, seq):
+ super(_ThreadWriteSequence, self).__init__()
+
+ seq_len = len(seq)
+ sub_seq_slices = list(slice(i, min(i + 20, seq_len)) for i in range(0, seq_len, 20))
+ random.shuffle(sub_seq_slices, rand.random)
+
+ self.name = name
+ self.seq = seq
+ self.sub_seq_slices = sub_seq_slices
+
+ def run(self):
+ seq = self.seq
+ seq_len = len(seq)
+ seq_str = seq[:].lower()
+
+ for sub_seq_slice in self.sub_seq_slices:
+ try:
+ seq[sub_seq_slice] = seq_str[sub_seq_slice]
+ time.sleep(0)
+ except Exception:
+ # Conflicting simultaneous writes are likely to cause exceptions
+ # We test for the expected string at the end, so ignore interim
+ # failures.
+ pass
+
+
+class TestFastaIntIndex(TestCase):
+ def setUp(self):
+ self.longMessage = True
+ self.maxDiff = None
+ self.tmp_dir = tempfile.mkdtemp()
+ # Use a seeded random orders are randomish within a test, but the same across test runs
+ self.rand = random.Random(8903423147)
+
+ def tearDown(self):
+ tmp_dir = getattr(self, 'tmp_dir', None)
+ if tmp_dir:
+ shutil.rmtree(tmp_dir)
+
+ try:
+ os.remove('data/genes.fasta.fai')
+ except EnvironmentError:
+ pass # some tests may delete this file
+
+ def test_simultaneous_reads(self):
+ """
+ Test that each read of a sequence range is atomic.
+ To do this, spawn several threads to simultaneously read the sequences
+ in a Fasta file in pieces. If the reads are not atomic, then it is
+ reasonably likely (with sufficient concurrency) that a read from one
+ thread will affect that in another, so the sequences will not be
+ read properly.
+ """
+ # Read in original file data
+ ref_seq_map = OrderedDict()
+ with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+ for name, seq in fasta.records.items():
+ ref_seq_map[name] = seq[:]
+
+ # Initialize map with fasta sequence names to enforce same ordering as 'ref_seq_map'
+ thread_result_lock = threading.Lock()
+ thread_read_seq_map = OrderedDict((name, None) for name in ref_seq_map)
+
+ # Read file again, using many threads and simultaneously reading each sequence in pieces
+ with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+ threads = []
+ for name, seq in fasta.records.items():
+ threads.append(_ThreadReadSequence(self.rand, thread_read_seq_map, thread_result_lock, name, seq))
+
+ for thread in threads:
+ thread.start()
+
+ for thread in threads:
+ thread.join()
+
+ self.assertEqual(thread_read_seq_map, ref_seq_map)
+
+ def test_simultaneous_writes(self):
+ """
+ Test that each write of a sequence range is atomic.
+ To do this, spawn several threads to simultaneously write sequences
+ to a Fasta file in pieces. If the writes are not atomic, then it is
+ reasonably likely (with sufficient concurrency) that a write from one
+ thread will affect that in another, so the sequences will not be
+ written properly. To make sure all sequences are mutated, the writes
+ will transform the sequence to lower-case.
+ """
+
+ tmp_dir = self.tmp_dir
+
+ tmp_fasta = os.path.join(tmp_dir, 'genes_write.fasta')
+ shutil.copyfile('data/genes.fasta', tmp_fasta)
+
+ # Read in original file data
+ ref_seq_map = OrderedDict()
+ with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+ for name, seq in fasta.records.items():
+ ref_seq_map[name] = seq[:].lower()
+
+ # Now write file, using many threads and simultaneously reading each sequence in pieces
+ with Fasta(tmp_fasta, as_raw=True, strict_bounds=True, mutable=True) as fasta:
+ threads = []
+ for name, seq in fasta.records.items():
+ threads.append(_ThreadWriteSequence(self.rand, name, seq))
+
+ for thread in threads:
+ thread.start()
+
+ for thread in threads:
+ thread.join()
+
+ fasta.faidx.file.flush()
+
+ # Now read written Fasta file, and compare it to the original
+ thread_write_seq_map = OrderedDict()
+ with Fasta(tmp_fasta, as_raw=True, strict_bounds=True) as fasta:
+ for name, seq in fasta.records.items():
+ thread_write_seq_map[name] = seq[:]
+
+ self.assertEqual(thread_write_seq_map, ref_seq_map)
+
+ def test_simultaneous_reads_and_writes(self):
+ """
+ Combine the above two tests to check that interleaved reads and writes don't conflict.
+ """
+
+ tmp_dir = self.tmp_dir
+
+ tmp_fasta = os.path.join(tmp_dir, 'genes_write.fasta')
+ shutil.copyfile('data/genes.fasta', tmp_fasta)
+
+ # Read in original file data
+ ref_seq_map = OrderedDict()
+ with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+ for name, seq in fasta.records.items():
+ ref_seq_map[name] = seq[:].lower()
+
+ # Initialize map with fasta sequence names to enforce same ordering as 'ref_seq_map'
+ thread_result_lock = threading.Lock()
+ thread_read_seq_map = OrderedDict((name, None) for name in ref_seq_map)
+
+ # Now write file, using many threads and simultaneously reading each sequence in pieces
+ with Fasta(tmp_fasta, as_raw=True, strict_bounds=True, mutable=True) as fasta:
+ threads = []
+ for name, seq in fasta.records.items():
+ threads.append(_ThreadWriteSequence(self.rand, name, seq))
+ threads.append(_ThreadReadSequence(self.rand, thread_read_seq_map, thread_result_lock, name, seq))
+
+ for thread in threads:
+ thread.start()
+
+ for thread in threads:
+ thread.join()
+
+ fasta.faidx.file.flush()
+
+ # Now read written Fasta file, and compare it to the original
+ thread_write_seq_map = OrderedDict()
+ with Fasta(tmp_fasta, as_raw=True, strict_bounds=True) as fasta:
+ for name, seq in fasta.records.items():
+ thread_write_seq_map[name] = seq[:]
+
+ # Change read strings to lower-case (may be a mixture of lower and upper)
+ for name in ref_seq_map.keys():
+ thread_read_seq_map[name] = thread_read_seq_map[name].lower()
+
+ self.assertEqual(thread_write_seq_map, ref_seq_map)
+ self.assertEqual(thread_read_seq_map, ref_seq_map)
+
diff --git a/tests/test_feature_indexing.py b/tests/test_feature_indexing.py
index 968b787..ddcc322 100644
--- a/tests/test_feature_indexing.py
+++ b/tests/test_feature_indexing.py
@@ -2,9 +2,19 @@ import os
from os.path import getmtime
from pyfaidx import Faidx, FastaIndexingError
from nose.tools import raises
+from nose.plugins.skip import Skip, SkipTest
from unittest import TestCase
-from tempfile import NamedTemporaryFile
+from tempfile import NamedTemporaryFile, mkdtemp
import time
+import platform
+import shutil
+
+try:
+ from unittest import mock
+except ImportError:
+ import mock
+
+import six.moves.builtins as builtins
path = os.path.dirname(__file__)
os.chdir(path)
@@ -73,6 +83,9 @@ class TestIndexing(TestCase):
""" Makes all full-length lines short and checks that error is raised
in all appropriate circumstances.
"""
+ # http://stackoverflow.com/a/23212515/717419
+ if platform.system() == 'Windows':
+ raise SkipTest
indexed = []
with open('data/genes.fasta') as genes:
fasta = genes.readlines()
@@ -100,6 +113,9 @@ class TestIndexing(TestCase):
""" Makes all full-length lines long and checks that error is raised
in all appropriate circumstances.
"""
+ # http://stackoverflow.com/a/23212515/717419
+ if platform.system() == 'Windows':
+ raise SkipTest
indexed = []
with open('data/genes.fasta') as genes:
fasta = genes.readlines()
@@ -127,6 +143,9 @@ class TestIndexing(TestCase):
""" Makes all full-length lines blank and checks that error is raised
in all appropriate circumstances.
"""
+ # http://stackoverflow.com/a/23212515/717419
+ if platform.system() == 'Windows':
+ raise SkipTest
indexed = []
with open('data/genes.fasta') as genes:
fasta = genes.readlines()
@@ -171,3 +190,69 @@ class TestIndexing(TestCase):
result_index = open(index_file).read()
os.remove('data/issue_83.fasta.fai')
assert result_index == expect_index
+
+ def test_build_issue_96_fail_build_faidx(self):
+ """ Ensure that the fasta file is closed if construction of the 'Faidx' file
+ when attempting to build an index.
+ See mdshw5/pyfaidx#96
+ """
+ tmp_dir = mkdtemp()
+ try:
+ fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
+ # Write simple fasta file with inconsistent sequence line lengths,
+ # so building an index raises a 'FastaIndexingError'
+ with open(fasta_path, 'w') as fasta_out:
+ fasta_out.write(">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n")
+
+ builtins_open = builtins.open
+
+ opened_files=[]
+ def test_open(*args, **kwargs):
+ f = builtins_open(*args, **kwargs)
+ opened_files.append(f)
+ return f
+
+ with mock.patch('six.moves.builtins.open', side_effect=test_open):
+ try:
+ Faidx(fasta_path)
+ self.assertFail("Faidx construction should fail with 'FastaIndexingError'.")
+ except FastaIndexingError:
+ pass
+ self.assertTrue(all(f.closed for f in opened_files))
+ finally:
+ shutil.rmtree(tmp_dir)
+
+ def test_build_issue_96_fail_read_malformed_index_duplicate_key(self):
+ """ Ensure that the fasta file is closed if construction of the 'Faidx' file
+ fails when attempting to read a pre-existing index. The index is malformed because
+ it contains mulitple occurrences of the same index.
+ See mdshw5/pyfaidx#96
+ """
+ tmp_dir = mkdtemp()
+ try:
+ fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
+ faidx_path = os.path.join(tmp_dir, 'issue_96.fasta.fai')
+ # Write simple fasta file
+ with open(fasta_path, 'w') as fasta_out:
+ fasta_out.write(">seq1\nCTCCGGGCCCAT\nATAAAGCCTAAA\n")
+ with open(faidx_path, 'w') as faidx_out:
+ faidx_out.write("seq1\t24\t6\t12\t13\nseq1\t24\t6\t12\t13\n")
+
+ builtins_open = builtins.open
+
+ opened_files=[]
+ def test_open(*args, **kwargs):
+ f = builtins_open(*args, **kwargs)
+ opened_files.append(f)
+ return f
+
+ with mock.patch('six.moves.builtins.open', side_effect=test_open):
+ try:
+ Faidx(fasta_path)
+ self.assertFail("Faidx construction should fail with 'ValueError'.")
+ except ValueError:
+ pass
+ self.assertTrue(all(f.closed for f in opened_files))
+ finally:
+ shutil.rmtree(tmp_dir)
+
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..a08d4ce
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,27 @@
+[tox]
+envlist = init, py26, py27, py33, py34, py35, pypy, pypy3, final
+
+[testenv]
+deps = nose
+ mock; python_version < '3.3'
+ coverage
+ nose-cov
+ biopython
+ pysam; python_version > '2.6' and platform_python_implementation != 'PyPy'
+ pyvcf
+commands = nosetests --with-cov --cov-report term-missing --cov {envsitepackagesdir}/pyfaidx -P tests
+ coverage combine
+ bash -c 'mv {toxinidir}/.coverage {toxworkdir}/.coverage.{envname}'
+whitelist_externals = bash
+
+[testenv:init]
+basepython = python2.7
+commands = coverage erase
+ bash -c 'rm -rf {toxworkdir}/.coverage.*'
+ python tests/data/download_gene_fasta.py
+
+[testenv:final]
+basepython = python2.7
+commands = coverage combine {toxworkdir}
+ coverage report -m
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyfaidx.git
More information about the debian-med-commit
mailing list