[med-svn] [python-pyfaidx] 01/06: New upstream version 0.4.8.1

Andreas Tille tille at debian.org
Sun Dec 11 17:10:34 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-pyfaidx.

commit c24135f0475e1cbf6907a5d4a39de3ae3c4b7e30
Author: Andreas Tille <tille at debian.org>
Date:   Sun Dec 11 17:36:16 2016 +0100

    New upstream version 0.4.8.1
---
 .coveragerc                         |   8 +-
 .gitignore                          |   7 +-
 .travis.yml                         |  13 +-
 README.rst                          |  21 +++-
 appveyor.yml                        |  31 +++++
 ci/appveyor/vcvars64.bat            |   1 +
 dev-requirements.txt                |  11 +-
 pyfaidx/__init__.py                 | 233 +++++++++++++++++++-----------------
 pyfaidx/cli.py                      |   5 +-
 setup.py                            |   7 +-
 tests/test_FastaRecord.py           |  13 +-
 tests/test_FastaVariant.py          |   1 +
 tests/test_Fasta_synchronization.py | 218 +++++++++++++++++++++++++++++++++
 tests/test_feature_indexing.py      |  87 +++++++++++++-
 tox.ini                             |  27 +++++
 15 files changed, 545 insertions(+), 138 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index dba015f..bd72b98 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,9 +1,15 @@
 [run]
+parallel = True
 omit =
-    */python?.?/*
     */lib-python/?.?/*.py
     */lib_pypy/_*.py
     */site-packages/ordereddict.py
     */site-packages/nose/*
     */site-packages/six/*
     */unittest2/*
+
+[paths]
+source =
+    pyfaidx
+    .tox/*/lib/python*/site-packages/pyfaidx
+    .tox/*/site-packages/pyfaidx
diff --git a/.gitignore b/.gitignore
index 22e1f11..cc3cdad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,9 @@ __pycache__
 *.pyc
 .project
 .pydevproject
-
+.idea
+*.egg-info
+.tox
+.coverage
+.coverage.*
+tests/data/chr22*
diff --git a/.travis.yml b/.travis.yml
index 252e2f3..eaf605b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,14 +5,13 @@ python:
     - '3.5'
     - '3.4'
     - '3.3'
-    - '3.2'
     - '2.7'
     - '2.6'
-    - pypy
-    - pypy3
+    - 'pypy'
+    - 'pypy3'
 install:
-    - pip wheel -f wheelhouse coveralls biopython cython pysam pyvcf || true
-    - pip install -f wheelhouse biopython cython pysam pyfasta coveralls pyvcf || true
+    - pip wheel -f wheelhouse coverage biopython cython pysam pyvcf || true
+    - pip install -f wheelhouse biopython cython pysam pyfasta coverage pyvcf || true
     - python setup.py install
     - if [ ! -f samtools-1.2 ]; then wget -q -O - https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 | tar -xjv; fi
     - cd samtools-1.2
@@ -36,11 +35,13 @@ deploy:
 matrix:
   allow_failures:
     - python: 'nightly'
+    - python: 'pypy3'
+  fast_finish: true
 cache:
     directories:
         - tests/data
         - samtools-1.2
         - wheelhouse
 after_success:
-    - coveralls
+    - bash <(curl -s https://codecov.io/bash)
     - if [ $TRAVIS_PYTHON_VERSION == '3.4' ] && [ $TRAVIS_TAG ]; then python scripts/benchmark.py 1000; fi
diff --git a/README.rst b/README.rst
index 4b8d96c..1d39ff8 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-|Travis| |PyPI| |Landscape| |Coveralls| |Depsy|
+|Travis| |PyPI| |Landscape| |Coverage| |Depsy| |Appveyor| |Flattr|
 
 Description
 -----------
@@ -265,7 +265,7 @@ The FastaVariant class provides a way to integrate single nucleotide variant cal
     >22:16042791-16042800
     TCATAGGACA
 
-    >>> consensus = FastaVariant('tests/data/chr22.fasta', 'tests/data/chr22.vcf.gz', het=True, hom=True, call_filter='GT == "0/1"')
+    >>> consensus = FastaVariant('tests/data/chr22.fasta', 'tests/data/chr22.vcf.gz', sample='NA06984', het=True, hom=True, call_filter='GT == "0/1"')
     >>> consensus['22'].variant_sites
     (16042793, 29187373, 29187448, 29194610, 29821332)
 
@@ -310,6 +310,7 @@ cli script: faidx
       -m, --mask-with-default-seq
                             mask the FASTA file using --default-seq default: False
       -M, --mask-by-case    mask the FASTA file by changing to lowercase. default: False
+      --no-rebuild          do not rebuild the .fai index even if it is out of date. default: False
       --version             print pyfaidx version number
 
 Examples:
@@ -423,7 +424,7 @@ Examples:
     AGCTTCCCTGTGGTTTCCCGAGGCTTCCTTGCTTCCCGCTCTGCGAGGAGCCTTTCATCCGAAGGCGGGA
     .......
 
-    
+
 
     $ faidx --size-range 5500,6000 -i chromsizes tests/data/genes.fasta
     NM_000465.3	5523
@@ -510,8 +511,16 @@ Comprehensive Cancer Center in the Department of Oncology.
    :target: https://landscape.io/github/mdshw5/pyfaidx/master
    :alt: Code Health
 
-.. |Coveralls| image:: https://coveralls.io/repos/mdshw5/pyfaidx/badge.svg?branch=master
-   :target: https://coveralls.io/r/mdshw5/pyfaidx?branch=master
-   
+.. |Coverage| image:: https://codecov.io/gh/mdshw5/pyfaidx/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/mdshw5/pyfaidx
+
 .. |Depsy| image:: http://depsy.org/api/package/pypi/pyfaidx/badge.svg
    :target: http://depsy.org/package/python/pyfaidx
+
+.. |Appveyor| image:: https://ci.appveyor.com/api/projects/status/80ihlw30a003596w?svg=true
+   :target: https://ci.appveyor.com/project/mdshw5/pyfaidx
+   
+.. |Flattr| image:: http://button.flattr.com/flattr-badge-large.png
+   :target: https://flattr.com/submit/auto?fid=po00kq&url=https%3A%2F%2Fgithub.com%2Fmdshw5%2Fpyfaidx
+   
+
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..d0147cc
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,31 @@
+environment:
+
+  matrix:
+
+    # For Python versions available on Appveyor, see
+    # http://www.appveyor.com/docs/installed-software#python
+
+    - PYTHON: "C:\\Python27"
+    - PYTHON: "C:\\Python33"
+    - PYTHON: "C:\\Python34"
+    - PYTHON: "C:\\Python35"
+    - PYTHON: "C:\\Python27-x64"
+    - PYTHON: "C:\\Python33-x64"
+      DISTUTILS_USE_SDK: "1"
+    - PYTHON: "C:\\Python34-x64"
+      DISTUTILS_USE_SDK: "1"
+    - PYTHON: "C:\\Python35-x64"
+
+install:
+  # Fix for problem building extensions for x64 under Python 3.3 and 3.4
+  # See: http://help.appveyor.com/discussions/problems/4278-cant-build-some-c-extensions-with-python-34-x64
+  # Used same solution as Matplotlib: https://github.com/matplotlib/matplotlib/blob/master/appveyor.yml
+  - cmd: copy ci\appveyor\vcvars64.bat "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\amd64"
+
+  # We need wheel installed to build wheels
+  - "%PYTHON%\\python.exe -m pip install -r dev-requirements.txt"
+
+build: off
+
+test_script:
+  - "%PYTHON%\\python.exe setup.py nosetests"
diff --git a/ci/appveyor/vcvars64.bat b/ci/appveyor/vcvars64.bat
new file mode 100644
index 0000000..ef77b9d
--- /dev/null
+++ b/ci/appveyor/vcvars64.bat
@@ -0,0 +1 @@
+CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64
diff --git a/dev-requirements.txt b/dev-requirements.txt
index cd933d6..76a6d1b 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,6 +1,5 @@
-Pygments>=1
-collective.checkdocs>=0.2
-docutils>=0.12
-six>=1.7.3
-nose==1.3.7
-biopython==1.65
+six
+nose
+biopython
+setuptools >= 0.7
+mock; python_version < '3.3'
\ No newline at end of file
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
index d21fe47..9e34bd2 100644
--- a/pyfaidx/__init__.py
+++ b/pyfaidx/__init__.py
@@ -7,7 +7,7 @@ Fasta file -> Faidx -> Fasta -> FastaRecord -> Sequence
 from __future__ import division
 import os
 from os.path import getmtime
-from six import PY2, PY3, string_types
+from six import PY2, PY3, string_types, integer_types
 from six.moves import zip_longest
 try:
     from collections import OrderedDict
@@ -18,10 +18,11 @@ import re
 import string
 import warnings
 from math import ceil
+from threading import Lock
 
 dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
 
-__version__ = '0.4.7.1'
+__version__ = '0.4.8.1'
 
 
 class FastaIndexingError(Exception):
@@ -137,7 +138,7 @@ class Sequence(object):
                 start = self_start + slice_start
                 end = self_start + slice_stop + correction_factor
             return self.__class__(self.name, self.seq[n], start, end, self.comp)
-        elif isinstance(n, int):
+        elif isinstance(n, integer_types):
             if n < 0:
                 n = len(self) + n
             if self.start:
@@ -268,7 +269,7 @@ class Faidx(object):
                  as_raw=False, strict_bounds=False, read_ahead=None,
                  mutable=False, split_char=None, filt_function=None,
                  one_based_attributes=True,
-                 sequence_always_upper=False):
+                 sequence_always_upper=False, rebuild=True):
         """
         filename: name of fasta file
         key_function: optional callback function which should return a unique
@@ -291,23 +292,32 @@ class Faidx(object):
         self.one_based_attributes = one_based_attributes
         self.sequence_always_upper = sequence_always_upper
         self.index = OrderedDict()
+        self.lock = Lock()
         self.buffer = dict((('seq', None), ('name', None), ('start', None), ('end', None)))
-        if not read_ahead or isinstance(read_ahead, int):
+        if not read_ahead or isinstance(read_ahead, integer_types):
             self.read_ahead = read_ahead
-        elif not isinstance(read_ahead, int):
+        elif not isinstance(read_ahead, integer_types):
             raise ValueError("read_ahead value must be int, not {0}".format(type(read_ahead)))
 
         self.mutable = mutable
-
-        if os.path.exists(self.indexname) and getmtime(self.indexname) >= getmtime(self.filename):
-            self.read_fai(split_char)
-        else:
+        with self.lock:  # lock around index generation so only one thread calls method
             try:
-                self.build_index()
+                if os.path.exists(self.indexname) and getmtime(self.indexname) >= getmtime(self.filename):
+                    self.read_fai(split_char)
+                elif os.path.exists(self.indexname) and getmtime(self.indexname) < getmtime(self.filename) and not rebuild:
+                    self.read_fai(split_char)
+                    warnings.warn("Index file {0} is older than FASTA file {1}.".format(self.indexname, self.filename), RuntimeWarning)
+                else:
+                    self.build_index()
+                    self.read_fai(split_char)
             except FastaIndexingError as e:
                 os.remove(self.indexname)
+                self.file.close()
                 raise FastaIndexingError(e)
-            self.read_fai(split_char)
+            except Exception:
+                # Handle potential exceptions other than 'FastaIndexingError'
+                self.file.close()
+                raise
 
     def __contains__(self, region):
         if not self.buffer['name']:
@@ -349,66 +359,70 @@ class Faidx(object):
                 self.index.pop(dup, None)
 
     def build_index(self):
-        with open(self.filename, 'r') as fastafile:
-            with open(self.indexname, 'w') as indexfile:
-                rname = None  # reference sequence name
-                offset = 0  # binary offset of end of current line
-                rlen = 0  # reference character length
-                blen = None  # binary line length (includes newline)
-                clen = None  # character line length
-                bad_lines = []  # lines > || < than blen
-                thisoffset = offset
-                for i, line in enumerate(fastafile):
-                    line_blen = len(line)
-                    line_clen = len(line.rstrip('\n\r'))
-                    # write an index line
-                    if line[0] == '>':
-                        valid_entry = check_bad_lines(rname, bad_lines, i - 1)
-                        if valid_entry and i > 0:
-                            indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
-                        elif not valid_entry:
-                            raise FastaIndexingError("Line length of fasta"
-                                                     " file is not "
-                                                     "consistent! "
-                                                     "Inconsistent line found in >{0} at "
-                                                     "line {1:n}.".format(rname, bad_lines[0][0] + 1))
-                        blen = None
-                        rlen = 0
-                        clen = None
-                        bad_lines = []
-                        try:  # must catch empty deflines
-                            rname = line.rstrip('\n\r')[1:].split()[0]  # remove comments
-                        except IndexError:
-                            raise FastaIndexingError("Bad sequence name %s at line %s." % (line.rstrip('\n\r'), str(i)))
-                        offset += line_blen
-                        thisoffset = offset
-                    else:  # check line and advance offset
-                        if not blen:
-                            blen = line_blen
-                        if not clen:
-                            clen = line_clen
-                        # only one short line should be allowed
-                        # before we hit the next header, and it
-                        # should be the last line in the entry
-                        if line_blen != blen or line_blen == 1:
-                            bad_lines.append((i, line_blen))
-                        offset += line_blen
-                        rlen += line_clen
-
-                # write the final index line
-                valid_entry = check_bad_lines(rname, bad_lines, i)  # advance index since we're at the end of the file
-                if not valid_entry:
-                    raise FastaIndexingError("Line length of fasta"
-                                             " file is not "
-                                             "consistent! "
-                                             "Inconsistent line found in >{0} at "
-                                             "line {1:n}.".format(rname, bad_lines[0][0] + 1))
-                indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+        try:
+            with open(self.filename, 'r') as fastafile:
+                with open(self.indexname, 'w') as indexfile:
+                    rname = None  # reference sequence name
+                    offset = 0  # binary offset of end of current line
+                    rlen = 0  # reference character length
+                    blen = None  # binary line length (includes newline)
+                    clen = None  # character line length
+                    bad_lines = []  # lines > || < than blen
+                    thisoffset = offset
+                    for i, line in enumerate(fastafile):
+                        line_blen = len(line)
+                        line_clen = len(line.rstrip('\n\r'))
+                        # write an index line
+                        if line[0] == '>':
+                            valid_entry = check_bad_lines(rname, bad_lines, i - 1)
+                            if valid_entry and i > 0:
+                                indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+                            elif not valid_entry:
+                                raise FastaIndexingError("Line length of fasta"
+                                                         " file is not "
+                                                         "consistent! "
+                                                         "Inconsistent line found in >{0} at "
+                                                         "line {1:n}.".format(rname, bad_lines[0][0] + 1))
+                            blen = None
+                            rlen = 0
+                            clen = None
+                            bad_lines = []
+                            try:  # must catch empty deflines
+                                rname = line.rstrip('\n\r')[1:].split()[0]  # remove comments
+                            except IndexError:
+                                raise FastaIndexingError("Bad sequence name %s at line %s." % (line.rstrip('\n\r'), str(i)))
+                            offset += line_blen
+                            thisoffset = offset
+                        else:  # check line and advance offset
+                            if not blen:
+                                blen = line_blen
+                            if not clen:
+                                clen = line_clen
+                            # only one short line should be allowed
+                            # before we hit the next header, and it
+                            # should be the last line in the entry
+                            if line_blen != blen or line_blen == 1:
+                                bad_lines.append((i, line_blen))
+                            offset += line_blen
+                            rlen += line_clen
+
+                    # write the final index line
+                    valid_entry = check_bad_lines(rname, bad_lines, i)  # advance index since we're at the end of the file
+                    if not valid_entry:
+                        raise FastaIndexingError("Line length of fasta"
+                                                 " file is not "
+                                                 "consistent! "
+                                                 "Inconsistent line found in >{0} at "
+                                                 "line {1:n}.".format(rname, bad_lines[0][0] + 1))
+                    indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen))
+        except IOError:
+            raise IOError("%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." % self.indexname)
 
     def write_fai(self):
-        with open(self.indexname, 'w') as outfile:
-            for k, v in self.index.items():
-                outfile.write('\t'.join([k, str(v)]))
+        with self.lock:
+            with open(self.indexname, 'w') as outfile:
+                for k, v in self.index.items():
+                    outfile.write('\t'.join([k, str(v)]))
 
     def from_buffer(self, start, end):
         i_start = start - self.buffer['start']  # want [0, 1) coordinates from [1, 1] coordinates
@@ -444,8 +458,8 @@ class Faidx(object):
         4. Seek to start position, taking newlines into account
         5. Read to end position, return sequence
         """
-        assert isinstance(start, int)
-        assert isinstance(end, int)
+        assert isinstance(start, integer_types)
+        assert isinstance(end, integer_types)
         try:
             i = self.index[rname]
         except KeyError:
@@ -463,20 +477,23 @@ class Faidx(object):
         newlines_inside = newlines_to_end - newlines_before
         seq_blen = newlines_inside + seq_len
         bstart = i.offset + newlines_before + start0
-        self.file.seek(bstart)
-
-        if bstart + seq_blen > i.bend and not self.strict_bounds:
-            seq_blen = i.bend - bstart
-        elif bstart + seq_blen > i.bend and self.strict_bounds:
-            raise FetchError("Requested end coordinate {0:n} outside of {1}. "
-                             "\n".format(end, rname))
-        if seq_blen > 0:
-            seq = self.file.read(seq_blen).decode()
-        elif seq_blen <= 0 and not self.strict_bounds:
-            seq = ''
-        elif seq_blen <= 0 and self.strict_bounds:
-            raise FetchError("Requested coordinates start={0:n} end={1:n} are "
-                             "invalid.\n".format(start, end))
+
+        with self.lock:
+            self.file.seek(bstart)
+
+            if bstart + seq_blen > i.bend and not self.strict_bounds:
+                seq_blen = i.bend - bstart
+            elif bstart + seq_blen > i.bend and self.strict_bounds:
+                raise FetchError("Requested end coordinate {0:n} outside of {1}. "
+                                 "\n".format(end, rname))
+            if seq_blen > 0:
+                seq = self.file.read(seq_blen).decode()
+            elif seq_blen <= 0 and not self.strict_bounds:
+                seq = ''
+            elif seq_blen <= 0 and self.strict_bounds:
+                raise FetchError("Requested coordinates start={0:n} end={1:n} are "
+                                 "invalid.\n".format(start, end))
+
         if not internals:
             return seq.replace('\n', '')
         else:
@@ -508,21 +525,23 @@ class Faidx(object):
         if not self.mutable:
             raise IOError("Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA.")
         file_seq, internals = self.from_file(rname, start, end, internals=True)
-        if len(seq) != len(file_seq) - internals['newlines_inside']:
-            raise IOError("Specified replacement sequence needs to have the same length as original.")
-        elif len(seq) == len(file_seq) - internals['newlines_inside']:
-            line_len = internals['i'].lenc
-            self.file.seek(internals['bstart'])
-            if internals['newlines_inside'] == 0:
-                self.file.write(seq.encode())
-            elif internals['newlines_inside'] > 0:
-                n = 0
-                m = file_seq.index('\n')
-                while m < len(seq):
-                    self.file.write(''.join([seq[n:m], '\n']).encode())
-                    n = m
-                    m += line_len
-                self.file.write(seq[n:].encode())
+
+        with self.lock:
+            if len(seq) != len(file_seq) - internals['newlines_inside']:
+                raise IOError("Specified replacement sequence needs to have the same length as original.")
+            elif len(seq) == len(file_seq) - internals['newlines_inside']:
+                line_len = internals['i'].lenc
+                self.file.seek(internals['bstart'])
+                if internals['newlines_inside'] == 0:
+                    self.file.write(seq.encode())
+                elif internals['newlines_inside'] > 0:
+                    n = 0
+                    m = file_seq.index('\n')
+                    while m < len(seq):
+                        self.file.write(''.join([seq[n:m], '\n']).encode())
+                        n = m
+                        m += line_len
+                    self.file.write(seq[n:].encode())
 
     def close(self):
         self.__exit__()
@@ -556,7 +575,7 @@ class FastaRecord(object):
                     start = len(self) + start
                 return self._fa.get_seq(self.name, start + 1, stop)[::step]
 
-            elif isinstance(n, int):
+            elif isinstance(n, integer_types):
                 if n < 0:
                     n = len(self) + n
                 return self._fa.get_seq(self.name, n + 1, n + 1)
@@ -632,7 +651,7 @@ class MutableFastaRecord(FastaRecord):
                     start = len(self) + start
                 self._fa.faidx.to_file(self.name, start + 1, stop, value)
 
-            elif isinstance(n, int):
+            elif isinstance(n, integer_types):
                 if n < 0:
                     n = len(self) + n
                 return self._fa.faidx.to_file(self.name, n + 1, n + 1, value)
@@ -644,7 +663,7 @@ class Fasta(object):
     def __init__(self, filename, default_seq=None, key_function=None, as_raw=False,
                  strict_bounds=False, read_ahead=None, mutable=False, split_char=None,
                  filt_function=None, one_based_attributes=True,
-                 sequence_always_upper=False):
+                 sequence_always_upper=False, rebuild=True):
         """
         An object that provides a pygr compatible interface.
         filename: name of fasta file
@@ -655,7 +674,7 @@ class Fasta(object):
                            default_seq=default_seq, strict_bounds=strict_bounds,
                            read_ahead=read_ahead, mutable=mutable, split_char=split_char,
                            filt_function=filt_function, one_based_attributes=one_based_attributes,
-                           sequence_always_upper=sequence_always_upper)
+                           sequence_always_upper=sequence_always_upper, rebuild=rebuild)
         self.keys = self.faidx.index.keys
         if not self.mutable:
             self.records = dict([(rname, FastaRecord(rname, self)) for rname in self.keys()])
@@ -668,7 +687,7 @@ class Fasta(object):
 
     def __getitem__(self, rname):
         """Return a chromosome by its name, or its numerical index."""
-        if isinstance(rname, int):
+        if isinstance(rname, integer_types):
             rname = tuple(self.keys())[rname]
         try:
             return self.records[rname]
@@ -759,7 +778,7 @@ class FastaVariant(Fasta):
         else:
             seq_mut = list(seq.seq)
             del seq.seq
-        var = self.vcf.fetch(name, start, end)
+        var = self.vcf.fetch(name, start - 1, end)
         for record in var:
             if record.is_snp:  # skip indels
                 sample = record.genotype(self.sample)
diff --git a/pyfaidx/cli.py b/pyfaidx/cli.py
index 9e55d0e..87c1c32 100644
--- a/pyfaidx/cli.py
+++ b/pyfaidx/cli.py
@@ -13,14 +13,14 @@ def write_sequence(args):
     if ext:
         ext = ext[1:]  # remove the dot from extension
     filt_function = re.compile(args.regex).search
-    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function)
+    fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, rebuild=not args.no_rebuild)
 
     regions_to_fetch, split_function = split_regions(args)
     if not regions_to_fetch:
         regions_to_fetch = fasta.keys()
     if args.invert_match:
         sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
-        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter)
+        fasta = Fasta(args.fasta, default_seq=args.default_seq, strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
         regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
         split_function = ucsc_split
 
@@ -152,6 +152,7 @@ def main(ext_args=None):
     masking = parser.add_mutually_exclusive_group()
     masking.add_argument('-m', '--mask-with-default-seq', action="store_true", default=False, help="mask the FASTA file using --default-seq default: %(default)s")
     masking.add_argument('-M', '--mask-by-case', action="store_true", default=False, help="mask the FASTA file by changing to lowercase. default: %(default)s")
+    parser.add_argument('--no-rebuild', action="store_true", default=False, help="do not rebuild the .fai index even if it is out of date. default: %(default)s")
     parser.add_argument('--version', action="version", version=__version__, help="print pyfaidx version number")
     # print help usage if no arguments are supplied
     if len(sys.argv)==1 and not ext_args:
diff --git a/setup.py b/setup.py
index 915f2da..7926f06 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,8 @@
 from setuptools import setup
+from io import open
 import sys
 
-install_requires = ['six']
+install_requires = ['six', 'setuptools >= 0.7']
 if sys.version_info[0] == 2 and sys.version_info[1] == 6:
     install_requires.extend(['ordereddict', 'argparse'])
 
@@ -17,13 +18,13 @@ def get_version(string):
 setup(
     name='pyfaidx',
     provides='pyfaidx',
-    version=get_version(open('pyfaidx/__init__.py').read()),
+    version=get_version(open('pyfaidx/__init__.py', encoding='utf-8').read()),
     author='Matthew Shirley',
     author_email='mdshw5 at gmail.com',
     url='http://mattshirley.com',
     description='pyfaidx: efficient pythonic random '
                 'access to fasta subsequences',
-    long_description=open('README.rst').read(),
+    long_description=open('README.rst', encoding='utf-8').read(),
     license='BSD',
     packages=['pyfaidx'],
     install_requires=install_requires,
diff --git a/tests/test_FastaRecord.py b/tests/test_FastaRecord.py
index f080f03..2316e7a 100644
--- a/tests/test_FastaRecord.py
+++ b/tests/test_FastaRecord.py
@@ -1,8 +1,10 @@
 import os
+import sys
 from pyfaidx import Fasta
 from tempfile import NamedTemporaryFile
 from unittest import TestCase
 from nose.tools import raises
+from difflib import Differ
 
 path = os.path.dirname(__file__)
 os.chdir(path)
@@ -48,17 +50,17 @@ class TestFastaRecord(TestCase):
         """ Check for pathogenic FastaRecord.long_name behavior in mdshw5/pyfaidx#62 """
         deflines = []
         line_len = None
-        with open('data/genes.fasta') as fasta_file:
-            with open('data/issue_62.fa', 'w') as fasta_uniform_len:
+        with open('data/genes.fasta', 'rb') as fasta_file:
+            with open('data/issue_62.fa', 'wb') as fasta_uniform_len:
                 for line in fasta_file:
-                    if line[0] == '>':
-                        deflines.append(line[1:-1])
+                    if line.startswith(b'>'):
+                        deflines.append(line[1:-1].decode('ascii'))
                         fasta_uniform_len.write(line)
                     elif line_len is None:
                         line_len = len(line)
                         fasta_uniform_len.write(line)
                     elif line_len > len(line):
-                        fasta_uniform_len.write(line.rstrip() + 'N' * (line_len - len(line)) + '\n')
+                        fasta_uniform_len.write(line.rstrip() + b'N' * (line_len - len(line)) + b'\n')
                     else:
                         fasta_uniform_len.write(line)
         fasta = Fasta('data/issue_62.fa', as_raw=True)
@@ -70,6 +72,7 @@ class TestFastaRecord(TestCase):
             os.remove('data/issue_62.fa.fai')
         except EnvironmentError:
             pass
+        sys.stdout.writelines(tuple(Differ().compare(deflines, long_names)))
         assert deflines == long_names
 
 class TestMutableFastaRecord(TestCase):
diff --git a/tests/test_FastaVariant.py b/tests/test_FastaVariant.py
index 981e5b3..d945ee7 100644
--- a/tests/test_FastaVariant.py
+++ b/tests/test_FastaVariant.py
@@ -54,6 +54,7 @@ class TestFastaVariant(TestCase):
         try:
             fasta = FastaVariant('data/chr22.fasta', 'data/chr22.vcf.gz', hom=True, het=True, as_raw=True)
             ref = Fasta('data/chr22.fasta', as_raw=True)
+            print([(ref['22'][pos-1], fasta['22'][pos-1]) for pos in fasta['22'].variant_sites])
             assert all(ref['22'][pos-1] != fasta['22'][pos-1] for pos in fasta['22'].variant_sites)
         except (ImportError, IOError):
             raise SkipTest
diff --git a/tests/test_Fasta_synchronization.py b/tests/test_Fasta_synchronization.py
new file mode 100644
index 0000000..a173418
--- /dev/null
+++ b/tests/test_Fasta_synchronization.py
@@ -0,0 +1,218 @@
+import os
+try:
+    from collections import OrderedDict
+except ImportError: #python 2.6
+    from ordereddict import OrderedDict
+import threading
+from pyfaidx import Fasta
+import random
+import tempfile
+import time
+import shutil
+from unittest import TestCase
+
+path = os.path.dirname(__file__)
+os.chdir(path)
+
+
+class _ThreadReadSequence(threading.Thread):
+    def __init__(self, rand, result_map, result_lock, name, seq):
+        super(_ThreadReadSequence, self).__init__()
+
+        seq_len = len(seq)
+        sub_seq_slices = list(slice(i, min(i + 20, seq_len)) for i in range(0, seq_len, 20))
+        random.shuffle(sub_seq_slices, rand.random)
+
+        self.result_map = result_map
+        self.result_lock = result_lock
+        self.name = name
+        self.seq = seq
+        self.sub_seq_slices = sub_seq_slices
+
+    def run(self):
+        name = self.name
+        seq = self.seq
+
+        sub_seqs = [''] * len(self.sub_seq_slices)
+        for sub_seq_slice in self.sub_seq_slices:
+            sub_seqs[sub_seq_slice.start//20] = seq[sub_seq_slice]
+            time.sleep(0)
+
+        # Put sub-sequences in correct order
+        seq_str = ''.join(sub_seqs)
+
+        with self.result_lock:
+            self.result_map[name] = seq_str
+
+
+class _ThreadWriteSequence(threading.Thread):
+    def __init__(self, rand, name, seq):
+        super(_ThreadWriteSequence, self).__init__()
+
+        seq_len = len(seq)
+        sub_seq_slices = list(slice(i, min(i + 20, seq_len)) for i in range(0, seq_len, 20))
+        random.shuffle(sub_seq_slices, rand.random)
+
+        self.name = name
+        self.seq = seq
+        self.sub_seq_slices = sub_seq_slices
+
+    def run(self):
+        seq = self.seq
+        seq_len = len(seq)
+        seq_str = seq[:].lower()
+
+        for sub_seq_slice in self.sub_seq_slices:
+            try:
+                seq[sub_seq_slice] = seq_str[sub_seq_slice]
+                time.sleep(0)
+            except Exception:
+                # Conflicting simultaneous writes are likely to cause exceptions
+                # We test for the expected string at the end, so ignore interim
+                # failures.
+                pass
+
+
+class TestFastaIntIndex(TestCase):
+    def setUp(self):
+        self.longMessage = True
+        self.maxDiff = None
+        self.tmp_dir = tempfile.mkdtemp()
+        # Use a seeded random orders are randomish within a test, but the same across test runs
+        self.rand = random.Random(8903423147)
+
+    def tearDown(self):
+        tmp_dir = getattr(self, 'tmp_dir', None)
+        if tmp_dir:
+            shutil.rmtree(tmp_dir)
+
+        try:
+            os.remove('data/genes.fasta.fai')
+        except EnvironmentError:
+            pass  # some tests may delete this file
+
+    def test_simultaneous_reads(self):
+        """
+        Test that each read of a sequence range is atomic.
+        To do this, spawn several threads to simultaneously read the sequences
+        in a Fasta file in pieces. If the reads are not atomic, then it is
+        reasonably likely (with sufficient concurrency) that a read from one
+        thread will affect that in another, so the sequences will not be
+        read properly.
+        """
+        # Read in original file data
+        ref_seq_map = OrderedDict()
+        with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+            for name, seq in fasta.records.items():
+                ref_seq_map[name] = seq[:]
+
+        # Initialize map with fasta sequence names to enforce same ordering as 'ref_seq_map'
+        thread_result_lock = threading.Lock()
+        thread_read_seq_map = OrderedDict((name, None) for name in ref_seq_map)
+
+        # Read file again, using many threads and simultaneously reading each sequence in pieces
+        with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+            threads = []
+            for name, seq in fasta.records.items():
+                threads.append(_ThreadReadSequence(self.rand, thread_read_seq_map, thread_result_lock, name, seq))
+
+            for thread in threads:
+                thread.start()
+
+            for thread in threads:
+                thread.join()
+
+        self.assertEqual(thread_read_seq_map, ref_seq_map)
+
+    def test_simultaneous_writes(self):
+        """
+        Test that each write of a sequence range is atomic.
+        To do this, spawn several threads to simultaneously write sequences
+        to a Fasta file in pieces. If the writes are not atomic, then it is
+        reasonably likely (with sufficient concurrency) that a write from one
+        thread will affect that in another, so the sequences will not be
+        written properly. To make sure all sequences are mutated, the writes
+        will transform the sequence to lower-case.
+        """
+
+        tmp_dir = self.tmp_dir
+
+        tmp_fasta = os.path.join(tmp_dir, 'genes_write.fasta')
+        shutil.copyfile('data/genes.fasta', tmp_fasta)
+
+        # Read in original file data
+        ref_seq_map = OrderedDict()
+        with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+            for name, seq in fasta.records.items():
+                ref_seq_map[name] = seq[:].lower()
+
+        # Now write file, using many threads and simultaneously reading each sequence in pieces
+        with Fasta(tmp_fasta, as_raw=True, strict_bounds=True, mutable=True) as fasta:
+            threads = []
+            for name, seq in fasta.records.items():
+                threads.append(_ThreadWriteSequence(self.rand, name, seq))
+
+            for thread in threads:
+                thread.start()
+
+            for thread in threads:
+                thread.join()
+
+            fasta.faidx.file.flush()
+
+        # Now read written Fasta file, and compare it to the original
+        thread_write_seq_map = OrderedDict()
+        with Fasta(tmp_fasta, as_raw=True, strict_bounds=True) as fasta:
+            for name, seq in fasta.records.items():
+                thread_write_seq_map[name] = seq[:]
+
+        self.assertEqual(thread_write_seq_map, ref_seq_map)
+
+    def test_simultaneous_reads_and_writes(self):
+        """
+        Combine the above two tests to check that interleaved reads and writes don't conflict.
+        """
+
+        tmp_dir = self.tmp_dir
+
+        tmp_fasta = os.path.join(tmp_dir, 'genes_write.fasta')
+        shutil.copyfile('data/genes.fasta', tmp_fasta)
+
+        # Read in original file data
+        ref_seq_map = OrderedDict()
+        with Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) as fasta:
+            for name, seq in fasta.records.items():
+                ref_seq_map[name] = seq[:].lower()
+
+        # Initialize map with fasta sequence names to enforce same ordering as 'ref_seq_map'
+        thread_result_lock = threading.Lock()
+        thread_read_seq_map = OrderedDict((name, None) for name in ref_seq_map)
+
+        # Now write file, using many threads and simultaneously reading each sequence in pieces
+        with Fasta(tmp_fasta, as_raw=True, strict_bounds=True, mutable=True) as fasta:
+            threads = []
+            for name, seq in fasta.records.items():
+                threads.append(_ThreadWriteSequence(self.rand, name, seq))
+                threads.append(_ThreadReadSequence(self.rand, thread_read_seq_map, thread_result_lock, name, seq))
+
+            for thread in threads:
+                thread.start()
+
+            for thread in threads:
+                thread.join()
+
+            fasta.faidx.file.flush()
+
+        # Now read written Fasta file, and compare it to the original
+        thread_write_seq_map = OrderedDict()
+        with Fasta(tmp_fasta, as_raw=True, strict_bounds=True) as fasta:
+            for name, seq in fasta.records.items():
+                thread_write_seq_map[name] = seq[:]
+
+        # Change read strings to lower-case (may be a mixture of lower and upper)
+        for name in ref_seq_map.keys():
+            thread_read_seq_map[name] = thread_read_seq_map[name].lower()
+
+        self.assertEqual(thread_write_seq_map, ref_seq_map)
+        self.assertEqual(thread_read_seq_map, ref_seq_map)
+
diff --git a/tests/test_feature_indexing.py b/tests/test_feature_indexing.py
index 968b787..ddcc322 100644
--- a/tests/test_feature_indexing.py
+++ b/tests/test_feature_indexing.py
@@ -2,9 +2,19 @@ import os
 from os.path import getmtime
 from pyfaidx import Faidx, FastaIndexingError
 from nose.tools import raises
+from nose.plugins.skip import Skip, SkipTest
 from unittest import TestCase
-from tempfile import NamedTemporaryFile
+from tempfile import NamedTemporaryFile, mkdtemp
 import time
+import platform
+import shutil
+
+try:
+    from unittest import mock
+except ImportError:
+    import mock
+
+import six.moves.builtins as builtins
 
 path = os.path.dirname(__file__)
 os.chdir(path)
@@ -73,6 +83,9 @@ class TestIndexing(TestCase):
         """ Makes all full-length lines short and checks that error is raised
         in all appropriate circumstances.
         """
+        # http://stackoverflow.com/a/23212515/717419
+        if platform.system() == 'Windows':
+            raise SkipTest
         indexed = []
         with open('data/genes.fasta') as genes:
             fasta = genes.readlines()
@@ -100,6 +113,9 @@ class TestIndexing(TestCase):
         """ Makes all full-length lines long and checks that error is raised
         in all appropriate circumstances.
         """
+        # http://stackoverflow.com/a/23212515/717419
+        if platform.system() == 'Windows':
+            raise SkipTest
         indexed = []
         with open('data/genes.fasta') as genes:
             fasta = genes.readlines()
@@ -127,6 +143,9 @@ class TestIndexing(TestCase):
         """ Makes all full-length lines blank and checks that error is raised
         in all appropriate circumstances.
         """
+        # http://stackoverflow.com/a/23212515/717419
+        if platform.system() == 'Windows':
+            raise SkipTest
         indexed = []
         with open('data/genes.fasta') as genes:
             fasta = genes.readlines()
@@ -171,3 +190,69 @@ class TestIndexing(TestCase):
         result_index = open(index_file).read()
         os.remove('data/issue_83.fasta.fai')
         assert result_index == expect_index
+
+    def test_build_issue_96_fail_build_faidx(self):
+        """ Ensure that the fasta file is closed if construction of the 'Faidx' file
+        when attempting to build an index.
+        See mdshw5/pyfaidx#96
+        """
+        tmp_dir = mkdtemp()
+        try:
+            fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
+            # Write simple fasta file with inconsistent sequence line lengths,
+            # so building an index raises a 'FastaIndexingError'
+            with open(fasta_path, 'w') as fasta_out:
+                fasta_out.write(">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n")
+
+            builtins_open = builtins.open
+
+            opened_files=[]
+            def test_open(*args, **kwargs):
+                f = builtins_open(*args, **kwargs)
+                opened_files.append(f)
+                return f
+
+            with mock.patch('six.moves.builtins.open', side_effect=test_open):
+                try:
+                    Faidx(fasta_path)
+                    self.assertFail("Faidx construction should fail with 'FastaIndexingError'.")
+                except FastaIndexingError:
+                    pass
+            self.assertTrue(all(f.closed for f in opened_files))
+        finally:
+            shutil.rmtree(tmp_dir)
+
+    def test_build_issue_96_fail_read_malformed_index_duplicate_key(self):
+        """ Ensure that the fasta file is closed if construction of the 'Faidx' file
+        fails when attempting to read a pre-existing index. The index is malformed because
+        it contains mulitple occurrences of the same index.
+        See mdshw5/pyfaidx#96
+        """
+        tmp_dir = mkdtemp()
+        try:
+            fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
+            faidx_path = os.path.join(tmp_dir, 'issue_96.fasta.fai')
+            # Write simple fasta file
+            with open(fasta_path, 'w') as fasta_out:
+                fasta_out.write(">seq1\nCTCCGGGCCCAT\nATAAAGCCTAAA\n")
+            with open(faidx_path, 'w') as faidx_out:
+                faidx_out.write("seq1\t24\t6\t12\t13\nseq1\t24\t6\t12\t13\n")
+
+            builtins_open = builtins.open
+
+            opened_files=[]
+            def test_open(*args, **kwargs):
+                f = builtins_open(*args, **kwargs)
+                opened_files.append(f)
+                return f
+
+            with mock.patch('six.moves.builtins.open', side_effect=test_open):
+                try:
+                    Faidx(fasta_path)
+                    self.assertFail("Faidx construction should fail with 'ValueError'.")
+                except ValueError:
+                    pass
+            self.assertTrue(all(f.closed for f in opened_files))
+        finally:
+            shutil.rmtree(tmp_dir)
+
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..a08d4ce
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,27 @@
+[tox]
+envlist = init, py26, py27, py33, py34, py35, pypy, pypy3, final
+
+[testenv]
+deps = nose
+       mock; python_version < '3.3'
+       coverage
+       nose-cov
+       biopython
+       pysam; python_version > '2.6' and platform_python_implementation != 'PyPy'
+       pyvcf
+commands = nosetests --with-cov --cov-report term-missing --cov {envsitepackagesdir}/pyfaidx -P tests
+           coverage combine
+           bash -c 'mv {toxinidir}/.coverage {toxworkdir}/.coverage.{envname}'
+whitelist_externals = bash
+
+[testenv:init]
+basepython = python2.7
+commands = coverage erase
+           bash -c 'rm -rf {toxworkdir}/.coverage.*'
+           python tests/data/download_gene_fasta.py
+
+[testenv:final]
+basepython = python2.7
+commands = coverage combine {toxworkdir}
+           coverage report -m
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyfaidx.git



More information about the debian-med-commit mailing list