[med-svn] [Git][med-team/python-pyfaidx][upstream] New upstream version 0.5.7

Steffen Möller gitlab at salsa.debian.org
Sat Jan 25 17:58:26 GMT 2020



Steffen Möller pushed to branch upstream at Debian Med / python-pyfaidx


Commits:
739f7379 by Steffen Moeller at 2020-01-01T19:58:11+01:00
New upstream version 0.5.7
- - - - -


7 changed files:

- .travis.yml
- README.rst
- pyfaidx/__init__.py
- pyfaidx/cli.py
- setup.py
- tests/test_FastaRecord.py
- tests/test_feature_bounds_check.py


Changes:

=====================================
.travis.yml
=====================================
@@ -7,9 +7,7 @@ python:
     - '3.6'
     - '3.5'
     - '3.4'
-    - '3.3'
     - '2.7'
-    - '2.6'
     - 'pypy'
     - 'pypy3'
 install:


=====================================
README.rst
=====================================
@@ -1,4 +1,4 @@
-|Travis| |PyPI| |Landscape| |Coverage| |Depsy| |Appveyor|
+|Travis| |PyPI| |Coverage| |Depsy|
 
 Description
 -----------
@@ -54,7 +54,8 @@ Acts like a dictionary.
 
 .. code:: python
 
-    >>> genes.keys() ('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
+    >>> genes.keys()
+    ('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
 
     >>> genes['NM_001282543.1'][200:230]
     >NM_001282543.1:201-230
@@ -544,7 +545,7 @@ Support for compressed FASTA
 ----------------------------
 
 ``pyfaidx`` can create and read ``.fai`` indices for FASTA files that have
-been compressed using the `bgzip <http://www.htslib.org/doc/tabix.html>`_
+been compressed using the `bgzip <https://www.htslib.org/doc/bgzip.html>`_
 tool from `samtools <http://www.htslib.org/>`_. ``bgzip`` writes compressed
 data in a ``BGZF`` format. ``BGZF`` is ``gzip`` compatible, consisting of
 multiple concatenated ``gzip`` blocks, each with an additional ``gzip``


=====================================
pyfaidx/__init__.py
=====================================
@@ -25,7 +25,7 @@ if sys.version_info > (3, ):
 
 dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
 
-__version__ = '0.5.5.2'
+__version__ = '0.5.7'
 
 
 class KeyFunctionError(ValueError):
@@ -123,7 +123,7 @@ class Sequence(object):
         >chr1
         AC
         """
-        if self.start is None or self.end is None:
+        if self.start is None or self.end is None or len(self.seq) == 0:
             correction_factor = 0
         elif len(
                 self.seq
@@ -461,7 +461,7 @@ class Faidx(object):
                     rname, rlen, offset, lenc, lenb = line.split('\t')
                     rlen, offset, lenc, lenb = map(int,
                                                    (rlen, offset, lenc, lenb))
-                    newlines = int(ceil(rlen / lenc) * (lenb - lenc))
+                    newlines = int(ceil(rlen / lenc) * (lenb - lenc)) if lenc else 0
                     bend = offset + newlines + rlen
                     rec = IndexRecord(rlen, offset, lenc, lenb, bend,
                                       prev_bend)
@@ -508,8 +508,8 @@ class Faidx(object):
                     rname = None  # reference sequence name
                     offset = 0  # binary offset of end of current line
                     rlen = 0  # reference character length
-                    blen = None  # binary line length (includes newline)
-                    clen = None  # character line length
+                    blen = 0  # binary line length (includes newline)
+                    clen = 0  # character line length
                     bad_lines = []  # lines > || < than blen
                     thisoffset = offset
                     valid_entry = False
@@ -535,9 +535,9 @@ class Faidx(object):
                                     "Inconsistent line found in >{0} at "
                                     "line {1:n}.".format(
                                         rname, bad_lines[0][0] + 1))
-                            blen = None
+                            blen = 0
                             rlen = 0
-                            clen = None
+                            clen = 0
                             bad_lines = []
                             try:  # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
                                 rname = line.rstrip('\n\r')[1:].split()[
@@ -648,8 +648,8 @@ class Faidx(object):
 
         # Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398)
         newlines_before = int(
-            (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0
-        newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc))
+            (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 and i.lenc else 0
+        newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0
         newlines_inside = newlines_to_end - newlines_before
         seq_blen = newlines_inside + seq_len
         bstart = i.offset + newlines_before + start0
@@ -669,12 +669,15 @@ class Faidx(object):
             else:
                 self.file.seek(bstart)
 
+                # If the requested sequence exceeds len(FastaRecord), return as much as possible
                 if bstart + seq_blen > i.bend and not self.strict_bounds:
                     seq_blen = i.bend - bstart
-
+                # Otherwise it should be safe to read the sequence
                 if seq_blen > 0:
                     seq = self.file.read(seq_blen).decode()
-                elif seq_blen <= 0 and not self.strict_bounds:
+                # If the requested sequence is negative, we will pad the empty string with default_seq.
+                # This was changed to support #155 with strict_bounds=True.
+                elif seq_blen <= 0:
                     seq = ''
 
         if not internals:
@@ -994,13 +997,9 @@ class Fasta(object):
             sequence_always_upper=sequence_always_upper,
             rebuild=rebuild,
             build_index=build_index)
-        self.keys = self.faidx.index.keys
-        if not self.mutable:
-            self.records = dict(
-                [(rname, FastaRecord(rname, self)) for rname in self.keys()])
-        elif self.mutable:
-            self.records = dict([(rname, MutableFastaRecord(rname, self))
-                                 for rname in self.keys()])
+        
+        _record_constructor = MutableFastaRecord if self.mutable else FastaRecord
+        self.records = OrderedDict([(rname, _record_constructor(rname, self)) for rname in self.faidx.index.keys()])
 
     def __contains__(self, rname):
         """Return True if genome contains record."""
@@ -1057,6 +1056,15 @@ class Fasta(object):
         # len(Sequence.seq) != end - start
         return Sequence(name=name, seq=seq, start=None, end=None)
 
+    def keys(self):
+        return self.records.keys()
+
+    def values(self):
+        return self.records.values()
+
+    def items(self):
+        return self.records.items()
+
     def close(self):
         self.__exit__()
 
@@ -1258,6 +1266,21 @@ def check_bad_lines(rname, bad_lines, i):
     raise RuntimeError("Unhandled exception during fasta indexing at entry " + rname + \
                        "Please report this issue at https://github.com/mdshw5/pyfaidx/issues " + \
                        str(bad_lines))
+    
+def get_valid_filename(s):
+    """
+    From https://github.com/django/django/blob/efc3e32d6d7fb9bb41be73b80c8607b653c1fbd6/django/utils/text.py#L222-L232
+    Return the given string converted to a string that can be used for a clean
+    filename. Remove leading and trailing spaces; convert other spaces to
+    underscores; and remove anything that is not an alphanumeric, dash,
+    underscore, or dot.
+    >>> get_valid_filename("HPV16_144-1.fa")
+    'HPV16_144-1.fa'
+    >>> get_valid_filename("chromosome 6.fa")
+    'chromosome_6.fa'
+    """
+    s = str(s).strip().replace(' ', '_')
+    return re.sub(r'(?u)[^-\w.]', '', s)
 
 
 if __name__ == "__main__":


=====================================
pyfaidx/cli.py
=====================================
@@ -3,11 +3,9 @@ import argparse
 import sys
 import os.path
 import re
-from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split
+from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split, get_valid_filename
 from collections import defaultdict
 
-keepcharacters = (' ', '.', '_')
-
 def write_sequence(args):
     _, ext = os.path.splitext(args.fasta)
     if ext:
@@ -36,7 +34,7 @@ def write_sequence(args):
                 continue
         if args.split_files:  # open output file based on sequence name
             filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
-            filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
+            filename = get_valid_filename(filename)
             outfile = open(filename, 'w')
         elif args.out:
             outfile = args.out


=====================================
setup.py
=====================================
@@ -36,12 +36,11 @@ setup(
             "Intended Audience :: Science/Research",
             "Natural Language :: English",
             "Operating System :: Unix",
+            "Programming Language :: Python :: 3.7",
+            "Programming Language :: Python :: 3.6",
             "Programming Language :: Python :: 3.5",
             "Programming Language :: Python :: 3.4",
-            "Programming Language :: Python :: 3.3",
-            "Programming Language :: Python :: 3.2",
             "Programming Language :: Python :: 2.7",
-            "Programming Language :: Python :: 2.6",
             "Programming Language :: Python :: Implementation :: PyPy",
             "Topic :: Scientific/Engineering :: Bio-Informatics"
     ]


=====================================
tests/test_FastaRecord.py
=====================================
@@ -46,6 +46,7 @@ class TestFastaRecord(TestCase):
         long_names = []
         for record in fasta:
             long_names.append(record.long_name)
+        print(tuple(zip(deflines, long_names)))
         assert deflines == long_names
 
     def test_issue_62(self):


=====================================
tests/test_feature_bounds_check.py
=====================================
@@ -6,6 +6,53 @@ from unittest import TestCase
 path = os.path.dirname(__file__)
 os.chdir(path)
 
+class TestFeatureZeroLength:
+    """Tests for handling zero-length entries, added in #155"""
+    def setUp(self):
+        with open('data/zero_length.fasta', 'w') as fasta:
+            fasta.write(""">A
+ATCG
+>B
+>C
+
+>D
+GTA
+GC""")
+
+    def tearDown(self):
+        os.remove('data/zero_length.fasta')
+        os.remove('data/zero_length.fasta.fai')
+              
+    def test_index_zero_length(self):
+        fasta = Fasta('data/zero_length.fasta')
+        
+    def test_fetch_zero_length(self):
+        fasta = Fasta('data/zero_length.fasta')
+        b = fasta["B"]
+        assert str(b) == ''
+        
+class TestZeroLengthSequenceSubRange(TestCase):
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        try:
+            os.remove('data/genes.fasta.fai')
+        except EnvironmentError:
+            pass  # some tests may delete this file
+        
+    def test_as_raw_zero_length_subsequence(self):
+        fasta = Fasta('data/genes.fasta', as_raw=True, strict_bounds=True)
+        expect = ''
+        result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
+        assert result == expect
+
+    def test_zero_length_subsequence(self):
+        fasta = Fasta('data/genes.fasta', strict_bounds=True)
+        expect = ''
+        result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
+        assert result.seq == expect
+
 class TestFeatureBoundsCheck:
     def setUp(self):
         pass



View it on GitLab: https://salsa.debian.org/med-team/python-pyfaidx/commit/739f737969497e06c16c17c0fc67c416e3524fdb

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-pyfaidx/commit/739f737969497e06c16c17c0fc67c416e3524fdb
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200125/b2359fec/attachment-0001.html>


More information about the debian-med-commit mailing list