[med-svn] [Git][med-team/python-bx][upstream] New upstream version 0.8.2

Wed Oct 17 11:33:59 BST 2018

Steffen Möller pushed to branch upstream at Debian Med / python-bx


Commits:
9974c2d8 by Steffen Moeller at 2018-10-17T10:16:22Z
New upstream version 0.8.2
- - - - -


5 changed files:

- .travis.yml
- README.md
- lib/bx/intervals/io.py
- scripts/bnMapper.py
- setup.py


Changes:

=====================================
.travis.yml
=====================================
@@ -20,6 +20,7 @@ before_install:
 install: 
     - conda create --yes -n testenv python=$TRAVIS_PYTHON_VERSION Cython numpy nose
     - source activate testenv
+    - pip install python-lzo
     - python setup.py build_ext --inplace
     - python setup.py install
 script: 


=====================================
README.md
=====================================
@@ -1,5 +1,7 @@
 [![Build Status](https://travis-ci.org/bxlab/bx-python.svg?branch=master)](https://travis-ci.org/bxlab/bx-python)
 
+[![Read the Docs](https://img.shields.io/readthedocs/bx-python.svg)](https://bx-python.readthedocs.io/)
+
 # bx-python
 
 The bx-python project is a python library and associated set of scripts to allow for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
@@ -10,6 +12,10 @@ The bx-python project is a python library and associated set of scripts to allow
     * "Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
     * "Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation
 
+## Requirements
+
+Build currently requires liblzo, e.g. sudo apt-get install liblzo2-dev on debian/ubuntu).
+
 ## Installing
 
 The package can be installed with pip:


=====================================
lib/bx/intervals/io.py
=====================================
@@ -99,7 +99,9 @@ class GenomicIntervalReader( TableReader ):
     ...               "chr2\\tbar\\t20\\t300\\txxx",
     ...               "#I am a comment",
     ...               "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
+    >>> header = next(r)
     >>> elements = list( r )
+    >>> elements.insert(0, header)
     >>> assert type( elements[0] ) is Header
     >>> str( elements[0] )
     '#chrom\\tname\\tstart\\tend\\textra'
@@ -174,6 +176,15 @@ class GenomicIntervalReader( TableReader ):
         return bitsets
 
 class NiceReaderWrapper( GenomicIntervalReader ):
+    """
+    >>> r = NiceReaderWrapper( [ "#chrom\\tname\\tstart\\tend\\textra",
+    ...                          "chr1\\tfoo\\t1\\t100\\txxx",
+    ...                          "chr2\\tbar\\t20\\t300\\txxx",
+    ...                          "#I am a comment",
+    ...                          "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
+    >>> assert type(next(r)) == Header
+    """
+
     def __init__( self, reader, **kwargs ):
         GenomicIntervalReader.__init__( self, reader, **kwargs )
         self.outstream = kwargs.get("outstream", None)
@@ -187,7 +198,7 @@ class NiceReaderWrapper( GenomicIntervalReader ):
     def __next__( self ):
         while 1:
             try:
-                nextitem = GenomicIntervalReader.next( self )
+                nextitem = super(NiceReaderWrapper, self ).__next__()
                 return nextitem
             except ParseError as e:
                 if self.outstream:


=====================================
scripts/bnMapper.py
=====================================
@@ -21,6 +21,9 @@ from bx.cookbook import argparse
 from bx.intervals.intersection import IntervalTree, Interval
 
 elem_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100)])
+narrowPeak_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100),
+                         ('score', np.int64), ('strand', np.str_, 1), ('signalValue', np.float),
+                         ('pValue', np.float), ('qValue', np.float), ('peak', np.int64)])
 LOG_LEVELS = {"info" : logging.INFO, "debug" : logging.DEBUG, "silent" : logging.ERROR}
 
 logging.basicConfig()
@@ -116,18 +119,43 @@ def union_elements(elements):
 def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
     BED4_FRM = "%s\t%d\t%d\t%s\n"
     BED12_FRM = "%s\t%d\t%d\t%s\t1000\t+\t%d\t%d\t0,0,0\t%d\t%s\t%s\n"
+    NPEAK_FRM = "%s\t%d\t%d\t%s\t%d\t%s\t%f\t%f\t%f\t%d\n"
     assert len( set(from_elem_list['chrom']) ) <= 1
 
     mapped_elem_count = 0
+    mapped_summit_count = 0
     for from_elem in from_elem_list:
         matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['start'], from_elem['end'])]
 
         # do the actual mapping
         to_elem_slices = [_ for _ in (transform(from_elem, all_epo[i], opt.gap) for i in matching_block_ids) if _]
+        """ # Original version: silently discard split alignments
         if len(to_elem_slices) > 1 or len(to_elem_slices) == 0:
             log.debug("%s no match or in different chain/chromosomes" % (str(from_elem)))
             continue
         to_elem_slices = to_elem_slices[0]
+        """
+        """ Modified version below allows liftOver-like behavior of
+        keeping the longest alignment when alignments are split across
+        multiple chains. Added by Adam Diehl (adadiehl at umich.edu)
+        """
+        max_elem_idx = 0
+        if len(to_elem_slices) == 0:
+            log.debug("%s: no match in target: discarding." % (str(from_elem)))
+            continue
+        elif len(to_elem_slices) > 1 and opt.keep_split:
+            log.debug("%s spans multiple chains/chromosomes. Using longest alignment." % (str(from_elem)))
+            max_elem_len = 0
+            for i in xrange(len(to_elem_slices)):
+                elem_len = to_elem_slices[i][-1][2] - to_elem_slices[i][0][2]
+                if elem_len > max_elem_len:
+                    max_elem_len = elem_len
+                    max_elem_idx = i
+        elif len(to_elem_slices) > 1:
+            log.debug("%s spans multiple chains/chromosomes: discarding." % (str(from_elem)))
+            continue
+        to_elem_slices = to_elem_slices[max_elem_idx]
+        """ End AGD modifications """
 
         # apply threshold
         if (from_elem[2] - from_elem[1]) * opt.threshold > reduce(lambda b,a: a[2]-a[1] + b, to_elem_slices, 0):
@@ -139,19 +167,45 @@ def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
         if to_elem_list:
             mapped_elem_count += 1
             log.debug("\tjoined to %d elements" % (len(to_elem_list)))
+            start = to_elem_list[0][1]
+            end = to_elem_list[-1][2]
             if opt.format == "BED4":
                 for tel in to_elem_list:
                     out_fd.write(BED4_FRM % tel)
-            else:
-                start = to_elem_list[0][1]
-                end = to_elem_list[-1][2]
+            elif opt.format == "BED12":
                 out_fd.write(BED12_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
                         start, end, len(to_elem_list),
                         ",".join( "%d" % (e[2]-e[1]) for e in to_elem_list ),
                         ",".join( "%d" % (e[1]-start) for e in to_elem_list ) )
                         )
-    log.info("%s %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
-
+            else:
+                # narrowPeak convention is to report the peak location relative to start
+                peak = int((start + end)/2) - start
+                if opt.in_format == "narrowPeak":
+                    # Map the peak location
+                    #sys.stderr.write("{}\n".format(from_elem))
+                    matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['peak'], from_elem['peak'])]
+                    p_elem_slices = [_ for _ in (transform( np.array((chrom, from_elem['peak'], from_elem['peak'], '.'), dtype=elem_t), all_epo[i], opt.gap) for i in matching_block_ids) if _]
+                    if len(p_elem_slices) >= 1:
+                        mapped_summit_count += 1
+                        sys.stderr.write("{}\n".format(p_elem_slices))
+                        # Make sure the peak is between the start and end positions
+                        if p_elem_slices[0][0][1] >= start and p_elem_slices[0][0][1] <= end:
+                            peak = p_elem_slices[0][0][1] - start
+                        else:
+                            mapped_summit_count -= 1
+                            log.debug("Warning: elem {} summit mapped location falls outside the mapped element start and end. Using the mapped elem midpoint instead.".format(from_elem))
+                        
+                    else:
+                        log.debug("Warning: elem {} summit maps to a gap region in the target alignment. Using the mapped elem midpoint instead.".format(from_elem))
+                out_fd.write(NPEAK_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
+                                          from_elem['score'], from_elem['strand'], from_elem['signalValue'],
+                                          from_elem['pValue'], from_elem['qValue'], peak))
+    log.info("%s: %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
+    if opt.format == "narrowPeak" and opt.in_format == "narrowPeak":
+        log.info("%s: %d peak summits from %d mapped elements mapped" % (chrom, mapped_summit_count, mapped_elem_count))
+
+    
 def transform_file(ELEMS, ofname, EPO, TREE, opt):
     "transform/map the elements of this file and dump the output on 'ofname'"
 
@@ -193,16 +247,29 @@ def loadChains(path):
     assert all( t[0].tStrand == '+' for t in EPO ), "all target strands should be +"
     return EPO
 
-def loadFeatures(path):
-    "load BED4 features (all other columns are ignored)"
+def loadFeatures(path, opt):
+    """
+    Load features. For BED, only BED4 columns are loaded.
+    For narrowPeak, all columns are loaded.
+    """
 
     log.info("loading from %s ..." % path)
     data = []
-    with open(path) as fd:
-        for line in fd:
-            cols = line.split()
-            data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3]) )
-    return np.array(data, dtype=elem_t)
+    if opt.in_format == "BED":        
+        with open(path) as fd:
+            for line in fd:
+                cols = line.split()
+                data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3]) )
+        data = np.array(data, dtype=elem_t)
+    else:
+        with open(path) as fd:
+            for line in fd:
+                cols = line.split()
+                data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3], int(cols[4]),
+                              cols[5], float(cols[6]), float(cols[7]), float(cols[8]),
+                              int(cols[-1])+int(cols[1])) )
+        data = np.array(data, dtype=narrowPeak_t)
+    return data
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__, epilog="Olgert Denas (Taylor Lab)",
@@ -212,8 +279,8 @@ if __name__ == "__main__":
             help="Input to process. If more than a file is specified, all files will be mapped and placed on --output, which should be a directory.")
     parser.add_argument("alignment", help="Alignment file (.chain or .pkl)")
 
-    parser.add_argument("-f", '--format', choices=("BED4", "BED12"), default="BED4",
-            help="Output format.")
+    parser.add_argument("-f", '--format', choices=("BED4", "BED12", "narrowPeak"), default="BED4",
+                        help="Output format. BED4 output reports all aligned blocks as separate BED records. BED12 reports a single BED record for each mapped element, with individual blocks given in the BED12 fields. NarrowPeak reports a single narrowPeak record for each mapped element, in which the chromosome, start, end, and peak positions are mapped to the target species and all other columns are passed through unchanged.")
     parser.add_argument("-o", '--output', metavar="FILE", default='stdout',
             type=lambda s: ((s in ('stdout', '-') and "/dev/stdout") or s),
             help="Output file. Mandatory if more than on file in input.")
@@ -225,7 +292,10 @@ if __name__ == "__main__":
             help="Ignore elements with an insertion/deletion of this or bigger size.")
     parser.add_argument('-v', '--verbose', type=str, choices=list(LOG_LEVELS.keys()), default='info',
             help='Verbosity level')
-
+    parser.add_argument("-k", '--keep_split', default=False, action='store_true',
+            help="If elements span multiple chains, report the segment with the longest overlap instead of silently dropping them. (This is the default behavior for liftOver.)")
+    parser.add_argument("-i", "--in_format", choices=["BED", "narrowPeak"], default="BED",
+            help="Input file format.")
 
     opt = parser.parse_args()
     log.setLevel(LOG_LEVELS[opt.verbose])
@@ -257,4 +327,4 @@ if __name__ == "__main__":
                 log.warning("overwriting %s ..." % outpath)
             transform_file(loadFeatures(inpath), outpath, EPO, TREE, opt)
     else:
-        transform_file(loadFeatures( opt.input[0] ), opt.output, EPO, TREE, opt)
+        transform_file(loadFeatures( opt.input[0], opt ), opt.output, EPO, TREE, opt)


=====================================
setup.py
=====================================
@@ -17,22 +17,21 @@ from glob import glob
 
 def main():
 
-    build_requires = [ 'python-lzo', 'numpy' ]
-
     metadata = \
       dict( name = "bx-python",
-            version = "0.8.1",
-            install_requires=build_requires + ['six'],
+            version = "0.8.2",
+            setup_requires=['numpy', 'cython'],
+            install_requires=['numpy', 'six'],
             py_modules = [ 'psyco_full' ],
             package_dir = { '': 'lib' },
             package_data = { '': ['*.ps'] },
             scripts = glob( "scripts/*.py" ),
             test_suite = 'nose.collector',
-            tests_require = ['nose'],
+            tests_require = ['nose','python-lzo'],
             author = "James Taylor, Bob Harris, David King, Brent Pedersen, Kanwei Li, and others",
             author_email = "james at jamestaylor.org",
             description = "Tools for manipulating biological data, particularly multiple sequence alignments",
-            url = "http://bitbucket.org/james_taylor/bx-python/wiki/Home",
+            url = "https://github.com/bxlab/bx-python",
             license = "MIT",
             classifiers = [
                 "Development Status :: 5 - Production/Stable",
@@ -57,6 +56,8 @@ def main():
     numpy = None
     try:
         import numpy
+        # Suppress numpy tests
+        numpy.test = None
     except:
         pass
     
@@ -81,10 +82,8 @@ def main():
 
 from distutils.core import Command
 
-# Use build_ext from Cython
-command_classes = {}
-
 # Use build_ext from Cython if found
+command_classes = {}
 try:
     import Cython.Distutils
     command_classes['build_ext'] = Cython.Distutils.build_ext
@@ -220,47 +219,5 @@ def get_extension_modules( numpy_include=None ):
                                       include_dirs=[ "src/bunzip" ] ) )   
     return extensions     
      
-# ---- Monkey patches -------------------------------------------------------
-
-def monkey_patch_doctest():
-    #
-    # Doctest and coverage don't get along, so we need to create
-    # a monkeypatch that will replace the part of doctest that
-    # interferes with coverage reports.
-    #
-    # The monkeypatch is based on this zope patch:
-    # http://svn.zope.org/Zope3/trunk/src/zope/testing/doctest.py?rev=28679&r1=28703&r2=28705
-    #
-    try:
-        import doctest
-        _orp = doctest._OutputRedirectingPdb
-        class NoseOutputRedirectingPdb(_orp):
-            def __init__(self, out):
-                self.__debugger_used = False
-                _orp.__init__(self, out)
-
-            def set_trace(self):
-                self.__debugger_used = True
-                _orp.set_trace(self)
-
-            def set_continue(self):
-                # Calling set_continue unconditionally would break unit test coverage
-                # reporting, as Bdb.set_continue calls sys.settrace(None).
-                if self.__debugger_used:
-                    _orp.set_continue(self)
-        doctest._OutputRedirectingPdb = NoseOutputRedirectingPdb
-    except:
-        pass
-
-def monkey_patch_numpy():
-    # Numpy pushes its tests into every importers namespace, yeccch.
-    try:
-        import numpy
-        numpy.test = None
-    except:
-        pass
-        
 if __name__ == "__main__":
-    monkey_patch_doctest()
-    monkey_patch_numpy()
     main()



View it on GitLab: https://salsa.debian.org/med-team/python-bx/commit/9974c2d898e4358167e9fa956f30799e8ec7d16c

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-bx/commit/9974c2d898e4358167e9fa956f30799e8ec7d16c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181017/f67c4ff7/attachment-0001.html>