[med-svn] [Git][med-team/pyensembl][upstream] New upstream version 2.2.4+ds

Sun Dec 25 21:53:18 GMT 2022


Andreas Tille pushed to branch upstream at Debian Med / pyensembl


Commits:
9a3eea57 by Andreas Tille at 2022-12-25T22:46:11+01:00
New upstream version 2.2.4+ds
- - - - -


27 changed files:

- PKG-INFO
- pyensembl.egg-info/PKG-INFO
- pyensembl/__init__.py
- pyensembl/common.py
- pyensembl/database.py
- pyensembl/ensembl_url_templates.py
- pyensembl/genome.py
- − pyensembl/memory_cache.py
- pyensembl/sequence_data.py
- pyensembl/species.py
- pyensembl/version.py
- + requirements.txt
- setup.py
- test/test_download_cache.py
- test/test_gene_objects.py
- test/test_id_length.py
- test/test_locus.py
- − test/test_memory_cache.py
- test/test_missing_genome_sources.py
- test/test_mouse.py
- test/test_release_versions.py
- test/test_search.py
- test/test_serialization.py
- + test/test_shell.py
- test/test_timings.py
- test/test_transcript_objects.py
- test/test_ucsc_gtf.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pyensembl
-Version: 2.1.0
+Version: 2.2.4
 Summary: Python interface to ensembl reference genome metadata
 Home-page: https://github.com/openvax/pyensembl
 Author: Alex Rubinsteyn


=====================================
pyensembl.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pyensembl
-Version: 2.1.0
+Version: 2.2.4
 Summary: Python interface to ensembl reference genome metadata
 Home-page: https://github.com/openvax/pyensembl
 Author: Alex Rubinsteyn


=====================================
pyensembl/__init__.py
=====================================
@@ -10,7 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .memory_cache import MemoryCache
 from .database import Database
 from .download_cache import DownloadCache
 from .ensembl_release import EnsemblRelease, cached_release
@@ -41,7 +40,6 @@ from .version import __version__
 
 __all__ = [
     "__version__",
-    "MemoryCache",
     "DownloadCache",
     "Database",
     "EnsemblRelease",


=====================================
pyensembl/common.py
=====================================
@@ -14,7 +14,6 @@ import pickle
 
 from functools import wraps
 
-
 def dump_pickle(obj, filepath):
     with open(filepath, "wb") as f:
         # use lower protocol for compatibility between Python 2 and Python 3


=====================================
pyensembl/database.py
=====================================
@@ -16,7 +16,6 @@ import sqlite3
 
 import datacache
 from typechecks import require_integer, require_string
-
 from gtfparse import read_gtf, create_missing_features
 
 from .common import memoize


=====================================
pyensembl/ensembl_url_templates.py
=====================================
@@ -16,14 +16,14 @@ on the Ensembl ftp server.
 
 For example, the human chromosomal DNA sequences for release 78 are in:
 
-    ftp://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
+    https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
 
 """
 
 from .species import Species, find_species_by_name
 from .ensembl_release_versions import check_release_number
 
-ENSEMBL_FTP_SERVER = "ftp://ftp.ensembl.org"
+ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
 
 # Example directories
 # FASTA files: /pub/release-78/fasta/homo_sapiens/


=====================================
pyensembl/genome.py
=====================================
@@ -21,7 +21,6 @@ from os.path import exists, getsize
 
 from serializable import Serializable
 
-from .memory_cache import MemoryCache
 from .download_cache import DownloadCache
 from .database import Database
 from .exon import Exon
@@ -108,7 +107,6 @@ class Genome(Serializable):
             copy_local_files_to_cache=self.copy_local_files_to_cache,
             install_string_function=self.install_string,
             cache_directory_path=cache_directory_path)
-        self.memory_cache = MemoryCache()
         self._init_lazy_fields()
 
     @property
@@ -435,8 +433,7 @@ class Genome(Serializable):
 
     def clear_cache(self):
         """
-        Clear any in-memory cached values and short-lived on-disk
-        materializations from MemoryCache
+        Clear any in-memory cached values
         """
         for maybe_fn in self.__dict__.values():
             # clear cache associated with all memoization decorators,


=====================================
pyensembl/memory_cache.py deleted
=====================================
@@ -1,136 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Cache and serializing the results of expensive computations. Used in pyensembl
-primarily to cache the heavy-weight parsing of GTF files and various
-filtering operations on Ensembl entries.
-
-A piece of data is returned from one of three sources:
-1) Cache cold. Run the user-supplied compute_fn.
-2) Cache warm on disk. Parse or unpickle the serialized result into memory.
-3) Cache warm in memory. Return cached object.
-"""
-
-import logging
-from os import remove, stat
-from os.path import exists
-
-import pandas as pd
-
-from .common import load_pickle, dump_pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-class MemoryCache(object):
-    """
-    In-memory and on-disk caching of long-running queries and computations.
-    """
-    def __init__(self):
-        self._memory_cache = {}
-
-    def is_empty(self, filename):
-        return stat(filename).st_size == 0
-
-    def delete_file(self, path):
-        if exists(path):
-            logger.info("Deleting cached file %s", path)
-            remove(path)
-
-    def remove_from_cache(self, key):
-        if key in self._memory_cache:
-            del self._memory_cache[key]
-        self.delete_file(key)
-
-    def clear_cached_objects(self):
-        for key in self._memory_cache.keys():
-            self.delete_file(key)
-        self._memory_cache.clear()
-
-    def _read_csv(self, csv_path):
-        logger.info("Reading Dataframe from %s", csv_path)
-        df = pd.read_csv(csv_path)
-        if 'seqname' in df:
-            # by default, Pandas will infer the type as int,
-            # then switch to str when it hits non-numerical
-            # chromosomes. Make sure whole column has the same type
-            df['seqname'] = df['seqname'].map(str)
-        return df
-
-    def _write_csv(self, df, csv_path, chunksize=10**5):
-        """
-        Parameters
-        ----------
-        df : pandas.DataFrame
-
-        csv_path : str
-
-        chunksize : int
-            Number of rows to write at a time. Helps to limit memory
-            consumption while writing a CSV.
-        """
-        logger.info("Saving DataFrame to %s", csv_path)
-        df.to_csv(csv_path, index=False, chunksize=chunksize)
-
-    def cached_dataframe(self, csv_path, compute_fn):
-        """
-        If a CSV path is in the _memory_cache, then return that cached value.
-
-        If we've already saved the DataFrame as a CSV then load it.
-
-        Otherwise run the provided `compute_fn`, and store its result
-        in memory and and save it as a CSV.
-        """
-        if not csv_path.endswith(".csv"):
-            raise ValueError("Invalid path '%s', must be a CSV file" % csv_path)
-
-        if csv_path in self._memory_cache:
-            return self._memory_cache[csv_path]
-
-        if exists(csv_path) and not self.is_empty(csv_path):
-            df = self._read_csv(csv_path)
-        else:
-            df = compute_fn()
-            if not isinstance(df, pd.DataFrame):
-                raise TypeError(
-                    "Expected compute_fn to return DataFrame, got %s : %s" % (
-                        df, type(df)))
-            self._write_csv(df, csv_path)
-        self._memory_cache[csv_path] = df
-        return df
-
-    def cached_object(self, path, compute_fn):
-        """
-        If `cached_object` has already been called for a value of `path` in this
-        running Python instance, then it should have a cached value in the
-         _memory_cache; return that value.
-
-        If this function was never called before with a particular value of
-        `path`, then call compute_fn, and pickle it to `path`.
-
-        If `path` already exists, unpickle it and store that value in
-        _memory_cache.
-        """
-        if path in self._memory_cache:
-            return self._memory_cache[path]
-
-        if exists(path) and not self.is_empty(path):
-            obj = load_pickle(path)
-        else:
-            obj = compute_fn()
-            dump_pickle(obj, path)
-        self._memory_cache[path] = obj
-        return obj


=====================================
pyensembl/sequence_data.py
=====================================
@@ -14,10 +14,7 @@ from os import remove
 from os.path import exists, abspath, split, join
 import logging
 from collections import Counter
-
-from six.moves import cPickle as pickle
-from six import string_types
-
+import pickle 
 from .common import (load_pickle, dump_pickle)
 from .fasta import parse_fasta_dictionary
 
@@ -34,7 +31,7 @@ class SequenceData(object):
             fasta_paths,
             cache_directory_path=None):
 
-        if isinstance(fasta_paths, string_types):
+        if type(fasta_paths) is str:
             fasta_paths = [fasta_paths]
 
         self.fasta_paths = [abspath(path) for path in fasta_paths]


=====================================
pyensembl/species.py
=====================================
@@ -187,7 +187,12 @@ dog = Species.register(
 cat = Species.register(
     latin_name="felis_catus",
     synonyms=["cat"],
-    reference_assemblies={"Felis_catus_6.2": (75, MAX_ENSEMBL_RELEASE)})
+    reference_assemblies={
+        "Felis_catus_6.2": (75, 90),
+        "Felis_catus_8.0": (91, 92),
+        "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE),
+        
+        })
 
 chicken = Species.register(
     latin_name="gallus_gallus",


=====================================
pyensembl/version.py
=====================================
@@ -1 +1 @@
-__version__ = '2.1.0'
+__version__ = '2.2.4'


=====================================
requirements.txt
=====================================
@@ -0,0 +1,8 @@
+typechecks>=0.0.2
+datacache>=1.1.4
+memoized-property>=1.0.2
+tinytimer
+gtfparse>=1.3.0,<2.0.0
+serializable
+nose>=1.3.3
+pylint>=1.4.4


=====================================
setup.py
=====================================
@@ -41,6 +41,9 @@ if not version:
     raise RuntimeError('Cannot find version information')
 
 if __name__ == '__main__':
+    with open("requirements.txt") as f:
+        requirements = [l.strip() for l in f]
+    
     setup(
         name=package_name,
         version=version,
@@ -63,17 +66,11 @@ if __name__ == '__main__':
             'Programming Language :: Python',
             'Topic :: Scientific/Engineering :: Bio-Informatics',
         ],
-        install_requires=[
-            "typechecks>=0.0.2",
-            "pandas>=0.15",
-            "datacache>=1.1.4",
-            "memoized-property>=1.0.2",
-            "gtfparse>=1.1.0",
-            "serializable",
-            "tinytimer",
-        ],
+        install_requires=requirements,
         long_description=readme_markdown,
         long_description_content_type='text/markdown',
         packages=[package_name],
-        package_data={package_name: ['logging.conf']},
+        package_data={
+            package_name: ['logging.conf', '../requirements.txt'],
+        },
     )


=====================================
test/test_download_cache.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import assert_raises, ok_
 from pyensembl.download_cache import (
     DownloadCache,


=====================================
test/test_gene_objects.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import eq_
 
 from .common import test_ensembl_releases


=====================================
test/test_id_length.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from .common import major_releases
 
 from nose.tools import nottest


=====================================
test/test_locus.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl.locus import Locus
 from pyensembl.normalization import normalize_chromosome
 


=====================================
test/test_memory_cache.py deleted
=====================================
@@ -1,94 +0,0 @@
-from __future__ import absolute_import
-
-import tempfile
-
-from pyensembl import MemoryCache
-
-import pandas as pd
-from nose.tools import raises
-
-memory_cache = MemoryCache()
-
-class Counter(object):
-    """
-    Use this class to count how many times a function gets called by
-    cached_object and cached_dataframe.
-    """
-    def __init__(self):
-        self.count = 0
-
-    def increment(self):
-        self.count += 1
-        return self.count
-
-    def increment_dataframe(self):
-        value = self.increment()
-        return pd.DataFrame({'x': [value]})
-
-def test_cached_object_with_tempfile():
-    """
-    test_cached_object_with_tempfile : A temporary file exists before
-    calling into compute_cache.cached_object but is empty, should be treated
-    as if result has never been computed before (rather than trying to load
-    the empty file).
-    """
-    counter = Counter()
-    with tempfile.NamedTemporaryFile() as f:
-        # call repeatedly to test the hot and cold cache logic
-        result = memory_cache.cached_object(
-            f.name, compute_fn=counter.increment)
-        assert result == 1, "Expected result=1, got %s" % (result,)
-        assert counter.count == 1, \
-            "Expected compute_fn to be called once, got %s" % (counter.count,)
-
-
-def test_cached_dataframe_with_tempfile():
-    """
-    test_cached_dataframe_with_tempfile : A temporary file exists before
-    calling into compute_cache.cached_dataframe but is empty,
-    should be treated as if result has never been computed before
-    (rather than trying to load the empty file).
-    """
-    counter = Counter()
-    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
-        # call repeatedly to test hot and cold cache logic
-        for _ in range(2):
-            df = memory_cache.cached_dataframe(
-                f.name, compute_fn=counter.increment_dataframe)
-            # get counter value from inside of dataframe
-            result = df['x'].iloc[0]
-            assert result == 1, \
-                "Expected result=1, got %s" % (result,)
-            assert counter.count == 1, \
-                "Expected compute_fn to be called once, got %s" % (
-                    counter.count,)
-
-def test_cached_dataframe_returns_correct_type():
-    def make_a_dataframe():
-        return pd.DataFrame({'x': [0, 1, 2]})
-    with tempfile.NamedTemporaryFile(suffix='.csv') as f:
-        # call repeatedly to test the cold and hot cache logic
-        for _ in range(2):
-            df = memory_cache.cached_dataframe(
-                f.name, compute_fn=make_a_dataframe)
-            assert isinstance(df, pd.DataFrame), \
-                "Expected DataFrame, got %s : %s" % (df, type(df))
-
-def test_cached_object_with_list_returns_correct_type():
-    def make_a_list():
-        return [1, 2, 3]
-    with tempfile.NamedTemporaryFile() as f:
-        # call repeatedly to test the cold and hot cache logic
-        for _ in range(2):
-            df = memory_cache.cached_object(
-                f.name, compute_fn=make_a_list)
-            assert isinstance(df, list), \
-                "Expected list, got %s : %s" % (df, type(df))
-
- at raises(Exception)
-def test_dataframe_path_must_be_csv():
-    # compute_cache should raise an exception when filename doesn't
-    # end with .csv extension
-    memory_cache.cached_dataframe(
-        csv_path="tempfile_not_csv",
-        compute_fn=lambda _: pd.DataFrame({'x': []}))


=====================================
test/test_missing_genome_sources.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import Genome
 from nose.tools import eq_, ok_, assert_raises
 


=====================================
test/test_mouse.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from nose.tools import eq_, with_setup
 
 from .data import (


=====================================
test/test_release_versions.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE
 
 from nose.tools import raises


=====================================
test/test_search.py
=====================================
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
 
 from pyensembl import find_nearest_locus
-from nose.tools import eq_
 from .common import test_ensembl_releases
 
 @test_ensembl_releases()


=====================================
test/test_serialization.py
=====================================
@@ -1,5 +1,3 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import absolute_import
 import pickle
 from nose.tools import eq_, with_setup
 from pyensembl import Genome, Transcript, Gene, Exon


=====================================
test/test_shell.py
=====================================
@@ -0,0 +1,12 @@
+from nose.tools import eq_, with_setup
+from pyensembl.shell import parser, all_combinations_of_ensembl_genomes
+from pyensembl import ensembl_grch38
+
+
+def test_genome_selection_grch38():
+    args = parser.parse_args(["install", "--release", "100", "--species", "human"])
+    genomes = all_combinations_of_ensembl_genomes(args)
+    assert len(genomes) == 1
+    genome = genomes[0]
+    eq_(genome.species.latin_name, "homo_sapiens")
+    eq_(genome.release, 100)


=====================================
test/test_timings.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import
-
 from pyensembl import genome_for_reference_name
 
 from tinytimer import benchmark


=====================================
test/test_transcript_objects.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
 from pyensembl import Locus, cached_release
 from nose.tools import eq_, assert_not_equal, assert_greater
 


=====================================
test/test_ucsc_gtf.py
=====================================
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
 
 from pyensembl import Genome, Database
-from nose.tools import eq_
 
 from .common import TemporaryDirectory
 from .data import data_path



View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/9a3eea579f30bf96e272a0b94d5da656870be858

-- 
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/9a3eea579f30bf96e272a0b94d5da656870be858
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221225/31133301/attachment-0001.htm>