[med-svn] [Git][med-team/pyensembl][upstream] New upstream version 2.2.4+ds
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Dec 25 21:53:18 GMT 2022
Andreas Tille pushed to branch upstream at Debian Med / pyensembl
Commits:
9a3eea57 by Andreas Tille at 2022-12-25T22:46:11+01:00
New upstream version 2.2.4+ds
- - - - -
27 changed files:
- PKG-INFO
- pyensembl.egg-info/PKG-INFO
- pyensembl/__init__.py
- pyensembl/common.py
- pyensembl/database.py
- pyensembl/ensembl_url_templates.py
- pyensembl/genome.py
- − pyensembl/memory_cache.py
- pyensembl/sequence_data.py
- pyensembl/species.py
- pyensembl/version.py
- + requirements.txt
- setup.py
- test/test_download_cache.py
- test/test_gene_objects.py
- test/test_id_length.py
- test/test_locus.py
- − test/test_memory_cache.py
- test/test_missing_genome_sources.py
- test/test_mouse.py
- test/test_release_versions.py
- test/test_search.py
- test/test_serialization.py
- + test/test_shell.py
- test/test_timings.py
- test/test_transcript_objects.py
- test/test_ucsc_gtf.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pyensembl
-Version: 2.1.0
+Version: 2.2.4
Summary: Python interface to ensembl reference genome metadata
Home-page: https://github.com/openvax/pyensembl
Author: Alex Rubinsteyn
=====================================
pyensembl.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pyensembl
-Version: 2.1.0
+Version: 2.2.4
Summary: Python interface to ensembl reference genome metadata
Home-page: https://github.com/openvax/pyensembl
Author: Alex Rubinsteyn
=====================================
pyensembl/__init__.py
=====================================
@@ -10,7 +10,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from .memory_cache import MemoryCache
from .database import Database
from .download_cache import DownloadCache
from .ensembl_release import EnsemblRelease, cached_release
@@ -41,7 +40,6 @@ from .version import __version__
__all__ = [
"__version__",
- "MemoryCache",
"DownloadCache",
"Database",
"EnsemblRelease",
=====================================
pyensembl/common.py
=====================================
@@ -14,7 +14,6 @@ import pickle
from functools import wraps
-
def dump_pickle(obj, filepath):
with open(filepath, "wb") as f:
# use lower protocol for compatibility between Python 2 and Python 3
=====================================
pyensembl/database.py
=====================================
@@ -16,7 +16,6 @@ import sqlite3
import datacache
from typechecks import require_integer, require_string
-
from gtfparse import read_gtf, create_missing_features
from .common import memoize
=====================================
pyensembl/ensembl_url_templates.py
=====================================
@@ -16,14 +16,14 @@ on the Ensembl ftp server.
For example, the human chromosomal DNA sequences for release 78 are in:
- ftp://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
+ https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
"""
from .species import Species, find_species_by_name
from .ensembl_release_versions import check_release_number
-ENSEMBL_FTP_SERVER = "ftp://ftp.ensembl.org"
+ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
# Example directories
# FASTA files: /pub/release-78/fasta/homo_sapiens/
=====================================
pyensembl/genome.py
=====================================
@@ -21,7 +21,6 @@ from os.path import exists, getsize
from serializable import Serializable
-from .memory_cache import MemoryCache
from .download_cache import DownloadCache
from .database import Database
from .exon import Exon
@@ -108,7 +107,6 @@ class Genome(Serializable):
copy_local_files_to_cache=self.copy_local_files_to_cache,
install_string_function=self.install_string,
cache_directory_path=cache_directory_path)
- self.memory_cache = MemoryCache()
self._init_lazy_fields()
@property
@@ -435,8 +433,7 @@ class Genome(Serializable):
def clear_cache(self):
"""
- Clear any in-memory cached values and short-lived on-disk
- materializations from MemoryCache
+ Clear any in-memory cached values
"""
for maybe_fn in self.__dict__.values():
# clear cache associated with all memoization decorators,
=====================================
pyensembl/memory_cache.py deleted
=====================================
@@ -1,136 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Cache and serializing the results of expensive computations. Used in pyensembl
-primarily to cache the heavy-weight parsing of GTF files and various
-filtering operations on Ensembl entries.
-
-A piece of data is returned from one of three sources:
-1) Cache cold. Run the user-supplied compute_fn.
-2) Cache warm on disk. Parse or unpickle the serialized result into memory.
-3) Cache warm in memory. Return cached object.
-"""
-
-import logging
-from os import remove, stat
-from os.path import exists
-
-import pandas as pd
-
-from .common import load_pickle, dump_pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-class MemoryCache(object):
- """
- In-memory and on-disk caching of long-running queries and computations.
- """
- def __init__(self):
- self._memory_cache = {}
-
- def is_empty(self, filename):
- return stat(filename).st_size == 0
-
- def delete_file(self, path):
- if exists(path):
- logger.info("Deleting cached file %s", path)
- remove(path)
-
- def remove_from_cache(self, key):
- if key in self._memory_cache:
- del self._memory_cache[key]
- self.delete_file(key)
-
- def clear_cached_objects(self):
- for key in self._memory_cache.keys():
- self.delete_file(key)
- self._memory_cache.clear()
-
- def _read_csv(self, csv_path):
- logger.info("Reading Dataframe from %s", csv_path)
- df = pd.read_csv(csv_path)
- if 'seqname' in df:
- # by default, Pandas will infer the type as int,
- # then switch to str when it hits non-numerical
- # chromosomes. Make sure whole column has the same type
- df['seqname'] = df['seqname'].map(str)
- return df
-
- def _write_csv(self, df, csv_path, chunksize=10**5):
- """
- Parameters
- ----------
- df : pandas.DataFrame
-
- csv_path : str
-
- chunksize : int
- Number of rows to write at a time. Helps to limit memory
- consumption while writing a CSV.
- """
- logger.info("Saving DataFrame to %s", csv_path)
- df.to_csv(csv_path, index=False, chunksize=chunksize)
-
- def cached_dataframe(self, csv_path, compute_fn):
- """
- If a CSV path is in the _memory_cache, then return that cached value.
-
- If we've already saved the DataFrame as a CSV then load it.
-
- Otherwise run the provided `compute_fn`, and store its result
- in memory and and save it as a CSV.
- """
- if not csv_path.endswith(".csv"):
- raise ValueError("Invalid path '%s', must be a CSV file" % csv_path)
-
- if csv_path in self._memory_cache:
- return self._memory_cache[csv_path]
-
- if exists(csv_path) and not self.is_empty(csv_path):
- df = self._read_csv(csv_path)
- else:
- df = compute_fn()
- if not isinstance(df, pd.DataFrame):
- raise TypeError(
- "Expected compute_fn to return DataFrame, got %s : %s" % (
- df, type(df)))
- self._write_csv(df, csv_path)
- self._memory_cache[csv_path] = df
- return df
-
- def cached_object(self, path, compute_fn):
- """
- If `cached_object` has already been called for a value of `path` in this
- running Python instance, then it should have a cached value in the
- _memory_cache; return that value.
-
- If this function was never called before with a particular value of
- `path`, then call compute_fn, and pickle it to `path`.
-
- If `path` already exists, unpickle it and store that value in
- _memory_cache.
- """
- if path in self._memory_cache:
- return self._memory_cache[path]
-
- if exists(path) and not self.is_empty(path):
- obj = load_pickle(path)
- else:
- obj = compute_fn()
- dump_pickle(obj, path)
- self._memory_cache[path] = obj
- return obj
=====================================
pyensembl/sequence_data.py
=====================================
@@ -14,10 +14,7 @@ from os import remove
from os.path import exists, abspath, split, join
import logging
from collections import Counter
-
-from six.moves import cPickle as pickle
-from six import string_types
-
+import pickle
from .common import (load_pickle, dump_pickle)
from .fasta import parse_fasta_dictionary
@@ -34,7 +31,7 @@ class SequenceData(object):
fasta_paths,
cache_directory_path=None):
- if isinstance(fasta_paths, string_types):
+ if type(fasta_paths) is str:
fasta_paths = [fasta_paths]
self.fasta_paths = [abspath(path) for path in fasta_paths]
=====================================
pyensembl/species.py
=====================================
@@ -187,7 +187,12 @@ dog = Species.register(
cat = Species.register(
latin_name="felis_catus",
synonyms=["cat"],
- reference_assemblies={"Felis_catus_6.2": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={
+ "Felis_catus_6.2": (75, 90),
+ "Felis_catus_8.0": (91, 92),
+ "Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE),
+
+ })
chicken = Species.register(
latin_name="gallus_gallus",
=====================================
pyensembl/version.py
=====================================
@@ -1 +1 @@
-__version__ = '2.1.0'
+__version__ = '2.2.4'
=====================================
requirements.txt
=====================================
@@ -0,0 +1,8 @@
+typechecks>=0.0.2
+datacache>=1.1.4
+memoized-property>=1.0.2
+tinytimer
+gtfparse>=1.3.0,<2.0.0
+serializable
+nose>=1.3.3
+pylint>=1.4.4
=====================================
setup.py
=====================================
@@ -41,6 +41,9 @@ if not version:
raise RuntimeError('Cannot find version information')
if __name__ == '__main__':
+ with open("requirements.txt") as f:
+ requirements = [l.strip() for l in f]
+
setup(
name=package_name,
version=version,
@@ -63,17 +66,11 @@ if __name__ == '__main__':
'Programming Language :: Python',
'Topic :: Scientific/Engineering :: Bio-Informatics',
],
- install_requires=[
- "typechecks>=0.0.2",
- "pandas>=0.15",
- "datacache>=1.1.4",
- "memoized-property>=1.0.2",
- "gtfparse>=1.1.0",
- "serializable",
- "tinytimer",
- ],
+ install_requires=requirements,
long_description=readme_markdown,
long_description_content_type='text/markdown',
packages=[package_name],
- package_data={package_name: ['logging.conf']},
+ package_data={
+ package_name: ['logging.conf', '../requirements.txt'],
+ },
)
=====================================
test/test_download_cache.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from nose.tools import assert_raises, ok_
from pyensembl.download_cache import (
DownloadCache,
=====================================
test/test_gene_objects.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from nose.tools import eq_
from .common import test_ensembl_releases
=====================================
test/test_id_length.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from .common import major_releases
from nose.tools import nottest
=====================================
test/test_locus.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from pyensembl.locus import Locus
from pyensembl.normalization import normalize_chromosome
=====================================
test/test_memory_cache.py deleted
=====================================
@@ -1,94 +0,0 @@
-from __future__ import absolute_import
-
-import tempfile
-
-from pyensembl import MemoryCache
-
-import pandas as pd
-from nose.tools import raises
-
-memory_cache = MemoryCache()
-
-class Counter(object):
- """
- Use this class to count how many times a function gets called by
- cached_object and cached_dataframe.
- """
- def __init__(self):
- self.count = 0
-
- def increment(self):
- self.count += 1
- return self.count
-
- def increment_dataframe(self):
- value = self.increment()
- return pd.DataFrame({'x': [value]})
-
-def test_cached_object_with_tempfile():
- """
- test_cached_object_with_tempfile : A temporary file exists before
- calling into compute_cache.cached_object but is empty, should be treated
- as if result has never been computed before (rather than trying to load
- the empty file).
- """
- counter = Counter()
- with tempfile.NamedTemporaryFile() as f:
- # call repeatedly to test the hot and cold cache logic
- result = memory_cache.cached_object(
- f.name, compute_fn=counter.increment)
- assert result == 1, "Expected result=1, got %s" % (result,)
- assert counter.count == 1, \
- "Expected compute_fn to be called once, got %s" % (counter.count,)
-
-
-def test_cached_dataframe_with_tempfile():
- """
- test_cached_dataframe_with_tempfile : A temporary file exists before
- calling into compute_cache.cached_dataframe but is empty,
- should be treated as if result has never been computed before
- (rather than trying to load the empty file).
- """
- counter = Counter()
- with tempfile.NamedTemporaryFile(suffix='.csv') as f:
- # call repeatedly to test hot and cold cache logic
- for _ in range(2):
- df = memory_cache.cached_dataframe(
- f.name, compute_fn=counter.increment_dataframe)
- # get counter value from inside of dataframe
- result = df['x'].iloc[0]
- assert result == 1, \
- "Expected result=1, got %s" % (result,)
- assert counter.count == 1, \
- "Expected compute_fn to be called once, got %s" % (
- counter.count,)
-
-def test_cached_dataframe_returns_correct_type():
- def make_a_dataframe():
- return pd.DataFrame({'x': [0, 1, 2]})
- with tempfile.NamedTemporaryFile(suffix='.csv') as f:
- # call repeatedly to test the cold and hot cache logic
- for _ in range(2):
- df = memory_cache.cached_dataframe(
- f.name, compute_fn=make_a_dataframe)
- assert isinstance(df, pd.DataFrame), \
- "Expected DataFrame, got %s : %s" % (df, type(df))
-
-def test_cached_object_with_list_returns_correct_type():
- def make_a_list():
- return [1, 2, 3]
- with tempfile.NamedTemporaryFile() as f:
- # call repeatedly to test the cold and hot cache logic
- for _ in range(2):
- df = memory_cache.cached_object(
- f.name, compute_fn=make_a_list)
- assert isinstance(df, list), \
- "Expected list, got %s : %s" % (df, type(df))
-
- at raises(Exception)
-def test_dataframe_path_must_be_csv():
- # compute_cache should raise an exception when filename doesn't
- # end with .csv extension
- memory_cache.cached_dataframe(
- csv_path="tempfile_not_csv",
- compute_fn=lambda _: pd.DataFrame({'x': []}))
=====================================
test/test_missing_genome_sources.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from pyensembl import Genome
from nose.tools import eq_, ok_, assert_raises
=====================================
test/test_mouse.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from nose.tools import eq_, with_setup
from .data import (
=====================================
test/test_release_versions.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE
from nose.tools import raises
=====================================
test/test_search.py
=====================================
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
from pyensembl import find_nearest_locus
-from nose.tools import eq_
from .common import test_ensembl_releases
@test_ensembl_releases()
=====================================
test/test_serialization.py
=====================================
@@ -1,5 +1,3 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -12,7 +10,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import absolute_import
import pickle
from nose.tools import eq_, with_setup
from pyensembl import Genome, Transcript, Gene, Exon
=====================================
test/test_shell.py
=====================================
@@ -0,0 +1,12 @@
+from nose.tools import eq_, with_setup
+from pyensembl.shell import parser, all_combinations_of_ensembl_genomes
+from pyensembl import ensembl_grch38
+
+
+def test_genome_selection_grch38():
+ args = parser.parse_args(["install", "--release", "100", "--species", "human"])
+ genomes = all_combinations_of_ensembl_genomes(args)
+ assert len(genomes) == 1
+ genome = genomes[0]
+ eq_(genome.species.latin_name, "homo_sapiens")
+ eq_(genome.release, 100)
=====================================
test/test_timings.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import
-
from pyensembl import genome_for_reference_name
from tinytimer import benchmark
=====================================
test/test_transcript_objects.py
=====================================
@@ -1,5 +1,3 @@
-from __future__ import absolute_import
-
from pyensembl import Locus, cached_release
from nose.tools import eq_, assert_not_equal, assert_greater
=====================================
test/test_ucsc_gtf.py
=====================================
@@ -1,7 +1,6 @@
-from __future__ import absolute_import
+from nose.tools import eq_
from pyensembl import Genome, Database
-from nose.tools import eq_
from .common import TemporaryDirectory
from .data import data_path
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/9a3eea579f30bf96e272a0b94d5da656870be858
--
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/9a3eea579f30bf96e272a0b94d5da656870be858
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221225/31133301/attachment-0001.htm>
More information about the debian-med-commit
mailing list