[med-svn] [Git][med-team/pyensembl][upstream] New upstream version 2.2.9+ds
Lance Lin (@linqigang)
gitlab at salsa.debian.org
Wed Jan 3 16:03:06 GMT 2024
Lance Lin pushed to branch upstream at Debian Med / pyensembl
Commits:
df67ba56 by Lance Lin at 2024-01-03T21:20:26+07:00
New upstream version 2.2.9+ds
- - - - -
23 changed files:
- PKG-INFO
- pyensembl.egg-info/PKG-INFO
- pyensembl/__init__.py
- pyensembl/common.py
- pyensembl/database.py
- pyensembl/download_cache.py
- pyensembl/ensembl_release.py
- pyensembl/ensembl_release_versions.py
- pyensembl/ensembl_url_templates.py
- pyensembl/exon.py
- pyensembl/fasta.py
- pyensembl/gene.py
- pyensembl/genome.py
- pyensembl/locus.py
- pyensembl/locus_with_genome.py
- pyensembl/normalization.py
- pyensembl/reference_name.py
- pyensembl/search.py
- pyensembl/sequence_data.py
- pyensembl/shell.py
- pyensembl/species.py
- pyensembl/version.py
- setup.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pyensembl
-Version: 2.2.8
+Version: 2.2.9
Summary: Python interface to ensembl reference genome metadata
Home-page: https://github.com/openvax/pyensembl
Author: Alex Rubinsteyn
=====================================
pyensembl.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pyensembl
-Version: 2.2.8
+Version: 2.2.9
Summary: Python interface to ensembl reference genome metadata
Home-page: https://github.com/openvax/pyensembl
Author: Alex Rubinsteyn
=====================================
pyensembl/__init__.py
=====================================
@@ -30,11 +30,7 @@ from .reference_name import (
from .search import find_nearest_locus
from .sequence_data import SequenceData
-from .species import (
- find_species_by_name,
- check_species_object,
- normalize_species_name
-)
+from .species import find_species_by_name, check_species_object, normalize_species_name
from .transcript import Transcript
from .version import __version__
=====================================
pyensembl/common.py
=====================================
@@ -14,16 +14,19 @@ import pickle
from functools import wraps
+
def dump_pickle(obj, filepath):
with open(filepath, "wb") as f:
# use lower protocol for compatibility between Python 2 and Python 3
pickle.dump(obj, file=f, protocol=2)
+
def load_pickle(filepath):
with open(filepath, "rb") as f:
obj = pickle.load(f)
return obj
+
def _memoize_cache_key(args, kwargs):
"""Turn args tuple and kwargs dictionary into a hashable key.
@@ -39,13 +42,14 @@ def _memoize_cache_key(args, kwargs):
cache_key_list.append(tuple(arg))
else:
cache_key_list.append(arg)
- for (k, v) in sorted(kwargs.items()):
+ for k, v in sorted(kwargs.items()):
if type(v) is list:
cache_key_list.append((k, tuple(v)))
else:
cache_key_list.append((k, v))
return tuple(cache_key_list)
+
def memoize(fn):
"""Simple reset-able memoization decorator for functions and methods,
assumes that all arguments to the function can be hashed and
=====================================
pyensembl/database.py
=====================================
@@ -37,12 +37,13 @@ class Database(object):
"""
def __init__(
- self,
- gtf_path,
- install_string=None,
- cache_directory_path=None,
- restrict_gtf_columns=None,
- restrict_gtf_features=None):
+ self,
+ gtf_path,
+ install_string=None,
+ cache_directory_path=None,
+ restrict_gtf_columns=None,
+ restrict_gtf_features=None,
+ ):
"""
Parameters
----------
@@ -85,9 +86,7 @@ class Database(object):
self._query_cache = {}
def __eq__(self, other):
- return (
- other.__class__ is Database and
- self.gtf_path == other.gtf_path)
+ return other.__class__ is Database and self.gtf_path == other.gtf_path
def __str__(self):
return "Database(gtf_path=%s)" % (self.gtf_path,)
@@ -115,14 +114,14 @@ class Database(object):
missing values for that feature or are the same as the table's primary key.
"""
candidate_column_groups = [
- ['seqname', 'start', 'end'],
- ['gene_name'],
- ['gene_id'],
- ['transcript_id'],
- ['transcript_name'],
- ['exon_id'],
- ['protein_id'],
- ['ccds_id'],
+ ["seqname", "start", "end"],
+ ["gene_name"],
+ ["gene_id"],
+ ["transcript_id"],
+ ["transcript_name"],
+ ["exon_id"],
+ ["protein_id"],
+ ["ccds_id"],
]
indices = []
column_set = set(column_names)
@@ -137,8 +136,8 @@ class Database(object):
# other GTFs)
if column_name not in column_set:
logger.info(
- "Skipping database index for {%s}",
- ", ".join(column_group))
+ "Skipping database index for {%s}", ", ".join(column_group)
+ )
skip = True
if skip:
continue
@@ -147,10 +146,7 @@ class Database(object):
# mapping from database tables to their primary keys
# sadly exon IDs *are* not unique, so can't be in this dict
- PRIMARY_KEY_COLUMNS = {
- 'gene': 'gene_id',
- 'transcript': 'transcript_id'
- }
+ PRIMARY_KEY_COLUMNS = {"gene": "gene_id", "transcript": "transcript_id"}
def _get_primary_key(self, feature_name, feature_df):
"""Name of primary key for a feature table (e.g. "gene" -> "gene_id")
@@ -167,13 +163,13 @@ class Database(object):
if primary_key_values.isnull().any():
raise ValueError(
"Column '%s' can't be primary key of table '%s'"
- " because it contains nulls values" % (
- primary_key, feature_name))
+ " because it contains nulls values" % (primary_key, feature_name)
+ )
elif len(primary_key_values.unique()) < len(primary_key_values):
raise ValueError(
"Column '%s' can't be primary key of table '%s'"
- " because it contains repeated values" % (
- primary_key, feature_name))
+ " because it contains repeated values" % (primary_key, feature_name)
+ )
else:
return primary_key
@@ -196,9 +192,7 @@ class Database(object):
result.append(index_group)
return result
- def create(
- self,
- overwrite=False):
+ def create(self, overwrite=False):
"""
Create the local database (including indexing) if it's not
already set up. If `overwrite` is True, always re-create
@@ -210,8 +204,8 @@ class Database(object):
datacache.ensure_dir(self.cache_directory_path)
df = self._load_gtf_as_dataframe(
- usecols=self.restrict_gtf_columns,
- features=self.restrict_gtf_features)
+ usecols=self.restrict_gtf_columns, features=self.restrict_gtf_features
+ )
all_index_groups = self._all_possible_indices(df.columns)
if self.restrict_gtf_features:
@@ -219,7 +213,7 @@ class Database(object):
else:
# split single DataFrame into dictionary mapping each unique
# feature name onto that subset of the data
- feature_names = df['feature'].unique()
+ feature_names = df["feature"].unique()
dataframes = {}
# every table gets the same set of indices
indices_dict = {}
@@ -237,9 +231,8 @@ class Database(object):
primary_keys[feature] = primary_key
indices_dict[feature] = self._feature_indices(
- all_index_groups,
- primary_key,
- df_subset)
+ all_index_groups, primary_key, df_subset
+ )
self._connection = datacache.db_from_dataframes_with_absolute_path(
db_path=self.local_db_path,
@@ -247,7 +240,8 @@ class Database(object):
table_names_to_primary_keys=primary_keys,
table_names_to_indices=indices_dict,
overwrite=overwrite,
- version=DATABASE_SCHEMA_VERSION)
+ version=DATABASE_SCHEMA_VERSION,
+ )
return self._connection
def _get_connection(self):
@@ -260,8 +254,8 @@ class Database(object):
# TODO: expose this more explicitly in datacache
#
self._connection = datacache.connect_if_correct_version(
- self.local_db_path,
- DATABASE_SCHEMA_VERSION)
+ self.local_db_path, DATABASE_SCHEMA_VERSION
+ )
return self._connection
@property
@@ -301,15 +295,16 @@ class Database(object):
return column_name in self.columns(table_name)
def column_values_at_locus(
- self,
- column_name,
- feature,
- contig,
- position,
- end=None,
- strand=None,
- distinct=False,
- sorted=False):
+ self,
+ column_name,
+ feature,
+ contig,
+ position,
+ end=None,
+ strand=None,
+ distinct=False,
+ sorted=False,
+ ):
"""
Get the non-null values of a column from the database
at a particular range of loci
@@ -329,8 +324,13 @@ class Database(object):
require_integer(end, "end")
if not self.column_exists(feature, column_name):
- raise ValueError("Table %s doesn't have column %s" % (
- feature, column_name,))
+ raise ValueError(
+ "Table %s doesn't have column %s"
+ % (
+ feature,
+ column_name,
+ )
+ )
if distinct:
distinct_string = "DISTINCT "
@@ -344,7 +344,11 @@ class Database(object):
AND start <= ?
AND end >= ?
- """ % (distinct_string, column_name, feature)
+ """ % (
+ distinct_string,
+ column_name,
+ feature,
+ )
query_params = [contig, end, position]
@@ -362,13 +366,8 @@ class Database(object):
return results
def distinct_column_values_at_locus(
- self,
- column,
- feature,
- contig,
- position,
- end=None,
- strand=None):
+ self, column, feature, contig, position, end=None, strand=None
+ ):
"""
Gather all the distinct values for a property/column at some specified
locus.
@@ -404,7 +403,8 @@ class Database(object):
end=end,
strand=strand,
distinct=True,
- sorted=True)
+ sorted=True,
+ )
def run_sql_query(self, sql, required=False, query_params=[]):
"""
@@ -426,30 +426,33 @@ class Database(object):
try:
cursor = self.connection.execute(sql, query_params)
except sqlite3.OperationalError as e:
- error_message = e.message if hasattr(e, 'message') else str(e)
+ error_message = e.message if hasattr(e, "message") else str(e)
logger.warn(
- "Encountered error \"%s\" from query \"%s\" with parameters %s",
+ 'Encountered error "%s" from query "%s" with parameters %s',
error_message,
sql,
- query_params)
+ query_params,
+ )
raise
results = cursor.fetchall()
if required and not results:
raise ValueError(
- "No results found for query:\n%s\nwith parameters: %s" % (
- sql, query_params))
+ "No results found for query:\n%s\nwith parameters: %s"
+ % (sql, query_params)
+ )
return results
@memoize
def query(
- self,
- select_column_names,
- filter_column,
- filter_value,
- feature,
- distinct=False,
- required=False):
+ self,
+ select_column_names,
+ filter_column,
+ filter_value,
+ feature,
+ distinct=False,
+ required=False,
+ ):
"""
Construct a SQL query and run against the sqlite3 database,
filtered both by the feature type and a user-provided column/value.
@@ -458,50 +461,49 @@ class Database(object):
SELECT %s%s
FROM %s
WHERE %s = ?
- """ % ("distinct " if distinct else "",
- ", ".join(select_column_names),
- feature,
- filter_column)
+ """ % (
+ "distinct " if distinct else "",
+ ", ".join(select_column_names),
+ feature,
+ filter_column,
+ )
query_params = [filter_value]
- return self.run_sql_query(
- sql, required=required, query_params=query_params)
+ return self.run_sql_query(sql, required=required, query_params=query_params)
def query_one(
- self,
- select_column_names,
- filter_column,
- filter_value,
- feature,
- distinct=False,
- required=False):
+ self,
+ select_column_names,
+ filter_column,
+ filter_value,
+ feature,
+ distinct=False,
+ required=False,
+ ):
results = self.query(
select_column_names,
filter_column,
filter_value,
feature,
distinct=distinct,
- required=required)
+ required=required,
+ )
if len(results) == 0:
if required:
- raise ValueError("%s not found: %s" % (
- filter_column, filter_value))
+ raise ValueError("%s not found: %s" % (filter_column, filter_value))
else:
return None
elif len(results) > 1:
raise ValueError(
- "Found multiple entries with %s=%s (%s)" % (
- filter_column, filter_value, results))
+ "Found multiple entries with %s=%s (%s)"
+ % (filter_column, filter_value, results)
+ )
return results[0]
@memoize
def query_feature_values(
- self,
- column,
- feature,
- distinct=True,
- contig=None,
- strand=None):
+ self, column, feature, distinct=True, contig=None, strand=None
+ ):
"""
Run a SQL query against the sqlite3 database, filtered
only on the feature type.
@@ -510,7 +512,11 @@ class Database(object):
SELECT %s%s
FROM %s
WHERE 1=1
- """ % ("DISTINCT " if distinct else "", column, feature)
+ """ % (
+ "DISTINCT " if distinct else "",
+ column,
+ feature,
+ )
query_params = []
if contig:
@@ -528,10 +534,8 @@ class Database(object):
def query_distinct_on_contig(self, column_name, feature, contig):
return self.query_feature_values(
- column=column_name,
- feature=feature,
- contig=contig,
- distinct=True)
+ column=column_name, feature=feature, contig=contig, distinct=True
+ )
def query_loci(self, filter_column, filter_value, feature):
"""
@@ -558,11 +562,11 @@ class Database(object):
filter_value=filter_value,
feature=feature,
distinct=True,
- required=True)
+ required=True,
+ )
return [
Locus(contig, start, end, strand)
- for (contig, start, end, strand)
- in result_tuples
+ for (contig, start, end, strand) in result_tuples
]
def query_locus(self, filter_column, filter_value, feature):
@@ -584,16 +588,19 @@ class Database(object):
Returns single Locus object.
"""
loci = self.query_loci(
- filter_column=filter_column,
- filter_value=filter_value,
- feature=feature)
+ filter_column=filter_column, filter_value=filter_value, feature=feature
+ )
if len(loci) == 0:
- raise ValueError("Couldn't find locus for %s with %s = %s" % (
- feature, filter_column, filter_value))
+ raise ValueError(
+ "Couldn't find locus for %s with %s = %s"
+ % (feature, filter_column, filter_value)
+ )
elif len(loci) > 1:
- raise ValueError("Too many loci for %s with %s = %s: %s" % (
- feature, filter_column, filter_value, loci))
+ raise ValueError(
+ "Too many loci for %s with %s = %s: %s"
+ % (feature, filter_column, filter_value, loci)
+ )
return loci[0]
def _load_gtf_as_dataframe(self, usecols=None, features=None):
@@ -609,7 +616,8 @@ class Database(object):
},
infer_biotype_column=True,
usecols=usecols,
- features=features)
+ features=features,
+ )
column_names = set(df.keys())
expect_gene_feature = features is None or "gene" in features
@@ -627,12 +635,10 @@ class Database(object):
dataframe=df,
unique_keys={"gene": "gene_id"},
extra_columns={
- "gene": {
- "gene_name",
- "gene_biotype"
- }.intersection(column_names),
+ "gene": {"gene_name", "gene_biotype"}.intersection(column_names),
},
- missing_value="")
+ missing_value="",
+ )
logger.info("Done.")
if expect_transcript_feature and "transcript" not in observed_features:
@@ -650,7 +656,8 @@ class Database(object):
"protein_id",
}.intersection(column_names)
},
- missing_value="")
+ missing_value="",
+ )
logger.info("Done.")
return df
=====================================
pyensembl/download_cache.py
=====================================
@@ -26,9 +26,8 @@ CACHE_DIR_ENV_KEY = "PYENSEMBL_CACHE_DIR"
def cache_subdirectory(
- reference_name=None,
- annotation_name=None,
- annotation_version=None):
+ reference_name=None, annotation_name=None, annotation_version=None
+):
"""
Which cache subdirectory to use for a given annotation database
over a particular reference. All arguments can be omitted to just get
@@ -55,7 +54,7 @@ class MissingLocalFile(Exception):
self.path = path
def __str__(self):
- return("MissingFile(%s)" % self.path)
+ return "MissingFile(%s)" % self.path
class DownloadCache(object):
@@ -63,15 +62,17 @@ class DownloadCache(object):
Downloads remote files to cache, optionally copies local files into cache,
raises custom message if data is missing.
"""
+
def __init__(
- self,
- reference_name,
- annotation_name,
- annotation_version=None,
- decompress_on_download=False,
- copy_local_files_to_cache=False,
- install_string_function=None,
- cache_directory_path=None):
+ self,
+ reference_name,
+ annotation_name,
+ annotation_version=None,
+ decompress_on_download=False,
+ copy_local_files_to_cache=False,
+ install_string_function=None,
+ cache_directory_path=None,
+ ):
"""
Parameters
----------
@@ -116,12 +117,13 @@ class DownloadCache(object):
self.cache_subdirectory = cache_subdirectory(
reference_name=reference_name,
annotation_name=annotation_name,
- annotation_version=annotation_version)
+ annotation_version=annotation_version,
+ )
# If `CACHE_DIR_ENV_KEY` is set, the cache will be saved there
self._cache_directory_path = datacache.get_data_dir(
- subdir=self.cache_subdirectory,
- envkey=CACHE_DIR_ENV_KEY)
+ subdir=self.cache_subdirectory, envkey=CACHE_DIR_ENV_KEY
+ )
self.decompress_on_download = decompress_on_download
self.copy_local_files_to_cache = copy_local_files_to_cache
@@ -136,19 +138,19 @@ class DownloadCache(object):
Fields used for hashing, string representation, equality comparison
"""
return (
- ('reference_name', self.reference_name,),
- ('annotation_name', self.annotation_name),
- ('annotation_version', self.annotation_version),
- ('cache_directory_path', self.cache_directory_path),
- ('decompress_on_download', self.decompress_on_download),
- ('copy_local_files_to_cache', self.copy_local_files_to_cache)
+ (
+ "reference_name",
+ self.reference_name,
+ ),
+ ("annotation_name", self.annotation_name),
+ ("annotation_version", self.annotation_version),
+ ("cache_directory_path", self.cache_directory_path),
+ ("decompress_on_download", self.decompress_on_download),
+ ("copy_local_files_to_cache", self.copy_local_files_to_cache),
)
def __eq__(self, other):
- return (
- other.__class__ is DownloadCache and
- self._fields() == other._fields()
- )
+ return other.__class__ is DownloadCache and self._fields() == other._fields()
def __hash__(self):
return hash(self._fields())
@@ -184,7 +186,7 @@ class DownloadCache(object):
"""
for ext in [".gz", ".gzip", ".zip"]:
if filename.endswith(ext):
- return filename[:-len(ext)]
+ return filename[: -len(ext)]
return filename
def cached_path(self, path_or_url):
@@ -200,21 +202,18 @@ class DownloadCache(object):
# for stripping decompression extensions for both local
# and remote files
local_filename = datacache.build_local_filename(
- download_url=path_or_url,
- filename=remote_filename,
- decompress=False)
+ download_url=path_or_url, filename=remote_filename, decompress=False
+ )
else:
local_filename = remote_filename
# if we expect the download function to decompress this file then
# we should use its name without the compression extension
if self.decompress_on_download:
- local_filename = self._remove_compression_suffix_if_present(
- local_filename)
+ local_filename = self._remove_compression_suffix_if_present(local_filename)
if len(local_filename) == 0:
- raise ValueError("Can't determine local filename for %s" % (
- path_or_url,))
+ raise ValueError("Can't determine local filename for %s" % (path_or_url,))
return join(self.cache_directory_path, local_filename)
@@ -229,9 +228,8 @@ class DownloadCache(object):
logger.info("Fetching %s from URL %s", cached_path, url)
datacache.ensure_dir(self.cache_directory_path)
datacache.download._download_and_decompress_if_necessary(
- full_path=cached_path,
- download_url=url,
- timeout=3600)
+ full_path=cached_path, download_url=url, timeout=3600
+ )
elif missing:
raise MissingRemoteFile(url)
return cached_path
@@ -254,10 +252,8 @@ class DownloadCache(object):
return cached_path
def download_or_copy_if_necessary(
- self,
- path_or_url,
- download_if_missing=False,
- overwrite=False):
+ self, path_or_url, download_if_missing=False, overwrite=False
+ ):
"""
Download a remote file or copy
Get the local path to a possibly remote file.
@@ -283,9 +279,8 @@ class DownloadCache(object):
raise ValueError("Expected non-empty string for path_or_url")
if self.is_url_format(path_or_url):
return self._download_if_necessary(
- path_or_url,
- download_if_missing,
- overwrite)
+ path_or_url, download_if_missing, overwrite
+ )
else:
return self._copy_if_necessary(path_or_url, overwrite)
@@ -293,23 +288,22 @@ class DownloadCache(object):
missing_urls = list(missing_urls_dict.values())
n_missing = len(missing_urls)
error_message = "Missing genome data file%s from %s." % (
- ("s", missing_urls) if n_missing > 1 else ("", missing_urls[0]))
+ ("s", missing_urls) if n_missing > 1 else ("", missing_urls[0])
+ )
if self.install_string_function:
install_string = self.install_string_function()
error_message += " Run %s" % install_string
raise ValueError(error_message)
def local_path_or_install_error(
- self,
- field_name,
- path_or_url,
- download_if_missing=False,
- overwrite=False):
+ self, field_name, path_or_url, download_if_missing=False, overwrite=False
+ ):
try:
return self.download_or_copy_if_necessary(
path_or_url,
download_if_missing=download_if_missing,
- overwrite=overwrite)
+ overwrite=overwrite,
+ )
except MissingRemoteFile:
self._raise_missing_file_error({field_name: path_or_url})
@@ -319,9 +313,9 @@ class DownloadCache(object):
"""
if isdir(self.cache_directory_path):
for filename in listdir():
- delete = (
- any([filename.endswith(ext) for ext in suffixes]) or
- any([filename.startswith(pre) for pre in prefixes]))
+ delete = any([filename.endswith(ext) for ext in suffixes]) or any(
+ [filename.startswith(pre) for pre in prefixes]
+ )
if delete:
path = join(self.cache_directory_path, filename)
logger.info("Deleting %s", path)
=====================================
pyensembl/ensembl_release.py
=====================================
@@ -20,11 +20,7 @@ from .genome import Genome
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
from .species import check_species_object, human
-from .ensembl_url_templates import (
- ENSEMBL_FTP_SERVER,
- make_gtf_url,
- make_fasta_url
-)
+from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url
class EnsemblRelease(Genome):
@@ -32,6 +28,7 @@ class EnsemblRelease(Genome):
Bundles together the genomic annotation and sequence data associated with
a particular release of the Ensembl database.
"""
+
@classmethod
def normalize_init_values(cls, release, species, server):
"""
@@ -50,10 +47,8 @@ class EnsemblRelease(Genome):
@classmethod
def cached(
- cls,
- release=MAX_ENSEMBL_RELEASE,
- species=human,
- server=ENSEMBL_FTP_SERVER):
+ cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
+ ):
"""
Construct EnsemblRelease if it's never been made before, otherwise
return an old instance.
@@ -66,29 +61,29 @@ class EnsemblRelease(Genome):
return genome
def __init__(
- self,
- release=MAX_ENSEMBL_RELEASE,
- species=human,
- server=ENSEMBL_FTP_SERVER):
+ self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
+ ):
self.release, self.species, self.server = self.normalize_init_values(
- release=release, species=species, server=server)
+ release=release, species=species, server=server
+ )
self.gtf_url = make_gtf_url(
- ensembl_release=self.release,
- species=self.species,
- server=self.server)
+ ensembl_release=self.release, species=self.species, server=self.server
+ )
self.transcript_fasta_urls = [
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="cdna",
- server=server),
+ server=server,
+ ),
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="ncrna",
- server=server)
+ server=server,
+ ),
]
self.protein_fasta_urls = [
@@ -96,7 +91,9 @@ class EnsemblRelease(Genome):
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="pep",
- server=self.server)]
+ server=self.server,
+ )
+ ]
self.reference_name = self.species.which_reference(self.release)
@@ -107,33 +104,33 @@ class EnsemblRelease(Genome):
annotation_version=self.release,
gtf_path_or_url=self.gtf_url,
transcript_fasta_paths_or_urls=self.transcript_fasta_urls,
- protein_fasta_paths_or_urls=self.protein_fasta_urls)
+ protein_fasta_paths_or_urls=self.protein_fasta_urls,
+ )
def install_string(self):
return "pyensembl install --release %d --species %s" % (
self.release,
- self.species.latin_name)
+ self.species.latin_name,
+ )
def __str__(self):
return "EnsemblRelease(release=%d, species='%s')" % (
self.release,
- self.species.latin_name)
+ self.species.latin_name,
+ )
def __eq__(self, other):
return (
- other.__class__ is EnsemblRelease and
- self.release == other.release and
- self.species == other.species)
+ other.__class__ is EnsemblRelease
+ and self.release == other.release
+ and self.species == other.species
+ )
def __hash__(self):
return hash((self.release, self.species))
def to_dict(self):
- return {
- "release": self.release,
- "species": self.species,
- "server": self.server
- }
+ return {"release": self.release, "species": self.species, "server": self.server}
@classmethod
def from_dict(cls, state_dict):
=====================================
pyensembl/ensembl_release_versions.py
=====================================
@@ -11,7 +11,8 @@
# limitations under the License.
MIN_ENSEMBL_RELEASE = 54
-MAX_ENSEMBL_RELEASE = 109
+MAX_ENSEMBL_RELEASE = 110
+
def check_release_number(release):
"""
@@ -25,6 +26,7 @@ def check_release_number(release):
if release < MIN_ENSEMBL_RELEASE:
raise ValueError(
- "Invalid Ensembl releases %d, must be greater than %d" % (
- release, MIN_ENSEMBL_RELEASE))
+ "Invalid Ensembl releases %d, must be greater than %d"
+ % (release, MIN_ENSEMBL_RELEASE)
+ )
return release
=====================================
pyensembl/ensembl_url_templates.py
=====================================
@@ -47,67 +47,62 @@ def normalize_release_properties(ensembl_release, species):
# GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz
GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz"
+
def make_gtf_filename(ensembl_release, species):
"""
Return GTF filename expect on Ensembl FTP server for a specific
species/release combination
"""
ensembl_release, species, reference_name = normalize_release_properties(
- ensembl_release, species)
+ ensembl_release, species
+ )
return GTF_FILENAME_TEMPLATE % {
"Species": species.capitalize(),
"reference": reference_name,
"release": ensembl_release,
}
-def make_gtf_url(ensembl_release,
- species,
- server=ENSEMBL_FTP_SERVER):
+
+def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
"""
Returns a URL and a filename, which can be joined together.
"""
- ensembl_release, species, _ = \
- normalize_release_properties(ensembl_release, species)
- subdir = GTF_SUBDIR_TEMPLATE % {
- "release": ensembl_release,
- "species": species
- }
- filename = make_gtf_filename(
- ensembl_release=ensembl_release,
- species=species)
+ ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
+ subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
+ filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
return server + subdir + filename
# cDNA & protein FASTA file for releases before (and including) Ensembl 75
# example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz
-OLD_FASTA_FILENAME_TEMPLATE = \
+OLD_FASTA_FILENAME_TEMPLATE = (
"%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz"
+)
# ncRNA FASTA file for releases before (and including) Ensembl 75
# example: Homo_sapiens.NCBI36.54.ncrna.fa.gz
-OLD_FASTA_FILENAME_TEMPLATE_NCRNA = \
- "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz"
+OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz"
# cDNA & protein FASTA file for releases after Ensembl 75
# example: Homo_sapiens.GRCh37.cdna.all.fa.gz
-NEW_FASTA_FILENAME_TEMPLATE = \
- "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz"
+NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz"
# ncRNA FASTA file for releases after Ensembl 75
# example: Homo_sapiens.GRCh37.ncrna.fa.gz
-NEW_FASTA_FILENAME_TEMPLATE_NCRNA = \
- "%(Species)s.%(reference)s.ncrna.fa.gz"
+NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"
+
def make_fasta_filename(ensembl_release, species, sequence_type):
- ensembl_release, species, reference_name = \
- normalize_release_properties(ensembl_release, species)
+ ensembl_release, species, reference_name = normalize_release_properties(
+ ensembl_release, species
+ )
if ensembl_release <= 75:
- if sequence_type == 'ncrna':
+ if sequence_type == "ncrna":
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
"reference": reference_name,
- "release": ensembl_release
+ "release": ensembl_release,
}
else:
return OLD_FASTA_FILENAME_TEMPLATE % {
@@ -117,10 +112,10 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
"sequence_type": sequence_type,
}
else:
- if sequence_type == 'ncrna':
+ if sequence_type == "ncrna":
return NEW_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
- "reference": reference_name
+ "reference": reference_name,
}
else:
return NEW_FASTA_FILENAME_TEMPLATE % {
@@ -129,11 +124,8 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
"sequence_type": sequence_type,
}
-def make_fasta_url(
- ensembl_release,
- species,
- sequence_type,
- server=ENSEMBL_FTP_SERVER):
+
+def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
"""Construct URL to FASTA file with cDNA transcript or protein sequences
Parameter examples:
@@ -142,14 +134,14 @@ def make_fasta_url(
sequence_type = "cdna" (other option: "pep")
"""
ensembl_release, species, reference_name = normalize_release_properties(
- ensembl_release, species)
+ ensembl_release, species
+ )
subdir = FASTA_SUBDIR_TEMPLATE % {
"release": ensembl_release,
"species": species,
- "type": sequence_type
+ "type": sequence_type,
}
filename = make_fasta_filename(
- ensembl_release=ensembl_release,
- species=species,
- sequence_type=sequence_type)
+ ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
+ )
return server + subdir + filename
=====================================
pyensembl/exon.py
=====================================
@@ -15,15 +15,7 @@ from .locus import Locus
class Exon(Locus):
- def __init__(
- self,
- exon_id,
- contig,
- start,
- end,
- strand,
- gene_name,
- gene_id):
+ def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id):
Locus.__init__(self, contig, start, end, strand)
self.exon_id = exon_id
self.gene_name = gene_name
@@ -44,26 +36,30 @@ class Exon(Locus):
" contig='%s',"
" start=%d,"
" end=%s,"
- " strand='%s')") % (
- self.exon_id,
- self.gene_id,
- self.gene_name,
- self.contig,
- self.start,
- self.end,
- self.strand)
+ " strand='%s')"
+ ) % (
+ self.exon_id,
+ self.gene_id,
+ self.gene_name,
+ self.contig,
+ self.start,
+ self.end,
+ self.strand,
+ )
def __eq__(self, other):
if not isinstance(other, Exon):
- raise TypeError("Cannot compare %s and %s" % (
- self.__class__.__name__,
- other.__class.__name__))
+ raise TypeError(
+ "Cannot compare %s and %s"
+ % (self.__class__.__name__, other.__class.__name__)
+ )
return (
- self.contig == other.contig and
- self.start == other.start and
- self.end == other.end and
- self.strand == other.strand and
- self.id == other.id)
+ self.contig == other.contig
+ and self.start == other.start
+ and self.end == other.end
+ and self.strand == other.strand
+ and self.id == other.id
+ )
def __hash__(self):
return hash(self.id)
=====================================
pyensembl/fasta.py
=====================================
@@ -32,8 +32,9 @@ def _parse_header_id(line):
which starts with '>'
"""
if type(line) is not bytes:
- raise TypeError("Expected header line to be of type %s but got %s" % (
- bytes, type(line)))
+ raise TypeError(
+ "Expected header line to be of type %s but got %s" % (bytes, type(line))
+ )
if len(line) <= 1:
raise ValueError("No identifier on FASTA line")
@@ -61,11 +62,13 @@ def _parse_header_id(line):
return identifier.decode("ascii")
+
class FastaParser(object):
"""
FastaParser object consumes lines of a FASTA file incrementally
while building up a dictionary mapping sequence identifiers to sequences.
"""
+
def __init__(self):
self.current_id = None
self.current_lines = []
@@ -75,7 +78,7 @@ class FastaParser(object):
Read the contents of a FASTA file into a dictionary
"""
fasta_dictionary = {}
- for (identifier, sequence) in self.iterate_over_file(fasta_path):
+ for identifier, sequence in self.iterate_over_file(fasta_path):
fasta_dictionary[identifier] = sequence
return fasta_dictionary
@@ -114,9 +117,9 @@ class FastaParser(object):
Open either a text file or compressed gzip file as a stream of bytes.
"""
if fasta_path.endswith("gz") or fasta_path.endswith("gzip"):
- return GzipFile(fasta_path, 'rb')
+ return GzipFile(fasta_path, "rb")
else:
- return open(fasta_path, 'rb')
+ return open(fasta_path, "rb")
def _current_entry(self):
# when we hit a new entry, if this isn't the first
@@ -140,6 +143,7 @@ class FastaParser(object):
self.current_lines = []
return previous_entry
+
def parse_fasta_dictionary(fasta_path):
"""
Given a path to a FASTA (or compressed FASTA) file, returns a dictionary
=====================================
pyensembl/gene.py
=====================================
@@ -17,17 +17,7 @@ from .locus_with_genome import LocusWithGenome
class Gene(LocusWithGenome):
-
- def __init__(
- self,
- gene_id,
- gene_name,
- contig,
- start,
- end,
- strand,
- biotype,
- genome):
+ def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome):
LocusWithGenome.__init__(
self,
contig=contig,
@@ -35,7 +25,8 @@ class Gene(LocusWithGenome):
end=end,
strand=strand,
biotype=biotype,
- genome=genome)
+ genome=genome,
+ )
self.gene_id = gene_id
self.gene_name = gene_name
@@ -60,7 +51,8 @@ class Gene(LocusWithGenome):
" biotype='%s',"
" contig='%s',"
" start=%d,"
- " end=%d, strand='%s', genome='%s')") % (
+ " end=%d, strand='%s', genome='%s')"
+ ) % (
self.gene_id,
self.gene_name,
self.biotype,
@@ -68,13 +60,15 @@ class Gene(LocusWithGenome):
self.start,
self.end,
self.strand,
- self.genome.reference_name)
+ self.genome.reference_name,
+ )
def __eq__(self, other):
return (
- other.__class__ is Gene and
- self.id == other.id and
- self.genome == other.genome)
+ other.__class__ is Gene
+ and self.id == other.id
+ and self.genome == other.genome
+ )
def __hash__(self):
return hash(self.id)
@@ -92,19 +86,19 @@ class Gene(LocusWithGenome):
transcript IDs associated with this gene.
"""
transcript_id_results = self.db.query(
- select_column_names=['transcript_id'],
- filter_column='gene_id',
+ select_column_names=["transcript_id"],
+ filter_column="gene_id",
filter_value=self.id,
- feature='transcript',
+ feature="transcript",
distinct=False,
- required=False)
+ required=False,
+ )
# We're doing a SQL query for each transcript ID to fetch
# its particular information, might be more efficient if we
# just get all the columns here, but how do we keep that modular?
return [
- self.genome.transcript_by_id(result[0])
- for result in transcript_id_results
+ self.genome.transcript_by_id(result[0]) for result in transcript_id_results
]
@memoized_property
=====================================
pyensembl/genome.py
=====================================
@@ -35,17 +35,19 @@ class Genome(Serializable):
a particular genomic database source (e.g. a single Ensembl release) and
provides a wide variety of helper methods for accessing this data.
"""
+
def __init__(
- self,
- reference_name,
- annotation_name,
- annotation_version=None,
- gtf_path_or_url=None,
- transcript_fasta_paths_or_urls=None,
- protein_fasta_paths_or_urls=None,
- decompress_on_download=False,
- copy_local_files_to_cache=False,
- cache_directory_path=None):
+ self,
+ reference_name,
+ annotation_name,
+ annotation_version=None,
+ gtf_path_or_url=None,
+ transcript_fasta_paths_or_urls=None,
+ protein_fasta_paths_or_urls=None,
+ decompress_on_download=False,
+ copy_local_files_to_cache=False,
+ cache_directory_path=None,
+ ):
"""
Parameters
----------
@@ -106,7 +108,8 @@ class Genome(Serializable):
decompress_on_download=self.decompress_on_download,
copy_local_files_to_cache=self.copy_local_files_to_cache,
install_string_function=self.install_string,
- cache_directory_path=cache_directory_path)
+ cache_directory_path=cache_directory_path,
+ )
self._init_lazy_fields()
@property
@@ -116,15 +119,15 @@ class Genome(Serializable):
@property
def requires_transcript_fasta(self):
return (
- self._transcript_fasta_paths_or_urls is not None and
- len(self._transcript_fasta_paths_or_urls) > 0
+ self._transcript_fasta_paths_or_urls is not None
+ and len(self._transcript_fasta_paths_or_urls) > 0
)
@property
def requires_protein_fasta(self):
return (
- self._protein_fasta_paths_or_urls is not None and
- len(self._protein_fasta_paths_or_urls) > 0
+ self._protein_fasta_paths_or_urls is not None
+ and len(self._protein_fasta_paths_or_urls) > 0
)
def to_dict(self):
@@ -140,7 +143,8 @@ class Genome(Serializable):
protein_fasta_paths_or_urls=self._protein_fasta_paths_or_urls,
decompress_on_download=self.decompress_on_download,
copy_local_files_to_cache=self.copy_local_files_to_cache,
- cache_directory_path=self.cache_directory_path)
+ cache_directory_path=self.cache_directory_path,
+ )
def _init_lazy_fields(self):
"""
@@ -159,11 +163,8 @@ class Genome(Serializable):
self._exons = {}
def _get_cached_path(
- self,
- field_name,
- path_or_url,
- download_if_missing=False,
- overwrite=False):
+ self, field_name, path_or_url, download_if_missing=False, overwrite=False
+ ):
"""
Get the local path for a possibly remote file, invoking either
a download or install error message if it's missing.
@@ -176,19 +177,18 @@ class Genome(Serializable):
field_name=field_name,
path_or_url=path_or_url,
download_if_missing=download_if_missing,
- overwrite=overwrite)
+ overwrite=overwrite,
+ )
def _get_gtf_path(self, download_if_missing=False, overwrite=False):
return self._get_cached_path(
field_name="gtf",
path_or_url=self._gtf_path_or_url,
download_if_missing=download_if_missing,
- overwrite=overwrite)
+ overwrite=overwrite,
+ )
- def _get_transcript_fasta_paths(
- self,
- download_if_missing=False,
- overwrite=False):
+ def _get_transcript_fasta_paths(self, download_if_missing=False, overwrite=False):
if not self.requires_transcript_fasta:
raise ValueError("No transcript FASTA source for %s" % self)
return [
@@ -196,13 +196,12 @@ class Genome(Serializable):
field_name="transcript-fasta",
path_or_url=path,
download_if_missing=download_if_missing,
- overwrite=overwrite)
- for path in self._transcript_fasta_paths_or_urls]
+ overwrite=overwrite,
+ )
+ for path in self._transcript_fasta_paths_or_urls
+ ]
- def _get_protein_fasta_paths(
- self,
- download_if_missing=False,
- overwrite=False):
+ def _get_protein_fasta_paths(self, download_if_missing=False, overwrite=False):
# get the path for peptide FASTA files containing
# this genome's protein sequences
if not self.requires_protein_fasta:
@@ -212,35 +211,43 @@ class Genome(Serializable):
field_name="protein-fasta",
path_or_url=path,
download_if_missing=download_if_missing,
- overwrite=overwrite)
- for path in self._protein_fasta_paths_or_urls]
+ overwrite=overwrite,
+ )
+ for path in self._protein_fasta_paths_or_urls
+ ]
def _set_local_paths(self, download_if_missing=False, overwrite=False):
if self.requires_gtf:
self.gtf_path = self._get_gtf_path(
- download_if_missing=download_if_missing,
- overwrite=overwrite)
+ download_if_missing=download_if_missing, overwrite=overwrite
+ )
if self.requires_transcript_fasta:
self.transcript_fasta_paths = self._get_transcript_fasta_paths(
- download_if_missing=download_if_missing,
- overwrite=overwrite)
+ download_if_missing=download_if_missing, overwrite=overwrite
+ )
if self.requires_protein_fasta:
self.protein_fasta_paths = self._get_protein_fasta_paths(
- download_if_missing=download_if_missing,
- overwrite=overwrite)
+ download_if_missing=download_if_missing, overwrite=overwrite
+ )
def required_local_files(self):
paths = []
if self._gtf_path_or_url:
paths.append(self.download_cache.cached_path(self._gtf_path_or_url))
if self._transcript_fasta_paths_or_urls:
- paths.extend([
- self.download_cache.cached_path(path_or_url)
- for path_or_url in self._transcript_fasta_paths_or_urls])
+ paths.extend(
+ [
+ self.download_cache.cached_path(path_or_url)
+ for path_or_url in self._transcript_fasta_paths_or_urls
+ ]
+ )
if self._protein_fasta_paths_or_urls:
- paths.extend([
- self.download_cache.cached_path(path_or_url)
- for path_or_url in self._protein_fasta_paths_or_urls])
+ paths.extend(
+ [
+ self.download_cache.cached_path(path_or_url)
+ for path_or_url in self._protein_fasta_paths_or_urls
+ ]
+ )
return paths
def required_local_files_exist(self, empty_files_ok=False):
@@ -285,8 +292,7 @@ class Genome(Serializable):
# and populate self.gtf_path
self._set_local_paths(download_if_missing=False, overwrite=False)
if self.gtf_path is None:
- raise ValueError(
- "Property 'gtf_path' of %s cannot be None" % self)
+ raise ValueError("Property 'gtf_path' of %s cannot be None" % self)
# Database object turns the GTF dataframes into sqlite3 tables
# and wraps them with methods like `query_one`
@@ -315,7 +321,8 @@ class Genome(Serializable):
"exon_version",
"ccds_id",
"protein_id",
- "protein_version"},
+ "protein_version",
+ },
# excluding 'UTR' and 'Selenocysteine'
restrict_gtf_features={
"gene",
@@ -324,43 +331,44 @@ class Genome(Serializable):
"CDS",
"start_codon",
"stop_codon",
-
- })
+ },
+ )
return self._db
@property
def protein_sequences(self):
if self._protein_sequences is None:
if not self.requires_protein_fasta:
- raise ValueError(
- "Missing protein FASTA source for %s" % self)
+ raise ValueError("Missing protein FASTA source for %s" % self)
# make sure protein FASTA file exists locally
# and populate self.protein_fasta_paths
self._set_local_paths(download_if_missing=False, overwrite=False)
if self.protein_fasta_paths is None:
raise ValueError(
- "Property 'protein_fasta_paths' of %s cannot be None" % self)
+ "Property 'protein_fasta_paths' of %s cannot be None" % self
+ )
self._protein_sequences = SequenceData(
fasta_paths=self.protein_fasta_paths,
- cache_directory_path=self.cache_directory_path)
+ cache_directory_path=self.cache_directory_path,
+ )
return self._protein_sequences
@property
def transcript_sequences(self):
if self._transcript_sequences is None:
if not self.requires_transcript_fasta:
- raise ValueError(
- "Missing transcript FASTA source for %s" % self)
+ raise ValueError("Missing transcript FASTA source for %s" % self)
# make sure transcript FASTA file exists locally
# and populate self.transcript_fasta_paths
self._set_local_paths(download_if_missing=False, overwrite=False)
if self.transcript_fasta_paths is None:
raise ValueError(
- "Property 'transcript_fasta_paths' of %s cannot be None" % (
- self,))
+ "Property 'transcript_fasta_paths' of %s cannot be None" % (self,)
+ )
self._transcript_sequences = SequenceData(
fasta_paths=self.transcript_fasta_paths,
- cache_directory_path=self.cache_directory_path)
+ cache_directory_path=self.cache_directory_path,
+ )
return self._transcript_sequences
def install_string(self):
@@ -369,45 +377,55 @@ class Genome(Serializable):
in an error message.
"""
args = [
- "--reference-name", self.reference_name,
- "--annotation-name", self.annotation_name]
+ "--reference-name",
+ self.reference_name,
+ "--annotation-name",
+ self.annotation_name,
+ ]
if self.annotation_version:
args.extend(["--annotation-version", str(self.annotation_version)])
if self.requires_gtf:
args.append("--gtf")
- args.append("\"%s\"" % self._gtf_path_or_url)
+ args.append('"%s"' % self._gtf_path_or_url)
if self.requires_protein_fasta:
args += [
- "--protein-fasta \"%s\"" %
- path for path in self._protein_fasta_paths_or_urls]
+ '--protein-fasta "%s"' % path
+ for path in self._protein_fasta_paths_or_urls
+ ]
if self.requires_transcript_fasta:
args += [
- "--transcript-fasta \"%s\"" %
- path for path in self._transcript_fasta_paths_or_urls]
+ '--transcript-fasta "%s"' % path
+ for path in self._transcript_fasta_paths_or_urls
+ ]
return "pyensembl install %s" % " ".join(args)
def __str__(self):
transcript_fasta_paths_or_urls = (
- ','.join(self._transcript_fasta_paths_or_urls)
+ ",".join(self._transcript_fasta_paths_or_urls)
if self._transcript_fasta_paths_or_urls is not None
else None
)
protein_fasta_paths_or_urls = (
- ','.join(self._protein_fasta_paths_or_urls)
- if self._protein_fasta_paths_or_urls is not None else None
+ ",".join(self._protein_fasta_paths_or_urls)
+ if self._protein_fasta_paths_or_urls is not None
+ else None
+ )
+ return (
+ "Genome(reference_name=%s, "
+ "annotation_name=%s, "
+ "annotation_version=%s, "
+ "gtf_path_or_url=%s, "
+ "transcript_fasta_paths_or_urls=%s, "
+ "protein_fasta_paths_or_urls=%s)"
+ % (
+ self.reference_name,
+ self.annotation_name,
+ self.annotation_version,
+ self._gtf_path_or_url,
+ transcript_fasta_paths_or_urls,
+ protein_fasta_paths_or_urls,
+ )
)
- return ("Genome(reference_name=%s, "
- "annotation_name=%s, "
- "annotation_version=%s, "
- "gtf_path_or_url=%s, "
- "transcript_fasta_paths_or_urls=%s, "
- "protein_fasta_paths_or_urls=%s)" % (
- self.reference_name,
- self.annotation_name,
- self.annotation_version,
- self._gtf_path_or_url,
- transcript_fasta_paths_or_urls,
- protein_fasta_paths_or_urls))
def __repr__(self):
return str(self)
@@ -423,10 +441,7 @@ class Genome(Serializable):
)
def __eq__(self, other):
- return (
- other.__class__ is Genome and
- self._fields() == other._fields()
- )
+ return other.__class__ is Genome and self._fields() == other._fields()
def __hash__(self):
return hash(self._fields())
@@ -451,12 +466,8 @@ class Genome(Serializable):
remove(db_path)
def _all_feature_values(
- self,
- column,
- feature,
- distinct=True,
- contig=None,
- strand=None):
+ self, column, feature, distinct=True, contig=None, strand=None
+ ):
"""
Cached lookup of all values for a particular feature property from
the database, caches repeated queries in memory and
@@ -487,15 +498,15 @@ class Genome(Serializable):
feature=feature,
distinct=distinct,
contig=contig,
- strand=strand)
+ strand=strand,
+ )
def transcript_sequence(self, transcript_id):
"""Return cDNA nucleotide sequence of transcript, or None if
transcript doesn't have cDNA sequence.
"""
if self.transcript_sequences is None:
- raise ValueError(
- "No transcript FASTA supplied to this Genome: %s" % self)
+ raise ValueError("No transcript FASTA supplied to this Genome: %s" % self)
return self.transcript_sequences.get(transcript_id)
def protein_sequence(self, protein_id):
@@ -503,30 +514,24 @@ class Genome(Serializable):
transcript doesn't have cDNA sequence.
"""
if self.protein_sequences is None:
- raise ValueError(
- "No protein FASTA supplied to this Genome: %s" % self)
+ raise ValueError("No protein FASTA supplied to this Genome: %s" % self)
return self.protein_sequences.get(protein_id)
def genes_at_locus(self, contig, position, end=None, strand=None):
- gene_ids = self.gene_ids_at_locus(
- contig, position, end=end, strand=strand)
+ gene_ids = self.gene_ids_at_locus(contig, position, end=end, strand=strand)
return [self.gene_by_id(gene_id) for gene_id in gene_ids]
def transcripts_at_locus(self, contig, position, end=None, strand=None):
transcript_ids = self.transcript_ids_at_locus(
- contig, position, end=end, strand=strand)
+ contig, position, end=end, strand=strand
+ )
return [
- self.transcript_by_id(transcript_id)
- for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
]
def exons_at_locus(self, contig, position, end=None, strand=None):
- exon_ids = self.exon_ids_at_locus(
- contig, position, end=end, strand=strand)
- return [
- self.exon_by_id(exon_id)
- for exon_id in exon_ids
- ]
+ exon_ids = self.exon_ids_at_locus(contig, position, end=end, strand=strand)
+ return [self.exon_by_id(exon_id) for exon_id in exon_ids]
def gene_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
@@ -535,7 +540,8 @@ class Genome(Serializable):
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
def gene_names_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
@@ -544,7 +550,8 @@ class Genome(Serializable):
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
def exon_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
@@ -553,7 +560,8 @@ class Genome(Serializable):
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
def transcript_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
@@ -562,17 +570,18 @@ class Genome(Serializable):
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
- def transcript_names_at_locus(
- self, contig, position, end=None, strand=None):
+ def transcript_names_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
column="transcript_name",
feature="transcript",
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
def protein_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
@@ -581,7 +590,8 @@ class Genome(Serializable):
contig=contig,
position=position,
end=end,
- strand=strand)
+ strand=strand,
+ )
###################################################
#
@@ -596,9 +606,8 @@ class Genome(Serializable):
Given a gene ID returns Locus with: chromosome, start, stop, strand
"""
return self.db.query_locus(
- filter_column="gene_id",
- filter_value=gene_id,
- feature="gene")
+ filter_column="gene_id", filter_value=gene_id, feature="gene"
+ )
def loci_of_gene_names(self, gene_name):
"""
@@ -613,7 +622,8 @@ class Genome(Serializable):
return self.db.query_locus(
filter_column="transcript_id",
filter_value=transcript_id,
- feature="transcript")
+ feature="transcript",
+ )
def locus_of_exon_id(self, exon_id):
"""
@@ -673,22 +683,25 @@ class Genome(Serializable):
]
# Do not look for gene_name and gene_biotype if they are
# not in the database.
- field_names.extend([
- name for name in optional_field_names
- if self.db.column_exists("gene", name)
- ])
+ field_names.extend(
+ [
+ name
+ for name in optional_field_names
+ if self.db.column_exists("gene", name)
+ ]
+ )
result = self.db.query_one(
field_names,
filter_column="gene_id",
filter_value=gene_id,
- feature="gene")
+ feature="gene",
+ )
if not result:
raise ValueError("Gene not found: %s" % (gene_id,))
gene_name, gene_biotype = None, None
if len(result) < 4 or len(result) > 6:
- raise ValueError(
- "Result is not the expected length: %d" % len(result))
+ raise ValueError("Result is not the expected length: %d" % len(result))
contig, start, end, strand = result[:4]
if len(result) == 5:
if "gene_name" in field_names:
@@ -706,7 +719,8 @@ class Genome(Serializable):
end=end,
strand=strand,
biotype=gene_biotype,
- genome=self)
+ genome=self,
+ )
return self._genes[gene_id]
@@ -740,7 +754,8 @@ class Genome(Serializable):
filter_value=property_value,
feature=feature_type,
distinct=True,
- required=True)
+ required=True,
+ )
return str(results[0][0])
def gene_names(self, contig=None, strand=None):
@@ -749,21 +764,17 @@ class Genome(Serializable):
optionally restrict to a chromosome and/or strand.
"""
return self._all_feature_values(
- column="gene_name",
- feature="gene",
- contig=contig,
- strand=strand)
+ column="gene_name", feature="gene", contig=contig, strand=strand
+ )
def gene_name_of_gene_id(self, gene_id):
return self._query_gene_name("gene_id", gene_id, "gene")
def gene_name_of_transcript_id(self, transcript_id):
- return self._query_gene_name(
- "transcript_id", transcript_id, "transcript")
+ return self._query_gene_name("transcript_id", transcript_id, "transcript")
def gene_name_of_transcript_name(self, transcript_name):
- return self._query_gene_name(
- "transcript_name", transcript_name, "transcript")
+ return self._query_gene_name("transcript_name", transcript_name, "transcript")
def gene_name_of_exon_id(self, exon_id):
return self._query_gene_name("exon_id", exon_id, "exon")
@@ -781,7 +792,8 @@ class Genome(Serializable):
filter_value=value,
feature=feature,
distinct=True,
- required=True)
+ required=True,
+ )
return [str(result[0]) for result in results if result[0]]
def gene_ids(self, contig=None, strand=None):
@@ -790,10 +802,8 @@ class Genome(Serializable):
(optionally restrict to a given chromosome/contig and/or strand)
"""
return self._all_feature_values(
- column="gene_id",
- feature="gene",
- contig=contig,
- strand=strand)
+ column="gene_id", feature="gene", contig=contig, strand=strand
+ )
def gene_ids_of_gene_name(self, gene_name):
"""
@@ -809,16 +819,17 @@ class Genome(Serializable):
"""
What is the gene ID associated with a given protein ID?
"""
- results = self._query_gene_ids(
- "protein_id",
- protein_id,
- feature="CDS")
+ results = self._query_gene_ids("protein_id", protein_id, feature="CDS")
if len(results) == 0:
raise ValueError("Protein ID not found: %s" % protein_id)
elif len(results) > 1:
raise ValueError(
- ("Should have only one gene ID for a given protein ID, "
- "but found %d: %s") % (len(results), results))
+ (
+ "Should have only one gene ID for a given protein ID, "
+ "but found %d: %s"
+ )
+ % (len(results), results)
+ )
return results[0]
###################################################
@@ -835,8 +846,7 @@ class Genome(Serializable):
"""
transcript_ids = self.transcript_ids(contig=contig, strand=strand)
return [
- self.transcript_by_id(transcript_id)
- for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
]
def transcript_by_id(self, transcript_id):
@@ -855,31 +865,36 @@ class Genome(Serializable):
"gene_id",
]
# Do not look for the optional fields if they are not in the database.
- field_names.extend([
- name for name in optional_field_names
- if self.db.column_exists("transcript", name)
- ])
+ field_names.extend(
+ [
+ name
+ for name in optional_field_names
+ if self.db.column_exists("transcript", name)
+ ]
+ )
result = self.db.query_one(
select_column_names=field_names,
filter_column="transcript_id",
filter_value=transcript_id,
feature="transcript",
- distinct=True)
+ distinct=True,
+ )
if not result:
raise ValueError("Transcript not found: %s" % (transcript_id,))
transcript_name, transcript_biotype, tsl = None, None, None
if len(result) < 5 or len(result) > (5 + len(optional_field_names)):
- raise ValueError(
- "Result is not the expected length: %d" % len(result))
+ raise ValueError("Result is not the expected length: %d" % len(result))
contig, start, end, strand, gene_id = result[:5]
if len(result) > 5:
- extra_field_names = [f for f in optional_field_names if f in field_names]
+ extra_field_names = [
+ f for f in optional_field_names if f in field_names
+ ]
extra_data = dict(zip(extra_field_names, result[5:]))
transcript_name = extra_data.get("transcript_name")
transcript_biotype = extra_data.get("transcript_biotype")
tsl = extra_data.get("transcript_support_level")
- if not tsl or tsl == 'NA':
+ if not tsl or tsl == "NA":
tsl = None
else:
tsl = int(tsl)
@@ -894,15 +909,15 @@ class Genome(Serializable):
biotype=transcript_biotype,
gene_id=gene_id,
genome=self,
- support_level=tsl)
+ support_level=tsl,
+ )
return self._transcripts[transcript_id]
def transcripts_by_name(self, transcript_name):
transcript_ids = self.transcript_ids_of_transcript_name(transcript_name)
return [
- self.transcript_by_id(transcript_id)
- for transcript_id in transcript_ids
+ self.transcript_by_id(transcript_id) for transcript_id in transcript_ids
]
def transcript_by_protein_id(self, protein_id):
@@ -922,7 +937,8 @@ class Genome(Serializable):
filter_value=value,
feature="transcript",
distinct=True,
- required=True)
+ required=True,
+ )
return [result[0] for result in results]
def transcript_names(self, contig=None, strand=None):
@@ -931,24 +947,22 @@ class Genome(Serializable):
(optionally, restrict to a given chromosome and/or strand)
"""
return self._all_feature_values(
- column="transcript_name",
- feature="transcript",
- contig=contig,
- strand=strand)
+ column="transcript_name", feature="transcript", contig=contig, strand=strand
+ )
def transcript_names_of_gene_name(self, gene_name):
return self._query_transcript_names("gene_name", gene_name)
def transcript_name_of_transcript_id(self, transcript_id):
- transcript_names = self._query_transcript_names(
- "transcript_id", transcript_id)
+ transcript_names = self._query_transcript_names("transcript_id", transcript_id)
if len(transcript_names) == 0:
raise ValueError(
- "No transcript names for transcript ID = %s" % transcript_id)
+ "No transcript names for transcript ID = %s" % transcript_id
+ )
elif len(transcript_names) > 1:
raise ValueError(
- "Multiple transcript names for transcript ID = %s" % (
- transcript_id,))
+ "Multiple transcript names for transcript ID = %s" % (transcript_id,)
+ )
return transcript_names[0]
###################################################
@@ -957,26 +971,21 @@ class Genome(Serializable):
#
###################################################
- def _query_transcript_ids(
- self,
- property_name,
- value,
- feature="transcript"):
+ def _query_transcript_ids(self, property_name, value, feature="transcript"):
results = self.db.query(
select_column_names=["transcript_id"],
filter_column=property_name,
filter_value=value,
feature=feature,
distinct=True,
- required=True)
+ required=True,
+ )
return [result[0] for result in results]
def transcript_ids(self, contig=None, strand=None):
return self._all_feature_values(
- column="transcript_id",
- feature="transcript",
- contig=contig,
- strand=strand)
+ column="transcript_id", feature="transcript", contig=contig, strand=strand
+ )
def transcript_ids_of_gene_id(self, gene_id):
return self._query_transcript_ids("gene_id", gene_id)
@@ -994,16 +1003,17 @@ class Genome(Serializable):
"""
What is the transcript ID associated with a given protein ID?
"""
- results = self._query_transcript_ids(
- "protein_id",
- protein_id,
- feature="CDS")
+ results = self._query_transcript_ids("protein_id", protein_id, feature="CDS")
if len(results) == 0:
raise ValueError("Protein ID not found: %s" % protein_id)
elif len(results) > 1:
raise ValueError(
- ("Should have only one transcript ID for a given protein ID, "
- "but found %d: %s") % (len(results), results))
+ (
+ "Should have only one transcript ID for a given protein ID, "
+ "but found %d: %s"
+ )
+ % (len(results), results)
+ )
return results[0]
###################################################
@@ -1019,10 +1029,7 @@ class Genome(Serializable):
"""
# DataFrame with single column called "exon_id"
exon_ids = self.exon_ids(contig=contig, strand=strand)
- return [
- self.exon_by_id(exon_id)
- for exon_id in exon_ids
- ]
+ return [self.exon_by_id(exon_id) for exon_id in exon_ids]
def exon_by_id(self, exon_id):
"""Construct an Exon object from its ID by looking up the exon"s
@@ -1043,7 +1050,8 @@ class Genome(Serializable):
filter_column="exon_id",
filter_value=exon_id,
feature="exon",
- distinct=True)
+ distinct=True,
+ )
self._exons[exon_id] = Exon(
exon_id=exon_id,
@@ -1052,7 +1060,8 @@ class Genome(Serializable):
end=end,
strand=strand,
gene_name=gene_name,
- gene_id=gene_id)
+ gene_id=gene_id,
+ )
return self._exons[exon_id]
@@ -1069,15 +1078,14 @@ class Genome(Serializable):
filter_value=value,
feature="exon",
distinct=True,
- required=True)
+ required=True,
+ )
return [result[0] for result in results]
def exon_ids(self, contig=None, strand=None):
return self._all_feature_values(
- column="exon_id",
- feature="exon",
- contig=contig,
- strand=strand)
+ column="exon_id", feature="exon", contig=contig, strand=strand
+ )
def exon_ids_of_gene_id(self, gene_id):
return self._query_exon_ids("gene_id", gene_id)
@@ -1107,6 +1115,7 @@ class Genome(Serializable):
feature="CDS",
contig=contig,
strand=strand,
- distinct=True)
+ distinct=True,
+ )
# drop None values
return [protein_id for protein_id in protein_ids if protein_id]
=====================================
pyensembl/locus.py
=====================================
@@ -21,12 +21,7 @@ class Locus(Serializable):
on a particular strand of a chromosome/contig.
"""
- def __init__(
- self,
- contig,
- start,
- end,
- strand):
+ def __init__(self, contig, start, end, strand):
"""
contig : str
Chromosome or other sequence name in the reference assembly
@@ -54,28 +49,33 @@ class Locus(Serializable):
if end < start:
raise ValueError(
- "Expected start <= end, got start = %d, end = %d" % (
- start, end))
+ "Expected start <= end, got start = %d, end = %d" % (start, end)
+ )
self.start = start
self.end = end
def __str__(self):
return "Locus(contig='%s', start=%d, end=%d, strand='%s')" % (
- self.contig, self.start, self.end, self.strand)
+ self.contig,
+ self.start,
+ self.end,
+ self.strand,
+ )
def __len__(self):
return self.end - self.start + 1
def __eq__(self, other):
if not isinstance(other, Locus):
- raise TypeError("Cannot compare %s and %s" % (
- self.__class__.__name__,
- other.__class.__name__))
+ raise TypeError(
+ "Cannot compare %s and %s"
+ % (self.__class__.__name__, other.__class.__name__)
+ )
return (
- self.contig == other.contig and
- self.start == other.start and
- self.end == other.end and
- self.strand == other.strand
+ self.contig == other.contig
+ and self.start == other.start
+ and self.end == other.end
+ and self.strand == other.strand
)
def to_tuple(self):
@@ -83,9 +83,10 @@ class Locus(Serializable):
def __lt__(self, other):
if not isinstance(other, Locus):
- raise TypeError("Cannot compare %s and %s" % (
- self.__class__.__name__,
- other.__class.__name__))
+ raise TypeError(
+ "Cannot compare %s and %s"
+ % (self.__class__.__name__, other.__class.__name__)
+ )
return self.to_tuple() < other.to_tuple()
def __le__(self, other):
@@ -93,9 +94,10 @@ class Locus(Serializable):
def __gt__(self, other):
if not isinstance(other, Locus):
- raise TypeError("Cannot compare %s and %s" % (
- self.__class__.__name__,
- other.__class.__name__))
+ raise TypeError(
+ "Cannot compare %s and %s"
+ % (self.__class__.__name__, other.__class.__name__)
+ )
return self.to_tuple() > other.to_tuple()
def __ge__(self, other):
@@ -106,7 +108,7 @@ class Locus(Serializable):
"contig": self.contig,
"start": self.start,
"end": self.end,
- "strand": self.strand
+ "strand": self.strand,
}
@property
@@ -122,8 +124,9 @@ class Locus(Serializable):
"""
if position > self.end or position < self.start:
raise ValueError(
- "Position %d outside valid range %d..%d of %s" % (
- position, self.start, self.end, self))
+ "Position %d outside valid range %d..%d of %s"
+ % (position, self.start, self.end, self)
+ )
elif self.on_forward_strand:
return position - self.start
else:
@@ -141,12 +144,12 @@ class Locus(Serializable):
"""
if start > end:
raise ValueError(
- "Locus should always have start <= end, got start=%d, end=%d" % (
- start, end))
+ "Locus should always have start <= end, got start=%d, end=%d"
+ % (start, end)
+ )
if start < self.start or end > self.end:
- raise ValueError("Range (%d, %d) falls outside %s" % (
- start, end, self))
+ raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self))
if self.on_forward_strand:
return (start - self.start, end - self.start)
@@ -180,8 +183,7 @@ class Locus(Serializable):
"""
Is this locus on the same contig and (optionally) on the same strand?
"""
- return (self.on_contig(contig) and
- (strand is None or self.on_strand(strand)))
+ return self.on_contig(contig) and (strand is None or self.on_strand(strand))
def distance_to_interval(self, start, end):
"""
@@ -212,25 +214,21 @@ class Locus(Serializable):
that e.g. chr1:10-10 overlaps with chr1:10-10
"""
return (
- self.can_overlap(contig, strand) and
- self.distance_to_interval(start, end) == 0)
+ self.can_overlap(contig, strand)
+ and self.distance_to_interval(start, end) == 0
+ )
def overlaps_locus(self, other_locus):
return self.overlaps(
- other_locus.contig,
- other_locus.start,
- other_locus.end,
- other_locus.strand)
+ other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
+ )
def contains(self, contig, start, end, strand=None):
return (
- self.can_overlap(contig, strand) and
- start >= self.start and
- end <= self.end)
+ self.can_overlap(contig, strand) and start >= self.start and end <= self.end
+ )
def contains_locus(self, other_locus):
return self.contains(
- other_locus.contig,
- other_locus.start,
- other_locus.end,
- other_locus.strand)
+ other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
+ )
=====================================
pyensembl/locus_with_genome.py
=====================================
@@ -19,6 +19,7 @@ class LocusWithGenome(Locus):
Common base class for Gene and Transcript to avoid copying
their shared logic.
"""
+
def __init__(self, contig, start, end, strand, biotype, genome):
Locus.__init__(self, contig, start, end, strand)
self.genome = genome
@@ -32,7 +33,8 @@ class LocusWithGenome(Locus):
end=self.end,
strand=self.strand,
biotype=self.biotype,
- genome=self.genome)
+ genome=self.genome,
+ )
@property
def is_protein_coding(self):
=====================================
pyensembl/normalization.py
=====================================
@@ -53,7 +53,7 @@ def normalize_chromosome(c):
def normalize_strand(strand):
- if strand == "+" or strand == 1 or strand == "+1" or strand == "1" :
+ if strand == "+" or strand == 1 or strand == "+1" or strand == "1":
return "+"
elif strand == "-" or strand == -1 or strand == "-1":
return "-"
=====================================
pyensembl/reference_name.py
=====================================
@@ -42,9 +42,7 @@ def max_ensembl_release(reference_name):
return max_release
-def genome_for_reference_name(
- reference_name,
- allow_older_downloaded_release=True):
+def genome_for_reference_name(reference_name, allow_older_downloaded_release=True):
"""
Given a genome reference name, such as "GRCh38", returns the
corresponding Ensembl Release object.
@@ -57,8 +55,9 @@ def genome_for_reference_name(
"""
reference_name = normalize_reference_name(reference_name)
species = find_species_by_reference(reference_name)
- (min_ensembl_release, max_ensembl_release) = \
- species.reference_assemblies[reference_name]
+ (min_ensembl_release, max_ensembl_release) = species.reference_assemblies[
+ reference_name
+ ]
if allow_older_downloaded_release:
# go through candidate releases in descending order
for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)):
@@ -70,6 +69,7 @@ def genome_for_reference_name(
# available
return EnsemblRelease.cached(release=max_ensembl_release, species=species)
+
ensembl_grch36 = genome_for_reference_name("ncbi36")
ensembl_grch37 = genome_for_reference_name("grch37")
ensembl_grch38 = genome_for_reference_name("grch38")
=====================================
pyensembl/search.py
=====================================
@@ -14,6 +14,7 @@
Helper functions for searching over collections of PyEnsembl objects
"""
+
def find_nearest_locus(start, end, loci):
"""
Finds nearest locus (object with method `distance_to_interval`) to the
=====================================
pyensembl/sequence_data.py
=====================================
@@ -14,8 +14,8 @@ from os import remove
from os.path import exists, abspath, split, join
import logging
from collections import Counter
-import pickle
-from .common import (load_pickle, dump_pickle)
+import pickle
+from .common import load_pickle, dump_pickle
from .fasta import parse_fasta_dictionary
@@ -26,11 +26,8 @@ class SequenceData(object):
"""
Container for reference nucleotide and amino acid sequenes.
"""
- def __init__(
- self,
- fasta_paths,
- cache_directory_path=None):
+ def __init__(self, fasta_paths, cache_directory_path=None):
if type(fasta_paths) is str:
fasta_paths = [fasta_paths]
@@ -45,10 +42,14 @@ class SequenceData(object):
if not exists(path):
raise ValueError("Couldn't find FASTA file %s" % (path,))
self.fasta_dictionary_filenames = [
- filename + ".pickle" for filename in self.fasta_filenames]
+ filename + ".pickle" for filename in self.fasta_filenames
+ ]
self.fasta_dictionary_pickle_paths = [
- join(cache_path, filename) for cache_path, filename in
- zip(self.cache_directory_paths, self.fasta_dictionary_filenames)]
+ join(cache_path, filename)
+ for cache_path, filename in zip(
+ self.cache_directory_paths, self.fasta_dictionary_filenames
+ )
+ ]
self._init_lazy_fields()
def _init_lazy_fields(self):
@@ -75,9 +76,9 @@ class SequenceData(object):
def __eq__(self, other):
# test to see if self.fasta_paths and other.fasta_paths contain
# the same list of paths, regardless of order
- return (
- (other.__class__ is SequenceData) and
- Counter(self.fasta_paths) == Counter(other.fasta_paths))
+ return (other.__class__ is SequenceData) and Counter(
+ self.fasta_paths
+ ) == Counter(other.fasta_paths)
def __hash__(self):
return hash(self.fasta_paths)
@@ -86,22 +87,24 @@ class SequenceData(object):
for identifier, sequence in fasta_dictionary_tmp.items():
if identifier in self._fasta_dictionary:
logger.warn(
- "Sequence identifier %s is duplicated in your FASTA files!" % identifier)
+ "Sequence identifier %s is duplicated in your FASTA files!"
+ % identifier
+ )
continue
self._fasta_dictionary[identifier] = sequence
def _load_or_create_fasta_dictionary_pickle(self):
self._fasta_dictionary = dict()
- for fasta_path, pickle_path in zip(self.fasta_paths, self.fasta_dictionary_pickle_paths):
+ for fasta_path, pickle_path in zip(
+ self.fasta_paths, self.fasta_dictionary_pickle_paths
+ ):
if exists(pickle_path):
# try loading the cached file
# but we'll fall back on recreating it if loading fails
try:
- fasta_dictionary_tmp = load_pickle(
- pickle_path)
+ fasta_dictionary_tmp = load_pickle(pickle_path)
self._add_to_fasta_dictionary(fasta_dictionary_tmp)
- logger.info(
- "Loaded sequence dictionary from %s", pickle_path)
+ logger.info("Loaded sequence dictionary from %s", pickle_path)
continue
except (pickle.UnpicklingError, AttributeError):
# catch either an UnpicklingError or an AttributeError
@@ -109,7 +112,8 @@ class SequenceData(object):
# that no longer exists
logger.warn(
"Failed to load %s, attempting to read FASTA directly",
- pickle_path)
+ pickle_path,
+ )
logger.info("Parsing sequences from FASTA file at %s", fasta_path)
fasta_dictionary_tmp = parse_fasta_dictionary(fasta_path)
=====================================
pyensembl/shell.py
=====================================
@@ -56,7 +56,8 @@ parser.add_argument(
"--overwrite",
default=False,
action="store_true",
- help="Force download and indexing even if files already exist locally")
+ help="Force download and indexing even if files already exist locally",
+)
root_group = parser.add_mutually_exclusive_group()
@@ -67,18 +68,21 @@ release_group.add_argument(
type=int,
nargs="+",
default=[],
- help="Ensembl release version(s) (default=%d)" % MAX_ENSEMBL_RELEASE)
+ help="Ensembl release version(s) (default=%d)" % MAX_ENSEMBL_RELEASE,
+)
release_group.add_argument(
"--species",
default=[],
nargs="+",
- help="Which species to download Ensembl data for (default=human)")
+ help="Which species to download Ensembl data for (default=human)",
+)
release_group.add_argument(
"--custom-mirror",
default=None,
- help="URL and directory to use instead of the default Ensembl FTP server")
+ help="URL and directory to use instead of the default Ensembl FTP server",
+)
path_group = root_group.add_argument_group()
@@ -86,44 +90,47 @@ path_group.add_argument(
"--reference-name",
type=str,
default=None,
- help="Name of the reference, e.g. GRCh38")
+ help="Name of the reference, e.g. GRCh38",
+)
path_group.add_argument(
- "--annotation-name",
- default=None,
- help="Name of annotation source (e.g. refseq)")
+ "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)"
+)
path_group.add_argument(
- "--annotation-version",
- default=None,
- help="Version of annotation database")
+ "--annotation-version", default=None, help="Version of annotation database"
+)
path_group.add_argument(
"--gtf",
type=str,
default=None,
- help="URL or local path to a GTF file containing annotations.")
+ help="URL or local path to a GTF file containing annotations.",
+)
path_group.add_argument(
"--transcript-fasta",
type=str,
- action='append',
+ action="append",
default=[],
help="URL or local path to a FASTA files containing the transcript "
"data. This option can be specified multiple times for multiple "
- "FASTA files.")
+ "FASTA files.",
+)
path_group.add_argument(
"--protein-fasta",
type=str,
default=[],
action="append",
- help="URL or local path to a FASTA file containing protein data.")
+ help="URL or local path to a FASTA file containing protein data.",
+)
path_group.add_argument(
"--shared-prefix",
default="",
- help="Add this prefix to URLs or paths specified by --gtf, --transcript-fasta, --protein-fasta")
+ help="Add this prefix to URLs or paths specified by --gtf, --transcript-fasta, --protein-fasta",
+)
parser.add_argument(
"action",
@@ -135,16 +142,18 @@ parser.add_argument(
"list",
),
help=(
- "\"install\" will download and index any data that is not "
- "currently downloaded or indexed. \"delete-all-files\" will delete all data "
- "associated with a genome annotation. \"delete-index-files\" deletes "
+ '"install" will download and index any data that is not '
+ 'currently downloaded or indexed. "delete-all-files" will delete all data '
+ 'associated with a genome annotation. "delete-index-files" deletes '
"all files other than the original GTF and FASTA files for a genome. "
- "\"list\" will show you all installed Ensembl genomes."))
+ '"list" will show you all installed Ensembl genomes.'
+ ),
+)
def collect_all_installed_ensembl_releases():
genomes = []
- for (species, release) in Species.all_species_release_pairs():
+ for species, release in Species.all_species_release_pairs():
genome = EnsemblRelease(release, species=species)
if genome.required_local_files_exist():
genomes.append(genome)
@@ -173,20 +182,19 @@ def all_combinations_of_ensembl_genomes(args):
# URL to be a directory with all the same filenames as
# would be provided by Ensembl
gtf_url = os.path.join(
- args.custom_mirror,
- os.path.basename(ensembl_release.gtf_url))
+ args.custom_mirror, os.path.basename(ensembl_release.gtf_url)
+ )
transcript_fasta_urls = [
os.path.join(
- args.custom_mirror,
- os.path.basename(transcript_fasta_url))
+ args.custom_mirror, os.path.basename(transcript_fasta_url)
+ )
for transcript_fasta_url in ensembl_release.transcript_fasta_urls
]
protein_fasta_urls = [
os.path.join(
- args.custom_mirror,
- os.path.basename(protein_fasta_url))
- for protein_fasta_url in
- ensembl_release.protein_fasta_urls
+ args.custom_mirror, os.path.basename(protein_fasta_url)
+ )
+ for protein_fasta_url in ensembl_release.protein_fasta_urls
]
reference_name = ensembl_release.reference_name
genome = Genome(
@@ -195,33 +203,41 @@ def all_combinations_of_ensembl_genomes(args):
annotation_version=version,
gtf_path_or_url=gtf_url,
transcript_fasta_paths_or_urls=transcript_fasta_urls,
- protein_fasta_paths_or_urls=protein_fasta_urls)
+ protein_fasta_paths_or_urls=protein_fasta_urls,
+ )
genomes.append(genome)
return genomes
+
def collect_selected_genomes(args):
# If specific genome source URLs are provided, use those
if args.gtf or args.transcript_fasta or args.protein_fasta:
if args.release:
raise ValueError(
"An Ensembl release cannot be specified if "
- "specific paths are also given")
+ "specific paths are also given"
+ )
if not args.reference_name:
raise ValueError("Must specify a reference name")
if not args.annotation_name:
raise ValueError("Must specify the name of the annotation source")
- return [Genome(
- reference_name=args.reference_name,
- annotation_name=args.annotation_name,
- annotation_version=args.annotation_version,
- gtf_path_or_url=os.path.join(args.shared_prefix, args.gtf),
- transcript_fasta_paths_or_urls=[
- os.path.join(args.shared_prefix, transcript_fasta)
- for transcript_fasta in args.transcript_fasta],
- protein_fasta_paths_or_urls=[
- os.path.join(args.shared_prefix, protein_fasta)
- for protein_fasta in args.protein_fasta])]
+ return [
+ Genome(
+ reference_name=args.reference_name,
+ annotation_name=args.annotation_name,
+ annotation_version=args.annotation_version,
+ gtf_path_or_url=os.path.join(args.shared_prefix, args.gtf),
+ transcript_fasta_paths_or_urls=[
+ os.path.join(args.shared_prefix, transcript_fasta)
+ for transcript_fasta in args.transcript_fasta
+ ],
+ protein_fasta_paths_or_urls=[
+ os.path.join(args.shared_prefix, protein_fasta)
+ for protein_fasta in args.protein_fasta
+ ],
+ )
+ ]
else:
return all_combinations_of_ensembl_genomes(args)
=====================================
pyensembl/species.py
=====================================
@@ -14,6 +14,8 @@ from serializable import Serializable
from .ensembl_release_versions import MAX_ENSEMBL_RELEASE
+# TODO: replace Serializable with data class
+
class Species(Serializable):
"""
@@ -36,21 +38,26 @@ class Species(Serializable):
species = Species(
latin_name=latin_name,
synonyms=synonyms,
- reference_assemblies=reference_assemblies)
+ reference_assemblies=reference_assemblies,
+ )
cls._latin_names_to_species[species.latin_name] = species
for synonym in synonyms:
if synonym in cls._common_names_to_species:
- raise ValueError("Can't use synonym '%s' for both %s and %s" % (
- synonym,
- species,
- cls._common_names_to_species[synonym]))
+ raise ValueError(
+ "Can't use synonym '%s' for both %s and %s"
+ % (synonym, species, cls._common_names_to_species[synonym])
+ )
cls._common_names_to_species[synonym] = species
for reference_name in reference_assemblies:
if reference_name in cls._reference_names_to_species:
- raise ValueError("Can't use reference '%s' for both %s and %s" % (
- reference_name,
- species,
- cls._reference_names_to_species[reference_name]))
+ raise ValueError(
+ "Can't use reference '%s' for both %s and %s"
+ % (
+ reference_name,
+ species,
+ cls._reference_names_to_species[reference_name],
+ )
+ )
cls._reference_names_to_species[reference_name] = species
return species
@@ -89,30 +96,36 @@ class Species(Serializable):
self.synonyms = synonyms
self.reference_assemblies = reference_assemblies
self._release_to_genome = {}
- for (genome_name, (start, end)) in self.reference_assemblies.items():
+ for genome_name, (start, end) in self.reference_assemblies.items():
for i in range(start, end + 1):
if i in self._release_to_genome:
raise ValueError(
- "Ensembl release %d already has an associated genome" % i)
+ "Ensembl release %d already has an associated genome" % i
+ )
self._release_to_genome[i] = genome_name
def which_reference(self, ensembl_release):
if ensembl_release not in self._release_to_genome:
- raise ValueError("No genome for %s in Ensembl release %d" % (
- self.latin_name, ensembl_release))
+ raise ValueError(
+ "No genome for %s in Ensembl release %d"
+ % (self.latin_name, ensembl_release)
+ )
return self._release_to_genome[ensembl_release]
def __str__(self):
- return (
- "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % (
- self.latin_name, self.synonyms, self.reference_assemblies))
+ return "Species(latin_name='%s', synonyms=%s, reference_assemblies=%s)" % (
+ self.latin_name,
+ self.synonyms,
+ self.reference_assemblies,
+ )
def __eq__(self, other):
return (
- other.__class__ is Species and
- self.latin_name == other.latin_name and
- self.synonyms == other.synonyms and
- self.reference_assemblies == other.reference_assemblies)
+ other.__class__ is Species
+ and self.latin_name == other.latin_name
+ and self.synonyms == other.synonyms
+ and self.reference_assemblies == other.reference_assemblies
+ )
def to_dict(self):
return {"latin_name": self.latin_name}
@@ -122,9 +135,13 @@ class Species(Serializable):
return cls._latin_names_to_species[state_dict["latin_name"]]
def __hash__(self):
- return hash((self.latin_name,
- tuple(self.synonyms),
- frozenset(self.reference_assemblies.items())))
+ return hash(
+ (
+ self.latin_name,
+ tuple(self.synonyms),
+ frozenset(self.reference_assemblies.items()),
+ )
+ )
def normalize_species_name(name):
@@ -145,7 +162,10 @@ def normalize_species_name(name):
def find_species_by_name(species_name):
latin_name = normalize_species_name(species_name)
if latin_name not in Species._latin_names_to_species:
- raise ValueError("Species not found: %s, for non-Ensembl data see https://github.com/openvax/pyensembl#non-ensembl-data" % (species_name,))
+ raise ValueError(
+ "Species not found: %s, for non-Ensembl data see https://github.com/openvax/pyensembl#non-ensembl-data"
+ % (species_name,)
+ )
return Species._latin_names_to_species[latin_name]
@@ -158,8 +178,11 @@ def check_species_object(species_name_or_object):
elif isinstance(species_name_or_object, str):
return find_species_by_name(species_name_or_object)
else:
- raise ValueError("Unexpected type for species: %s : %s" % (
- species_name_or_object, type(species_name_or_object)))
+ raise ValueError(
+ "Unexpected type for species: %s : %s"
+ % (species_name_or_object, type(species_name_or_object))
+ )
+
human = Species.register(
latin_name="homo_sapiens",
@@ -168,7 +191,8 @@ human = Species.register(
"GRCh38": (76, MAX_ENSEMBL_RELEASE),
"GRCh37": (55, 75),
"NCBI36": (54, 54),
- })
+ },
+)
mouse = Species.register(
latin_name="mus_musculus",
@@ -177,12 +201,14 @@ mouse = Species.register(
"NCBIM37": (54, 67),
"GRCm38": (68, 102),
"GRCm39": (103, MAX_ENSEMBL_RELEASE),
- })
+ },
+)
dog = Species.register(
latin_name="canis_familiaris",
synonyms=["dog"],
- reference_assemblies={"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"CanFam3.1": (75, MAX_ENSEMBL_RELEASE)},
+)
cat = Species.register(
latin_name="felis_catus",
@@ -191,15 +217,17 @@ cat = Species.register(
"Felis_catus_6.2": (75, 90),
"Felis_catus_8.0": (91, 92),
"Felis_catus_9.0": (93, MAX_ENSEMBL_RELEASE),
-
- })
+ },
+)
chicken = Species.register(
latin_name="gallus_gallus",
synonyms=["chicken"],
reference_assemblies={
"Galgal4": (75, 85),
- "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE)})
+ "Gallus_gallus-5.0": (86, MAX_ENSEMBL_RELEASE),
+ },
+)
# Does the black rat (Rattus Rattus) get used for research too?
brown_rat = Species.register(
@@ -208,70 +236,82 @@ brown_rat = Species.register(
reference_assemblies={
"Rnor_5.0": (75, 79),
"Rnor_6.0": (80, 104),
- "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE)
- })
+ "mRatBN7.2": (105, MAX_ENSEMBL_RELEASE),
+ },
+)
macaque = Species.register(
latin_name="macaca_fascicularis",
synonyms=["macaque", "Crab-eating macaque"],
reference_assemblies={
"Macaca_fascicularis_6.0": (103, MAX_ENSEMBL_RELEASE),
- })
+ },
+)
green_monkey = Species.register(
latin_name="chlorocebus_sabaeus",
synonyms=["green_monkey", "african_green_monkey"],
reference_assemblies={
"ChlSab1.1": (86, MAX_ENSEMBL_RELEASE),
- })
+ },
+)
rhesus = Species.register(
latin_name="macaca_mulatta",
synonyms=["rhesus"],
- reference_assemblies={"Mmul_10": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"Mmul_10": (75, MAX_ENSEMBL_RELEASE)},
+)
rabbit = Species.register(
latin_name="oryctolagus_cuniculus",
synonyms=["rabbit"],
- reference_assemblies={"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"OryCun2.0": (75, MAX_ENSEMBL_RELEASE)},
+)
gerbil = Species.register(
latin_name="meriones_unguiculatus",
synonyms=["gerbil"],
- reference_assemblies={"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"MunDraft-v1.0": (75, MAX_ENSEMBL_RELEASE)},
+)
syrian_hamster = Species.register(
latin_name="mesocricetus_auratus",
synonyms=["syrian_hamster"],
- reference_assemblies={"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"MesAur1.0": (75, MAX_ENSEMBL_RELEASE)},
+)
chinese_hamster = Species.register(
latin_name="cricetulus_griseus_chok1gshd",
synonyms=["chinese_hamster"],
- reference_assemblies={"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"CHOK1GS_HDv1": (75, MAX_ENSEMBL_RELEASE)},
+)
naked_mole_rat = Species.register(
latin_name="heterocephalus_glaber_female",
synonyms=["naked_mole_rat"],
- reference_assemblies={"HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"HetGla_female_1.0": (75, MAX_ENSEMBL_RELEASE)},
+)
guinea_pig = Species.register(
latin_name="cavia_porcellus",
synonyms=["guinea_pig"],
- reference_assemblies={"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"Cavpor3.0": (75, MAX_ENSEMBL_RELEASE)},
+)
pig = Species.register(
latin_name="sus_scrofa",
synonyms=["pig"],
- reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)})
+ reference_assemblies={"Sscrofa11.1": (75, MAX_ENSEMBL_RELEASE)},
+)
fly = Species.register(
latin_name="drosophila_melanogaster",
synonyms=["drosophila", "fruit fly", "fly"],
reference_assemblies={
- "BDGP5": (75, 78),
- "BDGP6": (79, 95),
- "BDGP6.22": (96, 98),
- "BDGP6.28": (99, 102),
- "BDGP6.32": (103, MAX_ENSEMBL_RELEASE)
- })
+ "BDGP5": (75, 78),
+ "BDGP6": (79, 95),
+ "BDGP6.22": (96, 98),
+ "BDGP6.28": (99, 102),
+ "BDGP6.32": (103, MAX_ENSEMBL_RELEASE),
+ },
+)
=====================================
pyensembl/version.py
=====================================
@@ -1 +1 @@
-__version__ = '2.2.8'
+__version__ = "2.2.9"
=====================================
setup.py
=====================================
@@ -14,16 +14,17 @@ from __future__ import print_function
import os
import re
+# TODO: replace setup.py with pyproject.toml
from setuptools import setup
package_name = "pyensembl"
current_directory = os.path.dirname(__file__)
-readme_filename = 'README.md'
+readme_filename = "README.md"
readme_path = os.path.join(current_directory, readme_filename)
github_url = "https://github.com/openvax/%s" % package_name
try:
- with open(readme_path, 'r') as f:
+ with open(readme_path, "r") as f:
readme_markdown = f.read()
except IOError as e:
print(e)
@@ -31,19 +32,18 @@ except IOError as e:
readme_markdown = ""
-with open('%s/version.py' % package_name, 'r') as f:
+with open("%s/version.py" % package_name, "r") as f:
version = re.search(
- r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
- f.read(),
- re.MULTILINE).group(1)
+ r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE
+ ).group(1)
if not version:
- raise RuntimeError('Cannot find version information')
+ raise RuntimeError("Cannot find version information")
-if __name__ == '__main__':
+if __name__ == "__main__":
with open("requirements.txt") as f:
requirements = [l.strip() for l in f]
-
+
setup(
name=package_name,
version=version,
@@ -53,24 +53,22 @@ if __name__ == '__main__':
url=github_url,
license="http://www.apache.org/licenses/LICENSE-2.0.html",
entry_points={
- 'console_scripts': [
- 'pyensembl = %s.shell:run' % package_name
- ],
+ "console_scripts": ["pyensembl = %s.shell:run" % package_name],
},
classifiers=[
- 'Development Status :: 3 - Alpha',
- 'Environment :: Console',
- 'Operating System :: OS Independent',
- 'Intended Audience :: Science/Research',
- 'License :: OSI Approved :: Apache Software License',
- 'Programming Language :: Python',
- 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ "Development Status :: 3 - Alpha",
+ "Environment :: Console",
+ "Operating System :: OS Independent",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: Apache Software License",
+ "Programming Language :: Python",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
],
install_requires=requirements,
long_description=readme_markdown,
- long_description_content_type='text/markdown',
+ long_description_content_type="text/markdown",
packages=[package_name],
package_data={
- package_name: ['logging.conf', '../requirements.txt'],
+ package_name: ["logging.conf", "../requirements.txt"],
},
)
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/df67ba56bb091511fc4756833fc0770c8de71fb4
--
View it on GitLab: https://salsa.debian.org/med-team/pyensembl/-/commit/df67ba56bb091511fc4756833fc0770c8de71fb4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240103/9a73b634/attachment-0001.htm>
More information about the debian-med-commit
mailing list