[med-svn] [Git][med-team/python-loompy][master] 6 commits: d/watch: version=5
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sat Dec 20 17:54:18 GMT 2025
Andreas Tille pushed to branch master at Debian Med / python-loompy
Commits:
f9a5eca4 by Andreas Tille at 2025-12-20T18:19:14+01:00
d/watch: version=5
- - - - -
f838481e by Andreas Tille at 2025-12-20T18:19:27+01:00
New upstream version 3.0.8+dfsg
- - - - -
6ecae535 by Andreas Tille at 2025-12-20T18:19:27+01:00
New upstream version
- - - - -
798b9ce6 by Andreas Tille at 2025-12-20T18:19:29+01:00
Update upstream source from tag 'upstream/3.0.8+dfsg'
Update to upstream version '3.0.8+dfsg'
with Debian dir 9d793a70f70b71664f1c092cb8a8cf1b0ec76482
- - - - -
dc1755f5 by Andreas Tille at 2025-12-20T18:19:29+01:00
Standards-Version: 4.7.2 (routine-update)
- - - - -
bbec94e0 by Andreas Tille at 2025-12-20T18:44:57+01:00
Refresh patches
- - - - -
19 changed files:
- debian/changelog
- debian/control
- − debian/patches/numpy_2x.patch
- − debian/patches/python3.12.patch
- debian/patches/series
- debian/watch
- doc/kallisto/index.rst
- kallisto/README.md
- loompy/__init__.py
- loompy/_version.py
- loompy/attribute_manager.py
- loompy/bus_file.py
- loompy/loom_layer.py
- loompy/loom_validator.py
- loompy/loompy.py
- loompy/normalize.py
- loompy/utils.py
- loompy/view_manager.py
- notebooks/build_index.ipynb
Changes:
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+python-loompy (3.0.8+dfsg-1) UNRELEASED; urgency=medium
+
+ * Team upload.
+ * d/watch: version=5
+ * New upstream version
+ * Standards-Version: 4.7.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org> Sat, 20 Dec 2025 18:19:27 +0100
+
python-loompy (3.0.7+dfsg-4) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -14,7 +14,7 @@ Build-Depends: debhelper-compat (= 13),
python3-scipy <!nocheck>,
python3-numpy-groupies <!nocheck>,
python3-pytest <!nocheck>
-Standards-Version: 4.7.0
+Standards-Version: 4.7.2
Vcs-Browser: https://salsa.debian.org/med-team/python-loompy
Vcs-Git: https://salsa.debian.org/med-team/python-loompy.git
Homepage: https://github.com/linnarsson-lab/loompy
=====================================
debian/patches/numpy_2x.patch deleted
=====================================
@@ -1,59 +0,0 @@
---- a/loompy/loom_validator.py
-+++ b/loompy/loom_validator.py
-@@ -232,7 +232,7 @@
- if self.version == "3.0.0":
- expected_dtype = np.object_
- else:
-- expected_dtype = np.string_
-+ expected_dtype = np.bytes_
- delay_print("Row attributes:")
- if self._check("row_attrs" in file, "'row_attrs' group is missing"):
- for ra in file["row_attrs"]:
---- a/loompy/normalize.py
-+++ b/loompy/normalize.py
-@@ -7,20 +7,19 @@
-
- def normalize_attr_strings(a: np.ndarray) -> np.ndarray:
- """
-- Take an np.ndarray of all kinds of string-like elements, and return an array of ascii (np.string_) objects
-+ Take an np.ndarray of all kinds of string-like elements, and return an array of ascii (np.bytes_) objects
- """
- if np.issubdtype(a.dtype, np.object_):
-- # if np.all([type(x) is str for x in a]) or np.all([type(x) is np.str_ for x in a]) or np.all([type(x) is np.unicode_ for x in a]):
-- if np.all([(type(x) is str or type(x) is np.str_ or type(x) is np.unicode_) for x in a]):
-+ if np.all([(type(x) is str or type(x) is np.str_) for x in a]):
- return np.array([x.encode('ascii', 'xmlcharrefreplace') for x in a])
-- elif np.all([type(x) is np.string_ for x in a]) or np.all([type(x) is np.bytes_ for x in a]):
-- return a.astype("string_")
-+ elif np.all([type(x) is np.bytes_ for x in a]):
-+ return a.astype("bytes_")
- else:
- logging.debug(f"Attribute contains mixed object types ({np.unique([str(type(x)) for x in a])}); casting all to string")
-- return np.array([str(x) for x in a], dtype="string_")
-- elif np.issubdtype(a.dtype, np.string_) or np.issubdtype(a.dtype, np.object_):
-+ return np.array([str(x) for x in a], dtype="bytes_")
-+ elif np.issubdtype(a.dtype, np.bytes_) or np.issubdtype(a.dtype, np.object_):
- return a
-- elif np.issubdtype(a.dtype, np.str_) or np.issubdtype(a.dtype, np.unicode_):
-+ elif np.issubdtype(a.dtype, np.str_):
- return np.array([x.encode('ascii', 'xmlcharrefreplace') for x in a])
- else:
- raise ValueError("String values must be object, ascii or unicode.")
-@@ -88,7 +87,7 @@
- scalar = True
- a = np.array([a])
- result: np.ndarray = None # This second clause takes care of attributes stored as variable-length ascii, which can be generated by loomR or Seurat
-- if np.issubdtype(a.dtype, np.string_) or np.issubdtype(a.dtype, np.object_):
-+ if np.issubdtype(a.dtype, np.bytes_) or np.issubdtype(a.dtype, np.object_):
- # First ensure that what we load is valid ascii (i.e. ignore anything outside 7-bit range)
- if hasattr(a, "decode"): # This takes care of Loom files that store strings as UTF8, which comes in as str and doesn't have a decode method
- temp = np.array([x.decode('ascii', 'ignore') for x in a])
-@@ -100,7 +99,7 @@
- except: # Dirty hack to handle UTF-8 non-break-space in scalar strings. TODO: Rewrite this whole method completely!
- if type(a[0]) == np.bytes_:
- result = [ a[0].replace(b'\xc2\xa0', b'') ]
-- elif np.issubdtype(a.dtype, np.str_) or np.issubdtype(a.dtype, np.unicode_):
-+ elif np.issubdtype(a.dtype, np.str_):
- result = np.array(a.astype(str), dtype=object)
- else:
- result = a
=====================================
debian/patches/python3.12.patch deleted
=====================================
@@ -1,15 +0,0 @@
-Author: Andreas Tille <tille at debian.org>
-Last-Update: 2024-12-04
-Description: Fix syntax error with Python3.12
-
---- a/loompy/view_manager.py
-+++ b/loompy/view_manager.py
-@@ -20,7 +20,7 @@ class ViewManager:
- Returns:
- A LoomView object, an in-memory representation of the sliced file
- """
-- if type(slice_) is not tuple or len(slice_) is not 2:
-+ if type(slice_) is not tuple or len(slice_) != 2:
- raise ValueError("Views require slices along two dimensions")
-
- rows = slice_[0]
=====================================
debian/patches/series
=====================================
@@ -1,5 +1,3 @@
tests-open-loom-write-permissions.patch
numpy_1.24.patch
-python3.12.patch
do_not_generate_a_dep_on_pkg_resources.patch
-numpy_2x.patch
=====================================
debian/watch
=====================================
@@ -1,4 +1,7 @@
-version=4
+Version: 5
-opts="repacksuffix=+dfsg,dversionmangle=auto,filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%@PACKAGE at -$1.tar.gz%" \
- https://github.com/linnarsson-lab/loompy/tags .*/v?@ANY_VERSION@@ARCHIVE_EXT@
+Template: Github
+Owner: linnarsson-lab
+Project: loompy
+Repack-Suffix: +dfsg
+Dversion-Mangle: auto
=====================================
doc/kallisto/index.rst
=====================================
@@ -27,18 +27,19 @@ and you should see the following output:
Using the ``loompy fromfq`` command
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-1. Install `kallisto <https://pachterlab.github.io/kallisto/>`_
+1. Install `kallisto <https://pachterlab.github.io/kallisto/>`_ . Please note: Version >= 0.46.0 is required!
---------------------------------------------------------------
The excellent kallisto tool performs ultra-fast pseudoalignment, which loompy uses to count reads (UMIs) on genes.
-2. Download an index (or build your own)
+2. Download an index or build your own. Please note: The pre-built index only works with kallisto version < 0.50.0 !
----------------------------------------
We provide a pre-built index of the `human genome <https://storage.googleapis.com/linnarsson-lab-www-blobs/human_GRCh38_gencode.v31.tar.gz>`_.
.. warning::
This index is really only suitable for use with 10x Chromium data (it makes strong assumptions about the read distribution).
+ This index only works with kallisto version < 0.50.0.
Unzip the index to a directory, which will have the following content:
=====================================
kallisto/README.md
=====================================
@@ -4,12 +4,13 @@ For the human genome see the notebook subdirectory.
## Building the mouse kallisto index
-These instructions work on Linux (tested on CentOS7).
+These instructions work on Linux (tested on Ubuntu 22.04).
-1. Make sure packages bedtools and kallisto are installed on the system.
+1. Make sure packages bedtools and kallisto are installed on the system. (On Ubuntu you can use the apt package manager to install both.)
- bedtools from https://bedtools.readthedocs.io/en/latest/content/installation.html
- kallisto from https://pachterlab.github.io/kallisto/download.html
+
2. Create your working directory, 'cd' there, and put the files above there.
3. Download and preprocess input files:
@@ -21,16 +22,16 @@ These instructions work on Linux (tested on CentOS7).
4. Download "BrowseTF TcoF-DB.xlsx" from https://tools.sschmeier.com/tcof/browse/?type=tcof&species=mouse&class=all# by clicking the "Excel" button. (Main page is https://tools.sschmeier.com/tcof/home/).
Open the file in Excel and save tab-separated as "inputs/TcoF-DB.tsv".
-5. You need to download some annotations for Mouse GRCm38 from BioMart (https://m.ensembl.org/biomart) Open this link in a new browser tab:
+5. You need to download some annotations for Mouse GRCm3x from BioMart (https://m.ensembl.org/biomart) Open this link in a new browser tab:
-http://www.ensembl.org/biomart/martview/7c9b283e3eca26cb81449ec518f4fc14?VIRTUALSCHEMANAME=default&ATTRIBUTES=mmusculus_gene_ensembl.default.feature_page.ensembl_gene_id|mmusculus_gene_ensembl.default.feature_page.ensembl_gene_id_version|mmusculus_gene_ensembl.default.feature_page.ensembl_transcript_id|mmusculus_gene_ensembl.default.feature_page.ensembl_transcript_id_version|mmusculus_gene_ensembl.default.feature_page.ucsc|mmusculus_gene_ensembl.default.feature_page.vega_translation|mmusculus_gene_ensembl.default.feature_page.ccds&FILTERS=&VISIBLEPANEL=resultspanel
+http://www.ensembl.org/biomart/martview/7c9b283e3eca26cb81449ec518f4fc14?VIRTUALSCHEMANAME=default&ATTRIBUTES=mmusculus_gene_ensembl.default.feature_page.ensembl_gene_id|mmusculus_gene_ensembl.default.feature_page.ensembl_gene_id_version|mmusculus_gene_ensembl.default.feature_page.ensembl_transcript_id|mmusculus_gene_ensembl.default.feature_page.ensembl_transcript_id_version|mmusculus_gene_ensembl.default.feature_page.ucsc|mmusculus_gene_ensembl.default.feature_page.entrezgene_trans_name|mmusculus_gene_ensembl.default.feature_page.ccds&FILTERS=&VISIBLEPANEL=resultspanel
On this BioMart page, click the "Go" button, and save the downloaded "mart_export.txt" file as "inputs/mart_export.txt".
The file should contain the following columns in the header:
- Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version UCSC Stable ID Vega translation ID CCDS ID
+ Gene stable ID Gene stable ID version Transcript stable ID Transcript stable ID version UCSC Stable ID EntrezGene transcript name ID CCDS ID
- If the link fails, you need to manually select the proper dataset and columns from the https://m.ensembl.org/biomart webpage and download:
- * Select Dataset "Ensembl Genes 101"/"Mouse genes GRCm38".
+ If the link fails, you need to manually select the proper dataset and columns from the [https://m.ensembl.org/biomart/martview](https://www.ensembl.org/biomart/martview) webpage and download:
+ * Select Dataset "Ensembl Genes 1xx"/"Mouse genes GRCm3x".
* Select Attributes as in columns above: First 4 should be auto-selected. Select the following 3 from the "EXTERNAL" section, clicking the 3 boxes in the order above.
* Click "Results", export using "Go" button, and save to "inputs/mart_export.txt".
@@ -61,4 +62,6 @@ http://www.ensembl.org/biomart/martview/7c9b283e3eca26cb81449ec518f4fc14?VIRTUAL
`kallisto index -i gencode.vM23.fragments.idx -k 31 inputs/gencode.vM23.fragments.fa`
+(If you end up with 'counting k-mers ... Killed' you have too little memory in your computer.)
+
10. Refer to the notebook for human for more info on the output.
=====================================
loompy/__init__.py
=====================================
@@ -8,7 +8,7 @@ from .loom_view import LoomView
from .loom_layer import MemoryLoomLayer, LoomLayer
from .to_html import to_html
from .view_manager import ViewManager
-from .loompy import connect, create, create_append, combine, create_from_cellranger, LoomConnection, new, combine_faster, create_from_matrix_market, create_from_star
+from .loompy import connect, create, create_append, combine, create_from_cellranger, LoomConnection, new, combine_faster, create_from_matrix_market, create_from_star, create_from_tsv
from .loom_validator import LoomValidator
from ._version import __version__, loom_spec_version
from .bus_file import create_from_fastq
=====================================
loompy/_version.py
=====================================
@@ -1,2 +1,2 @@
-__version__ = '3.0.7'
+__version__ = '3.0.8'
loom_spec_version = '3.0.0'
=====================================
loompy/attribute_manager.py
=====================================
@@ -1,3 +1,4 @@
+from multiprocessing.sharedctypes import Value
from typing import *
import numpy as np
import h5py
@@ -80,6 +81,8 @@ class AttributeManager:
if type(thing) is slice or type(thing) is np.ndarray or type(thing) is int:
am = AttributeManager(None, axis=self.axis)
for key, val in self.items():
+ if val is None:
+ raise ValueError(key)
am[key] = val[thing]
return am
elif type(thing) is tuple:
=====================================
loompy/bus_file.py
=====================================
@@ -64,7 +64,7 @@ twobit_to_dna_table = {0: "A", 1: "C", 2: "G", 3: "T"}
dna_to_twobit_table = {"A": 0, "C": 1, "G": 2, "T": 3}
- at jit
+ at jit(nopython=True)
def twobit_to_dna(twobit: int, size: int) -> str:
result = []
for i in range(size):
@@ -81,7 +81,7 @@ def twobit_to_dna(twobit: int, size: int) -> str:
return "".join(result)
- at jit
+ at jit(nopython=True)
def dna_to_twobit(dna: str) -> int:
x = 0
for nt in dna:
@@ -98,7 +98,7 @@ def dna_to_twobit(dna: str) -> int:
return x
- at jit
+ at jit(nopython=True)
def twobit_1hamming(twobit: int, size: int) -> List[int]:
result = []
for i in range(size):
@@ -456,6 +456,9 @@ def create_from_fastq(out_file: str, sample_id: str, fastqs: List[str], index_pa
logging.info(f"Creating loom file '{out_file}'")
bus.save(out_file, sample_id, samples_metadata_file)
with connect(out_file) as ds:
+ for ra in ds.row_attrs: # For some reason it happens that e.g. ds.ra.Alias == None, which fails downstream work.
+ if ds.rowattrs[ra] is None:
+ del ds.rowattrs[ra]
ds.attrs.Species = manifest["species"]
ds.attrs.Saturation = seq_sat
if run_info is not None:
=====================================
loompy/loom_layer.py
=====================================
@@ -100,7 +100,7 @@ class LoomLayer():
self.ds._file.attrs["last_modified"] = timestamp()
self.ds._file.flush()
- def sparse(self, rows: np.ndarray = None, cols: np.ndarray = None) -> scipy.sparse.coo_matrix:
+ def sparse(self, rows: np.ndarray = None, cols: np.ndarray = None, dtype = None) -> scipy.sparse.coo_matrix:
if rows is not None:
if np.issubdtype(rows.dtype, np.bool_):
rows = np.where(rows)[0]
@@ -110,22 +110,35 @@ class LoomLayer():
n_genes = self.ds.shape[0] if rows is None else rows.shape[0]
n_cells = self.ds.shape[1] if cols is None else cols.shape[0]
-
- data: List[np.ndarray] = []
- row: List[np.ndarray] = []
- col: List[np.ndarray] = []
+ # Calculate sparse data length to be able to reserve proper sized arrays beforehand
+ nnonzero = 0
+ for (ix, selection, view) in self.ds.scan(items=cols, axis=1, layers=[self.name], what=["layers"], batch_size=4096):
+ if rows is not None:
+ vals = view.layers[self.name][rows, :]
+ else:
+ vals = view.layers[self.name][:, :]
+ nnonzero += np.count_nonzero(vals)
+ data = np.empty((nnonzero,), dtype=dtype) #(data: List[np.ndarray] = []
+ row = np.empty((nnonzero,), dtype=('uint16' if self.ds.shape[0] < 2**16 else 'uint32')) #row : List[np.ndarray] = []
+ col = np.empty((nnonzero,), dtype=('uint32' if self.ds.shape[1] < 2**32 else 'uint64')) #col: List[np.ndarray] = []
i = 0
- for (ix, selection, view) in self.ds.scan(items=cols, axis=1, layers=[self.name], what=["layers"]):
+ ci = 0
+ for (ix, selection, view) in self.ds.scan(items=cols, axis=1, layers=[self.name], what=["layers"], batch_size=4096):
if rows is not None:
vals = view.layers[self.name][rows, :]
else:
vals = view.layers[self.name][:, :]
+ if dtype:
+ vals = vals.astype(dtype)
nonzeros = np.where(vals != 0)
- data.append(vals[nonzeros])
- row.append(nonzeros[0])
- col.append(nonzeros[1] + i)
+ n = len(nonzeros[0])
+ data[ci:ci+n] = vals[nonzeros] #data.append(vals[nonzeros])
+ row[ci:ci+n] = nonzeros[0] #row.append(nonzeros[0])
+ col[ci:ci+n] = (nonzeros[1]+i) #col.append(nonzeros[1] + i)
+ ci += n
i += selection.shape[0]
- return scipy.sparse.coo_matrix((np.concatenate(data), (np.concatenate(row), np.concatenate(col))), shape=(n_genes, n_cells))
+ return scipy.sparse.coo_matrix((data, (row, col)), shape=(n_genes, n_cells), dtype=dtype)
+ #return scipy.sparse.coo_matrix((np.concatenate(data, dtype=dtype), (np.concatenate(row), np.concatenate(col))), shape=(n_genes, n_cells), dtype=dtype)
def _resize(self, size: Tuple[int, int], axis: int = None) -> None:
"""Resize the dataset, or the specified axis.
=====================================
loompy/loom_validator.py
=====================================
@@ -232,7 +232,7 @@ class LoomValidator:
if self.version == "3.0.0":
expected_dtype = np.object_
else:
- expected_dtype = np.string_
+ expected_dtype = np.bytes_
delay_print("Row attributes:")
if self._check("row_attrs" in file, "'row_attrs' group is missing"):
for ra in file["row_attrs"]:
=====================================
loompy/loompy.py
=====================================
@@ -586,7 +586,7 @@ class LoomConnection:
if layers == "":
layers = [""]
- if (items is not None) and (np.issubdtype(items.dtype, np.bool_)):
+ if (items is not None) and (type(items) != int) and (np.issubdtype(items.dtype, np.bool_)):
items = np.where(items)[0]
ordering: Union[np.ndarray, slice] = None
@@ -599,6 +599,8 @@ class LoomConnection:
ordering = slice(None)
if items is None:
items = np.fromiter(range(self.shape[1]), dtype='int')
+ elif type(items) == int:
+ items = np.fromiter(range(items, self.shape[1]), dtype='int')
cols_per_chunk = batch_size
ix = 0
while ix < self.shape[1]:
@@ -855,7 +857,8 @@ class LoomConnection:
self.col_attrs._permute(ordering)
self.col_graphs._permute(ordering)
- def aggregate(self, out_file: str = None, select: np.ndarray = None, group_by: Union[str, np.ndarray] = "Clusters", aggr_by: str = "mean", aggr_ca_by: Dict[str, str] = None) -> np.ndarray:
+ def aggregate(self, out_file: str = None, select: np.ndarray = None, group_by: Union[str, np.ndarray] = "Clusters", \
+ aggr_by: str = "mean", aggr_ca_by: Dict[str, str] = None, layer: str = "", aggr_ca_if_equal: bool = False) -> np.ndarray:
"""
Aggregate the Loom file by applying aggregation functions to the main matrix as well as to the column attributes
@@ -865,6 +868,8 @@ class LoomConnection:
group_by The column attribute to group by, or an np.ndarray of integer group labels
aggr_by The aggregation function for the main matrix
aggr_ca_by A dictionary of aggregation functions for the column attributes (or None to skip)
+ layer The name of the layer to aggregate. Defaults to main layer
+ aggr_ca_if_equal If True, scalar column attributes not in aggr_ca_by will be transferred when they have the same value within each aggregate group
Returns:
m Aggregated main matrix
@@ -888,6 +893,13 @@ class LoomConnection:
labels = (self.ca[group_by]).astype('int')
_, zero_strt_sort_noholes_lbls = np.unique(labels, return_inverse=True)
n_groups = len(set(labels))
+ if aggr_ca_if_equal:
+ for key in self.ca.keys():
+ if (aggr_ca_by is None or key not in aggr_ca_by) and np.isscalar(self.ca[key][0]):
+ value_by_pos = self.ca[key]
+ nvalues_per_lbl = npg.aggregate(zero_strt_sort_noholes_lbls, value_by_pos, lambda v: len(set(v)))
+ if np.all(nvalues_per_lbl == 1):
+ ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, self.ca[key], func='first', fill_value=self.ca[key][0])
if aggr_ca_by is not None:
for key in self.ca.keys():
if key not in aggr_ca_by:
@@ -911,8 +923,8 @@ class LoomConnection:
ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, self.ca[key], func=func, fill_value=self.ca[key][0])
m = np.empty((self.shape[0], n_groups))
- for (_, selection, view) in self.scan(axis=0, layers=[""]):
- vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls, view[:, :], func=aggr_by, axis=1, fill_value=0)
+ for (_, selection, view) in self.scan(axis=0, layers=[layer]):
+ vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls, view[layer][:, :], func=aggr_by, axis=1, fill_value=0)
m[selection, :] = vals_aggr
if out_file is not None:
@@ -1081,14 +1093,15 @@ def create(filename: str, layers: Union[np.ndarray, Dict[str, np.ndarray], loomp
raise ve
-def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> str:
+def create_from_cellranger(indir: str, outdir: str = None, genome: str = None, file_attrs: Dict[str, str] = None, selected_cellids: List[str] = None) -> str:
"""
Create a .loom file from 10X Genomics cellranger output
Args:
indir (str): path to the cellranger output folder (the one that contains 'outs')
- outdir (str): output folder wher the new loom file should be saved (default to indir)
+ outdir (str): output folder where the new loom file should be saved (default to indir)
genome (str): genome build to load (e.g. 'mm10'; if None, determine species from outs folder)
+ file_attrs: dict of global file attributes, or None
Returns:
path (str): Full path to the created loom file.
@@ -1137,25 +1150,127 @@ def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -
col_attrs["ClusterID"] = labels.astype('int') - 1
path = os.path.join(outdir, sampleid + ".loom")
- create(path, matrix, row_attrs, col_attrs, file_attrs={"Genome": genome})
+ if file_attrs == None:
+ file_attrs = {}
+ if not "Genome" in file_attrs:
+ if genome is None:
+ invocationfile = os.path.join(indir, "_invocation")
+ for line in open(invocationfile):
+ m = re.search('reference_path.+"([^"]+)"', line)
+ if m:
+ genome = m.group(1)
+ break
+ if genome:
+ file_attrs["Genome"] = genome
+ cmdlinefile = os.path.join(indir, "_cmdline")
+ if not "Cmdline" in file_attrs and os.path.exists(cmdlinefile):
+ cmdline = open(cmdlinefile).read().strip()
+ file_attrs["Cmdline"] = cmdline
+ versionsfile = os.path.join(indir, "_versions")
+ if not "Versions" in file_attrs and os.path.exists(versionsfile):
+ versions = open(versionsfile).read().strip().replace("\n", " ")
+ file_attrs["Versions"] = versions
+ create(path, matrix, row_attrs, col_attrs, file_attrs=file_attrs)
return path
+def create_from_tsv(out_file: str, tsv_file: str, row_metadata_loomfile: str = None, row_metadata_attr: str = "Accession", delim: str = "\t", \
+ dtype: str = "float32", sample_id: str = "", file_attrs: Dict[str, str] = None, \
+ col_metadata_tsv: str = None, metadata_delim: str = '\t') -> None:
+ """
+ Create a .loom file from .tsv file
+
+ Args:
+ out_file: path to the newly created .loom file (will be overwritten if it exists)
+ tsv_file: input tab separated data matrix file. Header line should contain cell IDs.
+ row_metadata_loomfile: path to loomfile that will supply gene data and their order. Use to make the new loomfile conform with existing loomfile(s).
+ row_metadata_attr: row attribute of row_metadata_loomfile to use to match with gene IDs of tsv_file
+ delim: delimiter of expression matrix file
+ dtype: requested type of loomfile data matrix
+ sample_id: string to use as prefix for cell IDs, or nothing if header fields already include sample IDs
+ file_attrs: dict of global loomfile attributes
+ col_metadata_tsv: metadata for cells. Header line shoud be names of attributes. First column should be CellIDs. Order has to match data matrix file.
+ metadata_delim: delimiter of tsv metadata file
+ """
+ id2rowidx = None
+ row_attrs = {}
+ if row_metadata_loomfile:
+ with loompy.connect(row_metadata_loomfile, "r") as ds:
+ for attr in ("Accession", "Gene", "Chromosome", "Start", "End"):
+ if attr in ds.ra:
+ row_attrs[attr] = ds.ra[attr][:]
+ nrows = ds.shape[0]
+ id2rowidx = { n : i for i, n in enumerate(row_attrs[row_metadata_attr]) }
+ with (gzip.open(tsv_file, "rt") if tsv_file.endswith(".gz") else open(tsv_file, "r")) as fd:
+ headerrow = fd.readline().rstrip().split(delim)
+ datarow1 = fd.readline().rstrip().split(delim)
+ headerfirstcellid = 1 if len(datarow1)==len(headerrow) else 0
+ if not row_metadata_loomfile:
+ nrows = 1
+ for line in fd:
+ nrows += 1
+ geneids = []
+ with (gzip.open(tsv_file, "rt") if tsv_file.endswith(".gz") else open(tsv_file, "r")) as fd:
+ headerrow = fd.readline().rstrip().split(delim)
+ headerrow = [re.sub(r'^"(.+)"$', r'\1', f) for f in headerrow]
+ cellids = np.array([ sample_id + cellid for cellid in headerrow[headerfirstcellid:] ]).astype('str')
+ matrix = np.zeros([nrows, len(cellids)], dtype=dtype)
+ nlines = nnomatch = 0
+ for inrowidx, line in enumerate(fd):
+ nlines += 1
+ row = line.rstrip().split(delim)
+ geneid = re.sub(r'^"(.+)"$', r'\1', row[0])
+ if id2rowidx:
+ if not geneid in id2rowidx:
+ nnomatch += 1
+ continue
+ rowidx = id2rowidx[geneid]
+ else:
+ rowidx = inrowidx
+ datarow = [ float(v) for v in row[1:] ]
+ matrix[rowidx, :] = np.array(datarow).astype(dtype)
+ geneids.append(geneid)
+ if len(row_attrs) == 0:
+ row_attrs['Gene'] = geneids
+ col_attrs = {"CellID": cellids}
+ if col_metadata_tsv:
+ with (gzip.open(col_metadata_tsv, "rt") if col_metadata_tsv.endswith(".gz") else open(col_metadata_tsv, "r")) as fd:
+ cm_attrs = fd.readline().rstrip().split(metadata_delim)
+ cm_attrs = [re.sub(r'^"(.+)"$', r'\1', a) for a in cm_attrs]
+ for cm_attr in cm_attrs:
+ col_attrs[cm_attr] = []
+ cmrowidx = 0
+ line = fd.readline()
+ while line:
+ cm_values = line.rstrip().split(metadata_delim)
+ cm_values = [re.sub(r'^"(.+)"$', r'\1', v) for v in cm_values]
+ cmcellid = cm_values[0]
+ if cmcellid != cellids[cmrowidx]:
+ raise ValueError("CellID %s at row %s in %s does not match with corresponding header column %s of %s" % \
+ (cmcellid, cmrowidx, col_metadata_tsv, cellids[cmrowidx], tsv_file))
+ cm_idx0 = 0 if len(cm_values) == len(cm_attrs) else 1
+ for colidx, cm_attr in enumerate(cm_attrs):
+ col_attrs[cm_attr].append(cm_values[cm_idx0 + colidx])
+ cmrowidx += 1
+ line = fd.readline()
+ create(out_file, matrix, row_attrs, col_attrs, file_attrs=file_attrs)
+ print("No match for %s / %s genes in input file. Size of output loomfile: %s" % (nnomatch, nlines, matrix.shape) )
+
def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[str, str], row_metadata_path: str, column_metadata_path: str, delim: str = "\t", skip_row_headers: bool = False, skip_colums_headers: bool = False, file_attrs: Dict[str, str] = None, matrix_transposed: bool = False) -> None:
"""
Create a .loom file from .mtx matrix market format
Args:
- out_file: path to the newly created .loom file (will be overwritten if it exists)
- sample_id: string to use as prefix for cell IDs
- layer_paths: dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension)
- row_metadata_path: path to the row (usually genes) metadata file
- column_metadata_path: path to the column (usually cells) metadata file
- delim: delimiter used for metadata (default: "\t")
- skip_row_headers: if true, skip first line in rows metadata file
- skip_column_headers: if true, skip first line in columns metadata file
- file_attrs: dict of global file attributes, or None
- matrix_transposed: if true, the main matrix is transposed
+ out_file: path to the newly created .loom file (will be overwritten if it exists)
+ sample_id: string to use as prefix for cell IDs
+ layer_paths: dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension)
+ row_metadata_path: path to the row (usually genes) metadata file
+ column_metadata_path: path to the column (usually cells) metadata file
+ delim: delimiter used for metadata (default: "\t")
+ skip_row_headers: if true, skip first line in rows metadata file
+ skip_column_headers: if true, skip first line in columns metadata file
+ file_attrs: dict of global loomfile attributes, or None
+ matrix_transposed: if true, the main matrix is transposed
Remarks:
layer_paths should typically map the empty string to a matrix market file: {"": "path/to/filename.mtx"}.
@@ -1205,7 +1320,7 @@ def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[s
def create_from_star(indir : str, outfile : str, sample_id : str, \
cell_filter : str = "star", expected_n_cells : int = 0, min_total_umis : int = 0, ambient_pthreshold : float = 1.0, \
- sample_metadata_file : str = None, gtf_file : str = None, main_layer : str = "velosum", extra_layers = None):
+ sample_metadata_file : str = None, gtf_file : str = None, main_layer : str = "velosum", extra_layers = None, file_attrs = None):
"""
Create a .loom file from STARsolo output
Args:
@@ -1223,6 +1338,7 @@ def create_from_star(indir : str, outfile : str, sample_id : str, \
main_layer(str): The STAR output matrix to use for the main loom layer (e.g. 'Gene', or 'GeneFull').
'velosum' adds up spliced, unspliced, and ambiguous.
extra_layers(list): Names of additional layers to add in data (e.g. 'GeneFull').
+ file_attrs: dict of global file attributes, or None
Returns:
nothing
@@ -1233,7 +1349,7 @@ def create_from_star(indir : str, outfile : str, sample_id : str, \
if main_layer in ("Gene", "GeneFull", "GeneFull_ExonOverIntron", "GeneFull_Ex50pAS") and main_layer not in gtypes:
gtypes.append(main_layer)
subdir = "raw" if cell_filter == "none" else "filtered"
- velodir = os.path.join(indir, "Solo.out", "Velocyto", subdir)
+ velodir = os.path.join(indir, "Solo.out", "Velocyto", subdir) if not indir.endswith("Solo.out") else os.path.join(indir, "Velocyto", subdir)
accessions = [ l.split('\t')[0] for l in open(os.path.join(velodir, "features.tsv")).readlines() ]
genes = [ l.split('\t')[1] for l in open(os.path.join(velodir, "features.tsv")).readlines() ]
gtype2bcs = {}
@@ -1312,14 +1428,17 @@ def create_from_star(indir : str, outfile : str, sample_id : str, \
ca["AmbientPValue"] = valid_pvalue
ca["AmbientUMIs"] = np.full(n_valid_cells, ambient_umis)
if sample_metadata_file:
- sample_metadata = load_sample_metadata(sample_metadata_file, sample_id)
- for key, value in sample_metadata.items():
- ca[key] = np.full(n_valid_cells, value)
+ try:
+ sample_metadata = load_sample_metadata(sample_metadata_file, sample_id)
+ for key, value in sample_metadata.items():
+ ca[key] = np.full(n_valid_cells, value)
+ except:
+ print("No sample metadata for %s in %s. Skipping annotation." % (sample_id, sample_metadata_file))
if gtf_file:
ra = make_row_attrs_from_gene_metadata(gtf_file, accessions)
else:
ra = { "Gene": np.array(genes), "Accession": np.array(accessions) }
- create(filename=outfile, layers=layers, row_attrs=ra, col_attrs=ca)
+ create(filename=outfile, layers=layers, row_attrs=ra, col_attrs=ca, file_attrs=file_attrs)
def create_from_kallistobus(out_file: str, in_dir: str, tr2g_file: str, whitelist_file: str, file_attrs: Dict[str, str] = None, layers: Dict[str, str] = None):
"""
=====================================
loompy/normalize.py
=====================================
@@ -7,20 +7,20 @@ import logging
def normalize_attr_strings(a: np.ndarray) -> np.ndarray:
"""
- Take an np.ndarray of all kinds of string-like elements, and return an array of ascii (np.string_) objects
+ Take an np.ndarray of all kinds of string-like elements, and return an array of ascii (np.bytes_) objects
"""
if np.issubdtype(a.dtype, np.object_):
- # if np.all([type(x) is str for x in a]) or np.all([type(x) is np.str_ for x in a]) or np.all([type(x) is np.unicode_ for x in a]):
- if np.all([(type(x) is str or type(x) is np.str_ or type(x) is np.unicode_) for x in a]):
+ # if np.all([type(x) is str for x in a]) or np.all([type(x) is np.str_ for x in a]):
+ if np.all([(type(x) is str or type(x) is np.str_) for x in a]):
return np.array([x.encode('ascii', 'xmlcharrefreplace') for x in a])
- elif np.all([type(x) is np.string_ for x in a]) or np.all([type(x) is np.bytes_ for x in a]):
- return a.astype("string_")
+ elif np.all([type(x) is np.bytes_ for x in a]):
+ return a.astype("bytes_")
else:
logging.debug(f"Attribute contains mixed object types ({np.unique([str(type(x)) for x in a])}); casting all to string")
- return np.array([str(x) for x in a], dtype="string_")
- elif np.issubdtype(a.dtype, np.string_) or np.issubdtype(a.dtype, np.object_):
+ return np.array([str(x) for x in a], dtype="bytes_")
+ elif np.issubdtype(a.dtype, np.bytes_) or np.issubdtype(a.dtype, np.object_):
return a
- elif np.issubdtype(a.dtype, np.str_) or np.issubdtype(a.dtype, np.unicode_):
+ elif np.issubdtype(a.dtype, np.str_):
return np.array([x.encode('ascii', 'xmlcharrefreplace') for x in a])
else:
raise ValueError("String values must be object, ascii or unicode.")
@@ -88,9 +88,9 @@ def materialize_attr_values(a: np.ndarray) -> np.ndarray:
scalar = True
a = np.array([a])
result: np.ndarray = None # This second clause takes care of attributes stored as variable-length ascii, which can be generated by loomR or Seurat
- if np.issubdtype(a.dtype, np.string_) or np.issubdtype(a.dtype, np.object_):
+ if np.issubdtype(a.dtype, np.bytes_) or np.issubdtype(a.dtype, np.object_):
# First ensure that what we load is valid ascii (i.e. ignore anything outside 7-bit range)
- if hasattr(a, "decode"): # This takes care of Loom files that store strings as UTF8, which comes in as str and doesn't have a decode method
+ if len(a) > 0 and hasattr(a[0], "decode"): # This takes care of Loom files that store strings as UTF8, which comes in as str and doesn't have a decode method
temp = np.array([x.decode('ascii', 'ignore') for x in a])
else:
temp = a
@@ -100,7 +100,7 @@ def materialize_attr_values(a: np.ndarray) -> np.ndarray:
except: # Dirty hack to handle UTF-8 non-break-space in scalar strings. TODO: Rewrite this whole method completely!
if type(a[0]) == np.bytes_:
result = [ a[0].replace(b'\xc2\xa0', b'') ]
- elif np.issubdtype(a.dtype, np.str_) or np.issubdtype(a.dtype, np.unicode_):
+ elif np.issubdtype(a.dtype, np.str_):
result = np.array(a.astype(str), dtype=object)
else:
result = a
@@ -108,3 +108,4 @@ def materialize_attr_values(a: np.ndarray) -> np.ndarray:
return result[0]
else:
return result
+
=====================================
loompy/utils.py
=====================================
@@ -3,6 +3,7 @@ from inspect import currentframe, getouterframes
import datetime
from h5py import File as HDF5File
from .normalize import materialize_attr_values
+import numpy as np
def deprecated(message: str) -> None:
@@ -16,11 +17,16 @@ def timestamp() -> str:
def get_loom_spec_version(f: HDF5File) -> str:
+ version = "0.0.0"
if "attrs" in f and "LOOM_SPEC_VERSION" in f["/attrs"]:
- return materialize_attr_values(f["/attrs"]["LOOM_SPEC_VERSION"][()])
+ version = materialize_attr_values(f["/attrs"]["LOOM_SPEC_VERSION"][()])
+ if type(version) == np.ndarray:
+ version = version[0]
if "LOOM_SPEC_VERSION" in f.attrs:
- return materialize_attr_values(f.attrs["LOOM_SPEC_VERSION"])
- return "0.0.0"
+ version = materialize_attr_values(f.attrs["LOOM_SPEC_VERSION"])
+ if type(version) == np.ndarray:
+ version = version[0]
+ return version
def compare_loom_spec_version(f: HDF5File, v: str) -> int:
=====================================
loompy/view_manager.py
=====================================
@@ -20,7 +20,7 @@ class ViewManager:
Returns:
A LoomView object, an in-memory representation of the sliced file
"""
- if type(slice_) is not tuple or len(slice_) is not 2:
+ if type(slice_) is not tuple or len(slice_) != 2:
raise ValueError("Views require slices along two dimensions")
rows = slice_[0]
=====================================
notebooks/build_index.ipynb
=====================================
@@ -19,12 +19,13 @@
"conda install -c conda-forge biopython # Alternative, for Anaconda\n",
"```\n",
"\n",
- "Install [gawk](https://www.gnu.org/software/gawk/) (on Linux, this is typically already installed), [bedtools](https://bedtools.readthedocs.io/en/latest/), and [kallisto](https://pachterlab.github.io/kallisto/) (instructions below are for macOS using [Brew](https://brew.s)):\n",
+ "Install [gawk](https://www.gnu.org/software/gawk/) (on Linux, this is typically already installed), [bedtools](https://bedtools.readthedocs.io/en/latest/), and [kallisto](https://pachterlab.github.io/kallisto/ Visit https://github.com/pachterlab/kallisto to download and install latest version) (instructions below are for macOS using [Brew](https://brew.s)):\n",
"```\n",
"brew install gawk\n",
"brew install bedtools\n",
"brew install kallisto\n",
"```\n",
+ "On Ubuntu you would do ```sudo apt install gawk kallisto bedtools```\n",
"\n",
"### Download genome data\n",
"\n",
@@ -38,11 +39,14 @@
"\n",
"#### Download additional gene metadata from [HGNC](https://www.genenames.org/download/statistics-and-files/)\n",
"\n",
- "Get the “Complete HGNC dataset” as TXT, a file named [hgnc_complete_set.txt](ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.txt)\n",
+ "Get the “Complete HGNC dataset” as TXT, a file named [hgnc_complete_set.txt](http://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt)\n",
"\n",
"#### Download 10x Chromium barcode whitelists\n",
"\n",
- "Chromium cell barcodes are generated from a set of known sequences, which differ by version of the Chromium kits. Get three files from the [cellranger GitHub repository](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes) and rename them like this:\n",
+ "Chromium cell barcodes are generated from a set of known sequences, which differ by version of the Chromium kits.\n",
+ "Get three files from the (maybe dead) [cellranger GitHub repository](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes)\n",
+ "Working alternative links: [Teich lab v2](https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/737K-august-2016.txt.gz) [Teich lab v3](https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/3M-february-2018.txt.gz) [github v1](https://github.com/ucdavis-bioinformatics/proc10xG/blob/master/barcodes/737K-april-2014_rc.txt)\n",
+ "Rename them like this:\n",
"\n",
"```\n",
"3M-february-2018.txt.gz -> (unzip) -> 10xv3_whitelist.txt\n",
@@ -121,7 +125,7 @@
"\n",
"### Run the following scripts\n",
"\n",
- "Before starting, set `d` to the full path of your index directory, and `extent` to the desired window size (windows are the sequences upstream of a polyadenylation site or a genomic polyA/T sequence that are used to build the index)."
+ "Before starting, set `d` to the full path, ending with a '/', of your index directory (the one containing the `inputs` dir), and `extent` to the desired window size (windows are the sequences upstream of a polyadenylation site or a genomic polyA/T sequence that are used to build the index)."
]
},
{
View it on GitLab: https://salsa.debian.org/med-team/python-loompy/-/compare/50c43d9b30a47dfeda12590397fc8fdcd8161048...bbec94e06089aa4d7c7b94f7452aa7c586b1ea8c
--
View it on GitLab: https://salsa.debian.org/med-team/python-loompy/-/compare/50c43d9b30a47dfeda12590397fc8fdcd8161048...bbec94e06089aa4d7c7b94f7452aa7c586b1ea8c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251220/676156e6/attachment-0001.htm>
More information about the debian-med-commit
mailing list