[med-svn] [Git][python-team/packages/python-pynndescent][upstream] New upstream version 0.5.6
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Feb 18 06:25:39 GMT 2022
Andreas Tille pushed to branch upstream at Debian Python Team / packages / python-pynndescent
Commits:
a90e87ae by Andreas Tille at 2022-02-18T07:11:26+01:00
New upstream version 0.5.6
- - - - -
9 changed files:
- PKG-INFO
- README.rst
- pynndescent.egg-info/PKG-INFO
- pynndescent/distances.py
- pynndescent/pynndescent_.py
- pynndescent/rp_trees.py
- pynndescent/sparse.py
- pynndescent/tests/test_distances.py
- setup.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.2
Name: pynndescent
-Version: 0.5.5
+Version: 0.5.6
Summary: Nearest Neighbor Descent
Home-page: http://github.com/lmcinnes/pynndescent
Author: Leland McInnes
@@ -13,12 +13,9 @@ Description: .. image:: doc/pynndescent_logo.png
:align: center
:alt: PyNNDescent Logo
- .. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
- :target: https://travis-ci.com/lmcinnes/pynndescent
- :alt: Travis Build Status
- .. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
- :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
- :alt: AppVeyor Build Status
+ .. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+ :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+ :alt: Azure Pipelines Build Status
.. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
:target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
:alt: LGTM Alerts
=====================================
README.rst
=====================================
@@ -3,12 +3,9 @@
:align: center
:alt: PyNNDescent Logo
-.. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
- :target: https://travis-ci.com/lmcinnes/pynndescent
- :alt: Travis Build Status
-.. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
- :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
- :alt: AppVeyor Build Status
+.. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+ :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+ :alt: Azure Pipelines Build Status
.. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
:target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
:alt: LGTM Alerts
=====================================
pynndescent.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.2
Name: pynndescent
-Version: 0.5.5
+Version: 0.5.6
Summary: Nearest Neighbor Descent
Home-page: http://github.com/lmcinnes/pynndescent
Author: Leland McInnes
@@ -13,12 +13,9 @@ Description: .. image:: doc/pynndescent_logo.png
:align: center
:alt: PyNNDescent Logo
- .. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
- :target: https://travis-ci.com/lmcinnes/pynndescent
- :alt: Travis Build Status
- .. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
- :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
- :alt: AppVeyor Build Status
+ .. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+ :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+ :alt: Azure Pipelines Build Status
.. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
:target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
:alt: LGTM Alerts
=====================================
pynndescent/distances.py
=====================================
@@ -51,7 +51,6 @@ def euclidean(x, y):
"dim": numba.types.intp,
"i": numba.types.uint16,
},
-
)
def squared_euclidean(x, y):
r"""Squared euclidean distance.
@@ -235,7 +234,6 @@ def jaccard(x, y):
"dim": numba.types.intp,
"i": numba.types.uint16,
},
-
)
def alternative_jaccard(x, y):
num_non_zero = 0.0
@@ -421,7 +419,6 @@ def cosine(x, y):
"dim": numba.types.intp,
"i": numba.types.uint16,
},
-
)
def alternative_cosine(x, y):
result = 0.0
@@ -452,7 +449,6 @@ def alternative_cosine(x, y):
"dim": numba.types.intp,
"i": numba.types.uint16,
},
-
)
def dot(x, y):
result = 0.0
@@ -480,7 +476,6 @@ def dot(x, y):
"dim": numba.types.intp,
"i": numba.types.uint16,
},
-
)
def alternative_dot(x, y):
result = 0.0
@@ -815,6 +810,66 @@ def jensen_shannon_divergence(x, y):
return result
+ at numba.njit()
+def wasserstein_1d(x, y, p=1):
+ x_sum = 0.0
+ y_sum = 0.0
+ for i in range(x.shape[0]):
+ x_sum += x[i]
+ y_sum += y[i]
+
+ x_cdf = x / x_sum
+ y_cdf = y / y_sum
+
+ for i in range(1, x_cdf.shape[0]):
+ x_cdf[i] += x_cdf[i - 1]
+ y_cdf[i] += y_cdf[i - 1]
+
+ return minkowski(x_cdf, y_cdf, p)
+
+
+ at numba.njit()
+def circular_kantorovich(x, y, p=1):
+ x_sum = 0.0
+ y_sum = 0.0
+ for i in range(x.shape[0]):
+ x_sum += x[i]
+ y_sum += y[i]
+
+ x_cdf = x / x_sum
+ y_cdf = y / y_sum
+
+ for i in range(1, x_cdf.shape[0]):
+ x_cdf[i] += x_cdf[i - 1]
+ y_cdf[i] += y_cdf[i - 1]
+
+ mu = np.median((x_cdf - y_cdf) ** p)
+
+ # Now we just want minkowski distance on the CDFs shifted by mu
+ result = 0.0
+ if p > 2:
+ for i in range(x_cdf.shape[0]):
+ result += np.abs(x_cdf[i] - y_cdf[i] - mu) ** p
+
+ return result ** (1.0 / p)
+
+ elif p == 2:
+ for i in range(x_cdf.shape[0]):
+ val = x_cdf[i] - y_cdf[i] - mu
+ result += val * val
+
+ return np.sqrt(result)
+
+ elif p == 1:
+ for i in range(x_cdf.shape[0]):
+ result += np.abs(x_cdf[i] - y_cdf[i] - mu)
+
+ return result
+
+ else:
+ raise ValueError("Invalid p supplied to Kantorvich distance")
+
+
@numba.njit()
def symmetric_kl_divergence(x, y):
result = 0.0
@@ -873,6 +928,12 @@ named_distances = {
"hellinger": hellinger,
"kantorovich": kantorovich,
"wasserstein": kantorovich,
+ "wasserstein_1d": wasserstein_1d,
+ "wasserstein-1d": wasserstein_1d,
+ "kantorovich-1d": wasserstein_1d,
+ "kantorovich_1d": wasserstein_1d,
+ "circular_kantorovich": circular_kantorovich,
+ "circular_wasserstein": circular_kantorovich,
"sinkhorn": sinkhorn,
"jensen-shannon": jensen_shannon_divergence,
"jensen_shannon": jensen_shannon_divergence,
=====================================
pynndescent/pynndescent_.py
=====================================
@@ -9,7 +9,7 @@ import numpy as np
from sklearn.utils import check_random_state, check_array
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator, TransformerMixin
-from scipy.sparse import csr_matrix, coo_matrix, isspmatrix_csr, vstack as sparse_vstack
+from scipy.sparse import csr_matrix, coo_matrix, isspmatrix_csr, vstack as sparse_vstack, issparse
import heapq
@@ -58,6 +58,11 @@ FLOAT32_EPS = np.finfo(np.float32).eps
EMPTY_GRAPH = make_heap(1, 1)
+def is_c_contiguous(array_like):
+ flags = getattr(array_like, "flags", None)
+ return flags is not None and flags["C_CONTIGUOUS"]
+
+
@numba.njit(parallel=True, cache=True)
def generate_leaf_updates(leaf_block, dist_thresholds, data, dist):
@@ -496,7 +501,7 @@ def resort_tree_indices(tree, tree_order):
return new_tree
-class NNDescent(object):
+class NNDescent:
"""NNDescent for fast approximate nearest neighbor queries. NNDescent is
very flexible and supports a wide variety of distances, including
non-metric distances. NNDescent also scales well against high dimensional
@@ -505,7 +510,7 @@ class NNDescent(object):
Parameters
----------
- data: array os shape (n_samples, n_features)
+ data: array of shape (n_samples, n_features)
The training graph_data set to find nearest neighbors in.
metric: string or callable (optional, default='euclidean')
@@ -534,6 +539,7 @@ class NNDescent(object):
* sokalsneath
* yule
* hellinger
+ * wasserstein-1d
Metrics that take arguments (such as minkowski, mahalanobis etc.)
can have arguments passed via the metric_kwds dictionary. At this
time care must be taken and dictionary elements must be ordered
@@ -595,14 +601,11 @@ class NNDescent(object):
The``'alternative'`` algorithm has been deprecated and is no longer
available.
- low_memory: boolean (optional, default=False)
+ low_memory: boolean (optional, default=True)
Whether to use a lower memory, but more computationally expensive
- approach to index construction. This defaults to false as for most
- cases it speeds index construction, but if you are having issues
- with excessive memory use for your dataset consider setting this
- to True.
+ approach to index construction.
- max_candidates: int (optional, default=20)
+ max_candidates: int (optional, default=None)
Internally each "self-join" keeps a maximum number of candidates (
nearest neighbors and reverse nearest neighbors) to be considered.
This value controls this aspect of the algorithm. Larger values will
@@ -633,6 +636,11 @@ class NNDescent(object):
result in a significantly smaller index, particularly useful for saving,
but will remove information that might otherwise be useful.
+ parallel_batch_queries: bool (optional, default=False)
+ Whether to use parallelism of batched queries. This can be useful for large
+ batches of queries on multicore machines, but results in performance degradation
+ for single queries, so is poor for streaming use.
+
verbose: bool (optional, default=False)
Whether to print status graph_data during the computation.
"""
@@ -656,7 +664,8 @@ class NNDescent(object):
n_iters=None,
delta=0.001,
n_jobs=None,
- compressed=True,
+ compressed=False,
+ parallel_batch_queries=False,
verbose=False,
):
@@ -681,8 +690,14 @@ class NNDescent(object):
self.dim = data.shape[1]
self.n_jobs = n_jobs
self.compressed = compressed
+ self.parallel_batch_queries = parallel_batch_queries
self.verbose = verbose
+ if getattr(data, "dtype", None) == np.float32 and (issparse(data) or is_c_contiguous(data)):
+ copy_on_normalize = True
+ else:
+ copy_on_normalize = False
+
data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
self._raw_data = data
@@ -739,7 +754,7 @@ class NNDescent(object):
self._angular_trees = False
if metric == "dot":
- data = normalize(data, norm="l2", copy=False)
+ data = normalize(data, norm="l2", copy=copy_on_normalize)
self.rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
np.int64
@@ -1171,6 +1186,7 @@ class NNDescent(object):
indices = self._search_graph.indices
dist = self._distance_func
n_neighbors = self.n_neighbors
+ parallel_search = self.parallel_batch_queries
@numba.njit(
fastmath=True,
@@ -1193,6 +1209,7 @@ class NNDescent(object):
"distance_bound": numba.types.float32,
"seed_scale": numba.types.float32,
},
+ parallel=self.parallel_batch_queries,
)
def search_closure(query_points, k, epsilon, visited, rng_state):
@@ -1200,8 +1217,14 @@ class NNDescent(object):
distance_scale = 1.0 + epsilon
internal_rng_state = np.copy(rng_state)
- for i in range(query_points.shape[0]):
- visited[:] = 0
+ for i in numba.prange(query_points.shape[0]):
+ # Avoid races on visited if parallel
+ if parallel_search:
+ visited_nodes = np.zeros_like(visited)
+ else:
+ visited_nodes = visited
+ visited_nodes[:] = 0
+
if dist == alternative_dot or dist == alternative_cosine:
norm = np.sqrt((query_points[i] ** 2).sum())
if norm > 0.0:
@@ -1225,24 +1248,24 @@ class NNDescent(object):
for j in range(n_initial_points):
candidate = candidate_indices[j]
- d = dist(data[candidate], current_query)
+ d = np.float32(dist(data[candidate], current_query))
# indices are guaranteed different
simple_heap_push(heap_priorities, heap_indices, d, candidate)
heapq.heappush(seed_set, (d, candidate))
- mark_visited(visited, candidate)
+ mark_visited(visited_nodes, candidate)
if n_random_samples > 0:
for j in range(n_random_samples):
candidate = np.int32(
np.abs(tau_rand_int(internal_rng_state)) % data.shape[0]
)
- if has_been_visited(visited, candidate) == 0:
- d = dist(data[candidate], current_query)
+ if has_been_visited(visited_nodes, candidate) == 0:
+ d = np.float32(dist(data[candidate], current_query))
simple_heap_push(
heap_priorities, heap_indices, d, candidate
)
heapq.heappush(seed_set, (d, candidate))
- mark_visited(visited, candidate)
+ mark_visited(visited_nodes, candidate)
############ Search ##############
distance_bound = distance_scale * heap_priorities[0]
@@ -1256,10 +1279,10 @@ class NNDescent(object):
candidate = indices[j]
- if has_been_visited(visited, candidate) == 0:
- mark_visited(visited, candidate)
+ if has_been_visited(visited_nodes, candidate) == 0:
+ mark_visited(visited_nodes, candidate)
- d = dist(data[candidate], current_query)
+ d = np.float32(dist(data[candidate], current_query))
if d < distance_bound:
simple_heap_push(
@@ -1332,6 +1355,7 @@ class NNDescent(object):
indices = self._search_graph.indices
dist = self._distance_func
n_neighbors = self.n_neighbors
+ parallel_search = self.parallel_batch_queries
@numba.njit(
fastmath=True,
@@ -1350,6 +1374,7 @@ class NNDescent(object):
"distance_scale": numba.types.float32,
"seed_scale": numba.types.float32,
},
+ parallel=self.parallel_batch_queries,
)
def search_closure(
query_inds, query_indptr, query_data, k, epsilon, visited, rng_state
@@ -1361,8 +1386,13 @@ class NNDescent(object):
distance_scale = 1.0 + epsilon
internal_rng_state = np.copy(rng_state)
- for i in range(n_query_points):
- visited[:] = 0
+ for i in numba.prange(n_query_points):
+ # Avoid races on visited if parallel
+ if parallel_search:
+ visited_nodes = np.zeros_like(visited)
+ else:
+ visited_nodes = visited
+ visited_nodes[:] = 0
current_query_inds = query_inds[query_indptr[i] : query_indptr[i + 1]]
current_query_data = query_data[query_indptr[i] : query_indptr[i + 1]]
@@ -1398,20 +1428,20 @@ class NNDescent(object):
data_indptr[candidate] : data_indptr[candidate + 1]
]
- d = dist(
+ d = np.float32(dist(
from_inds, from_data, current_query_inds, current_query_data
- )
+ ))
# indices are guaranteed different
simple_heap_push(heap_priorities, heap_indices, d, candidate)
heapq.heappush(seed_set, (d, candidate))
- mark_visited(visited, candidate)
+ mark_visited(visited_nodes, candidate)
if n_random_samples > 0:
for j in range(n_random_samples):
candidate = np.int32(
np.abs(tau_rand_int(internal_rng_state)) % n_index_points
)
- if has_been_visited(visited, candidate) == 0:
+ if has_been_visited(visited_nodes, candidate) == 0:
from_inds = data_inds[
data_indptr[candidate] : data_indptr[candidate + 1]
]
@@ -1419,18 +1449,18 @@ class NNDescent(object):
data_indptr[candidate] : data_indptr[candidate + 1]
]
- d = dist(
+ d = np.float32(dist(
from_inds,
from_data,
current_query_inds,
current_query_data,
- )
+ ))
simple_heap_push(
heap_priorities, heap_indices, d, candidate
)
heapq.heappush(seed_set, (d, candidate))
- mark_visited(visited, candidate)
+ mark_visited(visited_nodes, candidate)
############ Search ##############
distance_bound = distance_scale * heap_priorities[0]
@@ -1444,8 +1474,8 @@ class NNDescent(object):
candidate = indices[j]
- if has_been_visited(visited, candidate) == 0:
- mark_visited(visited, candidate)
+ if has_been_visited(visited_nodes, candidate) == 0:
+ mark_visited(visited_nodes, candidate)
from_inds = data_inds[
data_indptr[candidate] : data_indptr[candidate + 1]
@@ -1454,12 +1484,12 @@ class NNDescent(object):
data_indptr[candidate] : data_indptr[candidate + 1]
]
- d = dist(
+ d = np.float32(dist(
from_inds,
from_data,
current_query_inds,
current_query_data,
- )
+ ))
if d < distance_bound:
simple_heap_push(
@@ -1604,6 +1634,9 @@ class NNDescent(object):
return indices, dists
def update(self, X):
+ if not hasattr(self, "_search_graph"):
+ self._init_search_graph()
+
current_random_state = check_random_state(self.random_state)
rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
np.int64
@@ -1705,6 +1738,8 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
* sokalmichener
* sokalsneath
* yule
+ * hellinger
+ * wasserstein-1d
Metrics that take arguments (such as minkowski, mahalanobis etc.)
can have arguments passed via the metric_kwds dictionary. At this
time care must be taken and dictionary elements must be ordered
@@ -1794,6 +1829,11 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
and less accurate searching. Don't tweak this value unless you know
what you're doing.
+ parallel_batch_queries: bool (optional, default=False)
+ Whether to use parallelism of batched queries. This can be useful for large
+ batches of queries on multicore machines, but results in performance degradation
+ for single queries, so is poor for streaming use.
+
verbose: bool (optional, default=False)
Whether to print status graph_data during the computation.
@@ -1825,6 +1865,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
max_candidates=None,
n_iters=None,
early_termination_value=0.001,
+ parallel_batch_queries=False,
verbose=False,
):
@@ -1844,6 +1885,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
self.n_iters = n_iters
self.early_termination_value = early_termination_value
self.n_jobs = n_jobs
+ self.parallel_batch_queries = parallel_batch_queries
self.verbose = verbose
def fit(self, X, compress_index=True):
@@ -1892,6 +1934,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
delta=self.early_termination_value,
n_jobs=self.n_jobs,
compressed=compress_index,
+ parallel_batch_queries=self.parallel_batch_queries,
verbose=self.verbose,
)
=====================================
pynndescent/rp_trees.py
=====================================
@@ -8,7 +8,7 @@ import numpy as np
import numba
import scipy.sparse
-from pynndescent.sparse import sparse_mul, sparse_diff, sparse_sum, arr_intersect
+from pynndescent.sparse import sparse_mul, sparse_diff, sparse_sum, arr_intersect, sparse_dot_product
from pynndescent.utils import tau_rand_int, norm
import joblib
@@ -908,10 +908,7 @@ def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
hyperplane_inds = hyperplane[0, :hyperplane_size].astype(np.int32)
hyperplane_data = hyperplane[1, :hyperplane_size]
- _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds, point_data)
-
- for val in aux_data:
- margin += val
+ margin += sparse_dot_product(hyperplane_inds, hyperplane_data, point_inds, point_data)
if abs(margin) < EPS:
side = tau_rand_int(rng_state) % 2
=====================================
pynndescent/sparse.py
=====================================
@@ -53,6 +53,60 @@ def arr_intersect(ar1, ar2):
aux.sort()
return aux[:-1][aux[1:] == aux[:-1]]
+# Some things require size of intersection; do this quickly; assume sorted arrays for speed
+ at numba.njit(
+ [
+ "i4(i4[:],i4[:])",
+ numba.types.int32(
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+ ),
+ ],
+ locals={
+ "i1": numba.uint16,
+ "i2": numba.uint16,
+ }
+)
+def fast_intersection_size(ar1, ar2):
+ if ar1.shape[0] == 0 or ar2.shape[0] == 0:
+ return 0
+
+ # NOTE: We assume arrays are sorted; if they are not this will break
+ i1 = 0
+ i2 = 0
+ limit1 = ar1.shape[0] - 1
+ limit2 = ar2.shape[0] - 1
+ j1 = ar1[i1]
+ j2 = ar2[i2]
+
+ result = 0
+
+ while True:
+ if j1 == j2:
+ result += 1
+ if i1 < limit1:
+ i1 += 1
+ j1 = ar1[i1]
+ else:
+ break
+
+ if i2 < limit2:
+ i2 += 1
+ j2 = ar2[i2]
+ else:
+ break
+
+ elif j1 < j2 and i1 < limit1:
+ i1 += 1
+ j1 = ar1[i1]
+ elif j2 < j1 and i2 < limit2:
+ i2 += 1
+ j2 = ar2[i2]
+ else:
+ break
+
+ return result
+
@numba.njit(
[
@@ -147,7 +201,6 @@ def sparse_sum(ind1, data1, ind2, data2):
def sparse_diff(ind1, data1, ind2, data2):
return sparse_sum(ind1, data1, ind2, -data2)
-
@numba.njit(
[
# "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
@@ -199,6 +252,63 @@ def sparse_mul(ind1, data1, ind2, data2):
return result_ind, result_data
+ at numba.njit(
+ [
+ # "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
+ numba.types.float32(
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+ )
+ ],
+ fastmath=True,
+ locals={
+ "result": numba.types.float32,
+ "val": numba.types.float32,
+ "i1": numba.types.uint16,
+ "i2": numba.types.uint16,
+ "j1": numba.types.int32,
+ "j2": numba.types.int32,
+ },
+ cache=True,
+)
+def sparse_dot_product(ind1, data1, ind2, data2):
+ dim1 = ind1.shape[0]
+ dim2 = ind2.shape[0]
+
+ result = 0.0
+
+ i1 = 0
+ i2 = 0
+ j1 = ind1[i1]
+ j2 = ind2[i2]
+
+ # pass through both index lists
+ while True:
+ if j1 == j2:
+ val = data1[i1] * data2[i2]
+ result += val
+ i1 += 1
+ if i1 >= dim1:
+ return result
+ j1 = ind1[i1]
+ i2 += 1
+ if i2 >= dim2:
+ return result
+ j2 = ind2[i2]
+ elif j1 < j2:
+ i1 += 1
+ if i1 >= dim1:
+ return result
+ j1 = ind1[i1]
+ else:
+ i2 += 1
+ if i2 >= dim2:
+ return result
+ j2 = ind2[i2]
+
+ return result # unreachable
# Return dense vectors supported on the union of the non-zero valued indices
@numba.njit()
@@ -380,8 +490,8 @@ def sparse_bray_curtis(ind1, data1, ind2, data2): # pragma: no cover
@numba.njit()
def sparse_jaccard(ind1, data1, ind2, data2):
- num_non_zero = arr_union(ind1, ind2).shape[0]
- num_equal = arr_intersect(ind1, ind2).shape[0]
+ num_equal = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_equal
if num_non_zero == 0:
return 0.0
@@ -403,24 +513,28 @@ def sparse_jaccard(ind1, data1, ind2, data2):
locals={"num_non_zero": numba.types.intp, "num_equal": numba.types.intp},
)
def sparse_alternative_jaccard(ind1, data1, ind2, data2):
- num_non_zero = arr_union(ind1, ind2).shape[0]
- num_equal = arr_intersect(ind1, ind2).shape[0]
+ num_equal = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_equal
if num_non_zero == 0:
return 0.0
+ elif num_equal == 0:
+ return FLOAT32_MAX
else:
return -np.log2(num_equal / num_non_zero)
+ # return (num_non_zero - num_equal) / num_equal
@numba.vectorize(fastmath=True)
def correct_alternative_jaccard(v):
return 1.0 - pow(2.0, -v)
+ # return v / (v + 1)
@numba.njit()
def sparse_matching(ind1, data1, ind2, data2, n_features):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
return float(num_not_equal) / n_features
@@ -428,8 +542,8 @@ def sparse_matching(ind1, data1, ind2, data2, n_features):
@numba.njit()
def sparse_dice(ind1, data1, ind2, data2):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
if num_not_equal == 0.0:
@@ -440,8 +554,8 @@ def sparse_dice(ind1, data1, ind2, data2):
@numba.njit()
def sparse_kulsinski(ind1, data1, ind2, data2, n_features):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
if num_not_equal == 0:
@@ -454,8 +568,8 @@ def sparse_kulsinski(ind1, data1, ind2, data2, n_features):
@numba.njit()
def sparse_rogers_tanimoto(ind1, data1, ind2, data2, n_features):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
return (2.0 * num_not_equal) / (n_features + num_not_equal)
@@ -466,7 +580,7 @@ def sparse_russellrao(ind1, data1, ind2, data2, n_features):
if ind1.shape[0] == ind2.shape[0] and np.all(ind1 == ind2):
return 0.0
- num_true_true = arr_intersect(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
if num_true_true == np.sum(data1 != 0) and num_true_true == np.sum(data2 != 0):
return 0.0
@@ -476,8 +590,8 @@ def sparse_russellrao(ind1, data1, ind2, data2, n_features):
@numba.njit()
def sparse_sokal_michener(ind1, data1, ind2, data2, n_features):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
return (2.0 * num_not_equal) / (n_features + num_not_equal)
@@ -485,8 +599,8 @@ def sparse_sokal_michener(ind1, data1, ind2, data2, n_features):
@numba.njit()
def sparse_sokal_sneath(ind1, data1, ind2, data2):
- num_true_true = arr_intersect(ind1, ind2).shape[0]
- num_non_zero = arr_union(ind1, ind2).shape[0]
+ num_true_true = fast_intersection_size(ind1, ind2)
+ num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
num_not_equal = num_non_zero - num_true_true
if num_not_equal == 0.0:
@@ -553,11 +667,7 @@ def sparse_correct_alternative_cosine(d):
@numba.njit()
def sparse_dot(ind1, data1, ind2, data2):
- _, aux_data = sparse_mul(ind1, data1, ind2, data2)
- result = 0.0
-
- for val in aux_data:
- result += val
+ result = sparse_dot_product(ind1, data1, ind2, data2)
return 1.0 - result
@@ -567,16 +677,10 @@ def sparse_dot(ind1, data1, ind2, data2):
fastmath=True,
locals={
"result": numba.types.float32,
- "dim": numba.types.intp,
- "i": numba.types.uint16,
},
)
def sparse_alternative_dot(ind1, data1, ind2, data2):
- _, aux_data = sparse_mul(ind1, data1, ind2, data2)
- result = 0.0
- dim = len(aux_data)
- for i in range(dim):
- result += aux_data[i]
+ result = sparse_dot_product(ind1, data1, ind2, data2)
if result <= 0.0:
return FLOAT32_MAX
@@ -758,6 +862,65 @@ def sparse_kantorovich(ind1, data1, ind2, data2, ground_metric=dummy_ground_metr
return kantorovich(data1, data2, cost_matrix)
+ at numba.njit()
+def sparse_wasserstein_1d(ind1, data1, ind2, data2, p=1):
+ result = 0.0
+ old_ind = 0
+ delta = 0.0
+ i1 = 0
+ i2 = 0
+ cdf1 = 0.0
+ cdf2 = 0.0
+ l1_norm_x = np.sum(data1)
+ l1_norm_y = np.sum(data2)
+
+ norm = lambda x, p: np.power(np.abs(x), p)
+
+ # pass through both index lists
+ while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
+ j1 = ind1[i1]
+ j2 = ind2[i2]
+
+ if j1 == j2:
+ result += delta * (j1 - old_ind)
+ cdf1 += data1[i1] / l1_norm_x
+ cdf2 += data2[i2] / l1_norm_y
+ delta = norm(cdf1 - cdf2, p)
+ old_ind = j1
+ i1 += 1
+ i2 += 1
+ elif j1 < j2:
+ result += delta * (j1 - old_ind)
+ cdf1 += data1[i1] / l1_norm_x
+ delta = norm(cdf1 - cdf2, p)
+ old_ind = j1
+ i1 += 1
+ else:
+ result += delta * (j2 - old_ind)
+ cdf2 += data2[i2] / l1_norm_y
+ delta = norm(cdf1 - cdf2, p)
+ old_ind = j2
+ i2 += 1
+ # pass over the tails
+ while i1 < ind1.shape[0]:
+ j1 = ind1[i1]
+ result += delta * (j1 - old_ind)
+ cdf1 += data1[i1] / l1_norm_x
+ delta = norm(cdf1 - cdf2, p)
+ old_ind = j1
+ i1 += 1
+
+ while i2 < ind2.shape[0]:
+ j2 = ind2[i2]
+ result += delta * (j2 - old_ind)
+ cdf2 += data2[i2] / l1_norm_y
+ delta = norm(cdf1 - cdf2, p)
+ old_ind = j2
+ i2 += 1
+
+ return np.power(result, 1.0 / p)
+
+
# Because of the EPS values and the need to normalize after adding them (and then average those for jensen_shannon)
# it seems like we might as well just take the dense union (dense vectors supported on the union of indices)
# and call the dense distance functions
@@ -917,6 +1080,10 @@ sparse_named_distances = {
# Distribution distances
"kantorovich": sparse_kantorovich,
"wasserstein": sparse_kantorovich,
+ "wasserstein_1d": sparse_wasserstein_1d,
+ "wasserstein-1d": sparse_wasserstein_1d,
+ "kantorovich-1d": sparse_wasserstein_1d,
+ "kantorovich-1d": sparse_wasserstein_1d,
"hellinger": sparse_hellinger,
"jensen-shannon": sparse_jensen_shannon_divergence,
"jensen_shannon": sparse_jensen_shannon_divergence,
=====================================
pynndescent/tests/test_distances.py
=====================================
@@ -366,9 +366,29 @@ def test_sparse_jensen_shannon():
/ 2.0
)
d2 = spdist.sparse_jensen_shannon_divergence(
- sparse_test_data[i].indices,
- sparse_test_data[i].data,
- sparse_test_data[j].indices,
- sparse_test_data[j].data,
- )
+ sparse_test_data[i].indices,
+ sparse_test_data[i].data,
+ sparse_test_data[j].indices,
+ sparse_test_data[j].data,
+ )
assert np.isclose(d1, d2, rtol=1e-3)
+
+
+ at pytest.mark.parametrize("p", [1.0, 2.0, 3.0, 0.5])
+def test_wasserstein_1d(p):
+ test_data = np.random.random(size=(10, 100))
+ # sparsify
+ test_data[test_data <= 0.5] = 0.0
+ sparse_test_data = csr_matrix(test_data)
+
+ for i in range(test_data.shape[0]):
+ for j in range(i + 1, test_data.shape[0]):
+ d1 = dist.wasserstein_1d(test_data[i], test_data[j], p)
+ d2 = spdist.sparse_wasserstein_1d(
+ sparse_test_data[i].indices,
+ sparse_test_data[i].data,
+ sparse_test_data[j].indices,
+ sparse_test_data[j].data,
+ p,
+ )
+ assert np.isclose(d1, d2)
=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def readme():
configuration = {
"name": "pynndescent",
- "version": "0.5.5",
+ "version": "0.5.6",
"description": "Nearest Neighbor Descent",
"long_description": readme(),
"classifiers": [
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/a90e87aed0b939012379dc24250bf52fe3744ddf
--
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/a90e87aed0b939012379dc24250bf52fe3744ddf
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220218/60c9fdd5/attachment-0001.htm>
More information about the debian-med-commit
mailing list