[med-svn] [Git][python-team/packages/python-pynndescent][upstream] New upstream version 0.5.6

Andreas Tille (@tille) gitlab at salsa.debian.org
Fri Feb 18 06:25:39 GMT 2022



Andreas Tille pushed to branch upstream at Debian Python Team / packages / python-pynndescent


Commits:
a90e87ae by Andreas Tille at 2022-02-18T07:11:26+01:00
New upstream version 0.5.6
- - - - -


9 changed files:

- PKG-INFO
- README.rst
- pynndescent.egg-info/PKG-INFO
- pynndescent/distances.py
- pynndescent/pynndescent_.py
- pynndescent/rp_trees.py
- pynndescent/sparse.py
- pynndescent/tests/test_distances.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.2
 Name: pynndescent
-Version: 0.5.5
+Version: 0.5.6
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes
@@ -13,12 +13,9 @@ Description: .. image:: doc/pynndescent_logo.png
           :align: center
           :alt: PyNNDescent Logo
         
-        .. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
-            :target: https://travis-ci.com/lmcinnes/pynndescent
-            :alt: Travis Build Status
-        .. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
-            :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
-            :alt: AppVeyor Build Status
+        .. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+            :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+            :alt: Azure Pipelines Build Status
         .. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
             :target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
             :alt: LGTM Alerts


=====================================
README.rst
=====================================
@@ -3,12 +3,9 @@
   :align: center
   :alt: PyNNDescent Logo
 
-.. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
-    :target: https://travis-ci.com/lmcinnes/pynndescent
-    :alt: Travis Build Status
-.. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
-    :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
-    :alt: AppVeyor Build Status
+.. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+    :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+    :alt: Azure Pipelines Build Status
 .. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
     :target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
     :alt: LGTM Alerts


=====================================
pynndescent.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.2
 Name: pynndescent
-Version: 0.5.5
+Version: 0.5.6
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes
@@ -13,12 +13,9 @@ Description: .. image:: doc/pynndescent_logo.png
           :align: center
           :alt: PyNNDescent Logo
         
-        .. image:: https://travis-ci.com/lmcinnes/pynndescent.svg
-            :target: https://travis-ci.com/lmcinnes/pynndescent
-            :alt: Travis Build Status
-        .. image:: https://ci.appveyor.com/api/projects/status/github/lmcinnes/pynndescent?branch=master&svg=true
-            :target: https://ci.appveyor.com/project/lmcinnes/pynndescent
-            :alt: AppVeyor Build Status
+        .. image:: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_apis/build/status/lmcinnes.pynndescent?branchName=master
+            :target: .. _build_status: https://dev.azure.com/lelandmcinnes/UMAP%20project%20builds/_build/latest?definitionId=2&branchName=master
+            :alt: Azure Pipelines Build Status
         .. image:: https://img.shields.io/lgtm/alerts/g/lmcinnes/pynndescent.svg
             :target: https://lgtm.com/projects/g/lmcinnes/pynndescent/alerts
             :alt: LGTM Alerts


=====================================
pynndescent/distances.py
=====================================
@@ -51,7 +51,6 @@ def euclidean(x, y):
         "dim": numba.types.intp,
         "i": numba.types.uint16,
     },
-
 )
 def squared_euclidean(x, y):
     r"""Squared euclidean distance.
@@ -235,7 +234,6 @@ def jaccard(x, y):
         "dim": numba.types.intp,
         "i": numba.types.uint16,
     },
-
 )
 def alternative_jaccard(x, y):
     num_non_zero = 0.0
@@ -421,7 +419,6 @@ def cosine(x, y):
         "dim": numba.types.intp,
         "i": numba.types.uint16,
     },
-
 )
 def alternative_cosine(x, y):
     result = 0.0
@@ -452,7 +449,6 @@ def alternative_cosine(x, y):
         "dim": numba.types.intp,
         "i": numba.types.uint16,
     },
-
 )
 def dot(x, y):
     result = 0.0
@@ -480,7 +476,6 @@ def dot(x, y):
         "dim": numba.types.intp,
         "i": numba.types.uint16,
     },
-
 )
 def alternative_dot(x, y):
     result = 0.0
@@ -815,6 +810,66 @@ def jensen_shannon_divergence(x, y):
     return result
 
 
+ at numba.njit()
+def wasserstein_1d(x, y, p=1):
+    x_sum = 0.0
+    y_sum = 0.0
+    for i in range(x.shape[0]):
+        x_sum += x[i]
+        y_sum += y[i]
+
+    x_cdf = x / x_sum
+    y_cdf = y / y_sum
+
+    for i in range(1, x_cdf.shape[0]):
+        x_cdf[i] += x_cdf[i - 1]
+        y_cdf[i] += y_cdf[i - 1]
+
+    return minkowski(x_cdf, y_cdf, p)
+
+
+ at numba.njit()
+def circular_kantorovich(x, y, p=1):
+    x_sum = 0.0
+    y_sum = 0.0
+    for i in range(x.shape[0]):
+        x_sum += x[i]
+        y_sum += y[i]
+
+    x_cdf = x / x_sum
+    y_cdf = y / y_sum
+
+    for i in range(1, x_cdf.shape[0]):
+        x_cdf[i] += x_cdf[i - 1]
+        y_cdf[i] += y_cdf[i - 1]
+
+    mu = np.median((x_cdf - y_cdf) ** p)
+
+    # Now we just want minkowski distance on the CDFs shifted by mu
+    result = 0.0
+    if p > 2:
+        for i in range(x_cdf.shape[0]):
+            result += np.abs(x_cdf[i] - y_cdf[i] - mu) ** p
+
+        return result ** (1.0 / p)
+
+    elif p == 2:
+        for i in range(x_cdf.shape[0]):
+            val = x_cdf[i] - y_cdf[i] - mu
+            result += val * val
+
+        return np.sqrt(result)
+
+    elif p == 1:
+        for i in range(x_cdf.shape[0]):
+            result += np.abs(x_cdf[i] - y_cdf[i] - mu)
+
+        return result
+
+    else:
+        raise ValueError("Invalid p supplied to Kantorvich distance")
+
+
 @numba.njit()
 def symmetric_kl_divergence(x, y):
     result = 0.0
@@ -873,6 +928,12 @@ named_distances = {
     "hellinger": hellinger,
     "kantorovich": kantorovich,
     "wasserstein": kantorovich,
+    "wasserstein_1d": wasserstein_1d,
+    "wasserstein-1d": wasserstein_1d,
+    "kantorovich-1d": wasserstein_1d,
+    "kantorovich_1d": wasserstein_1d,
+    "circular_kantorovich": circular_kantorovich,
+    "circular_wasserstein": circular_kantorovich,
     "sinkhorn": sinkhorn,
     "jensen-shannon": jensen_shannon_divergence,
     "jensen_shannon": jensen_shannon_divergence,


=====================================
pynndescent/pynndescent_.py
=====================================
@@ -9,7 +9,7 @@ import numpy as np
 from sklearn.utils import check_random_state, check_array
 from sklearn.preprocessing import normalize
 from sklearn.base import BaseEstimator, TransformerMixin
-from scipy.sparse import csr_matrix, coo_matrix, isspmatrix_csr, vstack as sparse_vstack
+from scipy.sparse import csr_matrix, coo_matrix, isspmatrix_csr, vstack as sparse_vstack, issparse
 
 import heapq
 
@@ -58,6 +58,11 @@ FLOAT32_EPS = np.finfo(np.float32).eps
 EMPTY_GRAPH = make_heap(1, 1)
 
 
+def is_c_contiguous(array_like):
+    flags = getattr(array_like, "flags", None)
+    return flags is not None and flags["C_CONTIGUOUS"]
+
+
 @numba.njit(parallel=True, cache=True)
 def generate_leaf_updates(leaf_block, dist_thresholds, data, dist):
 
@@ -496,7 +501,7 @@ def resort_tree_indices(tree, tree_order):
     return new_tree
 
 
-class NNDescent(object):
+class NNDescent:
     """NNDescent for fast approximate nearest neighbor queries. NNDescent is
     very flexible and supports a wide variety of distances, including
     non-metric distances. NNDescent also scales well against high dimensional
@@ -505,7 +510,7 @@ class NNDescent(object):
 
     Parameters
     ----------
-    data: array os shape (n_samples, n_features)
+    data: array of shape (n_samples, n_features)
         The training graph_data set to find nearest neighbors in.
 
     metric: string or callable (optional, default='euclidean')
@@ -534,6 +539,7 @@ class NNDescent(object):
             * sokalsneath
             * yule
             * hellinger
+            * wasserstein-1d
         Metrics that take arguments (such as minkowski, mahalanobis etc.)
         can have arguments passed via the metric_kwds dictionary. At this
         time care must be taken and dictionary elements must be ordered
@@ -595,14 +601,11 @@ class NNDescent(object):
         The``'alternative'`` algorithm has been deprecated and is no longer
         available.
 
-    low_memory: boolean (optional, default=False)
+    low_memory: boolean (optional, default=True)
         Whether to use a lower memory, but more computationally expensive
-        approach to index construction. This defaults to false as for most
-        cases it speeds index construction, but if you are having issues
-        with excessive memory use for your dataset consider setting this
-        to True.
+        approach to index construction.
 
-    max_candidates: int (optional, default=20)
+    max_candidates: int (optional, default=None)
         Internally each "self-join" keeps a maximum number of candidates (
         nearest neighbors and reverse nearest neighbors) to be considered.
         This value controls this aspect of the algorithm. Larger values will
@@ -633,6 +636,11 @@ class NNDescent(object):
         result in a significantly smaller index, particularly useful for saving,
         but will remove information that might otherwise be useful.
 
+    parallel_batch_queries: bool (optional, default=False)
+        Whether to use parallelism of batched queries. This can be useful for large
+        batches of queries on multicore machines, but results in performance degradation
+        for single queries, so is poor for streaming use.
+
     verbose: bool (optional, default=False)
         Whether to print status graph_data during the computation.
     """
@@ -656,7 +664,8 @@ class NNDescent(object):
         n_iters=None,
         delta=0.001,
         n_jobs=None,
-        compressed=True,
+        compressed=False,
+        parallel_batch_queries=False,
         verbose=False,
     ):
 
@@ -681,8 +690,14 @@ class NNDescent(object):
         self.dim = data.shape[1]
         self.n_jobs = n_jobs
         self.compressed = compressed
+        self.parallel_batch_queries = parallel_batch_queries
         self.verbose = verbose
 
+        if getattr(data, "dtype", None) == np.float32 and (issparse(data) or is_c_contiguous(data)):
+            copy_on_normalize = True
+        else:
+            copy_on_normalize = False
+
         data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
         self._raw_data = data
 
@@ -739,7 +754,7 @@ class NNDescent(object):
             self._angular_trees = False
 
         if metric == "dot":
-            data = normalize(data, norm="l2", copy=False)
+            data = normalize(data, norm="l2", copy=copy_on_normalize)
 
         self.rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
             np.int64
@@ -1171,6 +1186,7 @@ class NNDescent(object):
         indices = self._search_graph.indices
         dist = self._distance_func
         n_neighbors = self.n_neighbors
+        parallel_search = self.parallel_batch_queries
 
         @numba.njit(
             fastmath=True,
@@ -1193,6 +1209,7 @@ class NNDescent(object):
                 "distance_bound": numba.types.float32,
                 "seed_scale": numba.types.float32,
             },
+            parallel=self.parallel_batch_queries,
         )
         def search_closure(query_points, k, epsilon, visited, rng_state):
 
@@ -1200,8 +1217,14 @@ class NNDescent(object):
             distance_scale = 1.0 + epsilon
             internal_rng_state = np.copy(rng_state)
 
-            for i in range(query_points.shape[0]):
-                visited[:] = 0
+            for i in numba.prange(query_points.shape[0]):
+                # Avoid races on visited if parallel
+                if parallel_search:
+                    visited_nodes = np.zeros_like(visited)
+                else:
+                    visited_nodes = visited
+                    visited_nodes[:] = 0
+
                 if dist == alternative_dot or dist == alternative_cosine:
                     norm = np.sqrt((query_points[i] ** 2).sum())
                     if norm > 0.0:
@@ -1225,24 +1248,24 @@ class NNDescent(object):
 
                 for j in range(n_initial_points):
                     candidate = candidate_indices[j]
-                    d = dist(data[candidate], current_query)
+                    d = np.float32(dist(data[candidate], current_query))
                     # indices are guaranteed different
                     simple_heap_push(heap_priorities, heap_indices, d, candidate)
                     heapq.heappush(seed_set, (d, candidate))
-                    mark_visited(visited, candidate)
+                    mark_visited(visited_nodes, candidate)
 
                 if n_random_samples > 0:
                     for j in range(n_random_samples):
                         candidate = np.int32(
                             np.abs(tau_rand_int(internal_rng_state)) % data.shape[0]
                         )
-                        if has_been_visited(visited, candidate) == 0:
-                            d = dist(data[candidate], current_query)
+                        if has_been_visited(visited_nodes, candidate) == 0:
+                            d = np.float32(dist(data[candidate], current_query))
                             simple_heap_push(
                                 heap_priorities, heap_indices, d, candidate
                             )
                             heapq.heappush(seed_set, (d, candidate))
-                            mark_visited(visited, candidate)
+                            mark_visited(visited_nodes, candidate)
 
                 ############ Search ##############
                 distance_bound = distance_scale * heap_priorities[0]
@@ -1256,10 +1279,10 @@ class NNDescent(object):
 
                         candidate = indices[j]
 
-                        if has_been_visited(visited, candidate) == 0:
-                            mark_visited(visited, candidate)
+                        if has_been_visited(visited_nodes, candidate) == 0:
+                            mark_visited(visited_nodes, candidate)
 
-                            d = dist(data[candidate], current_query)
+                            d = np.float32(dist(data[candidate], current_query))
 
                             if d < distance_bound:
                                 simple_heap_push(
@@ -1332,6 +1355,7 @@ class NNDescent(object):
         indices = self._search_graph.indices
         dist = self._distance_func
         n_neighbors = self.n_neighbors
+        parallel_search = self.parallel_batch_queries
 
         @numba.njit(
             fastmath=True,
@@ -1350,6 +1374,7 @@ class NNDescent(object):
                 "distance_scale": numba.types.float32,
                 "seed_scale": numba.types.float32,
             },
+            parallel=self.parallel_batch_queries,
         )
         def search_closure(
             query_inds, query_indptr, query_data, k, epsilon, visited, rng_state
@@ -1361,8 +1386,13 @@ class NNDescent(object):
             distance_scale = 1.0 + epsilon
             internal_rng_state = np.copy(rng_state)
 
-            for i in range(n_query_points):
-                visited[:] = 0
+            for i in numba.prange(n_query_points):
+                # Avoid races on visited if parallel
+                if parallel_search:
+                    visited_nodes = np.zeros_like(visited)
+                else:
+                    visited_nodes = visited
+                    visited_nodes[:] = 0
 
                 current_query_inds = query_inds[query_indptr[i] : query_indptr[i + 1]]
                 current_query_data = query_data[query_indptr[i] : query_indptr[i + 1]]
@@ -1398,20 +1428,20 @@ class NNDescent(object):
                         data_indptr[candidate] : data_indptr[candidate + 1]
                     ]
 
-                    d = dist(
+                    d = np.float32(dist(
                         from_inds, from_data, current_query_inds, current_query_data
-                    )
+                    ))
                     # indices are guaranteed different
                     simple_heap_push(heap_priorities, heap_indices, d, candidate)
                     heapq.heappush(seed_set, (d, candidate))
-                    mark_visited(visited, candidate)
+                    mark_visited(visited_nodes, candidate)
 
                 if n_random_samples > 0:
                     for j in range(n_random_samples):
                         candidate = np.int32(
                             np.abs(tau_rand_int(internal_rng_state)) % n_index_points
                         )
-                        if has_been_visited(visited, candidate) == 0:
+                        if has_been_visited(visited_nodes, candidate) == 0:
                             from_inds = data_inds[
                                 data_indptr[candidate] : data_indptr[candidate + 1]
                             ]
@@ -1419,18 +1449,18 @@ class NNDescent(object):
                                 data_indptr[candidate] : data_indptr[candidate + 1]
                             ]
 
-                            d = dist(
+                            d = np.float32(dist(
                                 from_inds,
                                 from_data,
                                 current_query_inds,
                                 current_query_data,
-                            )
+                            ))
 
                             simple_heap_push(
                                 heap_priorities, heap_indices, d, candidate
                             )
                             heapq.heappush(seed_set, (d, candidate))
-                            mark_visited(visited, candidate)
+                            mark_visited(visited_nodes, candidate)
 
                 ############ Search ##############
                 distance_bound = distance_scale * heap_priorities[0]
@@ -1444,8 +1474,8 @@ class NNDescent(object):
 
                         candidate = indices[j]
 
-                        if has_been_visited(visited, candidate) == 0:
-                            mark_visited(visited, candidate)
+                        if has_been_visited(visited_nodes, candidate) == 0:
+                            mark_visited(visited_nodes, candidate)
 
                             from_inds = data_inds[
                                 data_indptr[candidate] : data_indptr[candidate + 1]
@@ -1454,12 +1484,12 @@ class NNDescent(object):
                                 data_indptr[candidate] : data_indptr[candidate + 1]
                             ]
 
-                            d = dist(
+                            d = np.float32(dist(
                                 from_inds,
                                 from_data,
                                 current_query_inds,
                                 current_query_data,
-                            )
+                            ))
 
                             if d < distance_bound:
                                 simple_heap_push(
@@ -1604,6 +1634,9 @@ class NNDescent(object):
         return indices, dists
 
     def update(self, X):
+        if not hasattr(self, "_search_graph"):
+            self._init_search_graph()
+
         current_random_state = check_random_state(self.random_state)
         rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
             np.int64
@@ -1705,6 +1738,8 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
             * sokalmichener
             * sokalsneath
             * yule
+            * hellinger
+            * wasserstein-1d
         Metrics that take arguments (such as minkowski, mahalanobis etc.)
         can have arguments passed via the metric_kwds dictionary. At this
         time care must be taken and dictionary elements must be ordered
@@ -1794,6 +1829,11 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
         and less accurate searching. Don't tweak this value unless you know
         what you're doing.
 
+    parallel_batch_queries: bool (optional, default=False)
+        Whether to use parallelism of batched queries. This can be useful for large
+        batches of queries on multicore machines, but results in performance degradation
+        for single queries, so is poor for streaming use.
+
     verbose: bool (optional, default=False)
         Whether to print status graph_data during the computation.
 
@@ -1825,6 +1865,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
         max_candidates=None,
         n_iters=None,
         early_termination_value=0.001,
+        parallel_batch_queries=False,
         verbose=False,
     ):
 
@@ -1844,6 +1885,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
         self.n_iters = n_iters
         self.early_termination_value = early_termination_value
         self.n_jobs = n_jobs
+        self.parallel_batch_queries = parallel_batch_queries
         self.verbose = verbose
 
     def fit(self, X, compress_index=True):
@@ -1892,6 +1934,7 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
             delta=self.early_termination_value,
             n_jobs=self.n_jobs,
             compressed=compress_index,
+            parallel_batch_queries=self.parallel_batch_queries,
             verbose=self.verbose,
         )
 


=====================================
pynndescent/rp_trees.py
=====================================
@@ -8,7 +8,7 @@ import numpy as np
 import numba
 import scipy.sparse
 
-from pynndescent.sparse import sparse_mul, sparse_diff, sparse_sum, arr_intersect
+from pynndescent.sparse import sparse_mul, sparse_diff, sparse_sum, arr_intersect, sparse_dot_product
 from pynndescent.utils import tau_rand_int, norm
 import joblib
 
@@ -908,10 +908,7 @@ def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
     hyperplane_inds = hyperplane[0, :hyperplane_size].astype(np.int32)
     hyperplane_data = hyperplane[1, :hyperplane_size]
 
-    _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds, point_data)
-
-    for val in aux_data:
-        margin += val
+    margin += sparse_dot_product(hyperplane_inds, hyperplane_data, point_inds, point_data)
 
     if abs(margin) < EPS:
         side = tau_rand_int(rng_state) % 2


=====================================
pynndescent/sparse.py
=====================================
@@ -53,6 +53,60 @@ def arr_intersect(ar1, ar2):
     aux.sort()
     return aux[:-1][aux[1:] == aux[:-1]]
 
+# Some things require size of intersection; do this quickly; assume sorted arrays for speed
+ at numba.njit(
+    [
+        "i4(i4[:],i4[:])",
+        numba.types.int32(
+            numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+        ),
+    ],
+    locals={
+        "i1": numba.uint16,
+        "i2": numba.uint16,
+    }
+)
+def fast_intersection_size(ar1, ar2):
+    if ar1.shape[0] == 0 or ar2.shape[0] == 0:
+        return 0
+
+    # NOTE: We assume arrays are sorted; if they are not this will break
+    i1 = 0
+    i2 = 0
+    limit1 = ar1.shape[0] - 1
+    limit2 = ar2.shape[0] - 1
+    j1 = ar1[i1]
+    j2 = ar2[i2]
+
+    result = 0
+
+    while True:
+        if j1 == j2:
+            result += 1
+            if i1 < limit1:
+                i1 += 1
+                j1 = ar1[i1]
+            else:
+                break
+
+            if i2 < limit2:
+                i2 += 1
+                j2 = ar2[i2]
+            else:
+                break
+
+        elif j1 < j2 and i1 < limit1:
+            i1 += 1
+            j1 = ar1[i1]
+        elif j2 < j1 and i2 < limit2:
+            i2 += 1
+            j2 = ar2[i2]
+        else:
+            break
+
+    return result
+
 
 @numba.njit(
     [
@@ -147,7 +201,6 @@ def sparse_sum(ind1, data1, ind2, data2):
 def sparse_diff(ind1, data1, ind2, data2):
     return sparse_sum(ind1, data1, ind2, -data2)
 
-
 @numba.njit(
     [
         # "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
@@ -199,6 +252,63 @@ def sparse_mul(ind1, data1, ind2, data2):
 
     return result_ind, result_data
 
+ at numba.njit(
+    [
+        # "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
+        numba.types.float32(
+            numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+        )
+    ],
+    fastmath=True,
+    locals={
+        "result": numba.types.float32,
+        "val": numba.types.float32,
+        "i1": numba.types.uint16,
+        "i2": numba.types.uint16,
+        "j1": numba.types.int32,
+        "j2": numba.types.int32,
+    },
+    cache=True,
+)
+def sparse_dot_product(ind1, data1, ind2, data2):
+    dim1 = ind1.shape[0]
+    dim2 = ind2.shape[0]
+
+    result = 0.0
+
+    i1 = 0
+    i2 = 0
+    j1 = ind1[i1]
+    j2 = ind2[i2]
+
+    # pass through both index lists
+    while True:
+        if j1 == j2:
+            val = data1[i1] * data2[i2]
+            result += val
+            i1 += 1
+            if i1 >= dim1:
+                return result
+            j1 = ind1[i1]
+            i2 += 1
+            if i2 >= dim2:
+                return result
+            j2 = ind2[i2]
+        elif j1 < j2:
+            i1 += 1
+            if i1 >= dim1:
+                return result
+            j1 = ind1[i1]
+        else:
+            i2 += 1
+            if i2 >= dim2:
+                return result
+            j2 = ind2[i2]
+
+    return result # unreachable
 
 # Return dense vectors supported on the union of the non-zero valued indices
 @numba.njit()
@@ -380,8 +490,8 @@ def sparse_bray_curtis(ind1, data1, ind2, data2):  # pragma: no cover
 
 @numba.njit()
 def sparse_jaccard(ind1, data1, ind2, data2):
-    num_non_zero = arr_union(ind1, ind2).shape[0]
-    num_equal = arr_intersect(ind1, ind2).shape[0]
+    num_equal = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_equal
 
     if num_non_zero == 0:
         return 0.0
@@ -403,24 +513,28 @@ def sparse_jaccard(ind1, data1, ind2, data2):
     locals={"num_non_zero": numba.types.intp, "num_equal": numba.types.intp},
 )
 def sparse_alternative_jaccard(ind1, data1, ind2, data2):
-    num_non_zero = arr_union(ind1, ind2).shape[0]
-    num_equal = arr_intersect(ind1, ind2).shape[0]
+    num_equal = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_equal
 
     if num_non_zero == 0:
         return 0.0
+    elif num_equal == 0:
+        return FLOAT32_MAX
     else:
         return -np.log2(num_equal / num_non_zero)
+        # return (num_non_zero - num_equal) / num_equal
 
 
 @numba.vectorize(fastmath=True)
 def correct_alternative_jaccard(v):
     return 1.0 - pow(2.0, -v)
+    # return v / (v + 1)
 
 
 @numba.njit()
 def sparse_matching(ind1, data1, ind2, data2, n_features):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     return float(num_not_equal) / n_features
@@ -428,8 +542,8 @@ def sparse_matching(ind1, data1, ind2, data2, n_features):
 
 @numba.njit()
 def sparse_dice(ind1, data1, ind2, data2):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     if num_not_equal == 0.0:
@@ -440,8 +554,8 @@ def sparse_dice(ind1, data1, ind2, data2):
 
 @numba.njit()
 def sparse_kulsinski(ind1, data1, ind2, data2, n_features):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     if num_not_equal == 0:
@@ -454,8 +568,8 @@ def sparse_kulsinski(ind1, data1, ind2, data2, n_features):
 
 @numba.njit()
 def sparse_rogers_tanimoto(ind1, data1, ind2, data2, n_features):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     return (2.0 * num_not_equal) / (n_features + num_not_equal)
@@ -466,7 +580,7 @@ def sparse_russellrao(ind1, data1, ind2, data2, n_features):
     if ind1.shape[0] == ind2.shape[0] and np.all(ind1 == ind2):
         return 0.0
 
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
 
     if num_true_true == np.sum(data1 != 0) and num_true_true == np.sum(data2 != 0):
         return 0.0
@@ -476,8 +590,8 @@ def sparse_russellrao(ind1, data1, ind2, data2, n_features):
 
 @numba.njit()
 def sparse_sokal_michener(ind1, data1, ind2, data2, n_features):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     return (2.0 * num_not_equal) / (n_features + num_not_equal)
@@ -485,8 +599,8 @@ def sparse_sokal_michener(ind1, data1, ind2, data2, n_features):
 
 @numba.njit()
 def sparse_sokal_sneath(ind1, data1, ind2, data2):
-    num_true_true = arr_intersect(ind1, ind2).shape[0]
-    num_non_zero = arr_union(ind1, ind2).shape[0]
+    num_true_true = fast_intersection_size(ind1, ind2)
+    num_non_zero = ind1.shape[0] + ind2.shape[0] - num_true_true
     num_not_equal = num_non_zero - num_true_true
 
     if num_not_equal == 0.0:
@@ -553,11 +667,7 @@ def sparse_correct_alternative_cosine(d):
 
 @numba.njit()
 def sparse_dot(ind1, data1, ind2, data2):
-    _, aux_data = sparse_mul(ind1, data1, ind2, data2)
-    result = 0.0
-
-    for val in aux_data:
-        result += val
+    result = sparse_dot_product(ind1, data1, ind2, data2)
 
     return 1.0 - result
 
@@ -567,16 +677,10 @@ def sparse_dot(ind1, data1, ind2, data2):
     fastmath=True,
     locals={
         "result": numba.types.float32,
-        "dim": numba.types.intp,
-        "i": numba.types.uint16,
     },
 )
 def sparse_alternative_dot(ind1, data1, ind2, data2):
-    _, aux_data = sparse_mul(ind1, data1, ind2, data2)
-    result = 0.0
-    dim = len(aux_data)
-    for i in range(dim):
-        result += aux_data[i]
+    result = sparse_dot_product(ind1, data1, ind2, data2)
 
     if result <= 0.0:
         return FLOAT32_MAX
@@ -758,6 +862,65 @@ def sparse_kantorovich(ind1, data1, ind2, data2, ground_metric=dummy_ground_metr
     return kantorovich(data1, data2, cost_matrix)
 
 
+ at numba.njit()
+def sparse_wasserstein_1d(ind1, data1, ind2, data2, p=1):
+    result = 0.0
+    old_ind = 0
+    delta = 0.0
+    i1 = 0
+    i2 = 0
+    cdf1 = 0.0
+    cdf2 = 0.0
+    l1_norm_x = np.sum(data1)
+    l1_norm_y = np.sum(data2)
+
+    norm = lambda x, p: np.power(np.abs(x), p)
+
+    # pass through both index lists
+    while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
+        j1 = ind1[i1]
+        j2 = ind2[i2]
+
+        if j1 == j2:
+            result += delta * (j1 - old_ind)
+            cdf1 += data1[i1] / l1_norm_x
+            cdf2 += data2[i2] / l1_norm_y
+            delta = norm(cdf1 - cdf2, p)
+            old_ind = j1
+            i1 += 1
+            i2 += 1
+        elif j1 < j2:
+            result += delta * (j1 - old_ind)
+            cdf1 += data1[i1] / l1_norm_x
+            delta = norm(cdf1 - cdf2, p)
+            old_ind = j1
+            i1 += 1
+        else:
+            result += delta * (j2 - old_ind)
+            cdf2 += data2[i2] / l1_norm_y
+            delta = norm(cdf1 - cdf2, p)
+            old_ind = j2
+            i2 += 1
+            # pass over the tails
+    while i1 < ind1.shape[0]:
+        j1 = ind1[i1]
+        result += delta * (j1 - old_ind)
+        cdf1 += data1[i1] / l1_norm_x
+        delta = norm(cdf1 - cdf2, p)
+        old_ind = j1
+        i1 += 1
+
+    while i2 < ind2.shape[0]:
+        j2 = ind2[i2]
+        result += delta * (j2 - old_ind)
+        cdf2 += data2[i2] / l1_norm_y
+        delta = norm(cdf1 - cdf2, p)
+        old_ind = j2
+        i2 += 1
+
+    return np.power(result, 1.0 / p)
+
+
 # Because of the EPS values and the need to normalize after adding them (and then average those for jensen_shannon)
 # it seems like we might as well just take the dense union (dense vectors supported on the union of indices)
 # and call the dense distance functions
@@ -917,6 +1080,10 @@ sparse_named_distances = {
     # Distribution distances
     "kantorovich": sparse_kantorovich,
     "wasserstein": sparse_kantorovich,
+    "wasserstein_1d": sparse_wasserstein_1d,
+    "wasserstein-1d": sparse_wasserstein_1d,
+    "kantorovich-1d": sparse_wasserstein_1d,
+    "kantorovich-1d": sparse_wasserstein_1d,
     "hellinger": sparse_hellinger,
     "jensen-shannon": sparse_jensen_shannon_divergence,
     "jensen_shannon": sparse_jensen_shannon_divergence,


=====================================
pynndescent/tests/test_distances.py
=====================================
@@ -366,9 +366,29 @@ def test_sparse_jensen_shannon():
                 / 2.0
             )
             d2 = spdist.sparse_jensen_shannon_divergence(
-                    sparse_test_data[i].indices,
-                    sparse_test_data[i].data,
-                    sparse_test_data[j].indices,
-                    sparse_test_data[j].data,
-                )
+                sparse_test_data[i].indices,
+                sparse_test_data[i].data,
+                sparse_test_data[j].indices,
+                sparse_test_data[j].data,
+            )
             assert np.isclose(d1, d2, rtol=1e-3)
+
+
+ at pytest.mark.parametrize("p", [1.0, 2.0, 3.0, 0.5])
+def test_wasserstein_1d(p):
+    test_data = np.random.random(size=(10, 100))
+    # sparsify
+    test_data[test_data <= 0.5] = 0.0
+    sparse_test_data = csr_matrix(test_data)
+
+    for i in range(test_data.shape[0]):
+        for j in range(i + 1, test_data.shape[0]):
+            d1 = dist.wasserstein_1d(test_data[i], test_data[j], p)
+            d2 = spdist.sparse_wasserstein_1d(
+                sparse_test_data[i].indices,
+                sparse_test_data[i].data,
+                sparse_test_data[j].indices,
+                sparse_test_data[j].data,
+                p,
+            )
+            assert np.isclose(d1, d2)


=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def readme():
 
 configuration = {
     "name": "pynndescent",
-    "version": "0.5.5",
+    "version": "0.5.6",
     "description": "Nearest Neighbor Descent",
     "long_description": readme(),
     "classifiers": [



View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/a90e87aed0b939012379dc24250bf52fe3744ddf

-- 
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/a90e87aed0b939012379dc24250bf52fe3744ddf
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220218/60c9fdd5/attachment-0001.htm>


More information about the debian-med-commit mailing list