[med-svn] [Git][python-team/packages/python-pynndescent][master] 4 commits: New upstream version 0.5.8

Andreas Tille (@tille) gitlab at salsa.debian.org
Tue Jan 17 10:13:15 GMT 2023



Andreas Tille pushed to branch master at Debian Python Team / packages / python-pynndescent


Commits:
82697f2f by Andreas Tille at 2023-01-11T07:34:24+01:00
New upstream version 0.5.8
- - - - -
e3c78e85 by Andreas Tille at 2023-01-11T07:34:24+01:00
routine-update: New upstream version

- - - - -
097e4e29 by Andreas Tille at 2023-01-11T07:34:26+01:00
Update upstream source from tag 'upstream/0.5.8'

Update to upstream version '0.5.8'
with Debian dir 6081afd397d2606d639ba07e0ec411b5b91a2cc6
- - - - -
9321c99e by Andreas Tille at 2023-01-11T07:34:26+01:00
routine-update: Standards-Version: 4.6.2

- - - - -


16 changed files:

- PKG-INFO
- debian/changelog
- debian/control
- pynndescent.egg-info/PKG-INFO
- pynndescent.egg-info/requires.txt
- pynndescent/__init__.py
- pynndescent/distances.py
- pynndescent/pynndescent_.py
- pynndescent/rp_trees.py
- pynndescent/sparse.py
- pynndescent/tests/conftest.py
- pynndescent/tests/test_distances.py
- pynndescent/tests/test_pynndescent_.py
- pynndescent/tests/test_rank.py
- pynndescent/utils.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.2
 Name: pynndescent
-Version: 0.5.7
+Version: 0.5.8
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes


=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-pynndescent (0.5.8-1) UNRELEASED; urgency=medium
+
+  * New upstream version
+  * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org>  Wed, 11 Jan 2023 07:34:24 +0100
+
 python-pynndescent (0.5.7-1) unstable; urgency=medium
 
   * New upstream version


=====================================
debian/control
=====================================
@@ -13,7 +13,7 @@ Build-Depends: debhelper-compat (= 13),
                python3-scipy <!nocheck>,
                python3-sklearn <!nocheck>,
                python3-pytest <!nocheck>
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
 Vcs-Browser: https://salsa.debian.org/python-team/packages/python-pynndescent
 Vcs-Git: https://salsa.debian.org/python-team/packages/python-pynndescent.git
 Homepage: https://github.com/lmcinnes/pynndescent/


=====================================
pynndescent.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.2
 Name: pynndescent
-Version: 0.5.7
+Version: 0.5.8
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes


=====================================
pynndescent.egg-info/requires.txt
=====================================
@@ -3,3 +3,6 @@ scipy>=1.0
 numba>=0.51.2
 llvmlite>=0.30
 joblib>=0.11
+
+[:python_version < "3.8"]
+importlib-metadata>=4.8.1


=====================================
pynndescent/__init__.py
=====================================
@@ -1,7 +1,14 @@
-import pkg_resources
+import sys
+
 import numba
+
 from .pynndescent_ import NNDescent, PyNNDescentTransformer
 
+if sys.version_info[:2] >= (3, 8):
+    import importlib.metadata as importlib_metadata
+else:
+    import importlib_metadata
+
 # Workaround: https://github.com/numba/numba/issues/3341
 if numba.config.THREADING_LAYER == "omp":
     try:
@@ -12,4 +19,4 @@ if numba.config.THREADING_LAYER == "omp":
         # might be a missing symbol due to e.g. tbb libraries missing
         numba.config.THREADING_LAYER = "workqueue"
 
-__version__ = pkg_resources.get_distribution("pynndescent").version
+__version__ = importlib_metadata.version("pynndescent")


=====================================
pynndescent/distances.py
=====================================
@@ -359,7 +359,7 @@ def haversine(x, y):
         raise ValueError("haversine is only defined for 2 dimensional graph_data")
     sin_lat = np.sin(0.5 * (x[0] - y[0]))
     sin_long = np.sin(0.5 * (x[1] - y[1]))
-    result = np.sqrt(sin_lat ** 2 + np.cos(x[0]) * np.cos(y[0]) * sin_long ** 2)
+    result = np.sqrt(sin_lat**2 + np.cos(x[0]) * np.cos(y[0]) * sin_long**2)
     return 2.0 * np.arcsin(result)
 
 
@@ -565,8 +565,8 @@ def correlation(x, y):
     for i in range(x.shape[0]):
         shifted_x = x[i] - mu_x
         shifted_y = y[i] - mu_y
-        norm_x += shifted_x ** 2
-        norm_y += shifted_y ** 2
+        norm_x += shifted_x**2
+        norm_y += shifted_y**2
         dot_product += shifted_x * shifted_y
 
     if norm_x == 0.0 and norm_y == 0.0:
@@ -744,11 +744,11 @@ def kantorovich(x, y, cost=_dummy_cost, max_iter=100000):
     #     print("WARNING: RESULT MIGHT BE INACCURATE\nMax number of iteration reached!")
     if solve_status == ProblemStatus.INFEASIBLE:
         raise ValueError(
-            "Optimal transport problem was INFEASIBLE. Please check " "inputs."
+            "Optimal transport problem was INFEASIBLE. Please check inputs."
         )
     elif solve_status == ProblemStatus.UNBOUNDED:
         raise ValueError(
-            "Optimal transport problem was UNBOUNDED. Please check " "inputs."
+            "Optimal transport problem was UNBOUNDED. Please check inputs."
         )
     result = total_cost(node_arc_data.flow, node_arc_data.cost)
 


=====================================
pynndescent/pynndescent_.py
=====================================
@@ -9,7 +9,13 @@ import numpy as np
 from sklearn.utils import check_random_state, check_array
 from sklearn.preprocessing import normalize
 from sklearn.base import BaseEstimator, TransformerMixin
-from scipy.sparse import csr_matrix, coo_matrix, isspmatrix_csr, vstack as sparse_vstack, issparse
+from scipy.sparse import (
+    csr_matrix,
+    coo_matrix,
+    isspmatrix_csr,
+    vstack as sparse_vstack,
+    issparse,
+)
 
 import heapq
 
@@ -31,6 +37,7 @@ from pynndescent.utils import (
     apply_graph_updates_high_memory,
     apply_graph_updates_low_memory,
     initalize_heap_from_graph_indices,
+    initalize_heap_from_graph_indices_and_distances,
     sparse_initalize_heap_from_graph_indices,
 )
 
@@ -588,6 +595,16 @@ class NNDescent:
     tree_init: bool (optional, default=True)
         Whether to use random projection trees for initialization.
 
+    init_graph: np.ndarray (optional, default=None)
+        2D array of indices of candidate neighbours of the shape
+        (data.shape[0], n_neighbours). If the j-th neighbour of the i-th
+        instances is unknown, use init_graph[i, j] = -1
+
+    init_dist: np.ndarray (optional, default=None)
+        2D array with the same shape as init_graph,
+        such that metric(data[i], data[init_graph[i, j]]) equals
+        init_dist[i, j]
+
     random_state: int, RandomState instance or None, optional (default: None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -658,6 +675,7 @@ class NNDescent:
         n_search_trees=1,
         tree_init=True,
         init_graph=None,
+        init_dist=None,
         random_state=None,
         low_memory=True,
         max_candidates=None,
@@ -676,6 +694,7 @@ class NNDescent:
             n_iters = max(5, int(round(np.log2(data.shape[0]))))
 
         self.n_trees = n_trees
+        self.n_trees_after_update = max(1, int(np.round(self.n_trees / 3)))
         self.n_neighbors = n_neighbors
         self.metric = metric
         self.metric_kwds = metric_kwds
@@ -693,7 +712,9 @@ class NNDescent:
         self.parallel_batch_queries = parallel_batch_queries
         self.verbose = verbose
 
-        if getattr(data, "dtype", None) == np.float32 and (issparse(data) or is_c_contiguous(data)):
+        if getattr(data, "dtype", None) == np.float32 and (
+            issparse(data) or is_c_contiguous(data)
+        ):
             copy_on_normalize = True
         else:
             copy_on_normalize = False
@@ -880,9 +901,18 @@ class NNDescent:
                 if init_graph.shape[0] != self._raw_data.shape[0]:
                     raise ValueError("Init graph size does not match dataset size!")
                 _init_graph = make_heap(init_graph.shape[0], self.n_neighbors)
-                _init_graph = initalize_heap_from_graph_indices(
-                    _init_graph, init_graph, data, self._distance_func
-                )
+                if init_dist is None:
+                    _init_graph = initalize_heap_from_graph_indices(
+                        _init_graph, init_graph, data, self._distance_func
+                    )
+                elif init_graph.shape != init_dist.shape:
+                    raise ValueError(
+                        "The shapes of init graph and init distances do not match!"
+                    )
+                else:
+                    _init_graph = initalize_heap_from_graph_indices_and_distances(
+                        _init_graph, init_graph, init_dist
+                    )
 
             if verbose:
                 print(ts(), "NN descent for", str(n_iters), "iterations")
@@ -905,8 +935,8 @@ class NNDescent:
         if np.any(self._neighbor_graph[0] < 0):
             warn(
                 "Failed to correctly find n_neighbors for some samples."
-                "Results may be less than ideal. Try re-running with"
-                "different parameters."
+                " Results may be less than ideal. Try re-running with"
+                " different parameters."
             )
 
         numba.set_num_threads(self._original_num_threads)
@@ -960,7 +990,9 @@ class NNDescent:
                         self._angular_trees,
                     )
                     self._search_forest = [
-                        convert_tree_format(tree, self._raw_data.shape[0])
+                        convert_tree_format(
+                            tree, self._raw_data.shape[0], self._raw_data.shape[1]
+                        )
                         for tree in rp_forest
                     ]
                 else:
@@ -979,7 +1011,9 @@ class NNDescent:
                 best_trees = [self._rp_forest[idx] for idx in best_tree_indices]
                 del self._rp_forest
                 self._search_forest = [
-                    convert_tree_format(tree, self._raw_data.shape[0])
+                    convert_tree_format(
+                        tree, self._raw_data.shape[0], self._raw_data.shape[1]
+                    )
                     for tree in best_trees
                 ]
 
@@ -1132,7 +1166,9 @@ class NNDescent:
             if self._is_sparse:
                 self._raw_data = self._raw_data[self._vertex_order, :]
             else:
-                self._raw_data = np.ascontiguousarray(self._raw_data[self._vertex_order, :])
+                self._raw_data = np.ascontiguousarray(
+                    self._raw_data[self._vertex_order, :]
+                )
 
             tree_order = np.argsort(self._vertex_order)
             self._search_forest = tuple(
@@ -1186,6 +1222,7 @@ class NNDescent:
 
             self._tree_search = tree_search_closure
         else:
+
             @numba.njit()
             def tree_search_closure(point, rng_state):
                 return (0, 0)
@@ -1316,9 +1353,12 @@ class NNDescent:
             return result
 
         self._search_function = search_closure
-        self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)(
-            deheap_sort.py_func
-        )
+        if hasattr(deheap_sort, "py_func"):
+            self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)(
+                deheap_sort.py_func
+            )
+        else:
+            self._deheap_function = deheap_sort
 
         # Force compilation of the search function (hardcoded k, epsilon)
         query_data = self._raw_data[:1]
@@ -1367,6 +1407,7 @@ class NNDescent:
 
             self._tree_search = sparse_tree_search_closure
         else:
+
             @numba.njit()
             def sparse_tree_search_closure(point_inds, point_data, rng_state):
                 return (0, 0)
@@ -1426,7 +1467,7 @@ class NNDescent:
                 current_query_data = query_data[query_indptr[i] : query_indptr[i + 1]]
 
                 if dist == alternative_dot or dist == alternative_cosine:
-                    norm = np.sqrt((current_query_data ** 2).sum())
+                    norm = np.sqrt((current_query_data**2).sum())
                     if norm > 0.0:
                         current_query_data = current_query_data / norm
                     else:
@@ -1456,9 +1497,11 @@ class NNDescent:
                         data_indptr[candidate] : data_indptr[candidate + 1]
                     ]
 
-                    d = np.float32(dist(
-                        from_inds, from_data, current_query_inds, current_query_data
-                    ))
+                    d = np.float32(
+                        dist(
+                            from_inds, from_data, current_query_inds, current_query_data
+                        )
+                    )
                     # indices are guaranteed different
                     simple_heap_push(heap_priorities, heap_indices, d, candidate)
                     heapq.heappush(seed_set, (d, candidate))
@@ -1477,12 +1520,14 @@ class NNDescent:
                                 data_indptr[candidate] : data_indptr[candidate + 1]
                             ]
 
-                            d = np.float32(dist(
-                                from_inds,
-                                from_data,
-                                current_query_inds,
-                                current_query_data,
-                            ))
+                            d = np.float32(
+                                dist(
+                                    from_inds,
+                                    from_data,
+                                    current_query_inds,
+                                    current_query_data,
+                                )
+                            )
 
                             simple_heap_push(
                                 heap_priorities, heap_indices, d, candidate
@@ -1512,12 +1557,14 @@ class NNDescent:
                                 data_indptr[candidate] : data_indptr[candidate + 1]
                             ]
 
-                            d = np.float32(dist(
-                                from_inds,
-                                from_data,
-                                current_query_inds,
-                                current_query_data,
-                            ))
+                            d = np.float32(
+                                dist(
+                                    from_inds,
+                                    from_data,
+                                    current_query_inds,
+                                    current_query_data,
+                                )
+                            )
 
                             if d < distance_bound:
                                 simple_heap_push(
@@ -1536,9 +1583,12 @@ class NNDescent:
             return result
 
         self._search_function = search_closure
-        self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)(
-            deheap_sort.py_func
-        )
+        if hasattr(deheap_sort, "py_func"):
+            self._deheap_function = numba.njit(parallel=self.parallel_batch_queries)(
+                deheap_sort.py_func
+            )
+        else:
+            self._deheap_function = deheap_sort
 
         # Force compilation of the search function (hardcoded k, epsilon)
         query_data = self._raw_data[:1]
@@ -1665,29 +1715,120 @@ class NNDescent:
 
         return indices, dists
 
-    def update(self, X):
+    def update(self, xs_fresh=None, xs_updated=None, updated_indices=None):
+        """
+        Updates the index with a) fresh data (that is appended to
+        the existing data), and b) data that was only updated (but should not be appended
+        to the existing data).
+
+        Not applicable to sparse data yet.
+
+        Parameters
+        ----------
+        xs_fresh: np.ndarray (optional, default=None)
+            2D array of the shape (n_fresh, dim) where dim is the dimension
+            of the data from which we built self.
+
+        xs_updated: np.ndarray (optional, default=None)
+            2D array of the shape (n_updates, dim) where dim is the dimension
+            of the data from which we built self.
+
+        updated_indices: array-like of size n_updates (optional, default=None)
+            Something that is convertable to list of ints.
+            If self is currently built from xs, then xs[update_indices[i]]
+            will be replaced by xs_updated[i].
+
+        Returns
+        -------
+            None
+        """
         current_random_state = check_random_state(self.random_state)
         rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
             np.int64
         )
-        X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C")
-
+        error_sparse_to_do = NotImplementedError("Sparse update not complete yet")
+        # input checks
+        if xs_updated is not None:
+            xs_updated = check_array(
+                xs_updated, dtype=np.float32, accept_sparse="csr", order="C"
+            )
+            if updated_indices is None:
+                raise ValueError(
+                    "If xs_updated are provided, updated_indices must also be provided!"
+                )
+            if self._is_sparse:
+                raise error_sparse_to_do
+            else:
+                try:
+                    updated_indices = list(map(int, updated_indices))
+                except (TypeError, ValueError):
+                    raise ValueError(
+                        "Could not convert updated indices to list of int(s)."
+                    )
+                n1 = len(updated_indices)
+                n2 = xs_updated.shape[0]
+                if n1 != n2:
+                    raise ValueError(
+                        f"Number of updated indices ({n1}) must match "
+                        f"number of rows of xs_updated ({n2})."
+                    )
+        else:
+            if updated_indices is not None:
+                warn(
+                    "xs_updated not provided, while update_indices provided. "
+                    "They will be ignored."
+                )
+                updated_indices = None
+        if updated_indices is None:
+            # make an empty iterable instead
+            xs_updated = []
+            updated_indices = []
+        if xs_fresh is None:
+            if self._is_sparse:
+                xs_fresh = csr_matrix(
+                    ([], [], []), shape=(0, self._raw_data.shape[1]), dtype=np.float32
+                )
+            else:
+                xs_fresh = np.zeros((0, self._raw_data.shape[1]), dtype=np.float32)
+        else:
+            xs_fresh = check_array(
+                xs_fresh, dtype=np.float32, accept_sparse="csr", order="C"
+            )
+        # data preparation
         if hasattr(self, "_vertex_order"):
             original_order = np.argsort(self._vertex_order)
         else:
             original_order = np.ones(self._raw_data.shape[0], dtype=np.bool_)
-
         if self._is_sparse:
-            self._raw_data = sparse_vstack([self._raw_data, X])
+            self._raw_data = sparse_vstack([self._raw_data, xs_fresh])
+            if updated_indices:
+                # cannot be reached due to the check above,
+                # but will leave this here as a marker
+                raise error_sparse_to_do
         else:
-            self._raw_data = np.ascontiguousarray(
-                np.vstack([self._raw_data[original_order, :], X])
-            )
-
+            self._raw_data = self._raw_data[original_order, :]
+            for x_updated, i_fresh in zip(xs_updated, updated_indices):
+                self._raw_data[i_fresh] = x_updated
+            self._raw_data = np.ascontiguousarray(np.vstack([self._raw_data, xs_fresh]))
+            ns, ds = self._neighbor_graph
+            n_examples, n_neighbors = ns.shape
+            indices_set = set(updated_indices)  # for fast "is element" checks
+            for i in range(n_examples):
+                # maybe update whole row
+                if i in indices_set:
+                    ns[i] = -1
+                    ds[i] = np.inf
+                    continue
+                # maybe update some columns
+                for j in range(n_neighbors):
+                    if ns[i, j] in indices_set:
+                        ns[i, j] = -1
+                        ds[i, j] = np.inf
+        # update neighbors
         if self._is_sparse:
-            raise NotImplementedError("Sparse update not complete yet")
+            raise error_sparse_to_do
         else:
-            self.n_trees = int(np.round(self.n_trees / 3))
+            self.n_trees = self.n_trees_after_update
             self._rp_forest = make_forest(
                 self._raw_data,
                 self.n_neighbors,
@@ -1728,9 +1869,9 @@ class NNDescent:
             # Remove search graph and search function
             # and rerun prepare if it was run previously
             if (
-                    hasattr(self, "_search_graph") or
-                    hasattr(self, "_search_function") or
-                    hasattr(self, "_search_forest")
+                hasattr(self, "_search_graph")
+                or hasattr(self, "_search_function")
+                or hasattr(self, "_search_forest")
             ):
                 if hasattr(self, "_search_graph"):
                     del self._search_graph


=====================================
pynndescent/rp_trees.py
=====================================
@@ -8,7 +8,13 @@ import numpy as np
 import numba
 import scipy.sparse
 
-from pynndescent.sparse import sparse_mul, sparse_diff, sparse_sum, arr_intersect, sparse_dot_product
+from pynndescent.sparse import (
+    sparse_mul,
+    sparse_diff,
+    sparse_sum,
+    arr_intersect,
+    sparse_dot_product,
+)
 from pynndescent.utils import tau_rand_int, norm
 import joblib
 
@@ -908,7 +914,9 @@ def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
     hyperplane_inds = hyperplane[0, :hyperplane_size].astype(np.int32)
     hyperplane_data = hyperplane[1, :hyperplane_size]
 
-    margin += sparse_dot_product(hyperplane_inds, hyperplane_data, point_inds, point_data)
+    margin += sparse_dot_product(
+        hyperplane_inds, hyperplane_data, point_inds, point_data
+    )
 
     if abs(margin) < EPS:
         side = tau_rand_int(rng_state) % 2
@@ -1131,36 +1139,17 @@ def num_nodes_and_leaves(tree):
     return n_nodes, n_leaves
 
 
- at numba.njit(cache=True)
-def dense_hyperplane_dim(hyperplanes):
-    for i in range(len(hyperplanes)):
-        if hyperplanes[i].shape[0] > 1:
-            return hyperplanes[i].shape[0]
-
-    raise ValueError("No hyperplanes of adequate size were found!")
-
-
- at numba.njit(cache=True)
-def sparse_hyperplane_dim(hyperplanes):
-    max_dim = 0
-    for i in range(len(hyperplanes)):
-        if hyperplanes[i].shape[1] > max_dim:
-            max_dim = hyperplanes[i].shape[1]
-    return max_dim
-
-
-def convert_tree_format(tree, data_size):
-
+def convert_tree_format(tree, data_size, data_dim):
     n_nodes, n_leaves = num_nodes_and_leaves(tree)
     is_sparse = False
     if tree.hyperplanes[0].ndim == 1:
         # dense hyperplanes
-        hyperplane_dim = dense_hyperplane_dim(tree.hyperplanes)
+        hyperplane_dim = data_dim
         hyperplanes = np.zeros((n_nodes, hyperplane_dim), dtype=np.float32)
     else:
         # sparse hyperplanes
         is_sparse = True
-        hyperplane_dim = sparse_hyperplane_dim(tree.hyperplanes)
+        hyperplane_dim = data_dim
         hyperplanes = np.zeros((n_nodes, 2, hyperplane_dim), dtype=np.float32)
         hyperplanes[:, 0, :] = -1
 


=====================================
pynndescent/sparse.py
=====================================
@@ -53,6 +53,7 @@ def arr_intersect(ar1, ar2):
     aux.sort()
     return aux[:-1][aux[1:] == aux[:-1]]
 
+
 # Some things require size of intersection; do this quickly; assume sorted arrays for speed
 @numba.njit(
     [
@@ -65,7 +66,7 @@ def arr_intersect(ar1, ar2):
     locals={
         "i1": numba.uint16,
         "i2": numba.uint16,
-    }
+    },
 )
 def fast_intersection_size(ar1, ar2):
     if ar1.shape[0] == 0 or ar2.shape[0] == 0:
@@ -201,6 +202,7 @@ def sparse_sum(ind1, data1, ind2, data2):
 def sparse_diff(ind1, data1, ind2, data2):
     return sparse_sum(ind1, data1, ind2, -data2)
 
+
 @numba.njit(
     [
         # "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
@@ -252,6 +254,7 @@ def sparse_mul(ind1, data1, ind2, data2):
 
     return result_ind, result_data
 
+
 @numba.njit(
     [
         # "Tuple((i4[::1],f4[::1]))(i4[::1],f4[::1],i4[::1],f4[::1])",
@@ -308,7 +311,8 @@ def sparse_dot_product(ind1, data1, ind2, data2):
                 return result
             j2 = ind2[i2]
 
-    return result # unreachable
+    return result  # unreachable
+
 
 # Return dense vectors supported on the union of the non-zero valued indices
 @numba.njit()
@@ -717,10 +721,10 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features):
         shifted_data2[i] = data2[i] - mu_y
 
     norm1 = np.sqrt(
-        (norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2)
+        (norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x**2)
     )
     norm2 = np.sqrt(
-        (norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2)
+        (norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y**2)
     )
 
     dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2)


=====================================
pynndescent/tests/conftest.py
=====================================
@@ -61,3 +61,35 @@ def cosine_hang_data():
     this_dir = os.path.dirname(os.path.abspath(__file__))
     data_path = os.path.join(this_dir, "test_data/cosine_hang.npy")
     return np.load(data_path)
+
+
+ at pytest.fixture
+def small_data():
+    return np.random.uniform(40, 5, size=(20, 5))
+
+
+ at pytest.fixture
+def sparse_small_data():
+    # Too low dim might cause more than one empty row,
+    # which might decrease the computed performance
+    return sparse.random(40, 32, density=0.5, format="csr")
+
+
+ at pytest.fixture
+def update_data():
+    np.random.seed(12345)
+    xs_orig = np.random.uniform(0, 1, size=(1000, 5))
+    xs_fresh = np.random.uniform(0, 1, size=xs_orig.shape)
+    xs_fresh_small = np.random.uniform(0, 1, size=(100, xs_orig.shape[1]))
+    xs_for_complete_update = np.random.uniform(0, 1, size=xs_orig.shape)
+    updates = [
+        (xs_orig, None, None, None),
+        (xs_orig, xs_fresh, None, None),
+        (xs_orig, None, xs_for_complete_update, list(range(xs_orig.shape[0]))),
+        (xs_orig, None, -xs_orig[0:50:2], list(range(0, 50, 2))),
+        (xs_orig, None, -xs_orig[0:500:2], list(range(0, 500, 2))),
+        (xs_orig, xs_fresh, xs_for_complete_update, list(range(xs_orig.shape[0]))),
+        (xs_orig, xs_fresh_small, -xs_orig[0:50:2], list(range(0, 50, 2))),
+        (xs_orig, xs_fresh, -xs_orig[0:500:2], list(range(0, 500, 2))),
+    ]
+    return updates


=====================================
pynndescent/tests/test_distances.py
=====================================
@@ -48,7 +48,7 @@ def test_spatial_check(spatial_data, metric):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric {}".format(metric),
+        err_msg="Distances don't match for metric {}".format(metric),
     )
 
 
@@ -88,7 +88,7 @@ def test_binary_check(binary_data, metric):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric {}".format(metric),
+        err_msg="Distances don't match for metric {}".format(metric),
     )
 
 
@@ -154,7 +154,7 @@ def test_sparse_spatial_check(sparse_spatial_data, metric, decimal=6):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Sparse distances don't match " "for metric {}".format(metric),
+        err_msg="Sparse distances don't match for metric {}".format(metric),
         decimal=decimal,
     )
 
@@ -219,7 +219,7 @@ def test_sparse_binary_check(sparse_binary_data, metric):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Sparse distances don't match " "for metric {}".format(metric),
+        err_msg="Sparse distances don't match for metric {}".format(metric),
     )
 
 
@@ -238,7 +238,7 @@ def test_seuclidean(spatial_data):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric seuclidean",
+        err_msg="Distances don't match for metric seuclidean",
     )
 
 
@@ -260,7 +260,7 @@ def test_weighted_minkowski(spatial_data):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric weighted_minkowski",
+        err_msg="Distances don't match for metric weighted_minkowski",
     )
 
 
@@ -279,7 +279,7 @@ def test_mahalanobis(spatial_data):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric mahalanobis",
+        err_msg="Distances don't match for metric mahalanobis",
     )
 
 
@@ -299,7 +299,7 @@ def test_haversine(spatial_data):
     assert_array_almost_equal(
         test_matrix,
         dist_matrix,
-        err_msg="Distances don't match " "for metric haversine",
+        err_msg="Distances don't match for metric haversine",
     )
 
 
@@ -309,7 +309,7 @@ def test_spearmanr():
 
     scipy_expected = stats.spearmanr(x, y)
     r = dist.spearmanr(x, y)
-    assert_array_equal(r, scipy_expected.correlation)
+    assert_array_almost_equal(r, scipy_expected.correlation)
 
 
 def test_alternative_distances():


=====================================
pynndescent/tests/test_pynndescent_.py
=====================================
@@ -28,9 +28,9 @@ def test_nn_descent_neighbor_accuracy(nn_data, seed):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (nn_data.shape[0] * 10)
-    assert percent_correct >= 0.98, (
-        "NN-descent did not get 99% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.98
+    ), "NN-descent did not get 99% accuracy on nearest neighbors"
 
 
 def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
@@ -47,9 +47,9 @@ def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (nn_data.shape[0] * 10)
-    assert percent_correct >= 0.98, (
-        "NN-descent did not get 99% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.98
+    ), "NN-descent did not get 99% accuracy on nearest neighbors"
 
 
 @pytest.mark.skipif(
@@ -69,9 +69,9 @@ def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data, seed):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
-    assert percent_correct >= 0.85, (
-        "Sparse NN-descent did not get 95% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.85
+    ), "Sparse NN-descent did not get 95% accuracy on nearest neighbors"
 
 
 @pytest.mark.skipif(
@@ -92,9 +92,9 @@ def test_sparse_angular_nn_descent_neighbor_accuracy(sparse_nn_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
-    assert percent_correct >= 0.85, (
-        "Sparse angular NN-descent did not get 98% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.85
+    ), "Sparse angular NN-descent did not get 98% accuracy on nearest neighbors"
 
 
 def test_nn_descent_query_accuracy(nn_data):
@@ -109,9 +109,9 @@ def test_nn_descent_query_accuracy(nn_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
-    assert percent_correct >= 0.95, (
-        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.95
+    ), "NN-descent query did not get 95% accuracy on nearest neighbors"
 
 
 def test_nn_descent_query_accuracy_angular(nn_data):
@@ -126,9 +126,9 @@ def test_nn_descent_query_accuracy_angular(nn_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
-    assert percent_correct >= 0.95, (
-        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.95
+    ), "NN-descent query did not get 95% accuracy on nearest neighbors"
 
 
 def test_sparse_nn_descent_query_accuracy(sparse_nn_data):
@@ -145,9 +145,9 @@ def test_sparse_nn_descent_query_accuracy(sparse_nn_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
-    assert percent_correct >= 0.95, (
-        "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.95
+    ), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
 
 
 def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
@@ -164,9 +164,9 @@ def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
-    assert percent_correct >= 0.95, (
-        "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.95
+    ), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
 
 
 def test_transformer_equivalence(nn_data):
@@ -209,9 +209,9 @@ def test_random_state_none(nn_data, spatial_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (spatial_data.shape[0] * 10)
-    assert percent_correct >= 0.99, (
-        "NN-descent did not get 99% " "accuracy on nearest neighbors"
-    )
+    assert (
+        percent_correct >= 0.99
+    ), "NN-descent did not get 99% accuracy on nearest neighbors"
 
 
 def test_deterministic():
@@ -282,9 +282,9 @@ def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):
         num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
 
     proportion_correct = num_correct / (data.shape[0] * n_neighbors)
-    assert proportion_correct >= 0.95, (
-        "NN-descent did not get 95%" " accuracy on nearest neighbors"
-    )
+    assert (
+        proportion_correct >= 0.95
+    ), "NN-descent did not get 95% accuracy on nearest neighbors"
 
 
 def test_output_when_verbose_is_true(spatial_data, seed):
@@ -436,10 +436,11 @@ def test_joblib_dump():
     np.testing.assert_equal(neighbors1, neighbors2)
     np.testing.assert_equal(distances1, distances2)
 
+
 @pytest.mark.parametrize("metric", ["euclidean", "cosine"])
 def test_update_no_prepare_query_accuracy(nn_data, metric):
     nnd = NNDescent(nn_data[200:800], metric=metric, n_neighbors=10, random_state=None)
-    nnd.update(nn_data[800:])
+    nnd.update(xs_fresh=nn_data[800:])
 
     knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)
 
@@ -455,12 +456,19 @@ def test_update_no_prepare_query_accuracy(nn_data, metric):
         "NN-descent query did not get 95% " "accuracy on nearest neighbors"
     )
 
+
 @pytest.mark.parametrize("metric", ["euclidean", "cosine"])
 def test_update_w_prepare_query_accuracy(nn_data, metric):
-    nnd = NNDescent(nn_data[200:800], metric=metric, n_neighbors=10, random_state=None, compressed=False)
+    nnd = NNDescent(
+        nn_data[200:800],
+        metric=metric,
+        n_neighbors=10,
+        random_state=None,
+        compressed=False,
+    )
     nnd.prepare()
 
-    nnd.update(nn_data[800:])
+    nnd.update(xs_fresh=nn_data[800:])
     nnd.prepare()
 
     knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)
@@ -477,6 +485,101 @@ def test_update_w_prepare_query_accuracy(nn_data, metric):
         "NN-descent query did not get 95% " "accuracy on nearest neighbors"
     )
 
+
+ at pytest.mark.parametrize("metric", ["euclidean", "cosine"])
+def test_update_w_prepare_query_accuracy(nn_data, metric):
+    nnd = NNDescent(
+        nn_data[200:800],
+        metric=metric,
+        n_neighbors=10,
+        random_state=None,
+        compressed=False,
+    )
+    nnd.prepare()
+
+    nnd.update(xs_fresh=nn_data[800:])
+    nnd.prepare()
+
+    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)
+
+    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:])
+    true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False)
+
+    num_correct = 0.0
+    for i in range(true_indices.shape[0]):
+        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+
+    percent_correct = num_correct / (true_indices.shape[0] * 10)
+    assert percent_correct >= 0.95, (
+        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
+    )
+
+
+def evaluate_predictions(neighbors_true, neigbhors_computed, n_neighbors):
+    n_correct = 0
+    n_all = neighbors_true.shape[0] * n_neighbors
+    for i in range(neighbors_true.shape[0]):
+        n_correct += np.sum(np.in1d(neighbors_true[i], neigbhors_computed[i]))
+    return n_correct / n_all
+
+
+ at pytest.mark.parametrize("metric", ["manhattan", "euclidean", "cosine"])
+ at pytest.mark.parametrize("case", list(range(8)))  # the number of cases in update_data
+def test_update_with_changed_data(update_data, case, metric):
+    def evaluate(nn_descent, xs_to_fit, xs_to_query):
+        true_nn = NearestNeighbors(metric=metric, n_neighbors=k).fit(xs_to_fit)
+        neighbors, _ = nn_descent.query(xs_to_query, k=k)
+        neighbors_expected = true_nn.kneighbors(xs_to_query, k, return_distance=False)
+        p_correct = evaluate_predictions(neighbors_expected, neighbors, k)
+        assert p_correct >= 0.95, (
+            "NN-descent query did not get 95% " "accuracy on nearest neighbors"
+        )
+
+    k = 10
+    xs_orig, xs_fresh, xs_updated, indices_updated = update_data[case]
+    queries1 = xs_orig
+
+    # original
+    index = NNDescent(xs_orig, metric=metric, n_neighbors=40, random_state=1234)
+    index.prepare()
+    evaluate(index, xs_orig, queries1)
+    # updated
+    index.update(
+        xs_fresh=xs_fresh, xs_updated=xs_updated, updated_indices=indices_updated
+    )
+    if xs_fresh is not None:
+        xs = np.vstack((xs_orig, xs_fresh))
+        queries2 = np.vstack((queries1, xs_fresh))
+    else:
+        xs = xs_orig
+        queries2 = queries1
+    if indices_updated is not None:
+        xs[indices_updated] = xs_updated
+    evaluate(index, xs, queries2)
+    if indices_updated is not None:
+        evaluate(index, xs, xs_updated)
+
+
+ at pytest.mark.parametrize("n_trees", [1, 2, 3, 10])
+def test_tree_numbers_after_multiple_updates(n_trees):
+    trees_after_update = max(1, int(np.round(n_trees / 3)))
+
+    nnd = NNDescent(np.array([[1.0]]), n_neighbors=1, n_trees=n_trees)
+
+    assert nnd.n_trees == n_trees, "NN-descent update changed the number of trees"
+    assert (
+        nnd.n_trees_after_update == trees_after_update
+    ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong"
+    for i in range(5):
+        nnd.update(xs_fresh=np.array([[i]], dtype=np.float64))
+        assert (
+            nnd.n_trees == trees_after_update
+        ), "The value of the n_trees in NN-descent after update(s) is wrong"
+        assert (
+            nnd.n_trees_after_update == trees_after_update
+        ), "The value of the n_trees_after_update in NN-descent after update(s) is wrong"
+
+
 @pytest.mark.parametrize("metric", ["euclidean", "cosine"])
 def test_tree_init_false(nn_data, metric):
     nnd = NNDescent(
@@ -497,3 +600,66 @@ def test_tree_init_false(nn_data, metric):
     assert percent_correct >= 0.95, (
         "NN-descent query did not get 95% " "accuracy on nearest neighbors"
     )
+
+
+ at pytest.mark.parametrize(
+    "metric", ["euclidean", "manhattan"]
+)  # cosine makes no sense for 1D
+def test_one_dimensional_data(nn_data, metric):
+    nnd = NNDescent(
+        nn_data[200:, :1],
+        metric=metric,
+        n_neighbors=20,
+        random_state=None,
+        tree_init=False,
+    )
+    nnd.prepare()
+
+    knn_indices, _ = nnd.query(nn_data[:200, :1], k=10, epsilon=0.2)
+
+    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:, :1])
+    true_indices = true_nnd.kneighbors(nn_data[:200, :1], 10, return_distance=False)
+
+    num_correct = 0.0
+    for i in range(true_indices.shape[0]):
+        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+
+    percent_correct = num_correct / (true_indices.shape[0] * 10)
+    assert percent_correct >= 0.95, (
+        "NN-descent query did not get 95% " "accuracy on nearest neighbors"
+    )
+
+
+ at pytest.mark.parametrize("metric", ["euclidean", "cosine"])
+def test_tree_no_split(small_data, sparse_small_data, metric):
+    k = 10
+    for data, data_type in zip([small_data, sparse_small_data], ["dense", "sparse"]):
+        n_instances = data.shape[0]
+        leaf_size = n_instances + 1  # just to be safe
+        data_train = data[n_instances // 2 :]
+        data_test = data[: n_instances // 2]
+
+        nnd = NNDescent(
+            data_train,
+            metric=metric,
+            n_neighbors=data_train.shape[0] - 1,
+            random_state=None,
+            tree_init=True,
+            leaf_size=leaf_size,
+        )
+        nnd.prepare()
+        knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2)
+
+        true_nnd = NearestNeighbors(metric=metric).fit(data_train)
+        true_indices = true_nnd.kneighbors(data_test, k, return_distance=False)
+
+        num_correct = 0.0
+        for i in range(true_indices.shape[0]):
+            num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+
+        percent_correct = num_correct / (true_indices.shape[0] * k)
+        assert (
+            percent_correct >= 0.95
+        ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format(
+            data_type
+        )


=====================================
pynndescent/tests/test_rank.py
=====================================
@@ -74,15 +74,15 @@ def test_rankdata_object_string():
 
 
 def test_large_int():
-    data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.uint64)
+    data = np.array([2**60, 2**60 + 1], dtype=np.uint64)
     r = rankdata(data)
     assert_array_equal(r, [1.0, 2.0])
 
-    data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.int64)
+    data = np.array([2**60, 2**60 + 1], dtype=np.int64)
     r = rankdata(data)
     assert_array_equal(r, [1.0, 2.0])
 
-    data = np.array([2 ** 60, -(2 ** 60) + 1], dtype=np.int64)
+    data = np.array([2**60, -(2**60) + 1], dtype=np.int64)
     r = rankdata(data)
     assert_array_equal(r, [2.0, 1.0])
 


=====================================
pynndescent/utils.py
=====================================
@@ -225,7 +225,7 @@ def siftdown(heap1, heap2, elt):
 
 @numba.njit(parallel=True, cache=False)
 def deheap_sort(indices, distances):
-    """Given two arrays representing a heap (indices and distances), reorder the 
+    """Given two arrays representing a heap (indices and distances), reorder the
      arrays by increasing distance. This is effectively just the second half of
      heap sort (the first half not being required since we already have the
      graph_data in a heap).
@@ -689,6 +689,20 @@ def initalize_heap_from_graph_indices(heap, graph_indices, data, metric):
     return heap
 
 
+ at numba.njit(cache=True)
+def initalize_heap_from_graph_indices_and_distances(
+    heap, graph_indices, graph_distances
+):
+    for i in range(graph_indices.shape[0]):
+        for idx in range(graph_indices.shape[1]):
+            j = graph_indices[i, idx]
+            if j >= 0:
+                d = graph_distances[i, idx]
+                checked_flagged_heap_push(heap[1][i], heap[0][i], heap[2][i], d, j, 1)
+
+    return heap
+
+
 @numba.njit(parallel=True, cache=False)
 def sparse_initalize_heap_from_graph_indices(
     heap, graph_indices, data_indptr, data_indices, data_vals, metric


=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def readme():
 
 configuration = {
     "name": "pynndescent",
-    "version": "0.5.7",
+    "version": "0.5.8",
     "description": "Nearest Neighbor Descent",
     "long_description": readme(),
     "classifiers": [
@@ -42,6 +42,7 @@ configuration = {
         "numba >= 0.51.2",
         "llvmlite >= 0.30",
         "joblib >= 0.11",
+        'importlib-metadata >= 4.8.1; python_version < "3.8"',
     ],
     "ext_modules": [],
     "cmdclass": {},



View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/compare/305851b8a14d762988dce6522c23db17ffc0d550...9321c99e1c0542865e15028a76ad4e3e2ba71272

-- 
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/compare/305851b8a14d762988dce6522c23db17ffc0d550...9321c99e1c0542865e15028a76ad4e3e2ba71272
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230117/c2b095ab/attachment-0001.htm>


More information about the debian-med-commit mailing list