[med-svn] [Git][python-team/packages/python-pynndescent][upstream] New upstream version 0.5.13

Michael R. Crusoe (@crusoe) gitlab at salsa.debian.org
Mon Sep 1 16:10:49 BST 2025



Michael R. Crusoe pushed to branch upstream at Debian Python Team / packages / python-pynndescent


Commits:
32f80e10 by Michael R. Crusoe at 2025-09-01T15:55:06+02:00
New upstream version 0.5.13
- - - - -


9 changed files:

- PKG-INFO
- pynndescent.egg-info/PKG-INFO
- pynndescent/distances.py
- pynndescent/pynndescent_.py
- pynndescent/rp_trees.py
- pynndescent/tests/test_distances.py
- pynndescent/tests/test_pynndescent_.py
- pynndescent/utils.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pynndescent
-Version: 0.5.11
+Version: 0.5.13
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes
@@ -20,9 +20,10 @@ Classifier: Operating System :: Microsoft :: Windows
 Classifier: Operating System :: POSIX
 Classifier: Operating System :: Unix
 Classifier: Operating System :: MacOS
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 License-File: LICENSE
 
 .. image:: doc/pynndescent_logo.png


=====================================
pynndescent.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pynndescent
-Version: 0.5.11
+Version: 0.5.13
 Summary: Nearest Neighbor Descent
 Home-page: http://github.com/lmcinnes/pynndescent
 Author: Leland McInnes
@@ -20,9 +20,10 @@ Classifier: Operating System :: Microsoft :: Windows
 Classifier: Operating System :: POSIX
 Classifier: Operating System :: Unix
 Classifier: Operating System :: MacOS
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 License-File: LICENSE
 
 .. image:: doc/pynndescent_logo.png


=====================================
pynndescent/distances.py
=====================================
@@ -22,6 +22,11 @@ _dummy_cost = np.zeros((2, 2), dtype=np.float64)
 FLOAT32_EPS = np.finfo(np.float32).eps
 FLOAT32_MAX = np.finfo(np.float32).max
 
+popcnt = np.array(
+    [bin(i).count('1') for i in range(256)],
+    dtype=np.float32
+)
+
 
 @numba.njit(fastmath=True)
 def euclidean(x, y):
@@ -890,6 +895,65 @@ def symmetric_kl_divergence(x, y):
     return result
 
 
+ at numba.njit(
+    [
+        "f4(u1[::1],u1[::1])",
+        numba.types.float32(
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+        ),
+    ],
+    fastmath=True,
+    locals={
+        "result": numba.types.float32,
+        "intersection": numba.types.uint8,
+        "dim": numba.types.intp,
+        "i": numba.types.uint16,
+    },
+)
+def bit_hamming(x, y):
+    result = 0.0
+    dim = x.shape[0]
+
+    for i in range(dim):
+        intersection = x[i] ^ y[i]
+        result += popcnt[intersection]
+
+    return result
+
+
+ at numba.njit(
+    [
+        "f4(u1[::1],u1[::1])",
+        numba.types.float32(
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+        ),
+    ],
+    fastmath=True,
+    locals={
+        "result": numba.types.float32,
+        "denom": numba.types.float32,
+        "and_": numba.types.uint8,
+        "or_": numba.types.uint8,
+        "dim": numba.types.intp,
+        "i": numba.types.uint16,
+    },
+)
+def bit_jaccard(x, y):
+    result = 0.0
+    denom = 0.0
+    dim = x.shape[0]
+
+    for i in range(dim):
+        and_ = x[i] & y[i]
+        or_ = x[i] | y[i]
+        result += popcnt[and_]
+        denom += popcnt[or_]
+
+    return -np.log(result / denom)
+
+
 named_distances = {
     # general minkowski distances
     "euclidean": euclidean,
@@ -946,6 +1010,8 @@ named_distances = {
     "sokalsneath": sokal_sneath,
     "sokalmichener": sokal_michener,
     "yule": yule,
+    "bit_hamming": bit_hamming,
+    "bit_jaccard": bit_jaccard,
 }
 
 # Some distances have a faster to compute alternative that


=====================================
pynndescent/pynndescent_.py
=====================================
@@ -49,6 +49,7 @@ from pynndescent.rp_trees import (
     denumbaify_tree,
     renumbaify_tree,
     select_side,
+    select_side_bit,
     sparse_select_side,
     score_linked_tree,
 )
@@ -728,7 +729,13 @@ class NNDescent:
         else:
             copy_on_normalize = False
 
-        data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
+        if metric in ("bit_hamming", "bit_jaccard"):
+            data = check_array(data, dtype=np.uint8, order="C")
+            self._input_dtype = np.uint8
+        else:
+            data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
+            self._input_dtype = np.float32
+
         self._raw_data = data
 
         if not tree_init or n_trees == 0 or init_graph is not None:
@@ -744,32 +751,9 @@ class NNDescent:
         current_random_state = check_random_state(self.random_state)
 
         self._distance_correction = None
-
-        if callable(metric):
-            _distance_func = metric
-        elif metric in pynnd_dist.named_distances:
-            if metric in pynnd_dist.fast_distance_alternatives:
-                _distance_func = pynnd_dist.fast_distance_alternatives[metric]["dist"]
-                self._distance_correction = pynnd_dist.fast_distance_alternatives[
-                    metric
-                ]["correction"]
-            else:
-                _distance_func = pynnd_dist.named_distances[metric]
-        else:
-            raise ValueError("Metric is neither callable, " + "nor a recognised string")
-
-        # Create a partial function for distances with arguments
-        if len(self._dist_args) > 0:
-            dist_args = self._dist_args
-
-            @numba.njit()
-            def _partial_dist_func(x, y):
-                return _distance_func(x, y, *dist_args)
-
-            self._distance_func = _partial_dist_func
-        else:
-            self._distance_func = _distance_func
-
+        
+        self._set_distance_func()
+        
         if metric in (
             "cosine",
             "dot",
@@ -778,10 +762,17 @@ class NNDescent:
             "jaccard",
             "hellinger",
             "hamming",
+            "bit_hamming",
+            "bit_jaccard",
         ):
             self._angular_trees = True
+            if metric in ("bit_hamming", "bit_jaccard"):
+                self._bit_trees = True
+            else:
+                self._bit_trees = False
         else:
             self._angular_trees = False
+            self._bit_trees = False
 
         if metric == "dot":
             data = normalize(data, norm="l2", copy=copy_on_normalize)
@@ -809,6 +800,7 @@ class NNDescent:
                 current_random_state,
                 self.n_jobs,
                 self._angular_trees,
+                self._bit_trees,
                 max_depth=self.max_rptree_depth,
             )
             leaf_array = rptree_leaf_array(self._rp_forest)
@@ -952,6 +944,32 @@ class NNDescent:
 
         numba.set_num_threads(self._original_num_threads)
 
+    def _set_distance_func(self):
+        if callable(self.metric):
+            _distance_func = self.metric
+        elif self.metric in pynnd_dist.named_distances:
+            if self.metric in pynnd_dist.fast_distance_alternatives:
+                _distance_func = pynnd_dist.fast_distance_alternatives[self.metric]["dist"]
+                self._distance_correction = pynnd_dist.fast_distance_alternatives[
+                    self.metric
+                ]["correction"]
+            else:
+                _distance_func = pynnd_dist.named_distances[self.metric]
+        else:
+            raise ValueError("Metric is neither callable, " + "nor a recognised string")
+
+        # Create a partial function for distances with arguments
+        if len(self._dist_args) > 0:
+            dist_args = self._dist_args
+
+            @numba.njit()
+            def _partial_dist_func(x, y):
+                return _distance_func(x, y, *dist_args)
+
+            self._distance_func = _partial_dist_func
+        else:
+            self._distance_func = _distance_func
+            
     def __getstate__(self):
         if not hasattr(self, "_search_graph"):
             self._init_search_graph()
@@ -970,6 +988,7 @@ class NNDescent:
 
     def __setstate__(self, d):
         self.__dict__ = d
+        self._set_distance_func()
         self._search_forest = tuple(
             [renumbaify_tree(tree) for tree in d["_search_forest"]]
         )
@@ -1210,27 +1229,50 @@ class NNDescent:
             tree_indices = self._search_forest[0].indices
             tree_children = self._search_forest[0].children
 
-            @numba.njit(
-                [
-                    numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
-                        numba.types.Array(numba.types.float32, 1, "C", readonly=True),
-                        numba.types.Array(numba.types.int64, 1, "C", readonly=False),
-                    )
-                ],
-                locals={"node": numba.types.uint32, "side": numba.types.boolean},
-            )
-            def tree_search_closure(point, rng_state):
-                node = 0
-                while tree_children[node, 0] > 0:
-                    side = select_side(
-                        tree_hyperplanes[node], tree_offsets[node], point, rng_state
-                    )
-                    if side == 0:
-                        node = tree_children[node, 0]
-                    else:
-                        node = tree_children[node, 1]
+            if self._bit_trees:
+                @numba.njit(
+                    [
+                        numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+                            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+                            numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+                        )
+                    ],
+                    locals={"node": numba.types.uint32, "side": numba.types.boolean},
+                )
+                def tree_search_closure(point, rng_state):
+                    node = 0
+                    while tree_children[node, 0] > 0:
+                        side = select_side_bit(
+                            tree_hyperplanes[node], tree_offsets[node], point, rng_state
+                        )
+                        if side == 0:
+                            node = tree_children[node, 0]
+                        else:
+                            node = tree_children[node, 1]
 
-                return -tree_children[node]
+                    return -tree_children[node]
+            else:
+                @numba.njit(
+                    [
+                        numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+                            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+                            numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+                        )
+                    ],
+                    locals={"node": numba.types.uint32, "side": numba.types.boolean},
+                )
+                def tree_search_closure(point, rng_state):
+                    node = 0
+                    while tree_children[node, 0] > 0:
+                        side = select_side(
+                            tree_hyperplanes[node], tree_offsets[node], point, rng_state
+                        )
+                        if side == 0:
+                            node = tree_children[node, 0]
+                        else:
+                            node = tree_children[node, 1]
+
+                    return -tree_children[node]
 
             self._tree_search = tree_search_closure
         else:
@@ -1252,10 +1294,15 @@ class NNDescent:
         n_neighbors = self.n_neighbors
         parallel_search = self.parallel_batch_queries
 
+        if dist == pynnd_dist.bit_hamming or dist == pynnd_dist.bit_jaccard:
+            data_type = numba.types.uint8[::1]
+        else:
+            data_type = numba.types.float32[::1]
+
         @numba.njit(
             fastmath=True,
             locals={
-                "current_query": numba.types.float32[::1],
+                "current_query": data_type,
                 "i": numba.types.uint32,
                 "j": numba.types.uint32,
                 "heap_priorities": numba.types.float32[::1],
@@ -1267,7 +1314,7 @@ class NNDescent:
                 "visited": numba.types.uint8[::1],
                 "indices": numba.types.int32[::1],
                 "indptr": numba.types.int32[::1],
-                "data": numba.types.float32[:, ::1],
+                "data": data_type,
                 "heap_size": numba.types.int16,
                 "distance_scale": numba.types.float32,
                 "distance_bound": numba.types.float32,
@@ -1693,7 +1740,11 @@ class NNDescent:
             if not hasattr(self, "_search_function"):
                 self._init_search_function()
 
-            query_data = np.asarray(query_data).astype(np.float32, order="C")
+            if self.metric in ("bit_hamming", "bit_jaccard"):
+                query_data = np.asarray(query_data).astype(np.uint8, order="C")
+            else:
+                query_data = np.asarray(query_data).astype(np.float32, order="C")
+
             indices, dists, _ = self._search_function(
                 query_data, k, epsilon, self._visited, self.search_rng_state
             )
@@ -1762,7 +1813,7 @@ class NNDescent:
         # input checks
         if xs_updated is not None:
             xs_updated = check_array(
-                xs_updated, dtype=np.float32, accept_sparse="csr", order="C"
+                xs_updated, dtype=self._input_dtype, accept_sparse="csr", order="C"
             )
             if updated_indices is None:
                 raise ValueError(
@@ -1798,13 +1849,13 @@ class NNDescent:
         if xs_fresh is None:
             if self._is_sparse:
                 xs_fresh = csr_matrix(
-                    ([], [], []), shape=(0, self._raw_data.shape[1]), dtype=np.float32
+                    ([], [], []), shape=(0, self._raw_data.shape[1]), dtype=self._input_dtype
                 )
             else:
-                xs_fresh = np.zeros((0, self._raw_data.shape[1]), dtype=np.float32)
+                xs_fresh = np.zeros((0, self._raw_data.shape[1]), dtype=self._input_dtype)
         else:
             xs_fresh = check_array(
-                xs_fresh, dtype=np.float32, accept_sparse="csr", order="C"
+                xs_fresh, dtype=self._input_dtype, accept_sparse="csr", order="C"
             )
         # data preparation
         if hasattr(self, "_vertex_order"):


=====================================
pynndescent/rp_trees.py
=====================================
@@ -33,10 +33,15 @@ FlatTree = namedtuple(
 
 dense_hyperplane_type = numba.float32[::1]
 sparse_hyperplane_type = numba.float64[:, ::1]
+bit_hyperplane_type = numba.uint8[::1]
 offset_type = numba.float64
 children_type = numba.typeof((np.int32(-1), np.int32(-1)))
 point_indices_type = numba.int32[::1]
 
+popcnt = np.array(
+    [bin(i).count('1') for i in range(256)],
+    dtype=np.float32
+)
 
 @numba.njit(
     numba.types.Tuple(
@@ -171,6 +176,136 @@ def angular_random_projection_split(data, indices, rng_state):
     return indices_left, indices_right, hyperplane_vector, 0.0
 
 
+ at numba.njit(
+    numba.types.Tuple(
+        (numba.int32[::1], numba.int32[::1], bit_hyperplane_type, offset_type)
+    )(numba.uint8[:, ::1], numba.int32[::1], numba.int64[::1]),
+    locals={
+        "n_left": numba.uint32,
+        "n_right": numba.uint32,
+        "hyperplane_vector": numba.uint8[::1],
+        "hyperplane_offset": numba.float32,
+        "margin": numba.float32,
+        "d": numba.uint32,
+        "i": numba.uint32,
+        "left_index": numba.uint32,
+        "right_index": numba.uint32,
+    },
+    fastmath=True,
+    nogil=True,
+    cache=True,
+)
+def angular_bitpacked_random_projection_split(data, indices, rng_state):
+    """Given a set of ``graph_indices`` for graph_data points from ``graph_data``, create
+    a random hyperplane to split the graph_data, returning two arrays graph_indices
+    that fall on either side of the hyperplane. This is the basis for a
+    random projection tree, which simply uses this splitting recursively.
+    This particular split uses cosine distance to determine the hyperplane
+    and which side each graph_data sample falls on.
+    Parameters
+    ----------
+    data: array of shape (n_samples, n_features)
+        The original graph_data to be split
+    indices: array of shape (tree_node_size,)
+        The graph_indices of the elements in the ``graph_data`` array that are to
+        be split in the current operation.
+    rng_state: array of int64, shape (3,)
+        The internal state of the rng
+    Returns
+    -------
+    indices_left: array
+        The elements of ``graph_indices`` that fall on the "left" side of the
+        random hyperplane.
+    indices_right: array
+        The elements of ``graph_indices`` that fall on the "left" side of the
+        random hyperplane.
+    """
+    dim = data.shape[1]
+
+    # Select two random points, set the hyperplane between them
+    left_index = tau_rand_int(rng_state) % indices.shape[0]
+    right_index = tau_rand_int(rng_state) % indices.shape[0]
+    right_index += left_index == right_index
+    right_index = right_index % indices.shape[0]
+    left = indices[left_index]
+    right = indices[right_index]
+
+    left_norm = 0.0
+    right_norm = 0.0
+
+    # Compute the normal vector to the hyperplane (the vector between
+    # the two points)
+    hyperplane_vector = np.empty(dim * 2, dtype=np.uint8)
+    positive_hyperplane_component = hyperplane_vector[:dim]
+    negative_hyperplane_component = hyperplane_vector[dim:]
+
+    for d in range(dim):
+        xor_vector = (data[left, d]) ^ (data[right, d])
+        positive_hyperplane_component[d] = xor_vector & (data[left, d])
+        negative_hyperplane_component[d] = xor_vector & (data[right, d])
+
+    hyperplane_norm = 0.0
+
+    for d in range(dim):
+        hyperplane_norm += popcnt[hyperplane_vector[d]]
+        left_norm += popcnt[data[left, d]]
+        right_norm += popcnt[data[right, d]]
+
+    # For each point compute the margin (project into normal vector)
+    # If we are on lower side of the hyperplane put in one pile, otherwise
+    # put it in the other pile (if we hit hyperplane on the nose, flip a coin)
+    n_left = 0
+    n_right = 0
+    side = np.empty(indices.shape[0], np.int8)
+    for i in range(indices.shape[0]):
+        margin = 0.0
+        for d in range(dim):
+            margin += popcnt[positive_hyperplane_component[d] & data[indices[i], d]]
+            margin -= popcnt[negative_hyperplane_component[d] & data[indices[i], d]]
+
+        if abs(margin) < EPS:
+            side[i] = tau_rand_int(rng_state) % 2
+            if side[i] == 0:
+                n_left += 1
+            else:
+                n_right += 1
+        elif margin > 0:
+            side[i] = 0
+            n_left += 1
+        else:
+            side[i] = 1
+            n_right += 1
+
+    # If all points end up on one side, something went wrong numerically
+    # In this case, assign points randomly; they are likely very close anyway
+    if n_left == 0 or n_right == 0:
+        n_left = 0
+        n_right = 0
+        for i in range(indices.shape[0]):
+            side[i] = tau_rand_int(rng_state) % 2
+            if side[i] == 0:
+                n_left += 1
+            else:
+                n_right += 1
+
+    # Now that we have the counts allocate arrays
+    indices_left = np.empty(n_left, dtype=np.int32)
+    indices_right = np.empty(n_right, dtype=np.int32)
+
+    # Populate the arrays with graph_indices according to which side they fell on
+    n_left = 0
+    n_right = 0
+    for i in range(side.shape[0]):
+        if side[i] == 0:
+            indices_left[n_left] = indices[i]
+            n_left += 1
+        else:
+            indices_right[n_right] = indices[i]
+            n_right += 1
+
+    return indices_left, indices_right, hyperplane_vector, 0.0
+
+
 @numba.njit(
     numba.types.Tuple(
         (numba.int32[::1], numba.int32[::1], dense_hyperplane_type, offset_type)
@@ -678,6 +813,73 @@ def make_angular_tree(
 
     return
 
+ at numba.njit(
+    nogil=True,
+    locals={
+        "children": numba.types.ListType(children_type),
+        "left_node_num": numba.types.int32,
+        "right_node_num": numba.types.int32,
+    },
+)
+def make_bit_tree(
+    data,
+    indices,
+    hyperplanes,
+    offsets,
+    children,
+    point_indices,
+    rng_state,
+    leaf_size=30,
+    max_depth=200,
+):
+    if indices.shape[0] > leaf_size and max_depth > 0:
+        (
+            left_indices,
+            right_indices,
+            hyperplane,
+            offset,
+        ) = angular_bitpacked_random_projection_split(data, indices, rng_state)
+
+        make_bit_tree(
+            data,
+            left_indices,
+            hyperplanes,
+            offsets,
+            children,
+            point_indices,
+            rng_state,
+            leaf_size,
+            max_depth - 1,
+        )
+
+        left_node_num = len(point_indices) - 1
+
+        make_bit_tree(
+            data,
+            right_indices,
+            hyperplanes,
+            offsets,
+            children,
+            point_indices,
+            rng_state,
+            leaf_size,
+            max_depth - 1,
+        )
+
+        right_node_num = len(point_indices) - 1
+
+        hyperplanes.append(hyperplane)
+        offsets.append(offset)
+        children.append((np.int32(left_node_num), np.int32(right_node_num)))
+        point_indices.append(np.array([-1], dtype=np.int32))
+    else:
+        hyperplanes.append(np.array([255], dtype=np.uint8))
+        offsets.append(-np.inf)
+        children.append((np.int32(-1), np.int32(-1)))
+        point_indices.append(indices)
+
+    return
+
 
 @numba.njit(
     nogil=True,
@@ -824,7 +1026,6 @@ def make_sparse_angular_tree(
 @numba.njit(nogil=True)
 def make_dense_tree(data, rng_state, leaf_size=30, angular=False, max_depth=200):
     indices = np.arange(data.shape[0]).astype(np.int32)
-
     hyperplanes = numba.typed.List.empty_list(dense_hyperplane_type)
     offsets = numba.typed.List.empty_list(offset_type)
     children = numba.typed.List.empty_list(children_type)
@@ -918,6 +1119,38 @@ def make_sparse_tree(
     return FlatTree(hyperplanes, offsets, children, point_indices, max_leaf_size)
 
 
+ at numba.njit(nogil=True)
+def make_dense_bit_tree(data, rng_state, leaf_size=30, angular=False, max_depth=200):
+    indices = np.arange(data.shape[0]).astype(np.int32)
+
+    hyperplanes = numba.typed.List.empty_list(bit_hyperplane_type)
+    offsets = numba.typed.List.empty_list(offset_type)
+    children = numba.typed.List.empty_list(children_type)
+    point_indices = numba.typed.List.empty_list(point_indices_type)
+
+    if angular:
+        make_bit_tree(
+            data,
+            indices,
+            hyperplanes,
+            offsets,
+            children,
+            point_indices,
+            rng_state,
+            leaf_size,
+            max_depth=max_depth,
+        )
+    else:
+        raise NotImplementedError("Euclidean bit trees are not implemented yet.")
+
+    max_leaf_size = leaf_size
+    for points in point_indices:
+        if len(points) > max_leaf_size:
+            max_leaf_size = numba.int32(len(points))
+
+    result = FlatTree(hyperplanes, offsets, children, point_indices, max_leaf_size)
+    return result
+
 @numba.njit(
     [
         "b1(f4[::1],f4,f4[::1],i8[::1])",
@@ -954,6 +1187,43 @@ def select_side(hyperplane, offset, point, rng_state):
         return 1
 
 
+ at numba.njit(
+    [
+        "b1(u1[::1],f4,u1[::1],i8[::1])",
+        numba.types.boolean(
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+            numba.types.float32,
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+            numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+        ),
+    ],
+    fastmath=True,
+    locals={
+        "margin": numba.types.float32,
+        "dim": numba.types.intp,
+        "d": numba.types.uint16,
+    },
+    cache=True,
+)
+def select_side_bit(hyperplane, offset, point, rng_state):
+    margin = offset
+    dim = point.shape[0]
+    for d in range(dim):
+        margin += popcnt[hyperplane[d] & point[d]]
+        margin -= popcnt[hyperplane[dim + d] & point[d]]
+
+    if abs(margin) < EPS:
+        side = np.abs(tau_rand_int(rng_state)) % 2
+        if side == 0:
+            return 0
+        else:
+            return 1
+    elif margin > 0:
+        return 0
+    else:
+        return 1
+
+
 @numba.njit(
     [
         "i4[::1](f4[::1],f4[:,::1],f4[::1],i4[:,::1],i4[::1],i8[::1])",
@@ -981,6 +1251,32 @@ def search_flat_tree(point, hyperplanes, offsets, children, indices, rng_state):
     return indices[-children[node, 0] : -children[node, 1]]
 
 
+ at numba.njit(
+    [
+        "i4[::1](u1[::1],u1[:,::1],f4[::1],i4[:,::1],i4[::1],i8[::1])",
+        numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+            numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+            numba.types.Array(numba.types.uint8, 2, "C", readonly=True),
+            numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.int32, 2, "C", readonly=True),
+            numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+            numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+        ),
+    ],
+    locals={"node": numba.types.uint32, "side": numba.types.boolean},
+    cache=True,
+)
+def search_flat_bit_tree(point, hyperplanes, offsets, children, indices, rng_state):
+    node = 0
+    while children[node, 0] > 0:
+        side = select_side_bit(hyperplanes[node], offsets[node], point, rng_state)
+        if side == 0:
+            node = children[node, 0]
+        else:
+            node = children[node, 1]
+
+    return indices[-children[node, 0] : -children[node, 1]]
+
 @numba.njit(fastmath=True, cache=True)
 def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
     margin = offset
@@ -1034,6 +1330,7 @@ def make_forest(
     random_state,
     n_jobs=None,
     angular=False,
+    bit_tree=False,
     max_depth=200,
 ):
     """Build a random projection forest with ``n_trees``.
@@ -1076,6 +1373,17 @@ def make_forest(
                 )
                 for i in range(n_trees)
             )
+        elif bit_tree:
+            result = joblib.Parallel(n_jobs=n_jobs, require="sharedmem")(
+                joblib.delayed(make_dense_bit_tree)(
+                    data,
+                    rng_states[i],
+                    leaf_size,
+                    angular,
+                    max_depth=max_depth
+                )
+                for i in range(n_trees)
+            )
         else:
             result = joblib.Parallel(n_jobs=n_jobs, require="sharedmem")(
                 joblib.delayed(make_dense_tree)(
@@ -1130,7 +1438,7 @@ def rptree_leaf_array(rp_forest):
         return np.array([[-1]])
 
 
- at numba.njit()
+#@numba.njit()
 def recursive_convert(
     tree, hyperplanes, offsets, children, indices, node_num, leaf_start, tree_node
 ):
@@ -1229,8 +1537,11 @@ def convert_tree_format(tree, data_size, data_dim):
     is_sparse = False
     if tree.hyperplanes[0].ndim == 1:
         # dense hyperplanes
-        hyperplane_dim = data_dim
-        hyperplanes = np.zeros((n_nodes, hyperplane_dim), dtype=np.float32)
+        if tree.hyperplanes[0].dtype == np.uint8:
+            hyperplane_dim = data_dim * 2
+        else:
+            hyperplane_dim = data_dim
+        hyperplanes = np.zeros((n_nodes, hyperplane_dim), dtype=tree.hyperplanes[0].dtype)
     else:
         # sparse hyperplanes
         is_sparse = True


=====================================
pynndescent/tests/test_distances.py
=====================================
@@ -172,7 +172,9 @@ def test_sparse_spatial_check(sparse_spatial_data, metric, decimal=6):
 )
 def test_sparse_binary_check(sparse_binary_data, metric):
     if metric in spdist.sparse_named_distances:
-        dist_matrix = pairwise_distances(np.asarray(sparse_binary_data.todense()), metric=metric)
+        dist_matrix = pairwise_distances(
+            np.asarray(sparse_binary_data.todense()), metric=metric
+        )
     if metric in ("jaccard", "dice", "sokalsneath"):
         dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
     if metric == "russellrao":
@@ -394,3 +396,37 @@ def test_wasserstein_1d(p):
                 p,
             )
             assert np.isclose(d1, d2)
+
+
+def test_bit_hamming():
+    test_data = np.random.randint(0, 255, size=(10, 100), dtype=np.uint8)
+    unpacked_data = np.zeros(
+        (test_data.shape[0], test_data.shape[1] * 8), dtype=np.float32
+    )
+    for i in range(unpacked_data.shape[0]):
+        for j in range(unpacked_data.shape[1]):
+            unpacked_data[i, j] = (test_data[i, j // 8] & (1 << (j % 8))) > 0
+
+    all_pairs = pairwise_distances(unpacked_data, metric="hamming")
+    for i in range(test_data.shape[0]):
+        for j in range(i + 1, test_data.shape[0]):
+            d1 = dist.bit_hamming(test_data[i], test_data[j]) / (test_data.shape[1] * 8)
+            d2 = all_pairs[i, j]
+            assert np.isclose(d1, d2)
+
+
+def test_bit_jaccard():
+    test_data = np.random.randint(0, 255, size=(10, 100), dtype=np.uint8)
+    unpacked_data = np.zeros(
+        (test_data.shape[0], test_data.shape[1] * 8), dtype=np.float32
+    )
+    for i in range(unpacked_data.shape[0]):
+        for j in range(unpacked_data.shape[1]):
+            unpacked_data[i, j] = (test_data[i, j // 8] & (1 << (j % 8))) > 0
+
+    all_pairs = pairwise_distances(unpacked_data, metric="jaccard")
+    for i in range(test_data.shape[0]):
+        for j in range(i + 1, test_data.shape[0]):
+            d1 = 1.0 - np.exp(-dist.bit_jaccard(test_data[i], test_data[j]))
+            d2 = all_pairs[i, j]
+            assert np.isclose(d1, d2)


=====================================
pynndescent/tests/test_pynndescent_.py
=====================================
@@ -25,7 +25,7 @@ def test_nn_descent_neighbor_accuracy(nn_data, seed):
 
     num_correct = 0.0
     for i in range(nn_data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (nn_data.shape[0] * 10)
     assert (
@@ -44,7 +44,7 @@ def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
 
     num_correct = 0.0
     for i in range(nn_data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (nn_data.shape[0] * 10)
     assert (
@@ -52,8 +52,34 @@ def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
     ), "NN-descent did not get 99% accuracy on nearest neighbors"
 
 
+def test_bitpacked_nn_descent_neighbor_accuracy(nn_data, seed):
+    bitpacked_data = (nn_data * 256).astype(np.uint8)
+    unpacked_data = np.zeros(
+        (bitpacked_data.shape[0], bitpacked_data.shape[1] * 8), dtype=np.float32
+    )
+    for i in range(unpacked_data.shape[0]):
+        for j in range(unpacked_data.shape[1]):
+            unpacked_data[i, j] = (bitpacked_data[i, j // 8] & (1 << (j % 8))) > 0
+
+    knn_indices, _ = NNDescent(
+        bitpacked_data, "bit_jaccard", {}, 10, random_state=np.random.RandomState(seed)
+    )._neighbor_graph
+
+    nn_finder = NearestNeighbors(n_neighbors=10, metric="jaccard").fit(unpacked_data)
+    true_indices = nn_finder.kneighbors(unpacked_data, 10, return_distance=False)
+
+    num_correct = 0.0
+    for i in range(nn_data.shape[0]):
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
+
+    percent_correct = num_correct / (nn_data.shape[0] * 10)
+    assert (
+        percent_correct >= 0.60
+    ), "NN-descent did not get 60% accuracy on nearest neighbors"
+
+
 @pytest.mark.skipif(
-    list(map(int, scipy.version.version.split("."))) < [1, 3, 0],
+    list(map(int, re.findall(r"[0-9]+\.[0-9]+\.?[0-9]*", scipy.version.version)[0].split("."))) < [1, 3, 0],
     reason="requires scipy >= 1.3.0",
 )
 def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data, seed):
@@ -66,7 +92,7 @@ def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data, seed):
 
     num_correct = 0.0
     for i in range(sparse_nn_data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
     assert (
@@ -89,7 +115,7 @@ def test_sparse_angular_nn_descent_neighbor_accuracy(sparse_nn_data):
 
     num_correct = 0.0
     for i in range(sparse_nn_data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
     assert (
@@ -106,7 +132,7 @@ def test_nn_descent_query_accuracy(nn_data):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert (
@@ -123,7 +149,7 @@ def test_nn_descent_query_accuracy_angular(nn_data):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert (
@@ -142,7 +168,7 @@ def test_sparse_nn_descent_query_accuracy(sparse_nn_data):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert (
@@ -161,7 +187,7 @@ def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert (
@@ -169,6 +195,35 @@ def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
     ), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
 
 
+def test_bitpacked_nn_descent_query_accuracy(nn_data):
+    bitpacked_data = (nn_data * 256).astype(np.uint8)
+    unpacked_data = np.zeros(
+        (bitpacked_data.shape[0], bitpacked_data.shape[1] * 8), dtype=np.float32
+    )
+    for i in range(unpacked_data.shape[0]):
+        for j in range(unpacked_data.shape[1]):
+            unpacked_data[i, j] = (bitpacked_data[i, j // 8] & (1 << (j % 8))) > 0
+
+    nnd = NNDescent(
+        bitpacked_data[200:], "bit_jaccard", n_neighbors=50, random_state=None
+    )
+    knn_indices, _ = nnd.query(bitpacked_data[:200], k=10, epsilon=0.36)
+
+    nn = NearestNeighbors(metric="jaccard").fit(unpacked_data[200:])
+    true_indices = nn.kneighbors(
+        unpacked_data[:200], n_neighbors=10, return_distance=False
+    )
+
+    num_correct = 0.0
+    for i in range(true_indices.shape[0]):
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
+
+    percent_correct = num_correct / (true_indices.shape[0] * 10)
+    assert (
+        percent_correct >= 0.80
+    ), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
+
+
 def test_transformer_equivalence(nn_data):
     N_NEIGHBORS = 15
     EPSILON = 0.15
@@ -206,7 +261,7 @@ def test_random_state_none(nn_data, spatial_data):
 
     num_correct = 0.0
     for i in range(nn_data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (spatial_data.shape[0] * 10)
     assert (
@@ -279,7 +334,7 @@ def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):
 
     num_correct = 0
     for i in range(data.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     proportion_correct = num_correct / (data.shape[0] * n_neighbors)
     assert (
@@ -287,7 +342,9 @@ def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):
     ), "NN-descent did not get 95% accuracy on nearest neighbors"
 
 
-def test_rp_trees_should_not_stack_overflow_with_near_duplicate_data(seed, cosine_near_duplicates_data):
+def test_rp_trees_should_not_stack_overflow_with_near_duplicate_data(
+    seed, cosine_near_duplicates_data
+):
 
     n_neighbors = 10
     knn_indices, _ = NNDescent(
@@ -467,7 +524,7 @@ def test_update_no_prepare_query_accuracy(nn_data, metric):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert percent_correct >= 0.95, (
@@ -496,7 +553,7 @@ def test_update_w_prepare_query_accuracy(nn_data, metric):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert percent_correct >= 0.95, (
@@ -525,7 +582,7 @@ def test_update_w_prepare_query_accuracy(nn_data, metric):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert percent_correct >= 0.95, (
@@ -537,7 +594,7 @@ def evaluate_predictions(neighbors_true, neigbhors_computed, n_neighbors):
     n_correct = 0
     n_all = neighbors_true.shape[0] * n_neighbors
     for i in range(neighbors_true.shape[0]):
-        n_correct += np.sum(np.in1d(neighbors_true[i], neigbhors_computed[i]))
+        n_correct += np.sum(np.isin(neighbors_true[i], neigbhors_computed[i]))
     return n_correct / n_all
 
 
@@ -612,7 +669,7 @@ def test_tree_init_false(nn_data, metric):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert percent_correct >= 0.95, (
@@ -640,7 +697,7 @@ def test_one_dimensional_data(nn_data, metric):
 
     num_correct = 0.0
     for i in range(true_indices.shape[0]):
-        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+        num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
     percent_correct = num_correct / (true_indices.shape[0] * 10)
     assert percent_correct >= 0.95, (
@@ -673,7 +730,7 @@ def test_tree_no_split(small_data, sparse_small_data, metric):
 
         num_correct = 0.0
         for i in range(true_indices.shape[0]):
-            num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+            num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
 
         percent_correct = num_correct / (true_indices.shape[0] * k)
         assert (
@@ -682,7 +739,12 @@ def test_tree_no_split(small_data, sparse_small_data, metric):
             data_type
         )
 
- at pytest.mark.skipif('NUMBA_DISABLE_JIT' in os.environ, reason="Too expensive for disabled Numba")
+
+ at pytest.mark.skipif(
+    "NUMBA_DISABLE_JIT" in os.environ, reason="Too expensive for disabled Numba"
+)
 def test_bad_data():
-    data = np.sqrt(np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")['arr_0'])
+    data = np.sqrt(
+        np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")["arr_0"]
+    )
     index = NNDescent(data, metric="cosine")


=====================================
pynndescent/utils.py
=====================================
@@ -192,7 +192,7 @@ def make_heap(n_points, size):
     heap: An ndarray suitable for passing to other numba enabled heap functions.
     """
     indices = np.full((int(n_points), int(size)), -1, dtype=np.int32)
-    distances = np.full((int(n_points), int(size)), np.infty, dtype=np.float32)
+    distances = np.full((int(n_points), int(size)), np.inf, dtype=np.float32)
     flags = np.zeros((int(n_points), int(size)), dtype=np.uint8)
     result = (indices, distances, flags)
 


=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def readme():
 
 configuration = {
     "name": "pynndescent",
-    "version": "0.5.11",
+    "version": "0.5.13",
     "description": "Nearest Neighbor Descent",
     "long_description": readme(),
     "classifiers": [
@@ -23,9 +23,10 @@ configuration = {
         "Operating System :: POSIX",
         "Operating System :: Unix",
         "Operating System :: MacOS",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
     ],
     "keywords": "nearest neighbor, knn, ANN",
     "url": "http://github.com/lmcinnes/pynndescent",



View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/32f80e101963de4d4d68ea86562fd4d0a172082e

-- 
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/commit/32f80e101963de4d4d68ea86562fd4d0a172082e
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250901/373555a6/attachment-0001.htm>


More information about the debian-med-commit mailing list