[med-svn] [Git][python-team/packages/python-pynndescent][master] 8 commits: New upstream version
Michael R. Crusoe (@crusoe)
gitlab at salsa.debian.org
Mon Sep 1 16:10:38 BST 2025
Michael R. Crusoe pushed to branch master at Debian Python Team / packages / python-pynndescent
Commits:
9149462f by Michael R. Crusoe at 2025-09-01T15:55:05+02:00
New upstream version
- - - - -
32f80e10 by Michael R. Crusoe at 2025-09-01T15:55:06+02:00
New upstream version 0.5.13
- - - - -
5b80b2d0 by Michael R. Crusoe at 2025-09-01T15:55:09+02:00
Update upstream source from tag 'upstream/0.5.13'
Update to upstream version '0.5.13'
with Debian dir 5d8bd7a33c1614eff797c17e50aad3d179a554e7
- - - - -
dd8dc432 by Michael R. Crusoe at 2025-09-01T15:55:09+02:00
Standards-Version: 4.7.2 (routine-update)
- - - - -
36441338 by Michael R. Crusoe at 2025-09-01T15:57:08+02:00
Refreshed the patches.
- - - - -
0287b613 by Michael R. Crusoe at 2025-09-01T16:12:27+02:00
new upstream releases fixes many build failures. Closes: #1090284, #1090284, #1058483, #1063636
- - - - -
bc9f5422 by Michael R. Crusoe at 2025-09-01T16:12:43+02:00
upload to unstable
- - - - -
7280b697 by Michael R. Crusoe at 2025-09-01T16:15:01+02:00
routine-update: Ready to upload to unstable
- - - - -
18 changed files:
- PKG-INFO
- debian/changelog
- debian/control
- debian/patches/arm.patch
- debian/patches/test-load-pynndescent_bug_np.npz-from-relative-path.patch
- − pynndescent.egg-info/PKG-INFO
- − pynndescent.egg-info/SOURCES.txt
- − pynndescent.egg-info/dependency_links.txt
- − pynndescent.egg-info/not-zip-safe
- − pynndescent.egg-info/requires.txt
- − pynndescent.egg-info/top_level.txt
- pynndescent/distances.py
- pynndescent/pynndescent_.py
- pynndescent/rp_trees.py
- pynndescent/tests/test_distances.py
- pynndescent/tests/test_pynndescent_.py
- pynndescent/utils.py
- setup.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pynndescent
-Version: 0.5.11
+Version: 0.5.13
Summary: Nearest Neighbor Descent
Home-page: http://github.com/lmcinnes/pynndescent
Author: Leland McInnes
@@ -20,9 +20,10 @@ Classifier: Operating System :: Microsoft :: Windows
Classifier: Operating System :: POSIX
Classifier: Operating System :: Unix
Classifier: Operating System :: MacOS
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
License-File: LICENSE
.. image:: doc/pynndescent_logo.png
=====================================
debian/changelog
=====================================
@@ -1,3 +1,11 @@
+python-pynndescent (0.5.13-1) unstable; urgency=medium
+
+ * New upstream version. Closes: #1090284, #1090284, #1058483, #1063636
+ * Standards-Version: 4.7.2 (routine-update)
+ * Refreshed the patches.
+
+ -- Michael R. Crusoe <crusoe at debian.org> Mon, 01 Sep 2025 16:12:31 +0200
+
python-pynndescent (0.5.11-1) unstable; urgency=medium
[ Andreas Tille ]
=====================================
debian/control
=====================================
@@ -13,7 +13,7 @@ Build-Depends: debhelper-compat (= 13),
python3-scipy <!nocheck>,
python3-setuptools,
python3-sklearn <!nocheck>
-Standards-Version: 4.6.2
+Standards-Version: 4.7.2
Vcs-Browser: https://salsa.debian.org/python-team/packages/python-pynndescent
Vcs-Git: https://salsa.debian.org/python-team/packages/python-pynndescent.git
Homepage: https://github.com/lmcinnes/pynndescent/
=====================================
debian/patches/arm.patch
=====================================
@@ -8,11 +8,9 @@ Subject: Some tests fail on arm, skip.
pynndescent/tests/test_rank.py | 4 ++++
3 files changed, 18 insertions(+)
-diff --git a/pynndescent/tests/test_distances.py b/pynndescent/tests/test_distances.py
-index 83898a1..d366446 100644
---- a/pynndescent/tests/test_distances.py
-+++ b/pynndescent/tests/test_distances.py
-@@ -9,6 +9,7 @@ from scipy.version import full_version as scipy_full_version
+--- python-pynndescent.orig/pynndescent/tests/test_distances.py
++++ python-pynndescent/pynndescent/tests/test_distances.py
+@@ -9,6 +9,7 @@
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import BallTree
from sklearn.preprocessing import normalize
@@ -20,7 +18,7 @@ index 83898a1..d366446 100644
@pytest.mark.parametrize(
-@@ -106,6 +107,9 @@ def test_binary_check(binary_data, metric):
+@@ -106,6 +107,9 @@
],
)
def test_sparse_spatial_check(sparse_spatial_data, metric, decimal=6):
@@ -30,7 +28,7 @@ index 83898a1..d366446 100644
if metric in spdist.sparse_named_distances:
dist_matrix = pairwise_distances(
np.asarray(sparse_spatial_data.todense()).astype(np.float32), metric=metric
-@@ -331,6 +335,9 @@ def test_alternative_distances():
+@@ -333,6 +337,9 @@
def test_jensen_shannon():
@@ -40,7 +38,7 @@ index 83898a1..d366446 100644
test_data = np.random.random(size=(10, 50))
test_data = normalize(test_data, norm="l1")
for i in range(test_data.shape[0]):
-@@ -347,6 +354,9 @@ def test_jensen_shannon():
+@@ -349,6 +356,9 @@
def test_sparse_jensen_shannon():
@@ -50,11 +48,9 @@ index 83898a1..d366446 100644
test_data = np.random.random(size=(10, 100))
# sparsify
test_data[test_data <= 0.5] = 0.0
-diff --git a/pynndescent/tests/test_pynndescent_.py b/pynndescent/tests/test_pynndescent_.py
-index b36fded..6be146b 100644
---- a/pynndescent/tests/test_pynndescent_.py
-+++ b/pynndescent/tests/test_pynndescent_.py
-@@ -11,9 +11,13 @@ from sklearn.preprocessing import normalize
+--- python-pynndescent.orig/pynndescent/tests/test_pynndescent_.py
++++ python-pynndescent/pynndescent/tests/test_pynndescent_.py
+@@ -11,9 +11,13 @@
import pickle
import joblib
import scipy
@@ -68,10 +64,8 @@ index b36fded..6be146b 100644
def test_nn_descent_neighbor_accuracy(nn_data, seed):
knn_indices, _ = NNDescent(
-diff --git a/pynndescent/tests/test_rank.py b/pynndescent/tests/test_rank.py
-index e75d96a..6e89798 100644
---- a/pynndescent/tests/test_rank.py
-+++ b/pynndescent/tests/test_rank.py
+--- python-pynndescent.orig/pynndescent/tests/test_rank.py
++++ python-pynndescent/pynndescent/tests/test_rank.py
@@ -1,9 +1,13 @@
import pytest
import numpy as np
=====================================
debian/patches/test-load-pynndescent_bug_np.npz-from-relative-path.patch
=====================================
@@ -15,10 +15,8 @@ Signed-off-by: Benjamin Drung <benjamin.drung at canonical.com>
pynndescent/tests/test_pynndescent_.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
-diff --git a/pynndescent/tests/test_pynndescent_.py b/pynndescent/tests/test_pynndescent_.py
-index 6be146b..cc59d14 100644
---- a/pynndescent/tests/test_pynndescent_.py
-+++ b/pynndescent/tests/test_pynndescent_.py
+--- python-pynndescent.orig/pynndescent/tests/test_pynndescent_.py
++++ python-pynndescent/pynndescent/tests/test_pynndescent_.py
@@ -1,6 +1,7 @@
import os
import io
@@ -27,11 +25,13 @@ index 6be146b..cc59d14 100644
import pytest
from contextlib import redirect_stdout
-@@ -688,5 +689,6 @@ def test_tree_no_split(small_data, sparse_small_data, metric):
-
- @pytest.mark.skipif('NUMBA_DISABLE_JIT' in os.environ, reason="Too expensive for disabled Numba")
+@@ -748,7 +749,8 @@
+ "NUMBA_DISABLE_JIT" in os.environ, reason="Too expensive for disabled Numba"
+ )
def test_bad_data():
-- data = np.sqrt(np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")['arr_0'])
+ test_data_dir = pathlib.Path(__file__).parent / "test_data"
-+ data = np.sqrt(np.load(test_data_dir / "pynndescent_bug_np.npz")['arr_0'])
+ data = np.sqrt(
+- np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")["arr_0"]
++ np.load(test_data_dir / "pynndescent_bug_np.npz")["arr_0"]
+ )
index = NNDescent(data, metric="cosine")
=====================================
pynndescent.egg-info/PKG-INFO deleted
=====================================
@@ -1,226 +0,0 @@
-Metadata-Version: 2.1
-Name: pynndescent
-Version: 0.5.11
-Summary: Nearest Neighbor Descent
-Home-page: http://github.com/lmcinnes/pynndescent
-Author: Leland McInnes
-Author-email: leland.mcinnes at gmail.com
-Maintainer: Leland McInnes
-Maintainer-email: leland.mcinnes at gmail.com
-License: BSD
-Keywords: nearest neighbor,knn,ANN
-Classifier: Development Status :: 3 - Alpha
-Classifier: Intended Audience :: Science/Research
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved
-Classifier: Programming Language :: Python
-Classifier: Topic :: Software Development
-Classifier: Topic :: Scientific/Engineering
-Classifier: Operating System :: Microsoft :: Windows
-Classifier: Operating System :: POSIX
-Classifier: Operating System :: Unix
-Classifier: Operating System :: MacOS
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-License-File: LICENSE
-
-.. image:: doc/pynndescent_logo.png
- :width: 600
- :align: center
- :alt: PyNNDescent Logo
-
-.. image:: https://dev.azure.com/TutteInstitute/build-pipelines/_apis/build/status%2Flmcinnes.pynndescent?branchName=master
- :target: https://dev.azure.com/TutteInstitute/build-pipelines/_build?definitionId=17
- :alt: Azure Pipelines Build Status
-.. image:: https://readthedocs.org/projects/pynndescent/badge/?version=latest
- :target: https://pynndescent.readthedocs.io/en/latest/?badge=latest
- :alt: Documentation Status
-
-===========
-PyNNDescent
-===========
-
-PyNNDescent is a Python nearest neighbor descent for approximate nearest neighbors.
-It provides a python implementation of Nearest Neighbor
-Descent for k-neighbor-graph construction and approximate nearest neighbor
-search, as per the paper:
-
-Dong, Wei, Charikar Moses, and Kai Li.
-*"Efficient k-nearest neighbor graph construction for generic similarity
-measures."*
-Proceedings of the 20th international conference on World wide web. ACM, 2011.
-
-This library supplements that approach with the use of random projection trees for
-initialisation. This can be particularly useful for the metrics that are
-amenable to such approaches (euclidean, minkowski, angular, cosine, etc.). Graph
-diversification is also performed, pruning the longest edges of any triangles in the
-graph.
-
-Currently this library targets relatively high accuracy
-(80%-100% accuracy rate) approximate nearest neighbor searches.
-
---------------------
-Why use PyNNDescent?
---------------------
-
-PyNNDescent provides fast approximate nearest neighbor queries. The
-`ann-benchmarks <https://github.com/erikbern/ann-benchmarks>`_ system puts it
-solidly in the mix of top performing ANN libraries:
-
-**SIFT-128 Euclidean**
-
-.. image:: https://pynndescent.readthedocs.io/en/latest/_images/sift.png
- :alt: ANN benchmark performance for SIFT 128 dataset
-
-**NYTimes-256 Angular**
-
-.. image:: https://pynndescent.readthedocs.io/en/latest/_images/nytimes.png
- :alt: ANN benchmark performance for NYTimes 256 dataset
-
-While PyNNDescent is among fastest ANN library, it is also both easy to install (pip
-and conda installable) with no platform or compilation issues, and is very flexible,
-supporting a wide variety of distance metrics by default:
-
-**Minkowski style metrics**
-
-- euclidean
-- manhattan
-- chebyshev
-- minkowski
-
-**Miscellaneous spatial metrics**
-
-- canberra
-- braycurtis
-- haversine
-
-**Normalized spatial metrics**
-
-- mahalanobis
-- wminkowski
-- seuclidean
-
-**Angular and correlation metrics**
-
-- cosine
-- dot
-- correlation
-- spearmanr
-- tsss
-- true_angular
-
-**Probability metrics**
-
-- hellinger
-- wasserstein
-
-**Metrics for binary data**
-
-- hamming
-- jaccard
-- dice
-- russelrao
-- kulsinski
-- rogerstanimoto
-- sokalmichener
-- sokalsneath
-- yule
-
-and also custom user defined distance metrics while still retaining performance.
-
-PyNNDescent also integrates well with Scikit-learn, including providing support
-for the KNeighborTransformer as a drop in replacement for algorithms
-that make use of nearest neighbor computations.
-
-----------------------
-How to use PyNNDescent
-----------------------
-
-PyNNDescent aims to have a very simple interface. It is similar to (but more
-limited than) KDTrees and BallTrees in ``sklearn``. In practice there are
-only two operations -- index construction, and querying an index for nearest
-neighbors.
-
-To build a new search index on some training data ``data`` you can do something
-like
-
-.. code:: python
-
- from pynndescent import NNDescent
- index = NNDescent(data)
-
-You can then use the index for searching (and can pickle it to disk if you
-wish). To search a pynndescent index for the 15 nearest neighbors of a test data
-set ``query_data`` you can do something like
-
-.. code:: python
-
- index.query(query_data, k=15)
-
-and that is pretty much all there is to it. You can find more details in the
-`documentation <https://pynndescent.readthedocs.org>`_.
-
-----------
-Installing
-----------
-
-PyNNDescent is designed to be easy to install being a pure python module with
-relatively light requirements:
-
-* numpy
-* scipy
-* scikit-learn >= 0.22
-* numba >= 0.51
-
-all of which should be pip or conda installable. The easiest way to install should be
-via conda:
-
-.. code:: bash
-
- conda install -c conda-forge pynndescent
-
-or via pip:
-
-.. code:: bash
-
- pip install pynndescent
-
-To manually install this package:
-
-.. code:: bash
-
- wget https://github.com/lmcinnes/pynndescent/archive/master.zip
- unzip master.zip
- rm master.zip
- cd pynndescent-master
- python setup.py install
-
-----------------
-Help and Support
-----------------
-
-This project is still young. The documentation is still growing. In the meantime please
-`open an issue <https://github.com/lmcinnes/pynndescent/issues/new>`_
-and I will try to provide any help and guidance that I can. Please also check
-the docstrings on the code, which provide some descriptions of the parameters.
-
--------
-License
--------
-
-The pynndescent package is 2-clause BSD licensed. Enjoy.
-
-------------
-Contributing
-------------
-
-Contributions are more than welcome! There are lots of opportunities
-for potential projects, so please get in touch if you would like to
-help out. Everything from code to notebooks to
-examples and documentation are all *equally valuable* so please don't feel
-you can't contribute. To contribute please `fork the project <https://github.com/lmcinnes/pynndescent/issues#fork-destination-box>`_ make your changes and
-submit a pull request. We will do our best to work through any issues with
-you and get your code merged into the main branch.
-
-
=====================================
pynndescent.egg-info/SOURCES.txt deleted
=====================================
@@ -1,31 +0,0 @@
-CODE_OF_CONDUCT.md
-CONTRIBUTING.md
-LICENSE
-MANIFEST.in
-README.rst
-requirements.txt
-setup.py
-pynndescent/__init__.py
-pynndescent/distances.py
-pynndescent/graph_utils.py
-pynndescent/optimal_transport.py
-pynndescent/pynndescent_.py
-pynndescent/rp_trees.py
-pynndescent/sparse.py
-pynndescent/sparse_nndescent.py
-pynndescent/threaded_rp_trees.py
-pynndescent/utils.py
-pynndescent.egg-info/PKG-INFO
-pynndescent.egg-info/SOURCES.txt
-pynndescent.egg-info/dependency_links.txt
-pynndescent.egg-info/not-zip-safe
-pynndescent.egg-info/requires.txt
-pynndescent.egg-info/top_level.txt
-pynndescent/tests/__init__.py
-pynndescent/tests/conftest.py
-pynndescent/tests/test_distances.py
-pynndescent/tests/test_pynndescent_.py
-pynndescent/tests/test_rank.py
-pynndescent/tests/test_data/cosine_hang.npy
-pynndescent/tests/test_data/cosine_near_duplicates.npy
-pynndescent/tests/test_data/pynndescent_bug_np.npz
\ No newline at end of file
=====================================
pynndescent.egg-info/dependency_links.txt deleted
=====================================
@@ -1 +0,0 @@
-
=====================================
pynndescent.egg-info/not-zip-safe deleted
=====================================
@@ -1 +0,0 @@
-
=====================================
pynndescent.egg-info/requires.txt deleted
=====================================
@@ -1,8 +0,0 @@
-scikit-learn>=0.18
-scipy>=1.0
-numba>=0.51.2
-llvmlite>=0.30
-joblib>=0.11
-
-[:python_version < "3.8"]
-importlib-metadata>=4.8.1
=====================================
pynndescent.egg-info/top_level.txt deleted
=====================================
@@ -1 +0,0 @@
-pynndescent
=====================================
pynndescent/distances.py
=====================================
@@ -22,6 +22,11 @@ _dummy_cost = np.zeros((2, 2), dtype=np.float64)
FLOAT32_EPS = np.finfo(np.float32).eps
FLOAT32_MAX = np.finfo(np.float32).max
+popcnt = np.array(
+ [bin(i).count('1') for i in range(256)],
+ dtype=np.float32
+)
+
@numba.njit(fastmath=True)
def euclidean(x, y):
@@ -890,6 +895,65 @@ def symmetric_kl_divergence(x, y):
return result
+ at numba.njit(
+ [
+ "f4(u1[::1],u1[::1])",
+ numba.types.float32(
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ ),
+ ],
+ fastmath=True,
+ locals={
+ "result": numba.types.float32,
+ "intersection": numba.types.uint8,
+ "dim": numba.types.intp,
+ "i": numba.types.uint16,
+ },
+)
+def bit_hamming(x, y):
+ result = 0.0
+ dim = x.shape[0]
+
+ for i in range(dim):
+ intersection = x[i] ^ y[i]
+ result += popcnt[intersection]
+
+ return result
+
+
+ at numba.njit(
+ [
+ "f4(u1[::1],u1[::1])",
+ numba.types.float32(
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ ),
+ ],
+ fastmath=True,
+ locals={
+ "result": numba.types.float32,
+ "denom": numba.types.float32,
+ "and_": numba.types.uint8,
+ "or_": numba.types.uint8,
+ "dim": numba.types.intp,
+ "i": numba.types.uint16,
+ },
+)
+def bit_jaccard(x, y):
+ result = 0.0
+ denom = 0.0
+ dim = x.shape[0]
+
+ for i in range(dim):
+ and_ = x[i] & y[i]
+ or_ = x[i] | y[i]
+ result += popcnt[and_]
+ denom += popcnt[or_]
+
+ return -np.log(result / denom)
+
+
named_distances = {
# general minkowski distances
"euclidean": euclidean,
@@ -946,6 +1010,8 @@ named_distances = {
"sokalsneath": sokal_sneath,
"sokalmichener": sokal_michener,
"yule": yule,
+ "bit_hamming": bit_hamming,
+ "bit_jaccard": bit_jaccard,
}
# Some distances have a faster to compute alternative that
=====================================
pynndescent/pynndescent_.py
=====================================
@@ -49,6 +49,7 @@ from pynndescent.rp_trees import (
denumbaify_tree,
renumbaify_tree,
select_side,
+ select_side_bit,
sparse_select_side,
score_linked_tree,
)
@@ -728,7 +729,13 @@ class NNDescent:
else:
copy_on_normalize = False
- data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
+ if metric in ("bit_hamming", "bit_jaccard"):
+ data = check_array(data, dtype=np.uint8, order="C")
+ self._input_dtype = np.uint8
+ else:
+ data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
+ self._input_dtype = np.float32
+
self._raw_data = data
if not tree_init or n_trees == 0 or init_graph is not None:
@@ -744,32 +751,9 @@ class NNDescent:
current_random_state = check_random_state(self.random_state)
self._distance_correction = None
-
- if callable(metric):
- _distance_func = metric
- elif metric in pynnd_dist.named_distances:
- if metric in pynnd_dist.fast_distance_alternatives:
- _distance_func = pynnd_dist.fast_distance_alternatives[metric]["dist"]
- self._distance_correction = pynnd_dist.fast_distance_alternatives[
- metric
- ]["correction"]
- else:
- _distance_func = pynnd_dist.named_distances[metric]
- else:
- raise ValueError("Metric is neither callable, " + "nor a recognised string")
-
- # Create a partial function for distances with arguments
- if len(self._dist_args) > 0:
- dist_args = self._dist_args
-
- @numba.njit()
- def _partial_dist_func(x, y):
- return _distance_func(x, y, *dist_args)
-
- self._distance_func = _partial_dist_func
- else:
- self._distance_func = _distance_func
-
+
+ self._set_distance_func()
+
if metric in (
"cosine",
"dot",
@@ -778,10 +762,17 @@ class NNDescent:
"jaccard",
"hellinger",
"hamming",
+ "bit_hamming",
+ "bit_jaccard",
):
self._angular_trees = True
+ if metric in ("bit_hamming", "bit_jaccard"):
+ self._bit_trees = True
+ else:
+ self._bit_trees = False
else:
self._angular_trees = False
+ self._bit_trees = False
if metric == "dot":
data = normalize(data, norm="l2", copy=copy_on_normalize)
@@ -809,6 +800,7 @@ class NNDescent:
current_random_state,
self.n_jobs,
self._angular_trees,
+ self._bit_trees,
max_depth=self.max_rptree_depth,
)
leaf_array = rptree_leaf_array(self._rp_forest)
@@ -952,6 +944,32 @@ class NNDescent:
numba.set_num_threads(self._original_num_threads)
+ def _set_distance_func(self):
+ if callable(self.metric):
+ _distance_func = self.metric
+ elif self.metric in pynnd_dist.named_distances:
+ if self.metric in pynnd_dist.fast_distance_alternatives:
+ _distance_func = pynnd_dist.fast_distance_alternatives[self.metric]["dist"]
+ self._distance_correction = pynnd_dist.fast_distance_alternatives[
+ self.metric
+ ]["correction"]
+ else:
+ _distance_func = pynnd_dist.named_distances[self.metric]
+ else:
+ raise ValueError("Metric is neither callable, " + "nor a recognised string")
+
+ # Create a partial function for distances with arguments
+ if len(self._dist_args) > 0:
+ dist_args = self._dist_args
+
+ @numba.njit()
+ def _partial_dist_func(x, y):
+ return _distance_func(x, y, *dist_args)
+
+ self._distance_func = _partial_dist_func
+ else:
+ self._distance_func = _distance_func
+
def __getstate__(self):
if not hasattr(self, "_search_graph"):
self._init_search_graph()
@@ -970,6 +988,7 @@ class NNDescent:
def __setstate__(self, d):
self.__dict__ = d
+ self._set_distance_func()
self._search_forest = tuple(
[renumbaify_tree(tree) for tree in d["_search_forest"]]
)
@@ -1210,27 +1229,50 @@ class NNDescent:
tree_indices = self._search_forest[0].indices
tree_children = self._search_forest[0].children
- @numba.njit(
- [
- numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
- numba.types.Array(numba.types.float32, 1, "C", readonly=True),
- numba.types.Array(numba.types.int64, 1, "C", readonly=False),
- )
- ],
- locals={"node": numba.types.uint32, "side": numba.types.boolean},
- )
- def tree_search_closure(point, rng_state):
- node = 0
- while tree_children[node, 0] > 0:
- side = select_side(
- tree_hyperplanes[node], tree_offsets[node], point, rng_state
- )
- if side == 0:
- node = tree_children[node, 0]
- else:
- node = tree_children[node, 1]
+ if self._bit_trees:
+ @numba.njit(
+ [
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+ )
+ ],
+ locals={"node": numba.types.uint32, "side": numba.types.boolean},
+ )
+ def tree_search_closure(point, rng_state):
+ node = 0
+ while tree_children[node, 0] > 0:
+ side = select_side_bit(
+ tree_hyperplanes[node], tree_offsets[node], point, rng_state
+ )
+ if side == 0:
+ node = tree_children[node, 0]
+ else:
+ node = tree_children[node, 1]
- return -tree_children[node]
+ return -tree_children[node]
+ else:
+ @numba.njit(
+ [
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+ )
+ ],
+ locals={"node": numba.types.uint32, "side": numba.types.boolean},
+ )
+ def tree_search_closure(point, rng_state):
+ node = 0
+ while tree_children[node, 0] > 0:
+ side = select_side(
+ tree_hyperplanes[node], tree_offsets[node], point, rng_state
+ )
+ if side == 0:
+ node = tree_children[node, 0]
+ else:
+ node = tree_children[node, 1]
+
+ return -tree_children[node]
self._tree_search = tree_search_closure
else:
@@ -1252,10 +1294,15 @@ class NNDescent:
n_neighbors = self.n_neighbors
parallel_search = self.parallel_batch_queries
+ if dist == pynnd_dist.bit_hamming or dist == pynnd_dist.bit_jaccard:
+ data_type = numba.types.uint8[::1]
+ else:
+ data_type = numba.types.float32[::1]
+
@numba.njit(
fastmath=True,
locals={
- "current_query": numba.types.float32[::1],
+ "current_query": data_type,
"i": numba.types.uint32,
"j": numba.types.uint32,
"heap_priorities": numba.types.float32[::1],
@@ -1267,7 +1314,7 @@ class NNDescent:
"visited": numba.types.uint8[::1],
"indices": numba.types.int32[::1],
"indptr": numba.types.int32[::1],
- "data": numba.types.float32[:, ::1],
+ "data": data_type,
"heap_size": numba.types.int16,
"distance_scale": numba.types.float32,
"distance_bound": numba.types.float32,
@@ -1693,7 +1740,11 @@ class NNDescent:
if not hasattr(self, "_search_function"):
self._init_search_function()
- query_data = np.asarray(query_data).astype(np.float32, order="C")
+ if self.metric in ("bit_hamming", "bit_jaccard"):
+ query_data = np.asarray(query_data).astype(np.uint8, order="C")
+ else:
+ query_data = np.asarray(query_data).astype(np.float32, order="C")
+
indices, dists, _ = self._search_function(
query_data, k, epsilon, self._visited, self.search_rng_state
)
@@ -1762,7 +1813,7 @@ class NNDescent:
# input checks
if xs_updated is not None:
xs_updated = check_array(
- xs_updated, dtype=np.float32, accept_sparse="csr", order="C"
+ xs_updated, dtype=self._input_dtype, accept_sparse="csr", order="C"
)
if updated_indices is None:
raise ValueError(
@@ -1798,13 +1849,13 @@ class NNDescent:
if xs_fresh is None:
if self._is_sparse:
xs_fresh = csr_matrix(
- ([], [], []), shape=(0, self._raw_data.shape[1]), dtype=np.float32
+ ([], [], []), shape=(0, self._raw_data.shape[1]), dtype=self._input_dtype
)
else:
- xs_fresh = np.zeros((0, self._raw_data.shape[1]), dtype=np.float32)
+ xs_fresh = np.zeros((0, self._raw_data.shape[1]), dtype=self._input_dtype)
else:
xs_fresh = check_array(
- xs_fresh, dtype=np.float32, accept_sparse="csr", order="C"
+ xs_fresh, dtype=self._input_dtype, accept_sparse="csr", order="C"
)
# data preparation
if hasattr(self, "_vertex_order"):
=====================================
pynndescent/rp_trees.py
=====================================
@@ -33,10 +33,15 @@ FlatTree = namedtuple(
dense_hyperplane_type = numba.float32[::1]
sparse_hyperplane_type = numba.float64[:, ::1]
+bit_hyperplane_type = numba.uint8[::1]
offset_type = numba.float64
children_type = numba.typeof((np.int32(-1), np.int32(-1)))
point_indices_type = numba.int32[::1]
+popcnt = np.array(
+ [bin(i).count('1') for i in range(256)],
+ dtype=np.float32
+)
@numba.njit(
numba.types.Tuple(
@@ -171,6 +176,136 @@ def angular_random_projection_split(data, indices, rng_state):
return indices_left, indices_right, hyperplane_vector, 0.0
+ at numba.njit(
+ numba.types.Tuple(
+ (numba.int32[::1], numba.int32[::1], bit_hyperplane_type, offset_type)
+ )(numba.uint8[:, ::1], numba.int32[::1], numba.int64[::1]),
+ locals={
+ "n_left": numba.uint32,
+ "n_right": numba.uint32,
+ "hyperplane_vector": numba.uint8[::1],
+ "hyperplane_offset": numba.float32,
+ "margin": numba.float32,
+ "d": numba.uint32,
+ "i": numba.uint32,
+ "left_index": numba.uint32,
+ "right_index": numba.uint32,
+ },
+ fastmath=True,
+ nogil=True,
+ cache=True,
+)
+def angular_bitpacked_random_projection_split(data, indices, rng_state):
+ """Given a set of ``graph_indices`` for graph_data points from ``graph_data``, create
+ a random hyperplane to split the graph_data, returning two arrays graph_indices
+ that fall on either side of the hyperplane. This is the basis for a
+ random projection tree, which simply uses this splitting recursively.
+ This particular split uses cosine distance to determine the hyperplane
+ and which side each graph_data sample falls on.
+ Parameters
+ ----------
+ data: array of shape (n_samples, n_features)
+ The original graph_data to be split
+ indices: array of shape (tree_node_size,)
+ The graph_indices of the elements in the ``graph_data`` array that are to
+ be split in the current operation.
+ rng_state: array of int64, shape (3,)
+ The internal state of the rng
+ Returns
+ -------
+ indices_left: array
+ The elements of ``graph_indices`` that fall on the "left" side of the
+ random hyperplane.
+ indices_right: array
+ The elements of ``graph_indices`` that fall on the "left" side of the
+ random hyperplane.
+ """
+ dim = data.shape[1]
+
+ # Select two random points, set the hyperplane between them
+ left_index = tau_rand_int(rng_state) % indices.shape[0]
+ right_index = tau_rand_int(rng_state) % indices.shape[0]
+ right_index += left_index == right_index
+ right_index = right_index % indices.shape[0]
+ left = indices[left_index]
+ right = indices[right_index]
+
+ left_norm = 0.0
+ right_norm = 0.0
+
+ # Compute the normal vector to the hyperplane (the vector between
+ # the two points)
+ hyperplane_vector = np.empty(dim * 2, dtype=np.uint8)
+ positive_hyperplane_component = hyperplane_vector[:dim]
+ negative_hyperplane_component = hyperplane_vector[dim:]
+
+ for d in range(dim):
+ xor_vector = (data[left, d]) ^ (data[right, d])
+ positive_hyperplane_component[d] = xor_vector & (data[left, d])
+ negative_hyperplane_component[d] = xor_vector & (data[right, d])
+
+ hyperplane_norm = 0.0
+
+ for d in range(dim):
+ hyperplane_norm += popcnt[hyperplane_vector[d]]
+ left_norm += popcnt[data[left, d]]
+ right_norm += popcnt[data[right, d]]
+
+ # For each point compute the margin (project into normal vector)
+ # If we are on lower side of the hyperplane put in one pile, otherwise
+ # put it in the other pile (if we hit hyperplane on the nose, flip a coin)
+ n_left = 0
+ n_right = 0
+ side = np.empty(indices.shape[0], np.int8)
+ for i in range(indices.shape[0]):
+ margin = 0.0
+ for d in range(dim):
+ margin += popcnt[positive_hyperplane_component[d] & data[indices[i], d]]
+ margin -= popcnt[negative_hyperplane_component[d] & data[indices[i], d]]
+
+ if abs(margin) < EPS:
+ side[i] = tau_rand_int(rng_state) % 2
+ if side[i] == 0:
+ n_left += 1
+ else:
+ n_right += 1
+ elif margin > 0:
+ side[i] = 0
+ n_left += 1
+ else:
+ side[i] = 1
+ n_right += 1
+
+ # If all points end up on one side, something went wrong numerically
+ # In this case, assign points randomly; they are likely very close anyway
+ if n_left == 0 or n_right == 0:
+ n_left = 0
+ n_right = 0
+ for i in range(indices.shape[0]):
+ side[i] = tau_rand_int(rng_state) % 2
+ if side[i] == 0:
+ n_left += 1
+ else:
+ n_right += 1
+
+ # Now that we have the counts allocate arrays
+ indices_left = np.empty(n_left, dtype=np.int32)
+ indices_right = np.empty(n_right, dtype=np.int32)
+
+ # Populate the arrays with graph_indices according to which side they fell on
+ n_left = 0
+ n_right = 0
+ for i in range(side.shape[0]):
+ if side[i] == 0:
+ indices_left[n_left] = indices[i]
+ n_left += 1
+ else:
+ indices_right[n_right] = indices[i]
+ n_right += 1
+
+ return indices_left, indices_right, hyperplane_vector, 0.0
+
+
@numba.njit(
numba.types.Tuple(
(numba.int32[::1], numba.int32[::1], dense_hyperplane_type, offset_type)
@@ -678,6 +813,73 @@ def make_angular_tree(
return
+ at numba.njit(
+ nogil=True,
+ locals={
+ "children": numba.types.ListType(children_type),
+ "left_node_num": numba.types.int32,
+ "right_node_num": numba.types.int32,
+ },
+)
+def make_bit_tree(
+ data,
+ indices,
+ hyperplanes,
+ offsets,
+ children,
+ point_indices,
+ rng_state,
+ leaf_size=30,
+ max_depth=200,
+):
+ if indices.shape[0] > leaf_size and max_depth > 0:
+ (
+ left_indices,
+ right_indices,
+ hyperplane,
+ offset,
+ ) = angular_bitpacked_random_projection_split(data, indices, rng_state)
+
+ make_bit_tree(
+ data,
+ left_indices,
+ hyperplanes,
+ offsets,
+ children,
+ point_indices,
+ rng_state,
+ leaf_size,
+ max_depth - 1,
+ )
+
+ left_node_num = len(point_indices) - 1
+
+ make_bit_tree(
+ data,
+ right_indices,
+ hyperplanes,
+ offsets,
+ children,
+ point_indices,
+ rng_state,
+ leaf_size,
+ max_depth - 1,
+ )
+
+ right_node_num = len(point_indices) - 1
+
+ hyperplanes.append(hyperplane)
+ offsets.append(offset)
+ children.append((np.int32(left_node_num), np.int32(right_node_num)))
+ point_indices.append(np.array([-1], dtype=np.int32))
+ else:
+ hyperplanes.append(np.array([255], dtype=np.uint8))
+ offsets.append(-np.inf)
+ children.append((np.int32(-1), np.int32(-1)))
+ point_indices.append(indices)
+
+ return
+
@numba.njit(
nogil=True,
@@ -824,7 +1026,6 @@ def make_sparse_angular_tree(
@numba.njit(nogil=True)
def make_dense_tree(data, rng_state, leaf_size=30, angular=False, max_depth=200):
indices = np.arange(data.shape[0]).astype(np.int32)
-
hyperplanes = numba.typed.List.empty_list(dense_hyperplane_type)
offsets = numba.typed.List.empty_list(offset_type)
children = numba.typed.List.empty_list(children_type)
@@ -918,6 +1119,38 @@ def make_sparse_tree(
return FlatTree(hyperplanes, offsets, children, point_indices, max_leaf_size)
+ at numba.njit(nogil=True)
+def make_dense_bit_tree(data, rng_state, leaf_size=30, angular=False, max_depth=200):
+ indices = np.arange(data.shape[0]).astype(np.int32)
+
+ hyperplanes = numba.typed.List.empty_list(bit_hyperplane_type)
+ offsets = numba.typed.List.empty_list(offset_type)
+ children = numba.typed.List.empty_list(children_type)
+ point_indices = numba.typed.List.empty_list(point_indices_type)
+
+ if angular:
+ make_bit_tree(
+ data,
+ indices,
+ hyperplanes,
+ offsets,
+ children,
+ point_indices,
+ rng_state,
+ leaf_size,
+ max_depth=max_depth,
+ )
+ else:
+ raise NotImplementedError("Euclidean bit trees are not implemented yet.")
+
+ max_leaf_size = leaf_size
+ for points in point_indices:
+ if len(points) > max_leaf_size:
+ max_leaf_size = numba.int32(len(points))
+
+ result = FlatTree(hyperplanes, offsets, children, point_indices, max_leaf_size)
+ return result
+
@numba.njit(
[
"b1(f4[::1],f4,f4[::1],i8[::1])",
@@ -954,6 +1187,43 @@ def select_side(hyperplane, offset, point, rng_state):
return 1
+ at numba.njit(
+ [
+ "b1(u1[::1],f4,u1[::1],i8[::1])",
+ numba.types.boolean(
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.float32,
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+ ),
+ ],
+ fastmath=True,
+ locals={
+ "margin": numba.types.float32,
+ "dim": numba.types.intp,
+ "d": numba.types.uint16,
+ },
+ cache=True,
+)
+def select_side_bit(hyperplane, offset, point, rng_state):
+ margin = offset
+ dim = point.shape[0]
+ for d in range(dim):
+ margin += popcnt[hyperplane[d] & point[d]]
+ margin -= popcnt[hyperplane[dim + d] & point[d]]
+
+ if abs(margin) < EPS:
+ side = np.abs(tau_rand_int(rng_state)) % 2
+ if side == 0:
+ return 0
+ else:
+ return 1
+ elif margin > 0:
+ return 0
+ else:
+ return 1
+
+
@numba.njit(
[
"i4[::1](f4[::1],f4[:,::1],f4[::1],i4[:,::1],i4[::1],i8[::1])",
@@ -981,6 +1251,32 @@ def search_flat_tree(point, hyperplanes, offsets, children, indices, rng_state):
return indices[-children[node, 0] : -children[node, 1]]
+ at numba.njit(
+ [
+ "i4[::1](u1[::1],u1[:,::1],f4[::1],i4[:,::1],i4[::1],i8[::1])",
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True)(
+ numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
+ numba.types.Array(numba.types.uint8, 2, "C", readonly=True),
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int32, 2, "C", readonly=True),
+ numba.types.Array(numba.types.int32, 1, "C", readonly=True),
+ numba.types.Array(numba.types.int64, 1, "C", readonly=False),
+ ),
+ ],
+ locals={"node": numba.types.uint32, "side": numba.types.boolean},
+ cache=True,
+)
+def search_flat_bit_tree(point, hyperplanes, offsets, children, indices, rng_state):
+ node = 0
+ while children[node, 0] > 0:
+ side = select_side_bit(hyperplanes[node], offsets[node], point, rng_state)
+ if side == 0:
+ node = children[node, 0]
+ else:
+ node = children[node, 1]
+
+ return indices[-children[node, 0] : -children[node, 1]]
+
@numba.njit(fastmath=True, cache=True)
def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state):
margin = offset
@@ -1034,6 +1330,7 @@ def make_forest(
random_state,
n_jobs=None,
angular=False,
+ bit_tree=False,
max_depth=200,
):
"""Build a random projection forest with ``n_trees``.
@@ -1076,6 +1373,17 @@ def make_forest(
)
for i in range(n_trees)
)
+ elif bit_tree:
+ result = joblib.Parallel(n_jobs=n_jobs, require="sharedmem")(
+ joblib.delayed(make_dense_bit_tree)(
+ data,
+ rng_states[i],
+ leaf_size,
+ angular,
+ max_depth=max_depth
+ )
+ for i in range(n_trees)
+ )
else:
result = joblib.Parallel(n_jobs=n_jobs, require="sharedmem")(
joblib.delayed(make_dense_tree)(
@@ -1130,7 +1438,7 @@ def rptree_leaf_array(rp_forest):
return np.array([[-1]])
- at numba.njit()
+#@numba.njit()
def recursive_convert(
tree, hyperplanes, offsets, children, indices, node_num, leaf_start, tree_node
):
@@ -1229,8 +1537,11 @@ def convert_tree_format(tree, data_size, data_dim):
is_sparse = False
if tree.hyperplanes[0].ndim == 1:
# dense hyperplanes
- hyperplane_dim = data_dim
- hyperplanes = np.zeros((n_nodes, hyperplane_dim), dtype=np.float32)
+ if tree.hyperplanes[0].dtype == np.uint8:
+ hyperplane_dim = data_dim * 2
+ else:
+ hyperplane_dim = data_dim
+ hyperplanes = np.zeros((n_nodes, hyperplane_dim), dtype=tree.hyperplanes[0].dtype)
else:
# sparse hyperplanes
is_sparse = True
=====================================
pynndescent/tests/test_distances.py
=====================================
@@ -172,7 +172,9 @@ def test_sparse_spatial_check(sparse_spatial_data, metric, decimal=6):
)
def test_sparse_binary_check(sparse_binary_data, metric):
if metric in spdist.sparse_named_distances:
- dist_matrix = pairwise_distances(np.asarray(sparse_binary_data.todense()), metric=metric)
+ dist_matrix = pairwise_distances(
+ np.asarray(sparse_binary_data.todense()), metric=metric
+ )
if metric in ("jaccard", "dice", "sokalsneath"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric == "russellrao":
@@ -394,3 +396,37 @@ def test_wasserstein_1d(p):
p,
)
assert np.isclose(d1, d2)
+
+
+def test_bit_hamming():
+ test_data = np.random.randint(0, 255, size=(10, 100), dtype=np.uint8)
+ unpacked_data = np.zeros(
+ (test_data.shape[0], test_data.shape[1] * 8), dtype=np.float32
+ )
+ for i in range(unpacked_data.shape[0]):
+ for j in range(unpacked_data.shape[1]):
+ unpacked_data[i, j] = (test_data[i, j // 8] & (1 << (j % 8))) > 0
+
+ all_pairs = pairwise_distances(unpacked_data, metric="hamming")
+ for i in range(test_data.shape[0]):
+ for j in range(i + 1, test_data.shape[0]):
+ d1 = dist.bit_hamming(test_data[i], test_data[j]) / (test_data.shape[1] * 8)
+ d2 = all_pairs[i, j]
+ assert np.isclose(d1, d2)
+
+
+def test_bit_jaccard():
+ test_data = np.random.randint(0, 255, size=(10, 100), dtype=np.uint8)
+ unpacked_data = np.zeros(
+ (test_data.shape[0], test_data.shape[1] * 8), dtype=np.float32
+ )
+ for i in range(unpacked_data.shape[0]):
+ for j in range(unpacked_data.shape[1]):
+ unpacked_data[i, j] = (test_data[i, j // 8] & (1 << (j % 8))) > 0
+
+ all_pairs = pairwise_distances(unpacked_data, metric="jaccard")
+ for i in range(test_data.shape[0]):
+ for j in range(i + 1, test_data.shape[0]):
+ d1 = 1.0 - np.exp(-dist.bit_jaccard(test_data[i], test_data[j]))
+ d2 = all_pairs[i, j]
+ assert np.isclose(d1, d2)
=====================================
pynndescent/tests/test_pynndescent_.py
=====================================
@@ -25,7 +25,7 @@ def test_nn_descent_neighbor_accuracy(nn_data, seed):
num_correct = 0.0
for i in range(nn_data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (nn_data.shape[0] * 10)
assert (
@@ -44,7 +44,7 @@ def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
num_correct = 0.0
for i in range(nn_data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (nn_data.shape[0] * 10)
assert (
@@ -52,8 +52,34 @@ def test_angular_nn_descent_neighbor_accuracy(nn_data, seed):
), "NN-descent did not get 99% accuracy on nearest neighbors"
+def test_bitpacked_nn_descent_neighbor_accuracy(nn_data, seed):
+ bitpacked_data = (nn_data * 256).astype(np.uint8)
+ unpacked_data = np.zeros(
+ (bitpacked_data.shape[0], bitpacked_data.shape[1] * 8), dtype=np.float32
+ )
+ for i in range(unpacked_data.shape[0]):
+ for j in range(unpacked_data.shape[1]):
+ unpacked_data[i, j] = (bitpacked_data[i, j // 8] & (1 << (j % 8))) > 0
+
+ knn_indices, _ = NNDescent(
+ bitpacked_data, "bit_jaccard", {}, 10, random_state=np.random.RandomState(seed)
+ )._neighbor_graph
+
+ nn_finder = NearestNeighbors(n_neighbors=10, metric="jaccard").fit(unpacked_data)
+ true_indices = nn_finder.kneighbors(unpacked_data, 10, return_distance=False)
+
+ num_correct = 0.0
+ for i in range(nn_data.shape[0]):
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
+
+ percent_correct = num_correct / (nn_data.shape[0] * 10)
+ assert (
+ percent_correct >= 0.60
+ ), "NN-descent did not get 60% accuracy on nearest neighbors"
+
+
@pytest.mark.skipif(
- list(map(int, scipy.version.version.split("."))) < [1, 3, 0],
+ list(map(int, re.findall(r"[0-9]+\.[0-9]+\.?[0-9]*", scipy.version.version)[0].split("."))) < [1, 3, 0],
reason="requires scipy >= 1.3.0",
)
def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data, seed):
@@ -66,7 +92,7 @@ def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data, seed):
num_correct = 0.0
for i in range(sparse_nn_data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
assert (
@@ -89,7 +115,7 @@ def test_sparse_angular_nn_descent_neighbor_accuracy(sparse_nn_data):
num_correct = 0.0
for i in range(sparse_nn_data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (sparse_nn_data.shape[0] * 10)
assert (
@@ -106,7 +132,7 @@ def test_nn_descent_query_accuracy(nn_data):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert (
@@ -123,7 +149,7 @@ def test_nn_descent_query_accuracy_angular(nn_data):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert (
@@ -142,7 +168,7 @@ def test_sparse_nn_descent_query_accuracy(sparse_nn_data):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert (
@@ -161,7 +187,7 @@ def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert (
@@ -169,6 +195,35 @@ def test_sparse_nn_descent_query_accuracy_angular(sparse_nn_data):
), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
+def test_bitpacked_nn_descent_query_accuracy(nn_data):
+ bitpacked_data = (nn_data * 256).astype(np.uint8)
+ unpacked_data = np.zeros(
+ (bitpacked_data.shape[0], bitpacked_data.shape[1] * 8), dtype=np.float32
+ )
+ for i in range(unpacked_data.shape[0]):
+ for j in range(unpacked_data.shape[1]):
+ unpacked_data[i, j] = (bitpacked_data[i, j // 8] & (1 << (j % 8))) > 0
+
+ nnd = NNDescent(
+ bitpacked_data[200:], "bit_jaccard", n_neighbors=50, random_state=None
+ )
+ knn_indices, _ = nnd.query(bitpacked_data[:200], k=10, epsilon=0.36)
+
+ nn = NearestNeighbors(metric="jaccard").fit(unpacked_data[200:])
+ true_indices = nn.kneighbors(
+ unpacked_data[:200], n_neighbors=10, return_distance=False
+ )
+
+ num_correct = 0.0
+ for i in range(true_indices.shape[0]):
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
+
+ percent_correct = num_correct / (true_indices.shape[0] * 10)
+ assert (
+ percent_correct >= 0.80
+ ), "Sparse NN-descent query did not get 95% accuracy on nearest neighbors"
+
+
def test_transformer_equivalence(nn_data):
N_NEIGHBORS = 15
EPSILON = 0.15
@@ -206,7 +261,7 @@ def test_random_state_none(nn_data, spatial_data):
num_correct = 0.0
for i in range(nn_data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (spatial_data.shape[0] * 10)
assert (
@@ -279,7 +334,7 @@ def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):
num_correct = 0
for i in range(data.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
proportion_correct = num_correct / (data.shape[0] * n_neighbors)
assert (
@@ -287,7 +342,9 @@ def test_deduplicated_data_behaves_normally(seed, cosine_hang_data):
), "NN-descent did not get 95% accuracy on nearest neighbors"
-def test_rp_trees_should_not_stack_overflow_with_near_duplicate_data(seed, cosine_near_duplicates_data):
+def test_rp_trees_should_not_stack_overflow_with_near_duplicate_data(
+ seed, cosine_near_duplicates_data
+):
n_neighbors = 10
knn_indices, _ = NNDescent(
@@ -467,7 +524,7 @@ def test_update_no_prepare_query_accuracy(nn_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert percent_correct >= 0.95, (
@@ -496,7 +553,7 @@ def test_update_w_prepare_query_accuracy(nn_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert percent_correct >= 0.95, (
@@ -525,7 +582,7 @@ def test_update_w_prepare_query_accuracy(nn_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert percent_correct >= 0.95, (
@@ -537,7 +594,7 @@ def evaluate_predictions(neighbors_true, neigbhors_computed, n_neighbors):
n_correct = 0
n_all = neighbors_true.shape[0] * n_neighbors
for i in range(neighbors_true.shape[0]):
- n_correct += np.sum(np.in1d(neighbors_true[i], neigbhors_computed[i]))
+ n_correct += np.sum(np.isin(neighbors_true[i], neigbhors_computed[i]))
return n_correct / n_all
@@ -612,7 +669,7 @@ def test_tree_init_false(nn_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert percent_correct >= 0.95, (
@@ -640,7 +697,7 @@ def test_one_dimensional_data(nn_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * 10)
assert percent_correct >= 0.95, (
@@ -673,7 +730,7 @@ def test_tree_no_split(small_data, sparse_small_data, metric):
num_correct = 0.0
for i in range(true_indices.shape[0]):
- num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))
+ num_correct += np.sum(np.isin(true_indices[i], knn_indices[i]))
percent_correct = num_correct / (true_indices.shape[0] * k)
assert (
@@ -682,7 +739,12 @@ def test_tree_no_split(small_data, sparse_small_data, metric):
data_type
)
- at pytest.mark.skipif('NUMBA_DISABLE_JIT' in os.environ, reason="Too expensive for disabled Numba")
+
+ at pytest.mark.skipif(
+ "NUMBA_DISABLE_JIT" in os.environ, reason="Too expensive for disabled Numba"
+)
def test_bad_data():
- data = np.sqrt(np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")['arr_0'])
+ data = np.sqrt(
+ np.load("pynndescent/tests/test_data/pynndescent_bug_np.npz")["arr_0"]
+ )
index = NNDescent(data, metric="cosine")
=====================================
pynndescent/utils.py
=====================================
@@ -192,7 +192,7 @@ def make_heap(n_points, size):
heap: An ndarray suitable for passing to other numba enabled heap functions.
"""
indices = np.full((int(n_points), int(size)), -1, dtype=np.int32)
- distances = np.full((int(n_points), int(size)), np.infty, dtype=np.float32)
+ distances = np.full((int(n_points), int(size)), np.inf, dtype=np.float32)
flags = np.zeros((int(n_points), int(size)), dtype=np.uint8)
result = (indices, distances, flags)
=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def readme():
configuration = {
"name": "pynndescent",
- "version": "0.5.11",
+ "version": "0.5.13",
"description": "Nearest Neighbor Descent",
"long_description": readme(),
"classifiers": [
@@ -23,9 +23,10 @@ configuration = {
"Operating System :: POSIX",
"Operating System :: Unix",
"Operating System :: MacOS",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
- "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
],
"keywords": "nearest neighbor, knn, ANN",
"url": "http://github.com/lmcinnes/pynndescent",
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/compare/1f3919a49f59e210154317c180b27ed67e7eac1d...7280b697553424a970b2e2c3fde159dfa56e741f
--
View it on GitLab: https://salsa.debian.org/python-team/packages/python-pynndescent/-/compare/1f3919a49f59e210154317c180b27ed67e7eac1d...7280b697553424a970b2e2c3fde159dfa56e741f
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250901/b05f7795/attachment-0001.htm>
More information about the debian-med-commit
mailing list