[med-svn] [Git][med-team/hnswlib][upstream] New upstream version 0.6.0

Andreas Tille (@tille) gitlab at salsa.debian.org
Sun Jan 16 19:18:04 GMT 2022



Andreas Tille pushed to branch upstream at Debian Med / hnswlib


Commits:
c6215c32 by Andreas Tille at 2022-01-16T18:54:36+01:00
New upstream version 0.6.0
- - - - -


21 changed files:

- + .github/workflows/build.yml
- .gitignore
- − .travis.yml
- ALGO_PARAMS.md
- CMakeLists.txt
- README.md
- + TESTING_RECALL.md
- + examples/git_tester.py
- + examples/speedtest.py
- hnswlib/hnswalg.h
- hnswlib/hnswlib.h
- hnswlib/space_ip.h
- hnswlib/space_l2.h
- hnswlib/visited_list_pool.h
- + python_bindings/__init__.py
- python_bindings/bindings.cpp
- python_bindings/tests/bindings_test_labels.py
- python_bindings/tests/bindings_test_pickle.py
- + python_bindings/tests/bindings_test_recall.py
- + python_bindings/tests/bindings_test_spaces.py
- setup.py


Changes:

=====================================
.github/workflows/build.yml
=====================================
@@ -0,0 +1,22 @@
+name: HNSW CI
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ${{matrix.os}}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        python-version: ['3.6', '3.7', '3.8', '3.9']
+    steps:
+      - uses: actions/checkout at v2
+      - uses: actions/setup-python at v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          
+      - name: Build and install
+        run: python -m pip install .
+      
+      - name: Test
+        run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"


=====================================
.gitignore
=====================================
@@ -6,3 +6,6 @@ python_bindings/tests/__pycache__/
 *.pyd
 hnswlib.cpython*.so
 var/
+.idea/
+.vscode/
+


=====================================
.travis.yml deleted
=====================================
@@ -1,63 +0,0 @@
-language: python
-
-jobs:
-  include:
-    - name: Linux Python 3.6
-      os: linux
-      python: 3.6
-    
-    - name: Linux Python 3.7
-      os: linux
-      python: 3.7
-
-    - name: Linux Python 3.8
-      os: linux
-      python: 3.8
-
-    - name: Linux Python 3.9
-      os: linux
-      python: 3.9
-
-    - name: Windows Python 3.6
-      os: windows
-      language: shell    # 'language: python' is an error on Travis CI Windows
-      before_install:
-        - choco install python --version 3.6.0
-        - python -m pip install --upgrade pip
-        - python --version
-      env: PATH=/c/Python36:/c/Python36/Scripts:$PATH
-    
-    - name: Windows Python 3.7
-      os: windows
-      language: shell    # 'language: python' is an error on Travis CI Windows
-      before_install:
-        - choco install python --version 3.7.0
-        - python -m pip install --upgrade pip
-        - python --version
-      env: PATH=/c/Python37:/c/Python37/Scripts:$PATH
-
-    - name: Windows Python 3.8
-      os: windows
-      language: shell    # 'language: python' is an error on Travis CI Windows
-      before_install:
-        - choco install python --version 3.8.0
-        - python -m pip install --upgrade pip
-        - python --version
-      env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
-
-    - name: Windows Python 3.9
-      os: windows
-      language: shell    # 'language: python' is an error on Travis CI Windows
-      before_install:
-        - choco install python --version 3.9.0
-        - python -m pip install --upgrade pip
-        - python --version
-      env: PATH=/c/Python39:/c/Python39/Scripts:$PATH
-
-install:
-  - |
-    python -m pip install .
-
-script:
-  - |
-    python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"


=====================================
ALGO_PARAMS.md
=====================================
@@ -9,6 +9,8 @@ The ```knn_query``` function returns two numpy arrays, containing labels and dis
 elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
 (this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.
 
+An example of tuning the parameters can be found in [TESTING_RECALL.md](TESTING_RECALL.md)
+
 ## Construction parameters:
 * ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M``` 
 is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work 


=====================================
CMakeLists.txt
=====================================
@@ -1,28 +1,27 @@
 cmake_minimum_required (VERSION 2.6)
-project (hnsw_lib)
+project(hnsw_lib
+    LANGUAGES CXX)
 
-include_directories("${PROJECT_BINARY_DIR}")
+add_library(hnswlib INTERFACE)
+target_include_directories(hnswlib INTERFACE .) 
 
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+    set(CMAKE_CXX_STANDARD 11)
 
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      SET( CMAKE_CXX_FLAGS  "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+      SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
+    endif()
 
-set(SOURCE_EXE main.cpp)           
+    add_executable(test_updates examples/updates_test.cpp)
+    target_link_libraries(test_updates hnswlib)
 
-set(SOURCE_LIB sift_1b.cpp)
+    add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
+    target_link_libraries(searchKnnCloserFirst_test hnswlib)
 
-add_library(sift_test STATIC ${SOURCE_LIB})
-
-
-add_executable(main ${SOURCE_EXE})
-if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  SET( CMAKE_CXX_FLAGS  "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
-elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
-elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-  SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
+    add_executable(main main.cpp sift_1b.cpp)
+    target_link_libraries(main hnswlib)
 endif()
-
-add_executable(test_updates examples/updates_test.cpp)
-
-add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
-
-target_link_libraries(main sift_test) 


=====================================
README.md
=====================================
@@ -3,21 +3,20 @@ Header-only C++ HNSW implementation with python bindings.
 
 **NEWS:**
 
-* **Hnswlib is now 0.5.2**. Bugfixes - thanks [@marekhanus](https://github.com/marekhanus) for fixing the missing arguments, adding support for python 3.8, 3.9 in Travis, improving python wrapper and fixing typos/code style; [@apoorv-sharma](https://github.com/apoorv-sharma) for fixing the bug int the insertion/deletion logic; [@shengjun1985](https://github.com/shengjun1985) for simplifying the memory reallocation logic; [@TakaakiFuruse](https://github.com/TakaakiFuruse) for improved description of `add_items`; [@psobot ](https://github.com/psobot) for improving error handling; [@ShuAiii](https://github.com/ShuAiii) for reporting the bug in the python interface
+**version 0.6** 
+* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions). 
+* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated. 
+* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md). 
+* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K. 
 
-* **Hnswlib is now 0.5.0**. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)!
-
-* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).**
-
-* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4** 
+  
 
-* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!**
 
-Highlights:
-1) Lightweight, header-only, no dependencies other than C++ 11.
-2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw).
+### Highlights:
+1) Lightweight, header-only, no dependencies other than C++ 11
+2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
 3) Has full support for incremental index construction. Has support for element deletions 
-(currently, without actual freeing of the memory).
+(by marking them in index). Index is picklable.
 4) Can work with custom user defined distances (C++).
 5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.
 
@@ -53,7 +52,9 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
       - If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
     * Thread-safe with other `add_items` calls, but not with `knn_query`..
     
-* `mark_deleted(label)`  - marks the element as deleted, so it will be omitted from search results.
+* `mark_deleted(label)`  - marks the element as deleted, so it will be omitted from search results. Throws an exception if it is already deleted.
+* 
+* `unmark_deleted(label)`  - unmarks the element as deleted, so it will be not be omitted from search results.
 
 * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`.
 
@@ -225,6 +226,15 @@ pip install .
 or you can install via pip:
 `pip install hnswlib`
 
+
+### For developers 
+
+When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
+```bash
+python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py
+```
+
+
 ### Other implementations
 * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib
 * Faiss library by facebook, uses own HNSW  implementation for coarse quantization (python, C++):


=====================================
TESTING_RECALL.md
=====================================
@@ -0,0 +1,91 @@
+# Testing recall
+
+Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors).
+For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index.  
+Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension.
+
+### Brute force index API
+`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`.
+
+`hnswlib.BFIndex` methods:
+
+`init_index(max_elements)` initializes the index with no elements.
+
+max_elements defines the maximum number of elements that can be stored in the structure.
+
+`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure.
+`ids` are optional N-size numpy array of integer labels for all elements in data.
+
+`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results.
+
+`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the
+`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
+
+`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index.
+
+`save_index(path_to_index)` saves the index from persistence.
+
+### measuring recall example
+
+```
+import hnswlib
+import numpy as np
+
+dim = 32
+num_elements = 100000
+k = 10
+nun_queries = 10
+
+# Generating sample data
+data = np.float32(np.random.random((num_elements, dim)))
+
+# Declaring index
+hnsw_index = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+bf_index = hnswlib.BFIndex(space='l2', dim=dim)
+
+# Initing both hnsw and brute force indices
+# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
+# during insertion of an element.
+# The capacity can be increased by saving/loading the index, see below.
+#
+# hnsw construction params:
+# ef_construction - controls index search speed/build speed tradeoff
+#
+# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
+# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
+
+hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
+bf_index.init_index(max_elements=num_elements)
+
+# Controlling the recall for hnsw by setting ef:
+# higher ef leads to better accuracy, but slower search
+hnsw_index.set_ef(200)
+
+# Set number of threads used during batch search/construction in hnsw
+# By default using all available cores
+hnsw_index.set_num_threads(1)
+
+print("Adding batch of %d elements" % (len(data)))
+hnsw_index.add_items(data)
+bf_index.add_items(data)
+
+print("Indices built")
+
+# Generating query data
+query_data = np.float32(np.random.random((nun_queries, dim)))
+
+# Query the elements and measure recall:
+labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
+labels_bf, distances_bf = bf_index.knn_query(query_data, k)
+
+# Measure recall
+correct = 0
+for i in range(nun_queries):
+    for label in labels_hnsw[i]:
+        for correct_label in labels_bf[i]:
+            if label == correct_label:
+                correct += 1
+                break
+
+print("recall is :", float(correct)/(k*nun_queries))
+```


=====================================
examples/git_tester.py
=====================================
@@ -0,0 +1,16 @@
+from pydriller import Repository
+import os 
+import datetime
+os.system("cp examples/speedtest.py examples/speedtest2.py")
+for commit in Repository('.', from_tag="v0.5.2").traverse_commits():
+    print(commit.hash)
+    print(commit.msg)
+    
+    os.system(f"git checkout {commit.hash}; rm -rf build; ")
+    os.system("python -m pip install .")
+    os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 1')
+    os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 64 -t 1')
+    os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 1')
+    os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 24')
+    os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 24')
+


=====================================
examples/speedtest.py
=====================================
@@ -0,0 +1,62 @@
+import hnswlib
+import numpy as np
+import os.path
+import time
+import argparse
+
+# Use nargs to specify how many arguments an option should take.
+ap = argparse.ArgumentParser()
+ap.add_argument('-d')
+ap.add_argument('-n')
+ap.add_argument('-t')
+args = ap.parse_args()
+dim = int(args.d)
+name = args.n
+threads=int(args.t)
+num_elements = 1000000 * 4//dim
+
+# Generating sample data
+np.random.seed(1)
+data = np.float32(np.random.random((num_elements, dim)))
+
+
+index_path=f'speed_index{dim}.bin'
+# Declaring index
+p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+
+if not os.path.isfile(index_path) :
+
+    p.init_index(max_elements=num_elements, ef_construction=100, M=16)
+
+    # Controlling the recall by setting ef:
+    # higher ef leads to better accuracy, but slower search
+    p.set_ef(10)
+
+    # Set number of threads used during batch search/construction
+    # By default using all available cores
+    p.set_num_threads(12)
+
+    p.add_items(data)
+
+    # Serializing and deleting the index:
+
+    print("Saving index to '%s'" % index_path)
+    p.save_index(index_path)
+p.set_num_threads(threads)
+times=[]
+time.sleep(10)
+p.set_ef(100)
+for _ in range(3):
+    p.load_index(index_path)
+    for _ in range(10):
+        t0=time.time()
+        labels, distances = p.knn_query(data, k=1)
+        tt=time.time()-t0
+        times.append(tt)
+        print(f"{tt} seconds")    
+str_out=f"mean time:{np.mean(times)}, median time:{np.median(times)}, std time {np.std(times)} {name}"
+print(str_out)
+with open (f"log_{dim}_t{threads}.txt","a") as f:
+    f.write(str_out+"\n")
+    f.flush()
+


=====================================
hnswlib/hnswalg.h
=====================================
@@ -18,7 +18,6 @@ namespace hnswlib {
     public:
         static const tableint max_update_element_locks = 65536;
         HierarchicalNSW(SpaceInterface<dist_t> *s) {
-
         }
 
         HierarchicalNSW(SpaceInterface<dist_t> *s, const std::string &location, bool nmslib = false, size_t max_elements=0) {
@@ -29,7 +28,7 @@ namespace hnswlib {
                 link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) {
             max_elements_ = max_elements;
 
-            has_deletions_=false;
+            num_deleted_ = 0;
             data_size_ = s->get_data_size();
             fstdistfunc_ = s->get_dist_func();
             dist_func_param_ = s->get_dist_func_param();
@@ -56,8 +55,6 @@ namespace hnswlib {
 
             visited_list_pool_ = new VisitedListPool(1, max_elements);
 
-
-
             //initializations for special treatment of the first node
             enterpoint_node_ = -1;
             maxlevel_ = -1;
@@ -92,6 +89,7 @@ namespace hnswlib {
         size_t cur_element_count;
         size_t size_data_per_element_;
         size_t size_links_per_element_;
+        size_t num_deleted_;
 
         size_t M_;
         size_t maxM_;
@@ -112,20 +110,15 @@ namespace hnswlib {
         std::vector<std::mutex> link_list_update_locks_;
         tableint enterpoint_node_;
 
-
         size_t size_links_level0_;
         size_t offsetData_, offsetLevel0_;
 
-
         char *data_level0_memory_;
         char **linkLists_;
         std::vector<int> element_levels_;
 
         size_t data_size_;
 
-        bool has_deletions_;
-
-
         size_t label_offset_;
         DISTFUNC<dist_t> fstdistfunc_;
         void *dist_func_param_;
@@ -182,7 +175,7 @@ namespace hnswlib {
 
             while (!candidateSet.empty()) {
                 std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
-                if ((-curr_el_pair.first) > lowerBound) {
+                if ((-curr_el_pair.first) > lowerBound && top_candidates.size() == ef_construction_) {
                     break;
                 }
                 candidateSet.pop();
@@ -271,7 +264,7 @@ namespace hnswlib {
 
                 std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
 
-                if ((-current_node_pair.first) > lowerBound) {
+                if ((-current_node_pair.first) > lowerBound && (top_candidates.size() == ef || has_deletions == false)) {
                     break;
                 }
                 candidate_set.pop();
@@ -547,7 +540,7 @@ namespace hnswlib {
                 }
             }
 
-            if (has_deletions_) {
+            if (num_deleted_) {
                 std::priority_queue<std::pair<dist_t, tableint  >> top_candidates1=searchBaseLayerST<true>(currObj, query_data,
                                                                                                            ef_);
                 top_candidates.swap(top_candidates1);
@@ -623,8 +616,6 @@ namespace hnswlib {
         }
 
         void loadIndex(const std::string &location, SpaceInterface<dist_t> *s, size_t max_elements_i=0) {
-
-
             std::ifstream input(location, std::ios::binary);
 
             if (!input.is_open())
@@ -639,7 +630,7 @@ namespace hnswlib {
             readBinaryPOD(input, max_elements_);
             readBinaryPOD(input, cur_element_count);
 
-            size_t max_elements=max_elements_i;
+            size_t max_elements = max_elements_i;
             if(max_elements < cur_element_count)
                 max_elements = max_elements_;
             max_elements_ = max_elements;
@@ -688,26 +679,19 @@ namespace hnswlib {
 
             input.seekg(pos,input.beg);
 
-
             data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
             if (data_level0_memory_ == nullptr)
                 throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
             input.read(data_level0_memory_, cur_element_count * size_data_per_element_);
 
-
-
-
             size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
 
-
             size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
             std::vector<std::mutex>(max_elements).swap(link_list_locks_);
             std::vector<std::mutex>(max_update_element_locks).swap(link_list_update_locks_);
 
-
             visited_list_pool_ = new VisitedListPool(1, max_elements);
 
-
             linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
             if (linkLists_ == nullptr)
                 throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
@@ -731,11 +715,9 @@ namespace hnswlib {
                 }
             }
 
-            has_deletions_=false;
-
             for (size_t i = 0; i < cur_element_count; i++) {
                 if(isMarkedDeleted(i))
-                    has_deletions_=true;
+                    num_deleted_ += 1;
             }
 
             input.close();
@@ -744,7 +726,7 @@ namespace hnswlib {
         }
 
         template<typename data_t>
-        std::vector<data_t> getDataByLabel(labeltype label)
+        std::vector<data_t> getDataByLabel(labeltype label) const
         {
             tableint label_c;
             auto search = label_lookup_.find(label);
@@ -765,19 +747,19 @@ namespace hnswlib {
         }
 
         static const unsigned char DELETE_MARK = 0x01;
-//        static const unsigned char REUSE_MARK = 0x10;
+        // static const unsigned char REUSE_MARK = 0x10;
         /**
          * Marks an element with the given label deleted, does NOT really change the current graph.
          * @param label
          */
         void markDelete(labeltype label)
         {
-            has_deletions_=true;
             auto search = label_lookup_.find(label);
             if (search == label_lookup_.end()) {
                 throw std::runtime_error("Label not found");
             }
-            markDeletedInternal(search->second);
+            tableint internalId = search->second;
+            markDeletedInternal(internalId);
         }
 
         /**
@@ -786,8 +768,31 @@ namespace hnswlib {
          * @param internalId
          */
         void markDeletedInternal(tableint internalId) {
-            unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
-            *ll_cur |= DELETE_MARK;
+            assert(internalId < cur_element_count);
+            if (!isMarkedDeleted(internalId))
+            {
+                unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
+                *ll_cur |= DELETE_MARK;
+                num_deleted_ += 1;
+            }
+            else
+            {
+                throw std::runtime_error("The requested to delete element is already deleted");
+            }
+        }
+
+        /**
+         * Remove the deleted mark of the node, does NOT really change the current graph.
+         * @param label
+         */
+        void unmarkDelete(labeltype label)
+        {
+            auto search = label_lookup_.find(label);
+            if (search == label_lookup_.end()) {
+                throw std::runtime_error("Label not found");
+            }
+            tableint internalId = search->second;
+            unmarkDeletedInternal(internalId);
         }
 
         /**
@@ -795,8 +800,17 @@ namespace hnswlib {
          * @param internalId
          */
         void unmarkDeletedInternal(tableint internalId) {
-            unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
-            *ll_cur &= ~DELETE_MARK;
+            assert(internalId < cur_element_count);
+            if (isMarkedDeleted(internalId))
+            {
+                unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
+                *ll_cur &= ~DELETE_MARK;
+                num_deleted_ -= 1;
+            }
+            else
+            {
+                throw std::runtime_error("The requested to undelete element is not deleted");
+            }
         }
 
         /**
@@ -857,8 +871,8 @@ namespace hnswlib {
                 }
 
                 for (auto&& neigh : sNeigh) {
-//                    if (neigh == internalId)
-//                        continue;
+                    // if (neigh == internalId)
+                    //     continue;
 
                     std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidates;
                     size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1
@@ -1133,7 +1147,7 @@ namespace hnswlib {
             }
 
             std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
-            if (has_deletions_) {
+            if (num_deleted_) {
                 top_candidates=searchBaseLayerST<true,true>(
                         currObj, query_data, std::max(ef_, k));
             }


=====================================
hnswlib/hnswlib.h
=====================================
@@ -4,6 +4,9 @@
 #define USE_SSE
 #ifdef __AVX__
 #define USE_AVX
+#ifdef __AVX512F__
+#define USE_AVX512
+#endif
 #endif
 #endif
 #endif
@@ -16,10 +19,16 @@
 #include <x86intrin.h>
 #endif
 
+#if defined(USE_AVX512)
+#include <immintrin.h>
+#endif
+
 #if defined(__GNUC__)
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#define PORTABLE_ALIGN64 __attribute__((aligned(64)))
 #else
 #define PORTABLE_ALIGN32 __declspec(align(32))
+#define PORTABLE_ALIGN64 __declspec(align(64))
 #endif
 #endif
 


=====================================
hnswlib/space_ip.h
=====================================
@@ -124,7 +124,40 @@ namespace hnswlib {
 
 #endif
 
-#if defined(USE_AVX)
+
+#if defined(USE_AVX512)
+
+    static float
+    InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float PORTABLE_ALIGN64 TmpRes[16];
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
+        size_t qty = *((size_t *) qty_ptr);
+
+        size_t qty16 = qty / 16;
+
+
+        const float *pEnd1 = pVect1 + 16 * qty16;
+
+        __m512 sum512 = _mm512_set1_ps(0);
+
+        while (pVect1 < pEnd1) {
+            //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
+
+            __m512 v1 = _mm512_loadu_ps(pVect1);
+            pVect1 += 16;
+            __m512 v2 = _mm512_loadu_ps(pVect2);
+            pVect2 += 16;
+            sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
+        }
+
+        _mm512_store_ps(TmpRes, sum512);
+        float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
+
+        return 1.0f - sum;
+    }
+
+#elif defined(USE_AVX)
 
     static float
     InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
@@ -211,7 +244,7 @@ namespace hnswlib {
 
 #endif
 
-#if defined(USE_SSE) || defined(USE_AVX)
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
     static float
     InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         size_t qty = *((size_t *) qty_ptr);
@@ -249,7 +282,7 @@ namespace hnswlib {
     public:
         InnerProductSpace(size_t dim) {
             fstdistfunc_ = InnerProduct;
-    #if defined(USE_AVX) || defined(USE_SSE)
+    #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
             if (dim % 16 == 0)
                 fstdistfunc_ = InnerProductSIMD16Ext;
             else if (dim % 4 == 0)


=====================================
hnswlib/space_l2.h
=====================================
@@ -19,7 +19,41 @@ namespace hnswlib {
         return (res);
     }
 
-#if defined(USE_AVX)
+#if defined(USE_AVX512)
+
+    // Favor using AVX512 if available.
+    static float
+    L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
+        size_t qty = *((size_t *) qty_ptr);
+        float PORTABLE_ALIGN64 TmpRes[16];
+        size_t qty16 = qty >> 4;
+
+        const float *pEnd1 = pVect1 + (qty16 << 4);
+
+        __m512 diff, v1, v2;
+        __m512 sum = _mm512_set1_ps(0);
+
+        while (pVect1 < pEnd1) {
+            v1 = _mm512_loadu_ps(pVect1);
+            pVect1 += 16;
+            v2 = _mm512_loadu_ps(pVect2);
+            pVect2 += 16;
+            diff = _mm512_sub_ps(v1, v2);
+            // sum = _mm512_fmadd_ps(diff, diff, sum);
+            sum = _mm512_add_ps(sum, _mm512_mul_ps(diff, diff));
+        }
+
+        _mm512_store_ps(TmpRes, sum);
+        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] +
+                TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] +
+                TmpRes[13] + TmpRes[14] + TmpRes[15];
+
+        return (res);
+}
+
+#elif defined(USE_AVX)
 
     // Favor using AVX if available.
     static float
@@ -106,7 +140,7 @@ namespace hnswlib {
     }
 #endif
 
-#if defined(USE_SSE) || defined(USE_AVX)
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
     static float
     L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         size_t qty = *((size_t *) qty_ptr);
@@ -174,7 +208,7 @@ namespace hnswlib {
     public:
         L2Space(size_t dim) {
             fstdistfunc_ = L2Sqr;
-        #if defined(USE_SSE) || defined(USE_AVX)
+        #if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
             if (dim % 16 == 0)
                 fstdistfunc_ = L2SqrSIMD16Ext;
             else if (dim % 4 == 0)
@@ -278,4 +312,4 @@ namespace hnswlib {
     };
 
 
-}
\ No newline at end of file
+}


=====================================
hnswlib/visited_list_pool.h
=====================================
@@ -2,6 +2,7 @@
 
 #include <mutex>
 #include <string.h>
+#include <deque>
 
 namespace hnswlib {
     typedef unsigned short int vl_type;


=====================================
python_bindings/__init__.py
=====================================


=====================================
python_bindings/bindings.cpp
=====================================
@@ -70,16 +70,14 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn
             std::rethrow_exception(lastException);
         }
     }
-
-
 }
 
-    inline void assert_true(bool expr, const std::string & msg) {
-      if (expr == false)
-        throw std::runtime_error("Unpickle Error: "+msg);
-      return;
-    }
 
+inline void assert_true(bool expr, const std::string & msg) {
+    if (expr == false)
+    throw std::runtime_error("Unpickle Error: "+msg);
+    return;
+}
 
 
 template<typename dist_t, typename data_t=float>
@@ -141,14 +139,12 @@ public:
         seed=random_seed;
     }
 
-
     void set_ef(size_t ef) {
       default_ef=ef;
       if (appr_alg)
         appr_alg->ef_ = ef;
     }
 
-
     void set_num_threads(int num_threads) {
         this->num_threads_default = num_threads;
     }
@@ -207,14 +203,14 @@ public:
         if (!ids_.is_none()) {
             py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
             auto ids_numpy = items.request();
-            if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) {
+            if(ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) {
                 std::vector<size_t> ids1(ids_numpy.shape[0]);
                 for (size_t i = 0; i < ids1.size(); i++) {
                     ids1[i] = items.data()[i];
                 }
                 ids.swap(ids1);
             }
-            else if(ids_numpy.ndim==0 && rows==1) {
+            else if(ids_numpy.ndim == 0 && rows == 1) {
                 ids.push_back(*items.data());
             }
             else
@@ -227,7 +223,7 @@ public:
           int start = 0;
           if (!ep_added) {
             size_t id = ids.size() ? ids.at(0) : (cur_l);
-            float *vector_data=(float *) items.data(0);
+            float *vector_data = (float *) items.data(0);
             std::vector<float> norm_array(dim);
             if(normalize){
               normalize_vector(vector_data, norm_array.data());
@@ -279,7 +275,6 @@ public:
     }
 
     std::vector<hnswlib::labeltype> getIdsList() {
-
         std::vector<hnswlib::labeltype> ids;
 
         for(auto kv : appr_alg->label_lookup_) {
@@ -290,9 +285,6 @@ public:
 
 
     py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe with Index::addItems */
-
-
-
       std::unique_lock <std::mutex> templock(appr_alg->global);
 
       unsigned int level0_npy_size = appr_alg->cur_element_count * appr_alg->size_data_per_element_;
@@ -369,7 +361,7 @@ public:
                       "mult"_a=appr_alg->mult_,
                       "ef_construction"_a=appr_alg->ef_construction_,
                       "ef"_a=appr_alg->ef_,
-                      "has_deletions"_a=appr_alg->has_deletions_,
+                      "has_deletions"_a=(bool)appr_alg->num_deleted_,
                       "size_links_per_element"_a=appr_alg->size_links_per_element_,
 
                       "label_lookup_external"_a=py::array_t<hnswlib::labeltype>(
@@ -402,10 +394,7 @@ public:
                               {sizeof(char)}, // C-style contiguous strides for double
                               link_list_npy, // the data pointer
                               free_when_done_ll)
-
                     );
-
-
     }
 
 
@@ -431,7 +420,6 @@ public:
 
 
     static Index<float> * createFromParams(const py::dict d) {
-
       // check serialization version
       assert_true(((int)py::int_(Index<float>::ser_version)) >= d["ser_version"].cast<int>(), "Invalid serialization version!");
 
@@ -466,8 +454,6 @@ public:
     }
 
     void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not thread-safe with Index::addItems */
-
-
       std::unique_lock <std::mutex> templock(appr_alg->global);
 
       assert_true(appr_alg->offsetLevel0_ == d["offset_level0"].cast<size_t>(), "Invalid value of offsetLevel0_ ");
@@ -489,7 +475,6 @@ public:
       assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast<size_t>(), "Invalid value of ef_construction_ ");
 
       appr_alg->ef_ = d["ef"].cast<size_t>();
-      appr_alg->has_deletions_=d["has_deletions"].cast<bool>();
 
       assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast<size_t>(), "Invalid value of size_links_per_element_ ");
 
@@ -535,10 +520,20 @@ public:
 
           }
       }
+
+      // set num_deleted
+      appr_alg->num_deleted_ = 0;
+      bool has_deletions = d["has_deletions"].cast<bool>();
+      if (has_deletions)
+      {
+        for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
+          if(appr_alg->isMarkedDeleted(i))
+            appr_alg->num_deleted_ += 1;
+        }
+      }
 }
 
     py::object knnQuery_return_numpy(py::object input, size_t k = 1, int num_threads = -1) {
-
         py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
         auto buffer = items.request();
         hnswlib::labeltype *data_numpy_l;
@@ -561,7 +556,6 @@ public:
                 features = buffer.shape[0];
             }
 
-
             // avoid using threads when the number of searches is small:
 
             if(rows<=num_threads*4){
@@ -609,7 +603,6 @@ public:
                             }
                 );
             }
-
         }
         py::capsule free_when_done_l(data_numpy_l, [](void *f) {
             delete[] f;
@@ -618,7 +611,6 @@ public:
             delete[] f;
         });
 
-
         return py::make_tuple(
                 py::array_t<hnswlib::labeltype>(
                         {rows, k}, // shape
@@ -638,6 +630,10 @@ public:
         appr_alg->markDelete(label);
     }
 
+    void unmarkDeleted(size_t label) {
+        appr_alg->unmarkDelete(label);
+    }
+
     void resizeIndex(size_t new_size) {
         appr_alg->resizeIndex(new_size);
     }
@@ -649,10 +645,190 @@ public:
     size_t getCurrentCount() const {
         return appr_alg->cur_element_count;
     }
-
 };
 
+template<typename dist_t, typename data_t=float>
+class BFIndex {
+public:
+    BFIndex(const std::string &space_name, const int dim) :
+            space_name(space_name), dim(dim) {
+        normalize=false;
+        if(space_name=="l2") {
+            space = new hnswlib::L2Space(dim);
+        }
+        else if(space_name=="ip") {
+            space = new hnswlib::InnerProductSpace(dim);
+        }
+        else if(space_name=="cosine") {
+            space = new hnswlib::InnerProductSpace(dim);
+            normalize=true;
+        } else {
+            throw new std::runtime_error("Space name must be one of l2, ip, or cosine.");
+        }
+        alg = NULL;
+        index_inited = false;
+    }
+
+    static const int ser_version = 1; // serialization version
+
+    std::string space_name;
+    int dim;
+    bool index_inited;
+    bool normalize;
+
+    hnswlib::labeltype cur_l;
+    hnswlib::BruteforceSearch<dist_t> *alg;
+    hnswlib::SpaceInterface<float> *space;
+
+    ~BFIndex() {
+        delete space;
+        if (alg)
+            delete alg;
+    }
+
+    void init_new_index(const size_t maxElements) {
+        if (alg) {
+            throw new std::runtime_error("The index is already initiated.");
+        }
+        cur_l = 0;
+        alg = new hnswlib::BruteforceSearch<dist_t>(space, maxElements);
+        index_inited = true;
+    }
+
+    void normalize_vector(float *data, float *norm_array){
+        float norm=0.0f;
+        for(int i=0;i<dim;i++)
+            norm+=data[i]*data[i];
+        norm= 1.0f / (sqrtf(norm) + 1e-30f);
+        for(int i=0;i<dim;i++)
+            norm_array[i]=data[i]*norm;
+    }
+
+    void addItems(py::object input, py::object ids_ = py::none()) {
+        py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
+        auto buffer = items.request();
+        size_t rows, features;
+
+        if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
+        if (buffer.ndim == 2) {
+            rows = buffer.shape[0];
+            features = buffer.shape[1];
+        } else {
+            rows = 1;
+            features = buffer.shape[0];
+        }
+
+        if (features != dim)
+            throw std::runtime_error("wrong dimensionality of the vectors");
+
+        std::vector<size_t> ids;
+
+        if (!ids_.is_none()) {
+            py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
+            auto ids_numpy = items.request();
+            if (ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) {
+                std::vector<size_t> ids1(ids_numpy.shape[0]);
+                for (size_t i = 0; i < ids1.size(); i++) {
+                    ids1[i] = items.data()[i];
+                }
+                ids.swap(ids1);
+            } else if (ids_numpy.ndim == 0 && rows == 1) {
+                ids.push_back(*items.data());
+            } else
+                throw std::runtime_error("wrong dimensionality of the labels");
+        }
+        {
+
+            for (size_t row = 0; row < rows; row++) {
+                size_t id = ids.size() ? ids.at(row) : cur_l + row;
+                if (!normalize) {
+                    alg->addPoint((void *) items.data(row), (size_t) id);
+                } else {
+                    std::vector<float> normalized_vector(dim);
+                    normalize_vector((float *)items.data(row), normalized_vector.data());
+                    alg->addPoint((void *) normalized_vector.data(), (size_t) id);
+                }
+            }
+            cur_l+=rows;
+        }
+    }
+
+    void deleteVector(size_t label) {
+        alg->removePoint(label);
+    }
+
+    void saveIndex(const std::string &path_to_index) {
+        alg->saveIndex(path_to_index);
+    }
+
+    void loadIndex(const std::string &path_to_index, size_t max_elements) {
+        if (alg) {
+            std::cerr<<"Warning: Calling load_index for an already inited index. Old index is being deallocated.";
+            delete alg;
+        }
+        alg = new hnswlib::BruteforceSearch<dist_t>(space, path_to_index);
+        cur_l = alg->cur_element_count;
+        index_inited = true;
+    }
+
+    py::object knnQuery_return_numpy(py::object input, size_t k = 1) {
+
+        py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
+        auto buffer = items.request();
+        hnswlib::labeltype *data_numpy_l;
+        dist_t *data_numpy_d;
+        size_t rows, features;
+        {
+            py::gil_scoped_release l;
+
+            if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
+            if (buffer.ndim == 2) {
+                rows = buffer.shape[0];
+                features = buffer.shape[1];
+            } else {
+                rows = 1;
+                features = buffer.shape[0];
+            }
+
+            data_numpy_l = new hnswlib::labeltype[rows * k];
+            data_numpy_d = new dist_t[rows * k];
 
+            for (size_t row = 0; row < rows; row++) {
+                std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = alg->searchKnn(
+                        (void *) items.data(row), k);
+                for (int i = k - 1; i >= 0; i--) {
+                    auto &result_tuple = result.top();
+                    data_numpy_d[row * k + i] = result_tuple.first;
+                    data_numpy_l[row * k + i] = result_tuple.second;
+                    result.pop();
+                }
+            }
+        }
+
+        py::capsule free_when_done_l(data_numpy_l, [](void *f) {
+            delete[] f;
+        });
+        py::capsule free_when_done_d(data_numpy_d, [](void *f) {
+            delete[] f;
+        });
+
+
+        return py::make_tuple(
+                py::array_t<hnswlib::labeltype>(
+                        {rows, k}, // shape
+                        {k * sizeof(hnswlib::labeltype),
+                         sizeof(hnswlib::labeltype)}, // C-style contiguous strides for double
+                        data_numpy_l, // the data pointer
+                        free_when_done_l),
+                py::array_t<dist_t>(
+                        {rows, k}, // shape
+                        {k * sizeof(dist_t), sizeof(dist_t)}, // C-style contiguous strides for double
+                        data_numpy_d, // the data pointer
+                        free_when_done_d));
+
+    }
+
+};
 
 PYBIND11_PLUGIN(hnswlib) {
         py::module m("hnswlib");
@@ -672,6 +848,7 @@ PYBIND11_PLUGIN(hnswlib) {
         .def("save_index", &Index<float>::saveIndex, py::arg("path_to_index"))
         .def("load_index", &Index<float>::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0)
         .def("mark_deleted", &Index<float>::markDeleted, py::arg("label"))
+        .def("unmark_deleted", &Index<float>::unmarkDeleted, py::arg("label"))
         .def("resize_index", &Index<float>::resizeIndex, py::arg("new_size"))
         .def("get_max_elements", &Index<float>::getMaxElements)
         .def("get_current_count", &Index<float>::getCurrentCount)
@@ -716,5 +893,16 @@ PYBIND11_PLUGIN(hnswlib) {
             return "<hnswlib.Index(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
         });
 
+        py::class_<BFIndex<float>>(m, "BFIndex")
+        .def(py::init<const std::string &, const int>(), py::arg("space"), py::arg("dim"))
+        .def("init_index", &BFIndex<float>::init_new_index, py::arg("max_elements"))
+        .def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k")=1)
+        .def("add_items", &BFIndex<float>::addItems, py::arg("data"), py::arg("ids") = py::none())
+        .def("delete_vector", &BFIndex<float>::deleteVector, py::arg("label"))
+        .def("save_index", &BFIndex<float>::saveIndex, py::arg("path_to_index"))
+        .def("load_index", &BFIndex<float>::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0)
+        .def("__repr__", [](const BFIndex<float> &a) {
+            return "<hnswlib.BFIndex(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
+        });
         return m.ptr();
 }


=====================================
python_bindings/tests/bindings_test_labels.py
=====================================
@@ -8,7 +8,7 @@ import hnswlib
 
 class RandomSelfTestCase(unittest.TestCase):
     def testRandomSelf(self):
-        for idx in range(16):
+        for idx in range(2):
             print("\n**** Index save-load test ****\n")
 
             np.random.seed(idx)
@@ -94,23 +94,23 @@ class RandomSelfTestCase(unittest.TestCase):
             self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)
 
             # Delete data1
-            labels1, _ = p.knn_query(data1, k=1)
+            labels1_deleted, _ = p.knn_query(data1, k=1)
 
-            for l in labels1:
+            for l in labels1_deleted:
                 p.mark_deleted(l[0])
             labels2, _ = p.knn_query(data2, k=1)
-            items=p.get_items(labels2)
+            items = p.get_items(labels2)
             diff_with_gt_labels = np.mean(np.abs(data2-items))
             self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console
 
             labels1_after, _ = p.knn_query(data1, k=1)
             for la in labels1_after:
-                for lb in labels1:
+                for lb in labels1_deleted:
                     if la[0] == lb[0]:
                         self.assertTrue(False)
             print("All the data in data1 are removed")
 
-            # checking saving/loading index with elements marked as deleted
+            # Checking saving/loading index with elements marked as deleted
             del_index_path = "with_deleted.bin"
             p.save_index(del_index_path)
             p = hnswlib.Index(space='l2', dim=dim)
@@ -119,9 +119,16 @@ class RandomSelfTestCase(unittest.TestCase):
 
             labels1_after, _ = p.knn_query(data1, k=1)
             for la in labels1_after:
-                for lb in labels1:
+                for lb in labels1_deleted:
                     if la[0] == lb[0]:
                         self.assertTrue(False)
 
+            # Unmark deleted data
+            for l in labels1_deleted:
+                p.unmark_deleted(l[0])
+            labels_restored, _ = p.knn_query(data1, k=1)
+            self.assertAlmostEqual(np.mean(labels_restored.reshape(-1) == np.arange(len(data1))), 1.0, 3)
+            print("All the data in data1 are restored")
+
         os.remove(index_path)
         os.remove(del_index_path)


=====================================
python_bindings/tests/bindings_test_pickle.py
=====================================
@@ -124,13 +124,12 @@ def test_space_main(self, space, dim):
 class PickleUnitTests(unittest.TestCase):
 
     def setUp(self):
+        self.ef_construction = 200
+        self.M = 32
+        self.ef = 400
 
-        self.ef_construction = 725
-        self.M = 64
-        self.ef = 725
-
-        self.num_elements = 5000
-        self.num_test_elements = 200
+        self.num_elements = 1000
+        self.num_test_elements = 100
 
         self.num_threads = 4
         self.k = 25
@@ -143,10 +142,10 @@ class PickleUnitTests(unittest.TestCase):
                                  # i.e., number of values that are (d1-d2)**2>1e-3
 
     def test_inner_product_space(self):
-        test_space_main(self, 'ip', 48)
+        test_space_main(self, 'ip', 16)
 
     def test_l2_space(self):
-        test_space_main(self, 'l2', 153)
+        test_space_main(self, 'l2', 53)
 
     def test_cosine_space(self):
-        test_space_main(self, 'cosine', 512)
+        test_space_main(self, 'cosine', 32)


=====================================
python_bindings/tests/bindings_test_recall.py
=====================================
@@ -0,0 +1,88 @@
+import hnswlib
+import numpy as np
+
+dim = 32
+num_elements = 100000
+k = 10
+nun_queries = 10
+
+# Generating sample data
+data = np.float32(np.random.random((num_elements, dim)))
+
+# Declaring index
+hnsw_index = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+bf_index = hnswlib.BFIndex(space='l2', dim=dim)
+
+# Initing both hnsw and brute force indices
+# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
+# during insertion of an element.
+# The capacity can be increased by saving/loading the index, see below.
+#
+# hnsw construction params:
+# ef_construction - controls index search speed/build speed tradeoff
+#
+# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
+# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
+
+hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
+bf_index.init_index(max_elements=num_elements)
+
+# Controlling the recall for hnsw by setting ef:
+# higher ef leads to better accuracy, but slower search
+hnsw_index.set_ef(200)
+
+# Set number of threads used during batch search/construction in hnsw
+# By default using all available cores
+hnsw_index.set_num_threads(1)
+
+print("Adding batch of %d elements" % (len(data)))
+hnsw_index.add_items(data)
+bf_index.add_items(data)
+
+print("Indices built")
+
+# Generating query data
+query_data = np.float32(np.random.random((nun_queries, dim)))
+
+# Query the elements and measure recall:
+labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
+labels_bf, distances_bf = bf_index.knn_query(query_data, k)
+
+# Measure recall
+correct = 0
+for i in range(nun_queries):
+    for label in labels_hnsw[i]:
+        for correct_label in labels_bf[i]:
+            if label == correct_label:
+                correct += 1
+                break
+
+print("recall is :", float(correct)/(k*nun_queries))
+
+# test serializing  the brute force index
+index_path = 'bf_index.bin'
+print("Saving index to '%s'" % index_path)
+bf_index.save_index(index_path)
+del bf_index
+
+# Re-initiating, loading the index
+bf_index = hnswlib.BFIndex(space='l2', dim=dim)
+
+print("\nLoading index from '%s'\n" % index_path)
+bf_index.load_index(index_path)
+
+# Query the brute force index again to verify that we get the same results
+labels_bf, distances_bf = bf_index.knn_query(query_data, k)
+
+# Measure recall
+correct = 0
+for i in range(nun_queries):
+    for label in labels_hnsw[i]:
+        for correct_label in labels_bf[i]:
+            if label == correct_label:
+                correct += 1
+                break
+
+print("recall after reloading is :", float(correct)/(k*nun_queries))
+
+


=====================================
python_bindings/tests/bindings_test_spaces.py
=====================================
@@ -0,0 +1,39 @@
+import unittest
+
+import numpy as np
+
+import hnswlib
+
+class RandomSelfTestCase(unittest.TestCase):
+    def testRandomSelf(self):
+
+        data1 = np.asarray([[1, 0, 0],
+                            [0, 1, 0],
+                            [0, 0, 1],
+                            [1, 0, 1],
+                            [1, 1, 1],
+                            ])
+
+        for space, expected_distances in [
+            ('l2', [[0., 1., 2., 2., 2.]]),
+            ('ip', [[-2., -1., 0., 0., 0.]]),
+            ('cosine', [[0, 1.835e-01, 4.23e-01, 4.23e-01, 4.23e-01]])]:
+
+            for rightdim in range(1, 128, 3):
+                for leftdim in range(1, 32, 5):
+                    data2 = np.concatenate(
+                        [np.zeros([data1.shape[0], leftdim]), data1, np.zeros([data1.shape[0], rightdim])], axis=1)
+                    dim = data2.shape[1]
+                    p = hnswlib.Index(space=space, dim=dim)
+                    p.init_index(max_elements=5, ef_construction=100, M=16)
+
+                    p.set_ef(10)
+
+                    p.add_items(data2)
+
+                    # Query the elements for themselves and measure recall:
+                    labels, distances = p.knn_query(np.asarray(data2[-1:]), k=5)
+
+                    
+                    diff=np.mean(np.abs(distances-expected_distances))                    
+                    self.assertAlmostEqual(diff, 0, delta=1e-3)


=====================================
setup.py
=====================================
@@ -7,7 +7,7 @@ import setuptools
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-__version__ = '0.5.2'
+__version__ = '0.6.0'
 
 
 include_dirs = [



View it on GitLab: https://salsa.debian.org/med-team/hnswlib/-/commit/c6215c32397a67ee35133a8ae5f77e886f00c040

-- 
View it on GitLab: https://salsa.debian.org/med-team/hnswlib/-/commit/c6215c32397a67ee35133a8ae5f77e886f00c040
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220116/28f6cb2e/attachment-0001.htm>


More information about the debian-med-commit mailing list