[med-svn] [Git][med-team/hnswlib][upstream] New upstream version 0.8.0
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Thu Dec 14 10:40:08 GMT 2023
Étienne Mollier pushed to branch upstream at Debian Med / hnswlib
Commits:
519411fd by Étienne Mollier at 2023-12-14T10:35:32+01:00
New upstream version 0.8.0
- - - - -
23 changed files:
- .github/workflows/build.yml
- .gitignore
- CMakeLists.txt
- README.md
- examples/cpp/EXAMPLES.md
- + examples/cpp/example_epsilon_search.cpp
- + examples/cpp/example_multivector_search.cpp
- hnswlib/bruteforce.h
- hnswlib/hnswalg.h
- hnswlib/hnswlib.h
- hnswlib/space_ip.h
- + hnswlib/stop_condition.h
- python_bindings/bindings.cpp
- + python_bindings/tests/bindings_test_bf_index.py
- setup.py
- + tests/cpp/epsilon_search_test.cpp
- + tests/cpp/multivector_search_test.cpp
- tests/cpp/sift_1b.cpp
- tests/cpp/updates_test.cpp
- tests/python/bindings_test_getdata.py
- tests/python/bindings_test_replace.py
- + tests/python/draw_git_test_plots.py
- tests/python/git_tester.py
Changes:
=====================================
.github/workflows/build.yml
=====================================
@@ -7,7 +7,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
- os: [ubuntu-latest, windows-latest]
+ os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout at v3
@@ -28,7 +28,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
- os: [ubuntu-latest, windows-latest]
+ os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout at v3
- uses: actions/setup-python at v4
@@ -40,10 +40,10 @@ jobs:
mkdir build
cd build
cmake ..
- if [ "$RUNNER_OS" == "Linux" ]; then
- make
- elif [ "$RUNNER_OS" == "Windows" ]; then
+ if [ "$RUNNER_OS" == "Windows" ]; then
cmake --build ./ --config Release
+ else
+ make
fi
shell: bash
@@ -67,10 +67,14 @@ jobs:
./example_mt_search
./example_mt_filter
./example_mt_replace_deleted
+ ./example_multivector_search
+ ./example_epsilon_search
./searchKnnCloserFirst_test
./searchKnnWithFilter_test
./multiThreadLoad_test
./multiThread_replace_test
./test_updates
./test_updates update
+ ./multivector_search_test
+ ./epsilon_search_test
shell: bash
=====================================
.gitignore
=====================================
@@ -10,3 +10,4 @@ var/
.vscode/
.vs/
**.DS_Store
+*.pyc
=====================================
CMakeLists.txt
=====================================
@@ -1,25 +1,68 @@
-cmake_minimum_required (VERSION 2.6)
-project(hnsw_lib
+cmake_minimum_required(VERSION 3.0...3.26)
+
+project(hnswlib
LANGUAGES CXX)
+include(GNUInstallDirs)
+include(CheckCXXCompilerFlag)
+
add_library(hnswlib INTERFACE)
-target_include_directories(hnswlib INTERFACE .)
+add_library(hnswlib::hnswlib ALIAS hnswlib)
+
+target_include_directories(hnswlib INTERFACE
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+ $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+# Install
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+install(TARGETS hnswlib
+ EXPORT hnswlibTargets)
+
+install(EXPORT hnswlibTargets
+ FILE hnswlibConfig.cmake
+ NAMESPACE hnswlib::
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)
+# Examples and tests
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+ option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
+else()
+ option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
+endif()
+if(HNSWLIB_EXAMPLES)
set(CMAKE_CXX_STANDARD 11)
- if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
+ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
+ check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
+ if(COMPILER_SUPPORT_NATIVE_FLAG)
+ SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
+ message("set -march=native flag")
+ else()
+ check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
+ if(COMPILER_SUPPORT_M1_FLAG)
+ SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
+ message("set -mcpu=apple-m1 flag")
+ endif()
+ endif()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
- SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
+ SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
- SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
+ SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
endif()
# examples
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)
+ add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
+ target_link_libraries(example_epsilon_search hnswlib)
+
+ add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
+ target_link_libraries(example_multivector_search hnswlib)
+
add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)
@@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
target_link_libraries(example_mt_replace_deleted hnswlib)
# tests
+ add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
+ target_link_libraries(multivector_search_test hnswlib)
+
+ add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
+ target_link_libraries(epsilon_search_test hnswlib)
+
add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)
=====================================
README.md
=====================================
@@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates
**NEWS:**
+**version 0.8.0**
+
+* Multi-vector document search and epsilon search (for now, only in C++)
+* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)).
+* Various bugfixes and improvements
+* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list'
+
+Full list of changes: https://github.com/nmslib/hnswlib/pull/523
+
**version 0.7.0**
* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
@@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
* `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying.
-* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors.
+* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors.
* `get_ids_list()` - returns a list of all elements' ids.
@@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
* filtering during the search with a boolean function
* deleting the elements and reusing the memory of the deleted elements for newly added elements
* multithreaded usage
+* multivector search
+* epsilon search
### Bindings installation
=====================================
examples/cpp/EXAMPLES.md
=====================================
@@ -182,4 +182,8 @@ int main() {
Multithreaded examples:
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
-* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
\ No newline at end of file
+* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
+
+More examples:
+* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
+* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
\ No newline at end of file
=====================================
examples/cpp/example_epsilon_search.cpp
=====================================
@@ -0,0 +1,66 @@
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+ int dim = 16; // Dimension of the elements
+ int max_elements = 10000; // Maximum number of elements, should be known beforehand
+ int M = 16; // Tightly connected with internal dimensionality of the data
+ // strongly affects the memory consumption
+ int ef_construction = 200; // Controls index search speed/build speed tradeoff
+ int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region
+ // this parameter is similar to ef
+
+ int num_queries = 5;
+ float epsilon2 = 2.0; // Squared distance to query
+
+ // Initing index
+ hnswlib::L2Space space(dim);
+ hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+ // Generate random data
+ std::mt19937 rng;
+ rng.seed(47);
+ std::uniform_real_distribution<> distrib_real;
+
+ size_t data_point_size = space.get_data_size();
+ char* data = new char[data_point_size * max_elements];
+ for (int i = 0; i < max_elements; i++) {
+ char* point_data = data + i * data_point_size;
+ for (int j = 0; j < dim; j++) {
+ char* vec_data = point_data + j * sizeof(float);
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ }
+
+ // Add data to index
+ for (int i = 0; i < max_elements; i++) {
+ hnswlib::labeltype label = i;
+ char* point_data = data + i * data_point_size;
+ alg_hnsw->addPoint(point_data, label);
+ }
+
+ // Query random vectors
+ for (int i = 0; i < num_queries; i++) {
+ char* query_data = new char[data_point_size];
+ for (int j = 0; j < dim; j++) {
+ size_t offset = j * sizeof(float);
+ char* vec_data = query_data + offset;
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ std::cout << "Query #" << i << "\n";
+ hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
+ std::vector<std::pair<float, hnswlib::labeltype>> result =
+ alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+ size_t num_vectors = result.size();
+ std::cout << "Found " << num_vectors << " vectors\n";
+ delete[] query_data;
+ }
+
+ delete[] data;
+ delete alg_hnsw;
+ return 0;
+}
=====================================
examples/cpp/example_multivector_search.cpp
=====================================
@@ -0,0 +1,83 @@
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+ int dim = 16; // Dimension of the elements
+ int max_elements = 10000; // Maximum number of elements, should be known beforehand
+ int M = 16; // Tightly connected with internal dimensionality of the data
+ // strongly affects the memory consumption
+ int ef_construction = 200; // Controls index search speed/build speed tradeoff
+
+ int num_queries = 5;
+ int num_docs = 5; // Number of documents to search
+ int ef_collection = 6; // Number of candidate documents during the search
+ // Controlls the recall: higher ef leads to better accuracy, but slower search
+ docidtype min_doc_id = 0;
+ docidtype max_doc_id = 9;
+
+ // Initing index
+ hnswlib::MultiVectorL2Space<docidtype> space(dim);
+ hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+ // Generate random data
+ std::mt19937 rng;
+ rng.seed(47);
+ std::uniform_real_distribution<> distrib_real;
+ std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
+
+ size_t data_point_size = space.get_data_size();
+ char* data = new char[data_point_size * max_elements];
+ for (int i = 0; i < max_elements; i++) {
+ // set vector value
+ char* point_data = data + i * data_point_size;
+ for (int j = 0; j < dim; j++) {
+ char* vec_data = point_data + j * sizeof(float);
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ // set document id
+ docidtype doc_id = distrib_docid(rng);
+ space.set_doc_id(point_data, doc_id);
+ }
+
+ // Add data to index
+ std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
+ for (int i = 0; i < max_elements; i++) {
+ hnswlib::labeltype label = i;
+ char* point_data = data + i * data_point_size;
+ alg_hnsw->addPoint(point_data, label);
+ label_docid_lookup[label] = space.get_doc_id(point_data);
+ }
+
+ // Query random vectors
+ size_t query_size = dim * sizeof(float);
+ for (int i = 0; i < num_queries; i++) {
+ char* query_data = new char[query_size];
+ for (int j = 0; j < dim; j++) {
+ size_t offset = j * sizeof(float);
+ char* vec_data = query_data + offset;
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ std::cout << "Query #" << i << "\n";
+ hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+ std::vector<std::pair<float, hnswlib::labeltype>> result =
+ alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+ size_t num_vectors = result.size();
+
+ std::unordered_map<docidtype, size_t> doc_counter;
+ for (auto pair: result) {
+ hnswlib::labeltype label = pair.second;
+ docidtype doc_id = label_docid_lookup[label];
+ doc_counter[doc_id] += 1;
+ }
+ std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
+ delete[] query_data;
+ }
+
+ delete[] data;
+ delete alg_hnsw;
+ return 0;
+}
=====================================
hnswlib/bruteforce.h
=====================================
@@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
void removePoint(labeltype cur_external) {
- size_t cur_c = dict_external_to_internal[cur_external];
+ std::unique_lock<std::mutex> lock(index_lock);
- dict_external_to_internal.erase(cur_external);
+ auto found = dict_external_to_internal.find(cur_external);
+ if (found == dict_external_to_internal.end()) {
+ return;
+ }
+
+ dict_external_to_internal.erase(found);
+ size_t cur_c = found->second;
labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
dict_external_to_internal[label] = cur_c;
memcpy(data_ + size_per_element_ * cur_c,
@@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
- topResults.push(std::pair<dist_t, labeltype>(dist, label));
+ topResults.emplace(dist, label);
}
}
dist_t lastdist = topResults.empty() ? std::numeric_limits<dist_t>::max() : topResults.top().first;
@@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
if (dist <= lastdist) {
labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
- topResults.push(std::pair<dist_t, labeltype>(dist, label));
+ topResults.emplace(dist, label);
}
if (topResults.size() > k)
topResults.pop();
=====================================
hnswlib/hnswalg.h
=====================================
@@ -8,6 +8,7 @@
#include <assert.h>
#include <unordered_set>
#include <list>
+#include <memory>
namespace hnswlib {
typedef unsigned int tableint;
@@ -33,7 +34,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
double mult_{0.0}, revSize_{0.0};
int maxlevel_{0};
- VisitedListPool *visited_list_pool_{nullptr};
+ std::unique_ptr<VisitedListPool> visited_list_pool_{nullptr};
// Locks operations with element by label value
mutable std::vector<std::mutex> label_op_locks_;
@@ -92,8 +93,8 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
size_t ef_construction = 200,
size_t random_seed = 100,
bool allow_replace_deleted = false)
- : link_list_locks_(max_elements),
- label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
+ : label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
+ link_list_locks_(max_elements),
element_levels_(max_elements),
allow_replace_deleted_(allow_replace_deleted) {
max_elements_ = max_elements;
@@ -101,7 +102,13 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
data_size_ = s->get_data_size();
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();
- M_ = M;
+ if ( M <= 10000 ) {
+ M_ = M;
+ } else {
+ HNSWERR << "warning: M parameter exceeds 10000 which may lead to adverse effects." << std::endl;
+ HNSWERR << " Cap to 10000 will be applied for the rest of the processing." << std::endl;
+ M_ = 10000;
+ }
maxM_ = M_;
maxM0_ = M_ * 2;
ef_construction_ = std::max(ef_construction, M_);
@@ -122,7 +129,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
cur_element_count = 0;
- visited_list_pool_ = new VisitedListPool(1, max_elements);
+ visited_list_pool_ = std::unique_ptr<VisitedListPool>(new VisitedListPool(1, max_elements));
// initializations for special treatment of the first node
enterpoint_node_ = -1;
@@ -138,13 +145,20 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
~HierarchicalNSW() {
+ clear();
+ }
+
+ void clear() {
free(data_level0_memory_);
+ data_level0_memory_ = nullptr;
for (tableint i = 0; i < cur_element_count; i++) {
if (element_levels_[i] > 0)
free(linkLists_[i]);
}
free(linkLists_);
- delete visited_list_pool_;
+ linkLists_ = nullptr;
+ cur_element_count = 0;
+ visited_list_pool_.reset(nullptr);
}
@@ -291,9 +305,15 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
}
- template <bool has_deletions, bool collect_metrics = false>
+ // bare_bone_search means there is no check for deletions and stop condition is ignored in return of extra performance
+ template <bool bare_bone_search = true, bool collect_metrics = false>
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
- searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const {
+ searchBaseLayerST(
+ tableint ep_id,
+ const void *data_point,
+ size_t ef,
+ BaseFilterFunctor* isIdAllowed = nullptr,
+ BaseSearchStopCondition<dist_t>* stop_condition = nullptr) const {
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
vl_type *visited_array = vl->mass;
vl_type visited_array_tag = vl->curV;
@@ -302,10 +322,15 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set;
dist_t lowerBound;
- if ((!has_deletions || !isMarkedDeleted(ep_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id)))) {
- dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
+ if (bare_bone_search ||
+ (!isMarkedDeleted(ep_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id))))) {
+ char* ep_data = getDataByInternalId(ep_id);
+ dist_t dist = fstdistfunc_(data_point, ep_data, dist_func_param_);
lowerBound = dist;
top_candidates.emplace(dist, ep_id);
+ if (!bare_bone_search && stop_condition) {
+ stop_condition->add_point_to_result(getExternalLabel(ep_id), ep_data, dist);
+ }
candidate_set.emplace(-dist, ep_id);
} else {
lowerBound = std::numeric_limits<dist_t>::max();
@@ -316,9 +341,19 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
while (!candidate_set.empty()) {
std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
+ dist_t candidate_dist = -current_node_pair.first;
- if ((-current_node_pair.first) > lowerBound &&
- (top_candidates.size() == ef || (!isIdAllowed && !has_deletions))) {
+ bool flag_stop_search;
+ if (bare_bone_search) {
+ flag_stop_search = candidate_dist > lowerBound;
+ } else {
+ if (stop_condition) {
+ flag_stop_search = stop_condition->should_stop_search(candidate_dist, lowerBound);
+ } else {
+ flag_stop_search = candidate_dist > lowerBound && top_candidates.size() == ef;
+ }
+ }
+ if (flag_stop_search) {
break;
}
candidate_set.pop();
@@ -353,7 +388,14 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
char *currObj1 = (getDataByInternalId(candidate_id));
dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_);
- if (top_candidates.size() < ef || lowerBound > dist) {
+ bool flag_consider_candidate;
+ if (!bare_bone_search && stop_condition) {
+ flag_consider_candidate = stop_condition->should_consider_candidate(dist, lowerBound);
+ } else {
+ flag_consider_candidate = top_candidates.size() < ef || lowerBound > dist;
+ }
+
+ if (flag_consider_candidate) {
candidate_set.emplace(-dist, candidate_id);
#ifdef USE_SSE
_mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ +
@@ -361,11 +403,30 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
_MM_HINT_T0); ////////////////////////
#endif
- if ((!has_deletions || !isMarkedDeleted(candidate_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))
+ if (bare_bone_search ||
+ (!isMarkedDeleted(candidate_id) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))) {
top_candidates.emplace(dist, candidate_id);
+ if (!bare_bone_search && stop_condition) {
+ stop_condition->add_point_to_result(getExternalLabel(candidate_id), currObj1, dist);
+ }
+ }
- if (top_candidates.size() > ef)
+ bool flag_remove_extra = false;
+ if (!bare_bone_search && stop_condition) {
+ flag_remove_extra = stop_condition->should_remove_extra();
+ } else {
+ flag_remove_extra = top_candidates.size() > ef;
+ }
+ while (flag_remove_extra) {
+ tableint id = top_candidates.top().second;
top_candidates.pop();
+ if (!bare_bone_search && stop_condition) {
+ stop_condition->remove_point_from_result(getExternalLabel(id), getDataByInternalId(id), dist);
+ flag_remove_extra = stop_condition->should_remove_extra();
+ } else {
+ flag_remove_extra = top_candidates.size() > ef;
+ }
+ }
if (!top_candidates.empty())
lowerBound = top_candidates.top().first;
@@ -380,8 +441,8 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
void getNeighborsByHeuristic2(
- std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
- const size_t M) {
+ std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
+ const size_t M) {
if (top_candidates.size() < M) {
return;
}
@@ -573,8 +634,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
if (new_max_elements < cur_element_count)
throw std::runtime_error("Cannot resize, max element is less than the current number of elements");
- delete visited_list_pool_;
- visited_list_pool_ = new VisitedListPool(1, new_max_elements);
+ visited_list_pool_.reset(new VisitedListPool(1, new_max_elements));
element_levels_.resize(new_max_elements);
@@ -595,6 +655,32 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
max_elements_ = new_max_elements;
}
+ size_t indexFileSize() const {
+ size_t size = 0;
+ size += sizeof(offsetLevel0_);
+ size += sizeof(max_elements_);
+ size += sizeof(cur_element_count);
+ size += sizeof(size_data_per_element_);
+ size += sizeof(label_offset_);
+ size += sizeof(offsetData_);
+ size += sizeof(maxlevel_);
+ size += sizeof(enterpoint_node_);
+ size += sizeof(maxM_);
+
+ size += sizeof(maxM0_);
+ size += sizeof(M_);
+ size += sizeof(mult_);
+ size += sizeof(ef_construction_);
+
+ size += cur_element_count * size_data_per_element_;
+
+ for (size_t i = 0; i < cur_element_count; i++) {
+ unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
+ size += sizeof(linkListSize);
+ size += linkListSize;
+ }
+ return size;
+ }
void saveIndex(const std::string &location) {
std::ofstream output(location, std::ios::binary);
@@ -633,6 +719,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
if (!input.is_open())
throw std::runtime_error("Cannot open file");
+ clear();
// get file size:
input.seekg(0, input.end);
std::streampos total_filesize = input.tellg();
@@ -698,7 +785,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
std::vector<std::mutex>(max_elements).swap(link_list_locks_);
std::vector<std::mutex>(MAX_LABEL_OPERATION_LOCKS).swap(label_op_locks_);
- visited_list_pool_ = new VisitedListPool(1, max_elements);
+ visited_list_pool_.reset(new VisitedListPool(1, max_elements));
linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
if (linkLists_ == nullptr)
@@ -752,7 +839,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
size_t dim = *((size_t *) dist_func_param_);
std::vector<data_t> data;
data_t* data_ptr = (data_t*) data_ptrv;
- for (int i = 0; i < dim; i++) {
+ for (size_t i = 0; i < dim; i++) {
data.push_back(*data_ptr);
data_ptr += 1;
}
@@ -1216,11 +1303,12 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
}
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
- if (num_deleted_) {
- top_candidates = searchBaseLayerST<true, true>(
+ bool bare_bone_search = !num_deleted_ && !isIdAllowed;
+ if (bare_bone_search) {
+ top_candidates = searchBaseLayerST<true>(
currObj, query_data, std::max(ef_, k), isIdAllowed);
} else {
- top_candidates = searchBaseLayerST<false, true>(
+ top_candidates = searchBaseLayerST<false>(
currObj, query_data, std::max(ef_, k), isIdAllowed);
}
@@ -1236,6 +1324,60 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
}
+ std::vector<std::pair<dist_t, labeltype >>
+ searchStopConditionClosest(
+ const void *query_data,
+ BaseSearchStopCondition<dist_t>& stop_condition,
+ BaseFilterFunctor* isIdAllowed = nullptr) const {
+ std::vector<std::pair<dist_t, labeltype >> result;
+ if (cur_element_count == 0) return result;
+
+ tableint currObj = enterpoint_node_;
+ dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+
+ for (int level = maxlevel_; level > 0; level--) {
+ bool changed = true;
+ while (changed) {
+ changed = false;
+ unsigned int *data;
+
+ data = (unsigned int *) get_linklist(currObj, level);
+ int size = getListCount(data);
+ metric_hops++;
+ metric_distance_computations+=size;
+
+ tableint *datal = (tableint *) (data + 1);
+ for (int i = 0; i < size; i++) {
+ tableint cand = datal[i];
+ if (cand < 0 || cand > max_elements_)
+ throw std::runtime_error("cand error");
+ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+
+ if (d < curdist) {
+ curdist = d;
+ currObj = cand;
+ changed = true;
+ }
+ }
+ }
+ }
+
+ std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
+ top_candidates = searchBaseLayerST<false>(currObj, query_data, 0, isIdAllowed, &stop_condition);
+
+ size_t sz = top_candidates.size();
+ result.resize(sz);
+ while (!top_candidates.empty()) {
+ result[--sz] = top_candidates.top();
+ top_candidates.pop();
+ }
+
+ stop_condition.filter_results(result);
+
+ return result;
+ }
+
+
void checkIntegrity() {
int connections_checked = 0;
std::vector <int > inbound_connections_num(cur_element_count, 0);
@@ -1246,7 +1388,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
tableint *data = (tableint *) (ll_cur + 1);
std::unordered_set<tableint> s;
for (int j = 0; j < size; j++) {
- assert(data[j] > 0);
assert(data[j] < cur_element_count);
assert(data[j] != i);
inbound_connections_num[data[j]]++;
=====================================
hnswlib/hnswlib.h
=====================================
@@ -1,4 +1,13 @@
#pragma once
+
+// https://github.com/nmslib/hnswlib/pull/508
+// This allows others to provide their own error stream (e.g. RcppHNSW)
+#ifndef HNSWLIB_ERR_OVERRIDE
+ #define HNSWERR std::cerr
+#else
+ #define HNSWERR HNSWLIB_ERR_OVERRIDE
+#endif
+
#ifndef NO_MANUAL_VECTORIZATION
#if (defined(__SSE__) || _M_IX86_FP > 0 || defined(_M_AMD64) || defined(_M_X64))
#define USE_SSE
@@ -15,7 +24,7 @@
#ifdef _MSC_VER
#include <intrin.h>
#include <stdexcept>
-void cpuid(int32_t out[4], int32_t eax, int32_t ecx) {
+static void cpuid(int32_t out[4], int32_t eax, int32_t ecx) {
__cpuidex(out, eax, ecx);
}
static __int64 xgetbv(unsigned int x) {
@@ -119,6 +128,25 @@ typedef size_t labeltype;
class BaseFilterFunctor {
public:
virtual bool operator()(hnswlib::labeltype id) { return true; }
+ virtual ~BaseFilterFunctor() {};
+};
+
+template<typename dist_t>
+class BaseSearchStopCondition {
+ public:
+ virtual void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) = 0;
+
+ virtual void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) = 0;
+
+ virtual bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) = 0;
+
+ virtual bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) = 0;
+
+ virtual bool should_remove_extra() = 0;
+
+ virtual void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) = 0;
+
+ virtual ~BaseSearchStopCondition() {}
};
template <typename T>
@@ -195,5 +223,6 @@ AlgorithmInterface<dist_t>::searchKnnCloserFirst(const void* query_data, size_t
#include "space_l2.h"
#include "space_ip.h"
+#include "stop_condition.h"
#include "bruteforce.h"
#include "hnswalg.h"
=====================================
hnswlib/space_ip.h
=====================================
@@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void
__m512 sum512 = _mm512_set1_ps(0);
- while (pVect1 < pEnd1) {
- //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
-
+ size_t loop = qty16 / 4;
+
+ while (loop--) {
__m512 v1 = _mm512_loadu_ps(pVect1);
- pVect1 += 16;
__m512 v2 = _mm512_loadu_ps(pVect2);
+ pVect1 += 16;
+ pVect2 += 16;
+
+ __m512 v3 = _mm512_loadu_ps(pVect1);
+ __m512 v4 = _mm512_loadu_ps(pVect2);
+ pVect1 += 16;
+ pVect2 += 16;
+
+ __m512 v5 = _mm512_loadu_ps(pVect1);
+ __m512 v6 = _mm512_loadu_ps(pVect2);
+ pVect1 += 16;
pVect2 += 16;
- sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
+
+ __m512 v7 = _mm512_loadu_ps(pVect1);
+ __m512 v8 = _mm512_loadu_ps(pVect2);
+ pVect1 += 16;
+ pVect2 += 16;
+
+ sum512 = _mm512_fmadd_ps(v1, v2, sum512);
+ sum512 = _mm512_fmadd_ps(v3, v4, sum512);
+ sum512 = _mm512_fmadd_ps(v5, v6, sum512);
+ sum512 = _mm512_fmadd_ps(v7, v8, sum512);
}
- _mm512_store_ps(TmpRes, sum512);
- float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
+ while (pVect1 < pEnd1) {
+ __m512 v1 = _mm512_loadu_ps(pVect1);
+ __m512 v2 = _mm512_loadu_ps(pVect2);
+ pVect1 += 16;
+ pVect2 += 16;
+ sum512 = _mm512_fmadd_ps(v1, v2, sum512);
+ }
+ float sum = _mm512_reduce_add_ps(sum512);
return sum;
}
=====================================
hnswlib/stop_condition.h
=====================================
@@ -0,0 +1,276 @@
+#pragma once
+#include "space_l2.h"
+#include "space_ip.h"
+#include <assert.h>
+#include <unordered_map>
+
+namespace hnswlib {
+
+template<typename DOCIDTYPE>
+class BaseMultiVectorSpace : public SpaceInterface<float> {
+ public:
+ virtual DOCIDTYPE get_doc_id(const void *datapoint) = 0;
+
+ virtual void set_doc_id(void *datapoint, DOCIDTYPE doc_id) = 0;
+};
+
+
+template<typename DOCIDTYPE>
+class MultiVectorL2Space : public BaseMultiVectorSpace<DOCIDTYPE> {
+ DISTFUNC<float> fstdistfunc_;
+ size_t data_size_;
+ size_t vector_size_;
+ size_t dim_;
+
+ public:
+ MultiVectorL2Space(size_t dim) {
+ fstdistfunc_ = L2Sqr;
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
+ #if defined(USE_AVX512)
+ if (AVX512Capable())
+ L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX512;
+ else if (AVXCapable())
+ L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX;
+ #elif defined(USE_AVX)
+ if (AVXCapable())
+ L2SqrSIMD16Ext = L2SqrSIMD16ExtAVX;
+ #endif
+
+ if (dim % 16 == 0)
+ fstdistfunc_ = L2SqrSIMD16Ext;
+ else if (dim % 4 == 0)
+ fstdistfunc_ = L2SqrSIMD4Ext;
+ else if (dim > 16)
+ fstdistfunc_ = L2SqrSIMD16ExtResiduals;
+ else if (dim > 4)
+ fstdistfunc_ = L2SqrSIMD4ExtResiduals;
+#endif
+ dim_ = dim;
+ vector_size_ = dim * sizeof(float);
+ data_size_ = vector_size_ + sizeof(DOCIDTYPE);
+ }
+
+ size_t get_data_size() override {
+ return data_size_;
+ }
+
+ DISTFUNC<float> get_dist_func() override {
+ return fstdistfunc_;
+ }
+
+ void *get_dist_func_param() override {
+ return &dim_;
+ }
+
+ DOCIDTYPE get_doc_id(const void *datapoint) override {
+ return *(DOCIDTYPE *)((char *)datapoint + vector_size_);
+ }
+
+ void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override {
+ *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id;
+ }
+
+ ~MultiVectorL2Space() {}
+};
+
+
+template<typename DOCIDTYPE>
+class MultiVectorInnerProductSpace : public BaseMultiVectorSpace<DOCIDTYPE> {
+ DISTFUNC<float> fstdistfunc_;
+ size_t data_size_;
+ size_t vector_size_;
+ size_t dim_;
+
+ public:
+ MultiVectorInnerProductSpace(size_t dim) {
+ fstdistfunc_ = InnerProductDistance;
+#if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
+ #if defined(USE_AVX512)
+ if (AVX512Capable()) {
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
+ } else if (AVXCapable()) {
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
+ }
+ #elif defined(USE_AVX)
+ if (AVXCapable()) {
+ InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
+ InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
+ }
+ #endif
+ #if defined(USE_AVX)
+ if (AVXCapable()) {
+ InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
+ InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
+ }
+ #endif
+
+ if (dim % 16 == 0)
+ fstdistfunc_ = InnerProductDistanceSIMD16Ext;
+ else if (dim % 4 == 0)
+ fstdistfunc_ = InnerProductDistanceSIMD4Ext;
+ else if (dim > 16)
+ fstdistfunc_ = InnerProductDistanceSIMD16ExtResiduals;
+ else if (dim > 4)
+ fstdistfunc_ = InnerProductDistanceSIMD4ExtResiduals;
+#endif
+ vector_size_ = dim * sizeof(float);
+ data_size_ = vector_size_ + sizeof(DOCIDTYPE);
+ }
+
+ size_t get_data_size() override {
+ return data_size_;
+ }
+
+ DISTFUNC<float> get_dist_func() override {
+ return fstdistfunc_;
+ }
+
+ void *get_dist_func_param() override {
+ return &dim_;
+ }
+
+ DOCIDTYPE get_doc_id(const void *datapoint) override {
+ return *(DOCIDTYPE *)((char *)datapoint + vector_size_);
+ }
+
+ void set_doc_id(void *datapoint, DOCIDTYPE doc_id) override {
+ *(DOCIDTYPE*)((char *)datapoint + vector_size_) = doc_id;
+ }
+
+ ~MultiVectorInnerProductSpace() {}
+};
+
+
+template<typename DOCIDTYPE, typename dist_t>
+class MultiVectorSearchStopCondition : public BaseSearchStopCondition<dist_t> {
+ size_t curr_num_docs_;
+ size_t num_docs_to_search_;
+ size_t ef_collection_;
+ std::unordered_map<DOCIDTYPE, size_t> doc_counter_;
+ std::priority_queue<std::pair<dist_t, DOCIDTYPE>> search_results_;
+ BaseMultiVectorSpace<DOCIDTYPE>& space_;
+
+ public:
+ MultiVectorSearchStopCondition(
+ BaseMultiVectorSpace<DOCIDTYPE>& space,
+ size_t num_docs_to_search,
+ size_t ef_collection = 10)
+ : space_(space) {
+ curr_num_docs_ = 0;
+ num_docs_to_search_ = num_docs_to_search;
+ ef_collection_ = std::max(ef_collection, num_docs_to_search);
+ }
+
+ void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override {
+ DOCIDTYPE doc_id = space_.get_doc_id(datapoint);
+ if (doc_counter_[doc_id] == 0) {
+ curr_num_docs_ += 1;
+ }
+ search_results_.emplace(dist, doc_id);
+ doc_counter_[doc_id] += 1;
+ }
+
+ void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override {
+ DOCIDTYPE doc_id = space_.get_doc_id(datapoint);
+ doc_counter_[doc_id] -= 1;
+ if (doc_counter_[doc_id] == 0) {
+ curr_num_docs_ -= 1;
+ }
+ search_results_.pop();
+ }
+
+ bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override {
+ bool stop_search = candidate_dist > lowerBound && curr_num_docs_ == ef_collection_;
+ return stop_search;
+ }
+
+ bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override {
+ bool flag_consider_candidate = curr_num_docs_ < ef_collection_ || lowerBound > candidate_dist;
+ return flag_consider_candidate;
+ }
+
+ bool should_remove_extra() override {
+ bool flag_remove_extra = curr_num_docs_ > ef_collection_;
+ return flag_remove_extra;
+ }
+
+ void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) override {
+ while (curr_num_docs_ > num_docs_to_search_) {
+ dist_t dist_cand = candidates.back().first;
+ dist_t dist_res = search_results_.top().first;
+ assert(dist_cand == dist_res);
+ DOCIDTYPE doc_id = search_results_.top().second;
+ doc_counter_[doc_id] -= 1;
+ if (doc_counter_[doc_id] == 0) {
+ curr_num_docs_ -= 1;
+ }
+ search_results_.pop();
+ candidates.pop_back();
+ }
+ }
+
+ ~MultiVectorSearchStopCondition() {}
+};
+
+
+template<typename dist_t>
+class EpsilonSearchStopCondition : public BaseSearchStopCondition<dist_t> {
+ float epsilon_;
+ size_t min_num_candidates_;
+ size_t max_num_candidates_;
+ size_t curr_num_items_;
+
+ public:
+ EpsilonSearchStopCondition(float epsilon, size_t min_num_candidates, size_t max_num_candidates) {
+ assert(min_num_candidates <= max_num_candidates);
+ epsilon_ = epsilon;
+ min_num_candidates_ = min_num_candidates;
+ max_num_candidates_ = max_num_candidates;
+ curr_num_items_ = 0;
+ }
+
+ void add_point_to_result(labeltype label, const void *datapoint, dist_t dist) override {
+ curr_num_items_ += 1;
+ }
+
+ void remove_point_from_result(labeltype label, const void *datapoint, dist_t dist) override {
+ curr_num_items_ -= 1;
+ }
+
+ bool should_stop_search(dist_t candidate_dist, dist_t lowerBound) override {
+ if (candidate_dist > lowerBound && curr_num_items_ == max_num_candidates_) {
+ // new candidate can't improve found results
+ return true;
+ }
+ if (candidate_dist > epsilon_ && curr_num_items_ >= min_num_candidates_) {
+ // new candidate is out of epsilon region and
+ // minimum number of candidates is checked
+ return true;
+ }
+ return false;
+ }
+
+ bool should_consider_candidate(dist_t candidate_dist, dist_t lowerBound) override {
+ bool flag_consider_candidate = curr_num_items_ < max_num_candidates_ || lowerBound > candidate_dist;
+ return flag_consider_candidate;
+ }
+
+ bool should_remove_extra() {
+ bool flag_remove_extra = curr_num_items_ > max_num_candidates_;
+ return flag_remove_extra;
+ }
+
+ void filter_results(std::vector<std::pair<dist_t, labeltype >> &candidates) override {
+ while (!candidates.empty() && candidates.back().first > epsilon_) {
+ candidates.pop_back();
+ }
+ while (candidates.size() > max_num_candidates_) {
+ candidates.pop_back();
+ }
+ }
+
+ ~EpsilonSearchStopCondition() {}
+};
+} // namespace hnswlib
=====================================
python_bindings/bindings.cpp
=====================================
@@ -218,6 +218,9 @@ class Index {
this->num_threads_default = num_threads;
}
+ size_t indexFileSize() const {
+ return appr_alg->indexFileSize();
+ }
void saveIndex(const std::string &path_to_index) {
appr_alg->saveIndex(path_to_index);
@@ -301,7 +304,11 @@ class Index {
}
- std::vector<std::vector<data_t>> getDataReturnList(py::object ids_ = py::none()) {
+ py::object getData(py::object ids_ = py::none(), std::string return_type = "numpy") {
+ std::vector<std::string> return_types{"numpy", "list"};
+ if (std::find(std::begin(return_types), std::end(return_types), return_type) == std::end(return_types)) {
+ throw std::invalid_argument("return_type should be \"numpy\" or \"list\"");
+ }
std::vector<size_t> ids;
if (!ids_.is_none()) {
py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
@@ -322,7 +329,12 @@ class Index {
for (auto id : ids) {
data.push_back(appr_alg->template getDataByLabel<data_t>(id));
}
- return data;
+ if (return_type == "list") {
+ return py::cast(data);
+ }
+ if (return_type == "numpy") {
+ return py::array_t< data_t, py::array::c_style | py::array::forcecast >(py::cast(data));
+ }
}
@@ -633,7 +645,7 @@ class Index {
(void*)items.data(row), k, p_idFilter);
if (result.size() != k)
throw std::runtime_error(
- "Cannot return the results in a contigious 2D array. Probably ef or M is too small");
+ "Cannot return the results in a contiguous 2D array. Probably ef or M is too small");
for (int i = k - 1; i >= 0; i--) {
auto& result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
@@ -653,7 +665,7 @@ class Index {
(void*)(norm_array.data() + start_idx), k, p_idFilter);
if (result.size() != k)
throw std::runtime_error(
- "Cannot return the results in a contigious 2D array. Probably ef or M is too small");
+ "Cannot return the results in a contiguous 2D array. Probably ef or M is too small");
for (int i = k - 1; i >= 0; i--) {
auto& result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
@@ -719,6 +731,7 @@ class BFIndex {
int dim;
bool index_inited;
bool normalize;
+ int num_threads_default;
hnswlib::labeltype cur_l;
hnswlib::BruteforceSearch<dist_t>* alg;
@@ -739,6 +752,8 @@ class BFIndex {
}
alg = NULL;
index_inited = false;
+
+ num_threads_default = std::thread::hardware_concurrency();
}
@@ -749,6 +764,21 @@ class BFIndex {
}
+ size_t getMaxElements() const {
+ return alg->maxelements_;
+ }
+
+
+ size_t getCurrentCount() const {
+ return alg->cur_element_count;
+ }
+
+
+ void set_num_threads(int num_threads) {
+ this->num_threads_default = num_threads;
+ }
+
+
void init_new_index(const size_t maxElements) {
if (alg) {
throw std::runtime_error("The index is already initiated.");
@@ -820,15 +850,19 @@ class BFIndex {
py::object knnQuery_return_numpy(
py::object input,
size_t k = 1,
+ int num_threads = -1,
const std::function<bool(hnswlib::labeltype)>& filter = nullptr) {
py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
auto buffer = items.request();
hnswlib::labeltype *data_numpy_l;
dist_t *data_numpy_d;
size_t rows, features;
+
+ if (num_threads <= 0)
+ num_threads = num_threads_default;
+
{
py::gil_scoped_release l;
-
get_input_array_shapes(buffer, &rows, &features);
data_numpy_l = new hnswlib::labeltype[rows * k];
@@ -837,16 +871,16 @@ class BFIndex {
CustomFilterFunctor idFilter(filter);
CustomFilterFunctor* p_idFilter = filter ? &idFilter : nullptr;
- for (size_t row = 0; row < rows; row++) {
+ ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) {
std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = alg->searchKnn(
- (void *) items.data(row), k, p_idFilter);
+ (void*)items.data(row), k, p_idFilter);
for (int i = k - 1; i >= 0; i--) {
- auto &result_tuple = result.top();
+ auto& result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
data_numpy_l[row * k + i] = result_tuple.second;
result.pop();
}
- }
+ });
}
py::capsule free_when_done_l(data_numpy_l, [](void *f) {
@@ -900,10 +934,11 @@ PYBIND11_PLUGIN(hnswlib) {
py::arg("ids") = py::none(),
py::arg("num_threads") = -1,
py::arg("replace_deleted") = false)
- .def("get_items", &Index<float, float>::getDataReturnList, py::arg("ids") = py::none())
+ .def("get_items", &Index<float>::getData, py::arg("ids") = py::none(), py::arg("return_type") = "numpy")
.def("get_ids_list", &Index<float>::getIdsList)
.def("set_ef", &Index<float>::set_ef, py::arg("ef"))
.def("set_num_threads", &Index<float>::set_num_threads, py::arg("num_threads"))
+ .def("index_file_size", &Index<float>::indexFileSize)
.def("save_index", &Index<float>::saveIndex, py::arg("path_to_index"))
.def("load_index",
&Index<float>::loadIndex,
@@ -957,13 +992,22 @@ PYBIND11_PLUGIN(hnswlib) {
py::class_<BFIndex<float>>(m, "BFIndex")
.def(py::init<const std::string &, const int>(), py::arg("space"), py::arg("dim"))
.def("init_index", &BFIndex<float>::init_new_index, py::arg("max_elements"))
- .def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none())
+ .def("knn_query",
+ &BFIndex<float>::knnQuery_return_numpy,
+ py::arg("data"),
+ py::arg("k") = 1,
+ py::arg("num_threads") = -1,
+ py::arg("filter") = py::none())
.def("add_items", &BFIndex<float>::addItems, py::arg("data"), py::arg("ids") = py::none())
.def("delete_vector", &BFIndex<float>::deleteVector, py::arg("label"))
+ .def("set_num_threads", &BFIndex<float>::set_num_threads, py::arg("num_threads"))
.def("save_index", &BFIndex<float>::saveIndex, py::arg("path_to_index"))
.def("load_index", &BFIndex<float>::loadIndex, py::arg("path_to_index"), py::arg("max_elements") = 0)
.def("__repr__", [](const BFIndex<float> &a) {
return "<hnswlib.BFIndex(space='" + a.space_name + "', dim="+std::to_string(a.dim)+")>";
- });
+ })
+ .def("get_max_elements", &BFIndex<float>::getMaxElements)
+ .def("get_current_count", &BFIndex<float>::getCurrentCount)
+ .def_readwrite("num_threads", &BFIndex<float>::num_threads_default);
return m.ptr();
}
=====================================
python_bindings/tests/bindings_test_bf_index.py
=====================================
@@ -0,0 +1,49 @@
+import unittest
+
+import numpy as np
+
+import hnswlib
+
+
+class RandomSelfTestCase(unittest.TestCase):
+ def testBFIndex(self):
+
+ dim = 16
+ num_elements = 10000
+ num_queries = 1000
+ k = 20
+
+ # Generating sample data
+ data = np.float32(np.random.random((num_elements, dim)))
+
+ # Declaring index
+ bf_index = hnswlib.BFIndex(space='l2', dim=dim) # possible options are l2, cosine or ip
+ bf_index.init_index(max_elements=num_elements)
+
+ num_threads = 8
+ bf_index.set_num_threads(num_threads) # by default using all available cores
+
+ print(f"Adding all elements {num_elements}")
+ bf_index.add_items(data)
+
+ self.assertEqual(bf_index.num_threads, num_threads)
+ self.assertEqual(bf_index.get_max_elements(), num_elements)
+ self.assertEqual(bf_index.get_current_count(), num_elements)
+
+ queries = np.float32(np.random.random((num_queries, dim)))
+ print("Searching nearest neighbours")
+ labels, distances = bf_index.knn_query(queries, k=k)
+
+ print("Checking results")
+ for i in range(num_queries):
+ query = queries[i]
+ sq_dists = (data - query)**2
+ dists = np.sum(sq_dists, axis=1)
+ labels_gt = np.argsort(dists)[:k]
+ dists_gt = dists[labels_gt]
+ dists_bf = distances[i]
+ # we can compare labels but because of numeric errors in distance calculation in C++ and numpy
+ # sometimes we get different order of labels, therefore we compare distances
+ max_diff_with_gt = np.max(np.abs(dists_gt - dists_bf))
+
+ self.assertTrue(max_diff_with_gt < 1e-5)
=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ import setuptools
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
-__version__ = '0.7.0'
+__version__ = '0.8.0'
include_dirs = [
@@ -73,22 +73,20 @@ def cpp_flag(compiler):
class BuildExt(build_ext):
"""A custom build extension for adding compiler-specific options."""
+ compiler_flag_native = '-march=native'
c_opts = {
'msvc': ['/EHsc', '/openmp', '/O2'],
- #'unix': ['-O3', '-march=native'], # , '-w'
- 'unix': ['-O3'], # , '-w'
+ 'unix': ['-O3', compiler_flag_native], # , '-w'
}
- if not os.environ.get("HNSWLIB_NO_NATIVE"):
- c_opts['unix'].append('-march=native')
-
link_opts = {
'unix': [],
'msvc': [],
}
+ if os.environ.get("HNSWLIB_NO_NATIVE"):
+ c_opts['unix'].remove(compiler_flag_native)
+
if sys.platform == 'darwin':
- if platform.machine() == 'arm64':
- c_opts['unix'].remove('-march=native')
c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
else:
@@ -97,18 +95,35 @@ class BuildExt(build_ext):
def build_extensions(self):
ct = self.compiler.compiler_type
- opts = self.c_opts.get(ct, [])
+ opts = BuildExt.c_opts.get(ct, [])
if ct == 'unix':
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
opts.append(cpp_flag(self.compiler))
if has_flag(self.compiler, '-fvisibility=hidden'):
opts.append('-fvisibility=hidden')
+ if not os.environ.get("HNSWLIB_NO_NATIVE"):
+ # check that native flag is available
+ print('checking avalability of flag:', BuildExt.compiler_flag_native)
+ if not has_flag(self.compiler, BuildExt.compiler_flag_native):
+ print('removing unsupported compiler flag:', BuildExt.compiler_flag_native)
+ opts.remove(BuildExt.compiler_flag_native)
+ # for macos add apple-m1 flag if it's available
+ if sys.platform == 'darwin':
+ m1_flag = '-mcpu=apple-m1'
+ print('checking avalability of flag:', m1_flag)
+ if has_flag(self.compiler, m1_flag):
+ print('adding flag:', m1_flag)
+ opts.append(m1_flag)
+ else:
+ print(f'flag: {m1_flag} is not available')
+ else:
+ print(f'flag: {BuildExt.compiler_flag_native} is available')
elif ct == 'msvc':
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
for ext in self.extensions:
ext.extra_compile_args.extend(opts)
- ext.extra_link_args.extend(self.link_opts.get(ct, []))
+ ext.extra_link_args.extend(BuildExt.link_opts.get(ct, []))
build_ext.build_extensions(self)
=====================================
tests/cpp/epsilon_search_test.cpp
=====================================
@@ -0,0 +1,114 @@
+#include "assert.h"
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+ int dim = 16; // Dimension of the elements
+ int max_elements = 10000; // Maximum number of elements, should be known beforehand
+ int M = 16; // Tightly connected with internal dimensionality of the data
+ // strongly affects the memory consumption
+ int ef_construction = 200; // Controls index search speed/build speed tradeoff
+
+ int num_queries = 100;
+ float epsilon2 = 1.0; // Squared distance to query
+ int max_num_candidates = max_elements; // Upper bound on the number of returned elements in the epsilon region
+ int min_num_candidates = 2000; // Minimum number of candidates to search in the epsilon region
+ // this parameter is similar to ef
+
+ // Initing index
+ hnswlib::L2Space space(dim);
+ hnswlib::BruteforceSearch<dist_t>* alg_brute = new hnswlib::BruteforceSearch<dist_t>(&space, max_elements);
+ hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+ // Generate random data
+ std::mt19937 rng;
+ rng.seed(47);
+ std::uniform_real_distribution<> distrib_real;
+
+ float* data = new float[dim * max_elements];
+ for (int i = 0; i < dim * max_elements; i++) {
+ data[i] = distrib_real(rng);
+ }
+
+ // Add data to index
+ std::cout << "Building index ...\n";
+ for (int i = 0; i < max_elements; i++) {
+ hnswlib::labeltype label = i;
+ float* point_data = data + i * dim;
+ alg_hnsw->addPoint(point_data, label);
+ alg_brute->addPoint(point_data, label);
+ }
+ std::cout << "Index is ready\n";
+
+ // Query random vectors
+ for (int i = 0; i < num_queries; i++) {
+ float* query_data = new float[dim];
+ for (int j = 0; j < dim; j++) {
+ query_data[j] = distrib_real(rng);
+ }
+ hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_num_candidates);
+ std::vector<std::pair<float, hnswlib::labeltype>> result_hnsw =
+ alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+
+ // check that returned results are in epsilon region
+ size_t num_vectors = result_hnsw.size();
+ std::unordered_set<hnswlib::labeltype> hnsw_labels;
+ for (auto pair: result_hnsw) {
+ float dist = pair.first;
+ hnswlib::labeltype label = pair.second;
+ hnsw_labels.insert(label);
+ assert(dist >=0 && dist <= epsilon2);
+ }
+ std::priority_queue<std::pair<float, hnswlib::labeltype>> result_brute =
+ alg_brute->searchKnn(query_data, max_elements);
+
+ // check recall
+ std::unordered_set<hnswlib::labeltype> gt_labels;
+ while (!result_brute.empty()) {
+ float dist = result_brute.top().first;
+ hnswlib::labeltype label = result_brute.top().second;
+ if (dist < epsilon2) {
+ gt_labels.insert(label);
+ }
+ result_brute.pop();
+ }
+ float correct = 0;
+ for (const auto& hnsw_label: hnsw_labels) {
+ if (gt_labels.find(hnsw_label) != gt_labels.end()) {
+ correct += 1;
+ }
+ }
+ if (gt_labels.size() == 0) {
+ assert(correct == 0);
+ continue;
+ }
+ float recall = correct / gt_labels.size();
+ assert(recall > 0.95);
+ delete[] query_data;
+ }
+ std::cout << "Recall is OK\n";
+
+ // Query the elements for themselves and check that query can be found
+ float epsilon2_small = 0.0001f;
+ int min_candidates_small = 500;
+ for (size_t i = 0; i < max_elements; i++) {
+ hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2_small, min_candidates_small, max_num_candidates);
+ std::vector<std::pair<float, hnswlib::labeltype>> result =
+ alg_hnsw->searchStopConditionClosest(alg_hnsw->getDataByInternalId(i), stop_condition);
+ size_t num_vectors = result.size();
+ // get closest distance
+ float dist = -1;
+ if (!result.empty()) {
+ dist = result[0].first;
+ }
+ assert(dist == 0);
+ }
+ std::cout << "Small epsilon search is OK\n";
+
+ delete[] data;
+ delete alg_brute;
+ delete alg_hnsw;
+ return 0;
+}
=====================================
tests/cpp/multivector_search_test.cpp
=====================================
@@ -0,0 +1,126 @@
+#include <assert.h>
+#include "../../hnswlib/hnswlib.h"
+
+typedef unsigned int docidtype;
+typedef float dist_t;
+
+int main() {
+ int dim = 16; // Dimension of the elements
+ int max_elements = 1000; // Maximum number of elements, should be known beforehand
+ int M = 16; // Tightly connected with internal dimensionality of the data
+ // strongly affects the memory consumption
+ int ef_construction = 200; // Controls index search speed/build speed tradeoff
+
+ int num_queries = 100;
+ int num_docs = 10; // Number of documents to search
+ int ef_collection = 15; // Number of candidate documents during the search
+ // Controlls the recall: higher ef leads to better accuracy, but slower search
+ docidtype min_doc_id = 0;
+ docidtype max_doc_id = 49;
+
+ // Initing index
+ hnswlib::MultiVectorL2Space<docidtype> space(dim);
+ hnswlib::BruteforceSearch<dist_t>* alg_brute = new hnswlib::BruteforceSearch<dist_t>(&space, max_elements);
+ hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
+
+ // Generate random data
+ std::mt19937 rng;
+ rng.seed(47);
+ std::uniform_real_distribution<> distrib_real;
+ std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
+
+ size_t data_point_size = space.get_data_size();
+ char* data = new char[data_point_size * max_elements];
+ for (int i = 0; i < max_elements; i++) {
+ // set vector value
+ char* point_data = data + i * data_point_size;
+ for (int j = 0; j < dim; j++) {
+ char* vec_data = point_data + j * sizeof(float);
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ // set document id
+ docidtype doc_id = distrib_docid(rng);
+ space.set_doc_id(point_data, doc_id);
+ }
+
+ // Add data to index
+ std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
+ for (int i = 0; i < max_elements; i++) {
+ hnswlib::labeltype label = i;
+ char* point_data = data + i * data_point_size;
+ alg_hnsw->addPoint(point_data, label);
+ alg_brute->addPoint(point_data, label);
+ label_docid_lookup[label] = space.get_doc_id(point_data);
+ }
+
+ // Query random vectors and check overall recall
+ float correct = 0;
+ float total_num_elements = 0;
+ size_t query_size = dim * sizeof(float);
+ for (int i = 0; i < num_queries; i++) {
+ char* query_data = new char[query_size];
+ for (int j = 0; j < dim; j++) {
+ size_t offset = j * sizeof(float);
+ char* vec_data = query_data + offset;
+ float value = distrib_real(rng);
+ *(float*)vec_data = value;
+ }
+ hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+ std::vector<std::pair<dist_t, hnswlib::labeltype>> hnsw_results =
+ alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
+
+ // check number of found documents
+ std::unordered_set<docidtype> hnsw_docs;
+ std::unordered_set<hnswlib::labeltype> hnsw_labels;
+ for (auto pair: hnsw_results) {
+ hnswlib::labeltype label = pair.second;
+ hnsw_labels.emplace(label);
+ docidtype doc_id = label_docid_lookup[label];
+ hnsw_docs.emplace(doc_id);
+ }
+ assert(hnsw_docs.size() == num_docs);
+
+ // Check overall recall
+ std::vector<std::pair<dist_t, hnswlib::labeltype>> gt_results =
+ alg_brute->searchKnnCloserFirst(query_data, max_elements);
+ std::unordered_set<docidtype> gt_docs;
+ for (int i = 0; i < gt_results.size(); i++) {
+ if (gt_docs.size() == num_docs) {
+ break;
+ }
+ hnswlib::labeltype gt_label = gt_results[i].second;
+ if (hnsw_labels.find(gt_label) != hnsw_labels.end()) {
+ correct += 1;
+ }
+ docidtype gt_doc_id = label_docid_lookup[gt_label];
+ gt_docs.emplace(gt_doc_id);
+ total_num_elements += 1;
+ }
+ delete[] query_data;
+ }
+ float recall = correct / total_num_elements;
+ std::cout << "random elements search recall : " << recall << "\n";
+ assert(recall > 0.95);
+
+ // Query the elements for themselves and measure recall
+ correct = 0;
+ for (int i = 0; i < max_elements; i++) {
+ hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
+ std::vector<std::pair<float, hnswlib::labeltype>> result =
+ alg_hnsw->searchStopConditionClosest(data + i * data_point_size, stop_condition);
+ hnswlib::labeltype label = -1;
+ if (!result.empty()) {
+ label = result[0].second;
+ }
+ if (label == i) correct++;
+ }
+ recall = correct / max_elements;
+ std::cout << "same elements search recall : " << recall << "\n";
+ assert(recall > 0.99);
+
+ delete[] data;
+ delete alg_brute;
+ delete alg_hnsw;
+ return 0;
+}
=====================================
tests/cpp/sift_1b.cpp
=====================================
@@ -250,11 +250,11 @@ void sift_test1B() {
size_t vecdim = 128;
char path_index[1024];
char path_gt[1024];
- char *path_q = "../bigann/bigann_query.bvecs";
- char *path_data = "../bigann/bigann_base.bvecs";
- sprintf(path_index, "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M);
+ const char *path_q = "../bigann/bigann_query.bvecs";
+ const char *path_data = "../bigann/bigann_base.bvecs";
+ snprintf(path_index, sizeof(path_index), "sift1b_%dm_ef_%d_M_%d.bin", subset_size_milllions, efConstruction, M);
- sprintf(path_gt, "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions);
+ snprintf(path_gt, sizeof(path_gt), "../bigann/gnd/idx_%dM.ivecs", subset_size_milllions);
unsigned char *massb = new unsigned char[vecdim];
=====================================
tests/cpp/updates_test.cpp
=====================================
@@ -239,7 +239,7 @@ int main(int argc, char **argv) {
for (int b = 1; b < dummy_data_multiplier; b++) {
std::cout << "Update iteration " << b << "\n";
char cpath[1024];
- sprintf(cpath, "batch_dummy_%02d.bin", b);
+ snprintf(cpath, sizeof(cpath), "batch_dummy_%02d.bin", b);
std::vector<float> dummy_batchb = load_batch<float>(path + cpath, N * d);
ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) {
=====================================
tests/python/bindings_test_getdata.py
=====================================
@@ -45,5 +45,11 @@ class RandomSelfTestCase(unittest.TestCase):
self.assertRaises(ValueError, lambda: p.get_items(labels[0]))
# After adding them, all labels should be retrievable
- returned_items = p.get_items(labels)
- self.assertSequenceEqual(data.tolist(), returned_items)
+ returned_items_np = p.get_items(labels)
+ self.assertTrue((data == returned_items_np).all())
+
+ # check returned type of get_items
+ self.assertTrue(isinstance(returned_items_np, np.ndarray))
+ returned_items_list = p.get_items(labels, return_type="list")
+ self.assertTrue(isinstance(returned_items_list, list))
+ self.assertTrue(isinstance(returned_items_list[0], list))
=====================================
tests/python/bindings_test_replace.py
=====================================
@@ -94,10 +94,10 @@ class RandomSelfTestCase(unittest.TestCase):
remaining_data = comb_data[remaining_labels_list]
returned_items = hnsw_index.get_items(remaining_labels_list)
- self.assertSequenceEqual(remaining_data.tolist(), returned_items)
+ self.assertTrue((remaining_data == returned_items).all())
returned_items = hnsw_index.get_items(labels3_tr)
- self.assertSequenceEqual(data3_tr.tolist(), returned_items)
+ self.assertTrue((data3_tr == returned_items).all())
# Check index serialization
# Delete batch 3
=====================================
tests/python/draw_git_test_plots.py
=====================================
@@ -0,0 +1,48 @@
+import os
+import glob
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_data_from_file(file_path):
+ # Load the data, assuming the last column is text
+ data = pd.read_csv(file_path, header=None)
+ rep_size=len(set(data[data.columns[-1]]))
+ data.drop(data.columns[-1], axis=1, inplace=True) # Drop the last column (text)
+
+ # Number of numerical columns
+ num_columns = data.shape[1]
+
+ # Create a subplot for each column
+ fig, axes = plt.subplots(num_columns, 1, figsize=(10, 6 * num_columns))
+
+ # In case there is only one column, axes will not be an array, so we convert it
+ if num_columns == 1:
+ axes = [axes]
+
+ for i, ax in enumerate(axes):
+ idx=0
+ ax.scatter(np.asarray(data.index,dtype=np.int64)%rep_size, data[i], label=f'Column {i+1}')
+ ax.set_title(f'Column {i+1}')
+ ax.set_xlabel('ID Number')
+ ax.set_ylabel('Value')
+ ax.legend()
+ ax.grid(True)
+
+ plt.tight_layout()
+ plt.suptitle(f'Data from {os.path.basename(file_path)}')
+
+ # Save the plot to a file
+ plt.savefig(file_path.replace('.txt', '.png'))
+ plt.close()
+
+def scan_and_plot(directory):
+ # Scan for .txt files in the given directory
+ txt_files = glob.glob(os.path.join(directory, '*.txt'))
+
+ # Process each file
+ for file in txt_files:
+ print(f'Processing {file}...')
+ plot_data_from_file(file)
+ print(f'Plot saved for {file}')
+# Replace 'your_folder_path' with the path to the folder containing the .txt files
+scan_and_plot('./')
\ No newline at end of file
=====================================
tests/python/git_tester.py
=====================================
@@ -9,16 +9,18 @@ speedtest_src_path = os.path.join("tests", "python", "speedtest.py")
speedtest_copy_path = os.path.join("tests", "python", "speedtest2.py")
shutil.copyfile(speedtest_src_path, speedtest_copy_path) # the file has to be outside of git
-commits = list(Repository('.', from_tag="v0.6.2").traverse_commits())
+commits = list(Repository('.', from_tag="v0.7.0").traverse_commits())
print("Found commits:")
for idx, commit in enumerate(commits):
name = commit.msg.replace('\n', ' ').replace('\r', ' ')
print(idx, commit.hash, name)
for commit in commits:
- name = commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";")
+ commit_time = commit.author_date.strftime("%Y-%m-%d %H:%M:%S")
+ author_name = commit.author.name
+ name = "auth:"+author_name+"_"+commit_time+"_msg:"+commit.msg.replace('\n', ' ').replace('\r', ' ').replace(",", ";")
print("\nProcessing", commit.hash, name)
-
+
if os.path.exists("build"):
shutil.rmtree("build")
os.system(f"git checkout {commit.hash}")
@@ -43,10 +45,11 @@ for commit in commits:
print("build failed!!!!")
continue
- # os.system(f'python {speedtest_copy_path} -n "{hash[:4]}_{name}" -d 32 -t 1')
+
os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 1')
os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 16 -t 64')
- # os.system(f'python {speedtest_copy_path} -n "{name}" -d 64 -t 1')
- # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 1')
- # os.system(f'python {speedtest_copy_path} -n "{name}" -d 4 -t 24')
- # os.system(f'python {speedtest_copy_path} -n "{name}" -d 128 -t 24')
+ os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 1')
+ os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 4 -t 64')
+ os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 1')
+ os.system(f'python {speedtest_copy_path} -n "{commit.hash[:4]}_{name}" -d 128 -t 64')
+
View it on GitLab: https://salsa.debian.org/med-team/hnswlib/-/commit/519411fd2a7837c28e639b9faa415a188f208357
--
View it on GitLab: https://salsa.debian.org/med-team/hnswlib/-/commit/519411fd2a7837c28e639b9faa415a188f208357
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231214/f6e4dd1c/attachment-0001.htm>
More information about the debian-med-commit
mailing list