-
Notifications
You must be signed in to change notification settings - Fork 655
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #348 from nmslib/develop
0.6.0 release
- Loading branch information
Showing
21 changed files
with
738 additions
and
184 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
name: HNSW CI | ||
|
||
on: [push, pull_request] | ||
|
||
jobs: | ||
test: | ||
runs-on: ${{matrix.os}} | ||
strategy: | ||
matrix: | ||
os: [ubuntu-latest, windows-latest] | ||
python-version: ['3.6', '3.7', '3.8', '3.9'] | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Build and install | ||
run: python -m pip install . | ||
|
||
- name: Test | ||
run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,6 @@ python_bindings/tests/__pycache__/ | |
*.pyd | ||
hnswlib.cpython*.so | ||
var/ | ||
.idea/ | ||
.vscode/ | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,27 @@ | ||
cmake_minimum_required (VERSION 2.6) | ||
project (hnsw_lib) | ||
project(hnsw_lib | ||
LANGUAGES CXX) | ||
|
||
include_directories("${PROJECT_BINARY_DIR}") | ||
add_library(hnswlib INTERFACE) | ||
target_include_directories(hnswlib INTERFACE .) | ||
|
||
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) | ||
set(CMAKE_CXX_STANDARD 11) | ||
|
||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) | ||
endif() | ||
|
||
set(SOURCE_EXE main.cpp) | ||
add_executable(test_updates examples/updates_test.cpp) | ||
target_link_libraries(test_updates hnswlib) | ||
|
||
set(SOURCE_LIB sift_1b.cpp) | ||
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) | ||
target_link_libraries(searchKnnCloserFirst_test hnswlib) | ||
|
||
add_library(sift_test STATIC ${SOURCE_LIB}) | ||
|
||
|
||
add_executable(main ${SOURCE_EXE}) | ||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) | ||
add_executable(main main.cpp sift_1b.cpp) | ||
target_link_libraries(main hnswlib) | ||
endif() | ||
|
||
add_executable(test_updates examples/updates_test.cpp) | ||
|
||
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) | ||
|
||
target_link_libraries(main sift_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Testing recall | ||
|
||
Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors). | ||
For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index. | ||
Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension. | ||
|
||
### Brute force index API | ||
`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`. | ||
|
||
`hnswlib.BFIndex` methods: | ||
|
||
`init_index(max_elements)` initializes the index with no elements. | ||
|
||
max_elements defines the maximum number of elements that can be stored in the structure. | ||
|
||
`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure. | ||
`ids` are optional N-size numpy array of integer labels for all elements in data. | ||
|
||
`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results. | ||
|
||
`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the | ||
`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). | ||
|
||
`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index. | ||
|
||
`save_index(path_to_index)` saves the index from persistence. | ||
|
||
### measuring recall example | ||
|
||
``` | ||
import hnswlib | ||
import numpy as np | ||
dim = 32 | ||
num_elements = 100000 | ||
k = 10 | ||
nun_queries = 10 | ||
# Generating sample data | ||
data = np.float32(np.random.random((num_elements, dim))) | ||
# Declaring index | ||
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip | ||
bf_index = hnswlib.BFIndex(space='l2', dim=dim) | ||
# Initing both hnsw and brute force indices | ||
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded | ||
# during insertion of an element. | ||
# The capacity can be increased by saving/loading the index, see below. | ||
# | ||
# hnsw construction params: | ||
# ef_construction - controls index search speed/build speed tradeoff | ||
# | ||
# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M) | ||
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction | ||
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) | ||
bf_index.init_index(max_elements=num_elements) | ||
# Controlling the recall for hnsw by setting ef: | ||
# higher ef leads to better accuracy, but slower search | ||
hnsw_index.set_ef(200) | ||
# Set number of threads used during batch search/construction in hnsw | ||
# By default using all available cores | ||
hnsw_index.set_num_threads(1) | ||
print("Adding batch of %d elements" % (len(data))) | ||
hnsw_index.add_items(data) | ||
bf_index.add_items(data) | ||
print("Indices built") | ||
# Generating query data | ||
query_data = np.float32(np.random.random((nun_queries, dim))) | ||
# Query the elements and measure recall: | ||
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) | ||
labels_bf, distances_bf = bf_index.knn_query(query_data, k) | ||
# Measure recall | ||
correct = 0 | ||
for i in range(nun_queries): | ||
for label in labels_hnsw[i]: | ||
for correct_label in labels_bf[i]: | ||
if label == correct_label: | ||
correct += 1 | ||
break | ||
print("recall is :", float(correct)/(k*nun_queries)) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from pydriller import Repository | ||
import os | ||
import datetime | ||
os.system("cp examples/speedtest.py examples/speedtest2.py") | ||
for commit in Repository('.', from_tag="v0.5.2").traverse_commits(): | ||
print(commit.hash) | ||
print(commit.msg) | ||
|
||
os.system(f"git checkout {commit.hash}; rm -rf build; ") | ||
os.system("python -m pip install .") | ||
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 1') | ||
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 64 -t 1') | ||
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 1') | ||
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 24') | ||
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 24') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import hnswlib | ||
import numpy as np | ||
import os.path | ||
import time | ||
import argparse | ||
|
||
# Use nargs to specify how many arguments an option should take. | ||
ap = argparse.ArgumentParser() | ||
ap.add_argument('-d') | ||
ap.add_argument('-n') | ||
ap.add_argument('-t') | ||
args = ap.parse_args() | ||
dim = int(args.d) | ||
name = args.n | ||
threads=int(args.t) | ||
num_elements = 1000000 * 4//dim | ||
|
||
# Generating sample data | ||
np.random.seed(1) | ||
data = np.float32(np.random.random((num_elements, dim))) | ||
|
||
|
||
index_path=f'speed_index{dim}.bin' | ||
# Declaring index | ||
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip | ||
|
||
if not os.path.isfile(index_path) : | ||
|
||
p.init_index(max_elements=num_elements, ef_construction=100, M=16) | ||
|
||
# Controlling the recall by setting ef: | ||
# higher ef leads to better accuracy, but slower search | ||
p.set_ef(10) | ||
|
||
# Set number of threads used during batch search/construction | ||
# By default using all available cores | ||
p.set_num_threads(12) | ||
|
||
p.add_items(data) | ||
|
||
# Serializing and deleting the index: | ||
|
||
print("Saving index to '%s'" % index_path) | ||
p.save_index(index_path) | ||
p.set_num_threads(threads) | ||
times=[] | ||
time.sleep(10) | ||
p.set_ef(100) | ||
for _ in range(3): | ||
p.load_index(index_path) | ||
for _ in range(10): | ||
t0=time.time() | ||
labels, distances = p.knn_query(data, k=1) | ||
tt=time.time()-t0 | ||
times.append(tt) | ||
print(f"{tt} seconds") | ||
str_out=f"mean time:{np.mean(times)}, median time:{np.median(times)}, std time {np.std(times)} {name}" | ||
print(str_out) | ||
with open (f"log_{dim}_t{threads}.txt","a") as f: | ||
f.write(str_out+"\n") | ||
f.flush() | ||
|
Oops, something went wrong.