Skip to content

Commit

Permalink
Merge pull request #197 from AleksandrKent/160-feature-c-bindings-and…
Browse files Browse the repository at this point in the history
…-semantic-kernel-integration

Refactor of C# Bindings
  • Loading branch information
ashvardanian authored Aug 14, 2023
2 parents f569c3b + 4f76d17 commit 166beda
Show file tree
Hide file tree
Showing 61 changed files with 3,303 additions and 829 deletions.
323 changes: 323 additions & 0 deletions .editorconfig

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions .github/workflows/csharp-build-and-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# This workflow builds USearch native libraries, then build and test C# USearch wrapper
# Currently for development purposes an if condition specified below to work only fork

name: Test C# wrapper

on:
push:
branches: ["160-feature-c-bindings-and-semantic-kernel-integration"]

env:
LINUX_OSX_SCRIPT: build_and_test.sh
WINDOWS_SCRIPT: build_and_test.cmd

jobs:
matrix-test:
if: github.repository == 'AleksandrKent/usearch'
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v3
with:
ref: "160-feature-c-bindings-and-semantic-kernel-integration"
- run: |
git submodule update --init --recursive
shell: bash
- name: Setup .NET
uses: actions/setup-dotnet@v3
with:
dotnet-version: 6.0.x

# Ubuntu and macOS steps
- name: Run ${{ env.LINUX_OSX_SCRIPT }} on Ubuntu or macOS
if: matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'
run: |
cd ./csharp
"./${{ env.LINUX_OSX_SCRIPT }}"
shell: bash

# Windows step
- name: Run ${{ env.WINDOWS_SCRIPT }} on Windows
if: matrix.os == 'windows-latest'
run: |
cd .\csharp
${{ env.WINDOWS_SCRIPT }}
shell: cmd
18 changes: 17 additions & 1 deletion .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,22 @@ jobs:
toolchain: stable
override: true

test_golang:
name: Test GoLang
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: git submodule update --init --recursive
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.15'
- name: Build C library for cGo
run: |
make -C ./c libusearch_c.so
mv ./c/libusearch_c.so ./golang/libusearch_c.so
cd golang && ls && go test -v
test_java:
name: Test Java
runs-on: ubuntu-latest
Expand Down Expand Up @@ -160,7 +176,7 @@ jobs:
sudo apt install -y nodejs
git clone https://github.com/emscripten-core/emsdk.git
./emsdk/emsdk install latest
- name: Build USearch by Emscripten
- name: Build USearch using Emscripten
run: |
./emsdk/emsdk activate latest && source ./emsdk/emsdk_env.sh
emcmake cmake -DUSEARCH_BUILD_BENCHMARK=0 -DUSEARCH_BUILD_WASM=1 -B ./build -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -s TOTAL_MEMORY=64MB" && emmake make -C ./build
Expand Down
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ __pycache__
dist/
wheelhouse/

# Python Environments
venv/

# JS wrappers
package-lock.json
node_modules/
Expand All @@ -39,14 +42,18 @@ node_modules/
Cargo.lock
target/

# Java binings
# Java bindings
.gradle
*.class

# ObjC and Swift
.build
.swiftpm

# C# builds
[Bb]in/
[Oo]bj/

# Prerequisites
*.d

Expand Down Expand Up @@ -82,3 +89,4 @@ target/
*.exe
*.out
*.app

8 changes: 8 additions & 0 deletions .releaserc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
{
"preset": "eslint",
"releaseRules": [
{
"tag": "Break",
"release": "major"
},
{
"tag": "Add",
"release": "minor"
Expand All @@ -35,6 +39,10 @@
{
"preset": "eslint",
"releaseRules": [
{
"tag": "Break",
"release": "major"
},
{
"tag": "Add",
"release": "minor"
Expand Down
8 changes: 7 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -157,5 +157,11 @@
"reportMissingImports": "none"
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
"python.testing.pytestEnabled": true,
"[go]": {
"editor.insertSpaces": true,
"editor.formatOnSave": true,
"editor.defaultFormatter": "golang.go"
},
"dotnet.defaultSolution": "csharp/libusearch-wrapper.sln",
}
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This CMake file is heavily inspired by following `usearch` CMake:
# https://github.com/nlohmann/json/blob/develop/CMakeLists.txt
cmake_minimum_required(VERSION 3.1)
project(usearch VERSION 0.6.0 LANGUAGES CXX)
project(usearch VERSION 0.6.0 LANGUAGES C CXX)

# Determine if USearch is built as a subproject (using `add_subdirectory`) or if it is the main project
set(USEARCH_IS_MAIN_PROJECT OFF)
Expand All @@ -14,6 +14,7 @@ option(USEARCH_INSTALL "Install CMake targets" OFF)
option(USEARCH_BUILD_TEST "Compile a native unit test in C++" ${USEARCH_IS_MAIN_PROJECT})
option(USEARCH_BUILD_BENCHMARK "Compile a native benchmark in C++" ${USEARCH_IS_MAIN_PROJECT})
option(USEARCH_BUILD_WOLFRAM "Compile Wolfram Language bindings" OFF)
option(USEARCH_BUILD_CLIB "Compile a native library for the C 99 interface" OFF)

# Includes
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
Expand Down Expand Up @@ -122,3 +123,7 @@ endif()
if(${USEARCH_BUILD_WASM})
add_subdirectory(wasm)
endif()

if (${USEARCH_BUILD_CLIB})
add_subdirectory(c)
endif()
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "usearch"
version = "0.22.3"
version = "1.1.0"
authors = ["Ash Vardanian <[email protected]>"]
description = "Smaller & Faster Single-File Vector Search Engine from Unum"
edition = "2021"
Expand Down
63 changes: 51 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ Vector Search Engine<br/>
<p align="center">
Euclidean • Angular • Jaccard • Hamming • Haversine • User-Defined Metrics
<br/>
<a href="https://unum-cloud.github.io/usearch/cpp">C++11</a> •
<a href="https://unum-cloud.github.io/usearch/python">Python</a> •
<a href="https://unum-cloud.github.io/usearch/cpp">C++ 11</a> •
<a href="https://unum-cloud.github.io/usearch/python">Python 3</a> •
<a href="https://unum-cloud.github.io/usearch/javascript">JavaScript</a> •
<a href="https://unum-cloud.github.io/usearch/java">Java</a> •
<a href="https://unum-cloud.github.io/usearch/rust">Rust</a> •
<a href="https://unum-cloud.github.io/usearch/c">C99</a> •
<a href="https://unum-cloud.github.io/usearch/c">C 99</a> •
<a href="https://unum-cloud.github.io/usearch/objective-c">Objective-C</a> •
<a href="https://unum-cloud.github.io/usearch/swift">Swift</a> •
<a href="https://unum-cloud.github.io/usearch/golang">GoLang</a> •
Expand All @@ -41,7 +41,7 @@ Linux • MacOS • Windows • Docker • WebAssembly
- ✅ SIMD-optimized and [user-defined metrics](#user-defined-functions) with JIT compilation.
- ✅ Variable dimensionality vectors for unique applications, including search over compressed data.
- ✅ Bitwise Tanimoto and Sorensen coefficients for [Genomics and Chemistry applications](#usearch--rdkit--molecular-search).
- ✅ Hardware-agnostic `f16` & `f8` - [half-precision & quarter-precision support](#memory-efficiency-downcasting-and-quantization).
- ✅ Hardware-agnostic `f16` & `i8` - [half-precision & quarter-precision support](#memory-efficiency-downcasting-and-quantization).
-[View large indexes from disk](#disk-based-indexes) without loading into RAM.
- ✅ Space-efficient point-clouds with `uint40_t`, accommodating 4B+ size.
- ✅ Compatible with OpenMP and custom "executors", for fine-grained control over CPU utilization.
Expand Down Expand Up @@ -79,7 +79,7 @@ from usearch.index import Index
index = Index(
ndim=3, # Define the number of dimensions in input vectors
metric='cos', # Choose 'l2sq', 'haversine' or other metric, default = 'ip'
dtype='f32', # Quantize to 'f16' or 'f8' if needed, default = 'f32'
dtype='f32', # Quantize to 'f16' or 'i8' if needed, default = 'f32'
connectivity=16, # Optional: How frequent should the connections in the graph be
expansion_add=128, # Optional: Control the recall of indexing
expansion_search=64, # Optional: Control the quality of search
Expand Down Expand Up @@ -121,10 +121,10 @@ Those, however, are only sometimes reliable, can significantly affect the statis
![USearch uint40_t support](https://github.com/unum-cloud/usearch/blob/main/assets/usearch-neighbor-types.png?raw=true)

Instead, we have focused on high-precision arithmetic over low-precision downcasted vectors.
The same index, and `add` and `search` operations will automatically down-cast or up-cast between `f32_t`, `f16_t`, `f64_t`, and `f8_t` representations, even if the hardware doesn't natively support it.
The same index, and `add` and `search` operations will automatically down-cast or up-cast between `f32_t`, `f16_t`, `f64_t`, and `i8_t` representations, even if the hardware doesn't natively support it.
Continuing the topic of memory efficiency, we provide a `uint40_t` to allow collection with over 4B+ vectors without allocating 8 bytes for every neighbor reference in the proximity graph.

| | FAISS, `f32` | USearch, `f32` | USearch, `f16` | USearch, `f8` |
| | FAISS, `f32` | USearch, `f32` | USearch, `f16` | USearch, `i8` |
| :----------- | -----------: | -------------: | -------------: | ----------------: |
| Batch Insert | 16 K/s | 73 K/s | 100 K/s | 104 K/s **+550%** |
| Batch Search | 82 K/s | 103 K/s | 113 K/s | 134 K/s **+63%** |
Expand Down Expand Up @@ -159,7 +159,46 @@ other_view = Index(ndim=..., metric=CompiledMetric(...))
other_view.view("index.usearch")
```

## Joins
## Exact, Approximate, and Multi-Index Lookups

Approximate search methods, such as HNSW, are predominantly used when an exact brute-force search becomes too resource-intensive.
This typically occurs when you have millions of entries in a collection.
For smaller collections, we offer a more direct approach with the `search` method.

```py
from usearch.index import search, MetricKind, Matches, BatchMatches
import numpy as np

# Generate 10'000 random vectors with 1024 dimensions
vectors = np.random.rand(10_000, 1024).astype(np.float32)
vector = np.random.rand(1024).astype(np.float32)

one_in_many: Matches = search(vectors, vector, 50, MetricKind.L2sq, exact=True)
many_in_many: BatchMatches = search(vectors, vectors, 50, MetricKind.L2sq, exact=True)
```

By passing the `exact=True` argument, the system bypasses indexing altogether and performs a brute-force search through the entire dataset using SIMD-optimized similarity metrics from [SimSIMD](https://github.com/ashvardanian/simsimd).
When compared to FAISS's `IndexFlatL2` in Google Colab, **[USearch may offer up to a 20x performance improvement](https://github.com/unum-cloud/usearch/issues/176#issuecomment-1666650778)**:

- `faiss.IndexFlatL2`: **55.3 ms**.
- `usearch.index.search`: **2.54 ms**.

For larger workloads targeting billions or even trillions of vectors, parallel multi-index lookups become invaluable.
These lookups prevent the need to construct a single, massive index, allowing users to query multiple smaller ones instead.

```py
from usearch.index import Indexes

multi_index = Indexes(
indexes: Iterable[usearch.index.Index] = [...],
paths: Iterable[os.PathLike] = [...],
view: bool = False,
threads: int = 0,
)
multi_index.search(...)
```

## Joins, One-to-One, One-to-Many, and Many-to-Many Mappings

One of the big questions these days is how will AI change the world of databases and data management.
Most databases are still struggling to implement high-quality fuzzy search, and the only kind of joins they know are deterministic.
Expand Down Expand Up @@ -189,9 +228,9 @@ Broader functionality is ported per request.
| :---------------------- | :----: | :------: | :---: | :---: | :--------: | :---: | :----: | :---: |
| Add, search |||||||||
| Save, load, view |||||||||
| Join |||||||||
| User-defined metrics |||||||||
| Variable-length vectors |||||||||
| Joins |||||||||
| Variable-length vectors |||||||||
| 4B+ capacities |||||||||

## Application Examples
Expand Down Expand Up @@ -281,9 +320,9 @@ matches = index.search(fingerprints, 10)
## Integrations

- [x] GPT-Cache.
- [ ] LangChain.
- [ ] Microsoft Semantic Kernel.
- [x] LangChain.
- [ ] ClickHouse.
- [ ] Microsoft Semantic Kernel.

## Citations

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.22.3
1.1.0
53 changes: 53 additions & 0 deletions c/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@


set(USEARCH_PUNNED_INCLUDE_DIRS
"${CMAKE_CURRENT_SOURCE_DIR}/../include"
"${CMAKE_CURRENT_SOURCE_DIR}/../fp16/include"
"${CMAKE_CURRENT_SOURCE_DIR}/../robin-map/include"
"${CMAKE_CURRENT_SOURCE_DIR}/../simsimd/include"
"${CMAKE_CURRENT_SOURCE_DIR}/"
)

# This article discusses a better way to allow building either static or shared libraries
# https://alexreinking.com/blog/building-a-dual-shared-and-static-library-with-cmake.html
if (${USEARCH_BUILD_STATIC})
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -static-libstdc++")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -static")
endif()

add_library(usearch_c STATIC "${CMAKE_CURRENT_SOURCE_DIR}/lib.cpp")
else()
add_library(usearch_c SHARED "${CMAKE_CURRENT_SOURCE_DIR}/lib.cpp")
endif()

set_target_properties(usearch_c PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(usearch_c PROPERTIES CXX_STANDARD 11)
set_target_properties(usearch_c PROPERTIES C_STANDARD 99)

target_include_directories(usearch_c PRIVATE ${USEARCH_PUNNED_INCLUDE_DIRS})
set_target_properties(usearch_c PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set_target_properties(usearch_c PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set_target_properties(usearch_c PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})


target_include_directories(usearch_c PRIVATE ${USEARCH_PUNNED_INCLUDE_DIRS})

if(${USEARCH_DEBUG_BUILD_ASAN})
# For ASAN in debug builds
target_link_libraries(usearch_c PRIVATE gcov)
endif()

if(${USEARCH_USE_SIMSIMD})
target_compile_definitions(usearch_c PRIVATE USEARCH_USE_SIMSIMD=1)
endif()
if (${USEARCH_LOOKUP_LABEL})
target_compile_definitions(usearch_c PRIVATE USEARCH_LOOKUP_LABEL=1)
endif()

if(${USEARCH_BUILD_TEST})
add_executable(test_c "${CMAKE_CURRENT_SOURCE_DIR}/test.c")
target_link_libraries(test_c usearch_c)
set_target_properties(test_c PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
endif()
Loading

0 comments on commit 166beda

Please sign in to comment.