From 679a8139805a056d904d782fb6e0fe8577ce5dc5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 17 Oct 2024 06:58:04 +0000 Subject: [PATCH] Add: `u8` kernels & FMA --- README.md | 19 +-- c/lib.c | 17 +++ include/simsimd/binary.h | 29 ++-- include/simsimd/dot.h | 103 ++++++++++++++- include/simsimd/fma.h | 39 ++++++ include/simsimd/simsimd.h | 113 +++++++++++++++- include/simsimd/spatial.h | 269 ++++++++++++++++++++++++++++++++++++-- scripts/bench.cxx | 32 +++-- scripts/test.py | 38 ++++-- 9 files changed, 602 insertions(+), 57 deletions(-) create mode 100644 include/simsimd/fma.h diff --git a/README.md b/README.md index 9aeba874..8d1fd0c5 100644 --- a/README.md +++ b/README.md @@ -91,15 +91,16 @@ You can learn more about the technical implementation details in the following b ## Benchmarks For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API. -Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements: +Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance: -| Type | Apple M2 Pro | AMD Genoa | AWS Graviton 4 | -| :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: | -| `f64` | 18.5 → 28.8 GB/s
+ 56 % | 21.9 → 41.4 GB/s
+ 89 % | 20.7 → 41.3 GB/s
+ 99 % | -| `f32` | 9.2 → 29.6 GB/s
+ 221 % | 10.9 → 95.8 GB/s
+ 779 % | 4.9 → 41.9 GB/s
+ 755 % | -| `f16` | 4.6 → 14.6 GB/s
+ 217 % | 3.1 → 108.4 GB/s
+ 3,397 % | 5.4 → 39.3 GB/s
+ 627 % | -| `bf16` | 4.6 → 26.3 GB/s
+ 472 % | 0.8 → 59.5 GB/s
+7,437 % | 2.5 → 29.9 GB/s
+ 1,096 % | -| `i8` | 25.8 → 47.1 GB/s
+ 83 % | 33.1 → 65.3 GB/s
+ 97 % | 35.2 → 43.5 GB/s
+ 24 % | +| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 | +| :----- | ----------------------------: | -------------------------------: | ------------------------------: | +| `f64` | 18.5 → 28.8 GB/s
+ 56 % | 21.9 → 41.4 GB/s
+ 89 % | 20.7 → 41.3 GB/s
+ 99 % | +| `f32` | 9.2 → 29.6 GB/s
+ 221 % | 10.9 → 95.8 GB/s
+ 779 % | 4.9 → 41.9 GB/s
+ 755 % | +| `f16` | 4.6 → 14.6 GB/s
+ 217 % | 3.1 → 108.4 GB/s
+ 3,397 % | 5.4 → 39.3 GB/s
+ 627 % | +| `bf16` | 4.6 → 26.3 GB/s
+ 472 % | 0.8 → 59.5 GB/s
+7,437 % | 2.5 → 29.9 GB/s
+ 1,096 % | +| `i8` | 25.8 → 47.1 GB/s
+ 83 % | 33.1 → 65.3 GB/s
+ 97 % | 35.2 → 43.5 GB/s
+ 24 % | +| `u8` | | 32.5 → 66.5 GB/s
+ 105 % | | Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python. Broader benchmarking results: @@ -112,7 +113,7 @@ Broader benchmarking results: The package is intended to replace the usage of `numpy.inner`, `numpy.dot`, and `scipy.spatial.distance`. Aside from drastic performance improvements, SimSIMD significantly improves accuracy in mixed precision setups. -NumPy and SciPy, processing `i8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely. +NumPy and SciPy, processing `i8`, `u8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely. The same applies to processing `f16` and `bf16` values with `f32` precision. ### Installation diff --git a/c/lib.c b/c/lib.c index 5ecb1970..81f3e8a2 100644 --- a/c/lib.c +++ b/c/lib.c @@ -108,6 +108,8 @@ extern "C" { } // Dot products +SIMSIMD_DECLARATION_DENSE(dot, i8, i8) +SIMSIMD_DECLARATION_DENSE(dot, u8, u8) SIMSIMD_DECLARATION_DENSE(dot, f16, f16) SIMSIMD_DECLARATION_DENSE(dot, bf16, bf16) SIMSIMD_DECLARATION_DENSE(dot, f32, f32) @@ -123,16 +125,19 @@ SIMSIMD_DECLARATION_DENSE(vdot, f64c, f64) // Spatial distances SIMSIMD_DECLARATION_DENSE(cos, i8, i8) +SIMSIMD_DECLARATION_DENSE(cos, u8, u8) SIMSIMD_DECLARATION_DENSE(cos, f16, f16) SIMSIMD_DECLARATION_DENSE(cos, bf16, bf16) SIMSIMD_DECLARATION_DENSE(cos, f32, f32) SIMSIMD_DECLARATION_DENSE(cos, f64, f64) SIMSIMD_DECLARATION_DENSE(l2sq, i8, i8) +SIMSIMD_DECLARATION_DENSE(l2sq, u8, u8) SIMSIMD_DECLARATION_DENSE(l2sq, f16, f16) SIMSIMD_DECLARATION_DENSE(l2sq, bf16, bf16) SIMSIMD_DECLARATION_DENSE(l2sq, f32, f32) SIMSIMD_DECLARATION_DENSE(l2sq, f64, f64) SIMSIMD_DECLARATION_DENSE(l2, i8, i8) +SIMSIMD_DECLARATION_DENSE(l2, u8, u8) SIMSIMD_DECLARATION_DENSE(l2, f16, f16) SIMSIMD_DECLARATION_DENSE(l2, bf16, bf16) SIMSIMD_DECLARATION_DENSE(l2, f32, f32) @@ -199,10 +204,13 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) { void* dummy = 0; // Dense: + simsimd_dot_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results); + simsimd_dot_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results); simsimd_dot_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_dot_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_dot_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); simsimd_dot_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results); + simsimd_dot_f16c((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_dot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_dot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); @@ -211,23 +219,32 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) { simsimd_vdot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_vdot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); simsimd_vdot_f64c((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results); + simsimd_cos_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results); + simsimd_cos_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results); simsimd_cos_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_cos_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_cos_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); simsimd_cos_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results); + simsimd_l2sq_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results); + simsimd_l2sq_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results); simsimd_l2sq_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_l2sq_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_l2sq_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); simsimd_l2sq_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results); + simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results); + simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results); + simsimd_l2_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results); simsimd_l2_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_l2_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_l2_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); simsimd_l2_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results); + simsimd_hamming_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results); simsimd_jaccard_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results); + simsimd_kl_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results); simsimd_kl_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results); simsimd_kl_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results); diff --git a/include/simsimd/binary.h b/include/simsimd/binary.h index 2f6a8059..7165e568 100644 --- a/include/simsimd/binary.h +++ b/include/simsimd/binary.h @@ -319,7 +319,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c simsimd_distance_t* result) { simsimd_size_t intersection = 0, union_ = 0; - // It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics. + //? On such vectors we can clearly see that the CPU struggles to perform this many parallel + //? population counts, because the throughput of Jaccard and Hamming in this case starts to differ. + //? One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount + //? lookups, to utilize other ports on the CPU. + //? https://github.com/ashvardanian/SimSIMD/pull/138 + // + // - `_mm512_popcnt_epi64` maps to `VPOPCNTQ (ZMM, K, ZMM)`: + // - On Ice Lake: 3 cycles latency, ports: 1*p5 + // - On Genoa: 2 cycles latency, ports: 1*FP01 + // - `_mm512_shuffle_epi8` maps to `VPSHUFB (ZMM, ZMM, ZMM)`: + // - On Ice Lake: 1 cycles latency, ports: 1*p5 + // - On Genoa: 2 cycles latency, ports: 1*FP12 + // + // It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics. if (n_words <= 64) { // Up to 512 bits. __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words); __m512i a_vec = _mm512_maskz_loadu_epi8(mask, a); @@ -341,20 +354,6 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec)); union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec)); } else if (n_words <= 196) { // Up to 1568 bits. - // TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel - // population counts, because the throughput of Jaccard and Hamming in this case starts to differ. - // One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount - // lookups, to utilize other ports on the CPU. - // https://github.com/ashvardanian/SimSIMD/pull/138 - // - // On Ice Lake: - // - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck. - // - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well! - // On Zen4: - // - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1. - // - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2. - // https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on - // https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128); __m512i a1_vec = _mm512_loadu_epi8(a); __m512i b1_vec = _mm512_loadu_epi8(b); diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 0b790e98..112a659f 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -13,6 +13,7 @@ * - 32-bit IEEE floating point numbers * - 16-bit IEEE floating point numbers * - 16-bit brain floating point numbers + * - 8-bit unsigned integers * - 8-bit signed integers * * For hardware architectures: @@ -54,6 +55,7 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16c_serial(simsimd_bf16_t const* a, simsimd_bf SIMSIMD_PUBLIC void simsimd_vdot_bf16c_serial(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result); +SIMSIMD_PUBLIC void simsimd_dot_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result); /* Double-precision serial backends for all numeric types. * For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions. @@ -83,6 +85,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_neon(simsimd_f16_t const* a, simsimd_f16_t SIMSIMD_PUBLIC void simsimd_vdot_f16c_neon(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* results); SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result); +SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_bf16c_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* results); @@ -120,6 +123,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16_t const* a, simsimd_f1 SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result); +SIMSIMD_PUBLIC void simsimd_dot_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result); /* SIMD-powered backends for various generations of AVX512 CPUs. * Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop. @@ -136,6 +140,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32_t const* a, simsimd_f32 SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* result); +SIMSIMD_PUBLIC void simsimd_dot_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result); SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* result); @@ -212,6 +217,7 @@ SIMSIMD_MAKE_COMPLEX_DOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_dot SIMSIMD_MAKE_COMPLEX_VDOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_vdot_bf16c_serial SIMSIMD_MAKE_DOT(serial, i8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_i8_serial +SIMSIMD_MAKE_DOT(serial, u8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_u8_serial SIMSIMD_MAKE_DOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32_accurate SIMSIMD_MAKE_COMPLEX_DOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32c_accurate @@ -366,6 +372,27 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons *result = ab; } +SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + uint32x4_t ab_vec = vdupq_n_u32(0); + simsimd_size_t i = 0; + for (; i + 16 <= n; i += 16) { + uint8x16_t a_vec = vld1q_u8(a + i); + uint8x16_t b_vec = vld1q_u8(b + i); + ab_vec = vdotq_u32(ab_vec, a_vec, b_vec); + } + + // Take care of the tail: + uint32_t ab = vaddvq_u32(ab_vec); + for (; i < n; ++i) { + uint32_t ai = a[i], bi = b[i]; + ab += ai * bi; + } + + *result = ab; +} + #pragma clang attribute pop #pragma GCC pop_options #endif @@ -1115,12 +1142,47 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t c __m256i a_i8_vec = _mm256_lddqu_si256((__m256i const*)(a + i)); __m256i b_i8_vec = _mm256_lddqu_si256((__m256i const*)(b + i)); - // Unpack int8 to int16 + // Upcast `int8` to `int16` __m256i a_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 0)); __m256i a_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a_i8_vec, 1)); __m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 0)); __m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1)); + // Multiply and accumulate at int16 level, accumulate at `int32` level + ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec)); + ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec)); + } + + // Horizontal sum across the 256-bit register + int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec)); + + // Take care of the tail: + for (; i < n; ++i) + ab += (int)(a[i]) * b[i]; + *result = ab; +} + +SIMSIMD_PUBLIC void simsimd_dot_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + __m256i ab_i32_low_vec = _mm256_setzero_si256(); + __m256i ab_i32_high_vec = _mm256_setzero_si256(); + __m256i const zeros_vec = _mm256_setzero_si256(); + + // AVX2 has no instructions for unsigned 8-bit integer dot-products, + // but it has a weird instruction for mixed signed-unsigned 8-bit dot-product. + simsimd_size_t i = 0; + for (; i + 32 <= n; i += 32) { + __m256i a_u8_vec = _mm256_lddqu_si256((__m256i const*)(a + i)); + __m256i b_u8_vec = _mm256_lddqu_si256((__m256i const*)(b + i)); + + // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking + // instructions instead of extracts, as they are much faster and more efficient. + __m256i a_i16_low_vec = _mm256_unpacklo_epi8(a_u8_vec, zeros_vec); + __m256i a_i16_high_vec = _mm256_unpackhi_epi8(a_u8_vec, zeros_vec); + __m256i b_i16_low_vec = _mm256_unpacklo_epi8(b_u8_vec, zeros_vec); + __m256i b_i16_high_vec = _mm256_unpackhi_epi8(b_u8_vec, zeros_vec); + // Multiply and accumulate at int16 level, accumulate at int32 level ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec)); ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec)); @@ -1699,6 +1761,45 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const *result = _mm512_reduce_add_epi32(ab_i32_vec); } +SIMSIMD_PUBLIC void simsimd_dot_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + __m512i ab_i32_low_vec = _mm512_setzero_si512(); + __m512i ab_i32_high_vec = _mm512_setzero_si512(); + __m512i const zeros_vec = _mm512_setzero_si512(); + __m512i a_i16_low_vec, a_i16_high_vec, b_i16_low_vec, b_i16_high_vec; + __m512i a_u8_vec, b_u8_vec; + +simsimd_dot_u8_ice_cycle: + if (n < 64) { + __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n); + a_u8_vec = _mm512_maskz_loadu_epi8(mask, a); + b_u8_vec = _mm512_maskz_loadu_epi8(mask, b); + n = 0; + } else { + a_u8_vec = _mm512_loadu_si512(a); + b_u8_vec = _mm512_loadu_si512(b); + a += 64, b += 64, n -= 64; + } + + // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking + // instructions instead of extracts, as they are much faster and more efficient. + a_i16_low_vec = _mm512_unpacklo_epi8(a_u8_vec, zeros_vec); + a_i16_high_vec = _mm512_unpackhi_epi8(a_u8_vec, zeros_vec); + b_i16_low_vec = _mm512_unpacklo_epi8(b_u8_vec, zeros_vec); + b_i16_high_vec = _mm512_unpackhi_epi8(b_u8_vec, zeros_vec); + // Unfortunately we can't use the `_mm512_dpbusd_epi32` intrinsics here either, + // as it's asymmetric with respect to the sign of the input arguments: + // Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + // So we have to use the `_mm512_dpwssd_epi32` intrinsics instead, upcasting + // to 16-bit beforehand. + ab_i32_low_vec = _mm512_dpwssd_epi32(ab_i32_low_vec, a_i16_low_vec, b_i16_low_vec); + ab_i32_high_vec = _mm512_dpwssd_epi32(ab_i32_high_vec, a_i16_high_vec, b_i16_high_vec); + if (n) + goto simsimd_dot_u8_ice_cycle; + + *result = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec)); +} + #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_ICE diff --git a/include/simsimd/fma.h b/include/simsimd/fma.h new file mode 100644 index 00000000..858d0496 --- /dev/null +++ b/include/simsimd/fma.h @@ -0,0 +1,39 @@ +/** + * @file fma.h + * @brief SIMD-accelerated mixed-precision Fused-Multiply-Add operations. + * @author Ash Vardanian + * @date October 16, 2024 + * + * Contains following element-wise operations: + * - Weighted Sum: Oq[i] = Alpha * X[i] + Beta * Z[i] + * - FMA or Fused-Multiply-Add: O[i] = Alpha * X[i] * Y[i] + Beta * Z[i] + * + * For datatypes: + * - 64-bit IEEE floating point numbers + * - 32-bit IEEE floating point numbers + * - 16-bit IEEE floating point numbers + * - 16-bit brain floating point numbers + * - 8-bit unsigned integers + * - 8-bit signed integers + * + * For hardware architectures: + * - Arm: NEON, SVE + * - x86: Haswell, Ice Lake, Skylake, Genoa, Sapphire + * + * x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/ + * Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/ + */ +#ifndef SIMSIMD_FMA_H +#define SIMSIMD_FMA_H + +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index e4013078..c14f56fb 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -549,7 +549,6 @@ SIMSIMD_INTERNAL void _simsimd_find_metric_punned_implementation( // case simsimd_datatype_i16_k: break; case simsimd_datatype_i32_k: break; case simsimd_datatype_i64_k: break; - case simsimd_datatype_u8_k: break; case simsimd_datatype_u64_k: break; // Double-precision floating-point vectors @@ -819,7 +818,7 @@ SIMSIMD_INTERNAL void _simsimd_find_metric_punned_implementation( // break; } - // Single-byte integer vectors + // Single-byte signed integer vectors case simsimd_datatype_i8_k: { #if SIMSIMD_TARGET_NEON_I8 if (viable & simsimd_cap_neon_i8_k) @@ -863,6 +862,50 @@ SIMSIMD_INTERNAL void _simsimd_find_metric_punned_implementation( // break; } + // Single-byte unsigned integer vectors + case simsimd_datatype_u8_k: { +#if SIMSIMD_TARGET_NEON_I8 + if (viable & simsimd_cap_neon_i8_k) + switch (kind) { + case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_neon, *c = simsimd_cap_neon_i8_k; return; + case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_neon, *c = simsimd_cap_neon_i8_k; return; + case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_neon, *c = simsimd_cap_neon_i8_k; return; + case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_neon, *c = simsimd_cap_neon_i8_k; return; + default: break; + } +#endif +#if SIMSIMD_TARGET_ICE + if (viable & simsimd_cap_ice_k) + switch (kind) { + case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_ice, *c = simsimd_cap_ice_k; return; + case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_ice, *c = simsimd_cap_ice_k; return; + case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_ice, *c = simsimd_cap_ice_k; return; + case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_ice, *c = simsimd_cap_ice_k; return; + default: break; + } +#endif +#if SIMSIMD_TARGET_HASWELL + if (viable & simsimd_cap_haswell_k) + switch (kind) { + case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_haswell, *c = simsimd_cap_haswell_k; return; + case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_haswell, *c = simsimd_cap_haswell_k; return; + case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_haswell, *c = simsimd_cap_haswell_k; return; + case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_haswell, *c = simsimd_cap_haswell_k; return; + default: break; + } +#endif + + if (viable & simsimd_cap_serial_k) + switch (kind) { + case simsimd_metric_dot_k: *m = (m_t)&simsimd_dot_u8_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_cos_k: *m = (m_t)&simsimd_cos_u8_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_l2sq_k: *m = (m_t)&simsimd_l2sq_u8_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_l2_k: *m = (m_t)&simsimd_l2_u8_serial, *c = simsimd_cap_serial_k; return; + default: break; + } + + break; + } // Binary vectors case simsimd_datatype_b8_k: { @@ -1270,6 +1313,8 @@ SIMSIMD_DYNAMIC void simsimd_vdot_f64c(simsimd_f64_t const* a, simsimd_f64_t con */ SIMSIMD_DYNAMIC void simsimd_cos_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_DYNAMIC void simsimd_cos_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d); SIMSIMD_DYNAMIC void simsimd_cos_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_DYNAMIC void simsimd_cos_bf16(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -1280,6 +1325,8 @@ SIMSIMD_DYNAMIC void simsimd_cos_f64(simsimd_f64_t const* a, simsimd_f64_t const simsimd_distance_t* d); SIMSIMD_DYNAMIC void simsimd_l2sq_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_DYNAMIC void simsimd_l2sq_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d); SIMSIMD_DYNAMIC void simsimd_l2sq_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_DYNAMIC void simsimd_l2sq_bf16(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, @@ -1408,6 +1455,30 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( // * @note The dot product is zero if and only if the two vectors are orthogonal. * @note Defined only for floating-point and integer data types. */ +SIMSIMD_PUBLIC void simsimd_dot_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, + simsimd_distance_t* d) { +#if SIMSIMD_TARGET_NEON_F16 + simsimd_dot_i8_neon(a, b, n, d); +#elif SIMSIMD_TARGET_ICE + simsimd_dot_i8_ice(a, b, n, d); +#elif SIMSIMD_TARGET_HASWELL + simsimd_dot_i8_haswell(a, b, n, d); +#else + simsimd_dot_i8_serial(a, b, n, d); +#endif +} +SIMSIMD_PUBLIC void simsimd_dot_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d) { +#if SIMSIMD_TARGET_NEON_F16 + simsimd_dot_u8_neon(a, b, n, d); +#elif SIMSIMD_TARGET_ICE + simsimd_dot_u8_ice(a, b, n, d); +#elif SIMSIMD_TARGET_HASWELL + simsimd_dot_u8_haswell(a, b, n, d); +#else + simsimd_dot_u8_serial(a, b, n, d); +#endif +} SIMSIMD_PUBLIC void simsimd_dot_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_SVE_F16 @@ -1422,7 +1493,6 @@ SIMSIMD_PUBLIC void simsimd_dot_f16(simsimd_f16_t const* a, simsimd_f16_t const* simsimd_dot_f16_serial(a, b, n, d); #endif } - SIMSIMD_PUBLIC void simsimd_dot_bf16(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_GENOA @@ -1435,7 +1505,6 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16(simsimd_bf16_t const* a, simsimd_bf16_t con simsimd_dot_bf16_serial(a, b, n, d); #endif } - SIMSIMD_PUBLIC void simsimd_dot_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_SVE @@ -1582,6 +1651,18 @@ SIMSIMD_PUBLIC void simsimd_cos_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_cos_i8_serial(a, b, n, d); #endif } +SIMSIMD_PUBLIC void simsimd_cos_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d) { +#if SIMSIMD_TARGET_NEON + simsimd_cos_u8_neon(a, b, n, d); +#elif SIMSIMD_TARGET_ICE + simsimd_cos_u8_ice(a, b, n, d); +#elif SIMSIMD_TARGET_HASWELL + simsimd_cos_u8_haswell(a, b, n, d); +#else + simsimd_cos_u8_serial(a, b, n, d); +#endif +} SIMSIMD_PUBLIC void simsimd_cos_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_SVE_F16 @@ -1648,6 +1729,18 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8(simsimd_i8_t const* a, simsimd_i8_t const* b simsimd_l2sq_i8_serial(a, b, n, d); #endif } +SIMSIMD_PUBLIC void simsimd_l2sq_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d) { +#if SIMSIMD_TARGET_NEON + simsimd_l2sq_u8_neon(a, b, n, d); +#elif SIMSIMD_TARGET_ICE + simsimd_l2sq_u8_ice(a, b, n, d); +#elif SIMSIMD_TARGET_HASWELL + simsimd_l2sq_u8_haswell(a, b, n, d); +#else + simsimd_l2sq_u8_serial(a, b, n, d); +#endif +} SIMSIMD_PUBLIC void simsimd_l2sq_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_SVE_F16 @@ -1714,6 +1807,18 @@ SIMSIMD_PUBLIC void simsimd_l2_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_l2_i8_serial(a, b, n, d); #endif } +SIMSIMD_PUBLIC void simsimd_l2_u8(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* d) { +#if SIMSIMD_TARGET_NEON + simsimd_l2_u8_neon(a, b, n, d); +#elif SIMSIMD_TARGET_ICE + simsimd_l2_u8_ice(a, b, n, d); +#elif SIMSIMD_TARGET_HASWELL + simsimd_l2_u8_haswell(a, b, n, d); +#else + simsimd_l2_u8_serial(a, b, n, d); +#endif +} SIMSIMD_PUBLIC void simsimd_l2_f16(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d) { #if SIMSIMD_TARGET_SVE_F16 diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h index 69f93595..61632bd6 100644 --- a/include/simsimd/spatial.h +++ b/include/simsimd/spatial.h @@ -13,6 +13,7 @@ * - 32-bit IEEE floating point numbers * - 16-bit IEEE floating point numbers * - 16-bit brain floating point numbers + * - 8-bit unsigned integral numbers * - 8-bit signed integral numbers * - 4-bit signed integral numbers * @@ -55,6 +56,9 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_serial(simsimd_bf16_t const* a, simsimd_bf1 SIMSIMD_PUBLIC void simsimd_l2_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_i8_serial(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2sq_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_cos_u8_serial(simsimd_u8_t const* a, simsimd_u8_t const*, simsimd_size_t n, simsimd_distance_t* d); /* Double-precision serial backends for all numeric types. * For single-precision computation check out the "*_serial" counterparts of those "*_accurate" functions. @@ -67,10 +71,6 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_accurate(simsimd_f16_t const* a, simsimd_f1 SIMSIMD_PUBLIC void simsimd_cos_f16_accurate(simsimd_f16_t const* a, simsimd_f16_t const*, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d); -SIMSIMD_PUBLIC void simsimd_cos_bf16_accurate(simsimd_bf16_t const* a, simsimd_bf16_t const*, simsimd_size_t n, simsimd_distance_t* d); -SIMSIMD_PUBLIC void simsimd_l2_i8_accurate(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); -SIMSIMD_PUBLIC void simsimd_l2sq_i8_accurate(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); -SIMSIMD_PUBLIC void simsimd_cos_i8_accurate(simsimd_i8_t const* a, simsimd_i8_t const*, simsimd_size_t n, simsimd_distance_t* d); /* SIMD-powered backends for Arm NEON, mostly using 32-bit arithmetic over 128-bit words. * By far the most portable backend, covering most Arm v8 devices, over a billion phones, and almost all @@ -91,6 +91,9 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_ SIMSIMD_PUBLIC void simsimd_l2_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); /* SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes. * Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs. @@ -117,6 +120,9 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const* a, simsimd_f64_t co SIMSIMD_PUBLIC void simsimd_l2_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n, simsimd_distance_t* d); @@ -148,6 +154,9 @@ SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t SIMSIMD_PUBLIC void simsimd_l2_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); +SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_l2sq_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d); SIMSIMD_PUBLIC void simsimd_cos_bf16_genoa(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n, simsimd_distance_t* d); @@ -224,6 +233,10 @@ SIMSIMD_MAKE_COS(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_serial SIMSIMD_MAKE_L2SQ(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_i8_serial SIMSIMD_MAKE_L2(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_i8_serial +SIMSIMD_MAKE_COS(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_u8_serial +SIMSIMD_MAKE_L2SQ(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_u8_serial +SIMSIMD_MAKE_L2(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_u8_serial + SIMSIMD_MAKE_COS(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_cos_f32_accurate SIMSIMD_MAKE_L2SQ(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f32_accurate SIMSIMD_MAKE_L2(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2_f32_accurate @@ -236,10 +249,6 @@ SIMSIMD_MAKE_COS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_cos_bf16_ SIMSIMD_MAKE_L2SQ(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2sq_bf16_accurate SIMSIMD_MAKE_L2(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_accurate -SIMSIMD_MAKE_COS(accurate, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_accurate -SIMSIMD_MAKE_L2SQ(accurate, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_i8_accurate -SIMSIMD_MAKE_L2(accurate, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_i8_accurate - #if SIMSIMD_TARGET_ARM #if SIMSIMD_TARGET_NEON #pragma GCC push_options @@ -716,6 +725,56 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); } +SIMSIMD_PUBLIC void simsimd_l2_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + simsimd_l2sq_u8_neon(a, b, n, result); + *result = _simsimd_sqrt_f32_neon(*result); +} +SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + uint32x4_t d2_vec = vdupq_n_u32(0); + simsimd_size_t i = 0; + for (; i + 16 <= n; i += 16) { + uint8x16_t a_vec = vld1q_u8(a + i); + uint8x16_t b_vec = vld1q_u8(b + i); + uint8x16_t d_vec = vabdq_u8(a_vec, b_vec); + d2_vec = vdotq_u32(d2_vec, d_vec, d_vec); + } + uint32_t d2 = vaddvq_u32(d2_vec); + for (; i < n; ++i) { + int32_t n = (int32_t)a[i] - b[i]; + d2 += (uint32_t)(n * n); + } + *result = d2; +} + +SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + simsimd_size_t i = 0; + int32x4_t ab_vec = vdupq_n_u32(0); + int32x4_t a2_vec = vdupq_n_u32(0); + int32x4_t b2_vec = vdupq_n_u32(0); + for (; i + 16 <= n; i += 16) { + uint8x16_t a_vec = vld1q_u8(a + i); + uint8x16_t b_vec = vld1q_u8(b + i); + ab_vec = vdotq_u32(ab_vec, a_vec, b_vec); + a2_vec = vdotq_u32(a2_vec, a_vec, a_vec); + b2_vec = vdotq_u32(b2_vec, b_vec, b_vec); + } + uint32_t ab = vaddvq_u32(ab_vec); + uint32_t a2 = vaddvq_u32(a2_vec); + uint32_t b2 = vaddvq_u32(b2_vec); + + // Take care of the tail: + for (; i < n; ++i) { + uint32_t ai = a[i], bi = b[i]; + ab += ai * bi, a2 += ai * ai, b2 += bi * bi; + } + + *result = _simsimd_cos_normalize_f32_neon(ab, a2, b2); +} + #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON_I8 @@ -1214,6 +1273,106 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_haswell(simsimd_i8_t const* a, simsimd_i8_t c __m256i b_i16_low_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 0)); __m256i b_i16_high_vec = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b_i8_vec, 1)); + // Multiply and accumulate as `int16`, accumulate products as `int32`: + ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec)); + ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec)); + a2_i32_low_vec = _mm256_add_epi32(a2_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, a_i16_low_vec)); + a2_i32_high_vec = _mm256_add_epi32(a2_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, a_i16_high_vec)); + b2_i32_low_vec = _mm256_add_epi32(b2_i32_low_vec, _mm256_madd_epi16(b_i16_low_vec, b_i16_low_vec)); + b2_i32_high_vec = _mm256_add_epi32(b2_i32_high_vec, _mm256_madd_epi16(b_i16_high_vec, b_i16_high_vec)); + } + + // Further reduce to a single sum for each vector + int ab = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(ab_i32_low_vec, ab_i32_high_vec)); + int a2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(a2_i32_low_vec, a2_i32_high_vec)); + int b2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(b2_i32_low_vec, b2_i32_high_vec)); + + // Take care of the tail: + for (; i < n; ++i) { + int ai = a[i], bi = b[i]; + ab += ai * bi, a2 += ai * ai, b2 += bi * bi; + } + + *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); +} + +SIMSIMD_PUBLIC void simsimd_l2_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + simsimd_l2sq_u8_haswell(a, b, n, result); + *result = _simsimd_sqrt_f32_haswell(*result); +} +SIMSIMD_PUBLIC void simsimd_l2sq_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + __m256i d2_i32_low_vec = _mm256_setzero_si256(); + __m256i d2_i32_high_vec = _mm256_setzero_si256(); + __m256i const zeros_vec = _mm256_setzero_si256(); + + simsimd_size_t i = 0; + for (; i + 32 <= n; i += 32) { + __m256i a_u8_vec = _mm256_lddqu_si256((__m256i const*)(a + i)); + __m256i b_u8_vec = _mm256_lddqu_si256((__m256i const*)(b + i)); + + // Substracting unsigned vectors in AVX2 is done by saturating subtraction: + __m256i d_u8_vec = _mm256_or_si256(_mm256_subs_epu8(a_u8_vec, b_u8_vec), _mm256_subs_epu8(b_u8_vec, a_u8_vec)); + + // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking + // instructions instead of extracts, as they are much faster and more efficient. + __m256i d_i16_low_vec = _mm256_unpacklo_epi8(d_u8_vec, zeros_vec); + __m256i d_i16_high_vec = _mm256_unpackhi_epi8(d_u8_vec, zeros_vec); + + // Multiply and accumulate at `int16` level, accumulate at `int32` level: + d2_i32_low_vec = _mm256_add_epi32(d2_i32_low_vec, _mm256_madd_epi16(d_i16_low_vec, d_i16_low_vec)); + d2_i32_high_vec = _mm256_add_epi32(d2_i32_high_vec, _mm256_madd_epi16(d_i16_high_vec, d_i16_high_vec)); + } + + // Accumulate the 32-bit integers from `d2_i32_high_vec` and `d2_i32_low_vec` + int d2 = _simsimd_reduce_i32x8_haswell(_mm256_add_epi32(d2_i32_low_vec, d2_i32_high_vec)); + + // Take care of the tail: + for (; i < n; ++i) { + int n = (int)(a[i]) - b[i]; + d2 += n * n; + } + + *result = (simsimd_f64_t)d2; +} + +SIMSIMD_PUBLIC void simsimd_cos_u8_haswell(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + __m256i ab_i32_low_vec = _mm256_setzero_si256(); + __m256i ab_i32_high_vec = _mm256_setzero_si256(); + __m256i a2_i32_low_vec = _mm256_setzero_si256(); + __m256i a2_i32_high_vec = _mm256_setzero_si256(); + __m256i b2_i32_low_vec = _mm256_setzero_si256(); + __m256i b2_i32_high_vec = _mm256_setzero_si256(); + __m256i const zeros_vec = _mm256_setzero_si256(); + + // AVX2 has no instructions for 8-bit signed integer dot-products, + // but it has a weird instruction for mixed signed-unsigned 8-bit dot-product. + // So we need to normalize the first vector to its absolute value, + // and shift the product sign into the second vector. + // + // __m256i a_i8_abs_vec = _mm256_abs_epi8(a_i8_vec); + // __m256i b_i8_flipped_vec = _mm256_sign_epi8(b_i8_vec, a_i8_vec); + // __m256i ab_i16_vec = _mm256_maddubs_epi16(a_i8_abs_vec, b_i8_flipped_vec); + // + // The problem with this approach, however, is the `-128` value in the second vector. + // Flipping it's sign will do nothing, and the result will be incorrect. + // This can easily lead to noticeable numerical errors in the final result. + simsimd_size_t i = 0; + for (; i + 32 <= n; i += 32) { + __m256i a_u8_vec = _mm256_lddqu_si256((__m256i const*)(a + i)); + __m256i b_u8_vec = _mm256_lddqu_si256((__m256i const*)(b + i)); + + // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking + // instructions instead of extracts, as they are much faster and more efficient. + __m256i a_i16_low_vec = _mm256_unpacklo_epi8(a_u8_vec, zeros_vec); + __m256i a_i16_high_vec = _mm256_unpackhi_epi8(a_u8_vec, zeros_vec); + __m256i b_i16_low_vec = _mm256_unpacklo_epi8(b_u8_vec, zeros_vec); + __m256i b_i16_high_vec = _mm256_unpackhi_epi8(b_u8_vec, zeros_vec); + // Multiply and accumulate as `int16`, accumulate products as `int32` ab_i32_low_vec = _mm256_add_epi32(ab_i32_low_vec, _mm256_madd_epi16(a_i16_low_vec, b_i16_low_vec)); ab_i32_high_vec = _mm256_add_epi32(ab_i32_high_vec, _mm256_madd_epi16(a_i16_high_vec, b_i16_high_vec)); @@ -1675,8 +1834,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8_ice(simsimd_i8_t const* a, simsimd_i8_t cons b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b)); n = 0; } else { - a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256(a)); - b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256(b)); + a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const*)a)); + b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const*)b)); a += 32, b += 32, n -= 32; } d_i16s_vec = _mm512_sub_epi16(a_i16_vec, b_i16_vec); @@ -1701,8 +1860,8 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const b_i16_vec = _mm512_cvtepi8_epi16(_mm256_maskz_loadu_epi8(mask, b)); n = 0; } else { - a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256(a)); - b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256(b)); + a_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const*)a)); + b_i16_vec = _mm512_cvtepi8_epi16(_mm256_lddqu_si256((__m256i const*)b)); a += 32, b += 32, n -= 32; } @@ -1756,6 +1915,92 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const int b2 = _mm512_reduce_add_epi32(b2_i32_vec); *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); } +SIMSIMD_PUBLIC void simsimd_l2_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + simsimd_l2sq_u8_ice(a, b, n, result); + *result = _simsimd_sqrt_f32_haswell(*result); +} +SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + __m512i d2_i32_low_vec = _mm512_setzero_si512(); + __m512i d2_i32_high_vec = _mm512_setzero_si512(); + __m512i const zeros_vec = _mm512_setzero_si512(); + __m512i d_i16_low_vec, d_i16_high_vec; + __m512i a_u8_vec, b_u8_vec, d_u8_vec; + +simsimd_l2sq_u8_ice_cycle: + if (n < 64) { + __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n); + a_u8_vec = _mm512_maskz_loadu_epi8(mask, a); + b_u8_vec = _mm512_maskz_loadu_epi8(mask, b); + n = 0; + } else { + a_u8_vec = _mm512_loadu_si512(a); + b_u8_vec = _mm512_loadu_si512(b); + a += 64, b += 64, n -= 64; + } + + // Substracting unsigned vectors in AVX-512 is done by saturating subtraction: + d_u8_vec = _mm512_or_si512(_mm512_subs_epu8(a_u8_vec, b_u8_vec), _mm512_subs_epu8(b_u8_vec, a_u8_vec)); + d_i16_low_vec = _mm512_unpacklo_epi8(d_u8_vec, zeros_vec); + d_i16_high_vec = _mm512_unpackhi_epi8(d_u8_vec, zeros_vec); + + // Multiply and accumulate at `int16` level, accumulate at `int32` level: + d2_i32_low_vec = _mm512_dpwssd_epi32(d2_i32_low_vec, d_i16_low_vec, d_i16_low_vec); + d2_i32_high_vec = _mm512_dpwssd_epi32(d2_i32_high_vec, d_i16_high_vec, d_i16_high_vec); + if (n) + goto simsimd_l2sq_u8_ice_cycle; + + *result = _mm512_reduce_add_epi32(_mm512_add_epi32(d2_i32_low_vec, d2_i32_high_vec)); +} + +SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const* a, simsimd_u8_t const* b, simsimd_size_t n, + simsimd_distance_t* result) { + + __m512i ab_i32_low_vec = _mm512_setzero_si512(); + __m512i ab_i32_high_vec = _mm512_setzero_si512(); + __m512i a2_i32_low_vec = _mm512_setzero_si512(); + __m512i a2_i32_high_vec = _mm512_setzero_si512(); + __m512i b2_i32_low_vec = _mm512_setzero_si512(); + __m512i b2_i32_high_vec = _mm512_setzero_si512(); + __m512i const zeros_vec = _mm512_setzero_si512(); + __m512i a_i16_low_vec, a_i16_high_vec, b_i16_low_vec, b_i16_high_vec; + __m512i a_u8_vec, b_u8_vec; + +simsimd_cos_u8_ice_cycle: + if (n < 64) { + __mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n); + a_u8_vec = _mm512_maskz_loadu_epi8(mask, a); + b_u8_vec = _mm512_maskz_loadu_epi8(mask, b); + n = 0; + } else { + a_u8_vec = _mm512_loadu_si512(a); + b_u8_vec = _mm512_loadu_si512(b); + a += 64, b += 64, n -= 64; + } + + // Upcast `uint8` to `int16`. Unlike the signed version, we can use the unpacking + // instructions instead of extracts, as they are much faster and more efficient. + a_i16_low_vec = _mm512_unpacklo_epi8(a_u8_vec, zeros_vec); + a_i16_high_vec = _mm512_unpackhi_epi8(a_u8_vec, zeros_vec); + b_i16_low_vec = _mm512_unpacklo_epi8(b_u8_vec, zeros_vec); + b_i16_high_vec = _mm512_unpackhi_epi8(b_u8_vec, zeros_vec); + + // Multiply and accumulate as `int16`, accumulate products as `int32`: + ab_i32_low_vec = _mm512_dpwssds_epi32(ab_i32_low_vec, a_i16_low_vec, b_i16_low_vec); + ab_i32_high_vec = _mm512_dpwssds_epi32(ab_i32_high_vec, a_i16_high_vec, b_i16_high_vec); + a2_i32_low_vec = _mm512_dpwssds_epi32(a2_i32_low_vec, a_i16_low_vec, a_i16_low_vec); + a2_i32_high_vec = _mm512_dpwssds_epi32(a2_i32_high_vec, a_i16_high_vec, a_i16_high_vec); + b2_i32_low_vec = _mm512_dpwssds_epi32(b2_i32_low_vec, b_i16_low_vec, b_i16_low_vec); + b2_i32_high_vec = _mm512_dpwssds_epi32(b2_i32_high_vec, b_i16_high_vec, b_i16_high_vec); + if (n) + goto simsimd_cos_u8_ice_cycle; + + int ab = _mm512_reduce_add_epi32(_mm512_add_epi32(ab_i32_low_vec, ab_i32_high_vec)); + int a2 = _mm512_reduce_add_epi32(_mm512_add_epi32(a2_i32_low_vec, a2_i32_high_vec)); + int b2 = _mm512_reduce_add_epi32(_mm512_add_epi32(b2_i32_low_vec, b2_i32_high_vec)); + *result = _simsimd_cos_normalize_f32_haswell(ab, a2, b2); +} SIMSIMD_PUBLIC void simsimd_l2_i4x2_ice(simsimd_i4x2_t const* a, simsimd_i4x2_t const* b, simsimd_size_t n_words, simsimd_distance_t* result) { diff --git a/scripts/bench.cxx b/scripts/bench.cxx index 93f7761e..b438e669 100644 --- a/scripts/bench.cxx +++ b/scripts/bench.cxx @@ -656,9 +656,13 @@ int main(int argc, char** argv) { dense_("cos_f64_neon", simsimd_cos_f64_neon, simsimd_cos_f64_serial); dense_("l2sq_f64_neon", simsimd_l2sq_f64_neon, simsimd_l2sq_f64_serial); - dense_("cos_i8_neon", simsimd_cos_i8_neon, simsimd_cos_i8_accurate); + dense_("cos_i8_neon", simsimd_cos_i8_neon, simsimd_cos_i8_serial); + dense_("l2sq_i8_neon", simsimd_l2sq_i8_neon, simsimd_l2sq_i8_serial); dense_("dot_i8_neon", simsimd_dot_i8_neon, simsimd_dot_i8_serial); - dense_("l2sq_i8_neon", simsimd_l2sq_i8_neon, simsimd_l2sq_i8_accurate); + + dense_("cos_u8_neon", simsimd_cos_u8_neon, simsimd_cos_u8_serial); + dense_("l2sq_u8_neon", simsimd_l2sq_u8_neon, simsimd_l2sq_u8_serial); + dense_("dot_u8_neon", simsimd_dot_u8_neon, simsimd_dot_u8_serial); dense_("hamming_b8_neon", simsimd_hamming_b8_neon, simsimd_hamming_b8_serial); dense_("jaccard_b8_neon", simsimd_jaccard_b8_neon, simsimd_jaccard_b8_serial); @@ -724,9 +728,13 @@ int main(int argc, char** argv) { dense_("cos_bf16_haswell", simsimd_cos_bf16_haswell, simsimd_cos_bf16_accurate); dense_("l2sq_bf16_haswell", simsimd_l2sq_bf16_haswell, simsimd_l2sq_bf16_accurate); - dense_("cos_i8_haswell", simsimd_cos_i8_haswell, simsimd_cos_i8_accurate); + dense_("cos_i8_haswell", simsimd_cos_i8_haswell, simsimd_cos_i8_serial); + dense_("l2sq_i8_haswell", simsimd_l2sq_i8_haswell, simsimd_l2sq_i8_serial); dense_("dot_i8_haswell", simsimd_dot_i8_haswell, simsimd_dot_i8_serial); - dense_("l2sq_i8_haswell", simsimd_l2sq_i8_haswell, simsimd_l2sq_i8_accurate); + + dense_("cos_u8_haswell", simsimd_cos_u8_haswell, simsimd_cos_u8_serial); + dense_("l2sq_u8_haswell", simsimd_l2sq_u8_haswell, simsimd_l2sq_u8_serial); + dense_("dot_u8_haswell", simsimd_dot_u8_haswell, simsimd_dot_u8_serial); dense_("hamming_b8_haswell", simsimd_hamming_b8_haswell, simsimd_hamming_b8_serial); dense_("jaccard_b8_haswell", simsimd_jaccard_b8_haswell, simsimd_jaccard_b8_serial); @@ -767,9 +775,13 @@ int main(int argc, char** argv) { #endif #if SIMSIMD_TARGET_ICE - dense_("cos_i8_ice", simsimd_cos_i8_ice, simsimd_cos_i8_accurate); + dense_("cos_i8_ice", simsimd_cos_i8_ice, simsimd_cos_i8_serial); + dense_("l2sq_i8_ice", simsimd_l2sq_i8_ice, simsimd_l2sq_i8_serial); dense_("dot_i8_ice", simsimd_dot_i8_ice, simsimd_dot_i8_serial); - dense_("l2sq_i8_ice", simsimd_l2sq_i8_ice, simsimd_l2sq_i8_accurate); + + dense_("cos_u8_ice", simsimd_cos_u8_ice, simsimd_cos_u8_serial); + dense_("l2sq_u8_ice", simsimd_l2sq_u8_ice, simsimd_l2sq_u8_serial); + dense_("dot_u8_ice", simsimd_dot_u8_ice, simsimd_dot_u8_serial); dense_("dot_f64_skylake", simsimd_dot_f64_skylake, simsimd_dot_f64_serial); dense_("cos_f64_skylake", simsimd_cos_f64_skylake, simsimd_cos_f64_serial); @@ -831,9 +843,13 @@ int main(int argc, char** argv) { dense_("cos_f64_serial", simsimd_cos_f64_serial, simsimd_cos_f64_serial); dense_("l2sq_f64_serial", simsimd_l2sq_f64_serial, simsimd_l2sq_f64_serial); - dense_("cos_i8_serial", simsimd_cos_i8_serial, simsimd_cos_i8_accurate); + dense_("cos_i8_serial", simsimd_cos_i8_serial, simsimd_cos_i8_serial); + dense_("l2sq_i8_serial", simsimd_l2sq_i8_serial, simsimd_l2sq_i8_serial); dense_("dot_i8_serial", simsimd_dot_i8_serial, simsimd_dot_i8_serial); - dense_("l2sq_i8_serial", simsimd_l2sq_i8_serial, simsimd_l2sq_i8_accurate); + + dense_("cos_u8_serial", simsimd_cos_u8_serial, simsimd_cos_u8_serial); + dense_("l2sq_u8_serial", simsimd_l2sq_u8_serial, simsimd_l2sq_u8_serial); + dense_("dot_u8_serial", simsimd_dot_u8_serial, simsimd_dot_u8_serial); dense_("dot_f64c_serial", simsimd_dot_f64c_serial, simsimd_dot_f64c_serial); dense_("dot_f32c_serial", simsimd_dot_f32c_serial, simsimd_dot_f32c_accurate); diff --git a/scripts/test.py b/scripts/test.py index 4076374e..2cff738c 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -51,7 +51,6 @@ import simsimd as simd - # NumPy is available on most platforms and is required for most tests. # When using PyPy on some platforms NumPy has internal issues, that will # raise a weird error, not an `ImportError`. That's why we intentionally @@ -356,10 +355,22 @@ def collect_warnings(message: str, stats: dict): # We will run all the tests many times using different instruction sets under the hood. available_capabilities: Dict[str, str] = simd.get_capabilities() possible_x86_capabilities: List[str] = ["haswell", "ice", "skylake", "sapphire", "turin", "genoa", "sierra"] -possible_arm_capabilities: List[str] = ["neon", "neon_f16", "neon_bf16", "neon_i8", "sve", "sve_f16", "sve_bf16", "sve_i8"] +possible_arm_capabilities: List[str] = [ + "neon", + "neon_f16", + "neon_bf16", + "neon_i8", + "sve", + "sve_f16", + "sve_bf16", + "sve_i8", +] possible_x86_capabilities: List[str] = [c for c in possible_x86_capabilities if available_capabilities[c]] possible_arm_capabilities: List[str] = [c for c in possible_arm_capabilities if available_capabilities[c]] -possible_capabilities: List[str] = possible_x86_capabilities if platform.machine() == "x86_64" else possible_arm_capabilities +possible_capabilities: List[str] = ( + possible_x86_capabilities if platform.machine() == "x86_64" else possible_arm_capabilities +) + def keep_one_capability(cap: str): assert cap in possible_capabilities @@ -446,6 +457,10 @@ def test_pointers_availability(): assert simd.pointer_to_cosine("i8") != 0 assert simd.pointer_to_inner("i8") != 0 + assert simd.pointer_to_sqeuclidean("u8") != 0 + assert simd.pointer_to_cosine("u8") != 0 + assert simd.pointer_to_inner("u8") != 0 + def test_capabilities_list(): """Tests the visibility of hardware capabilities.""" @@ -698,16 +713,21 @@ def test_curved_bf16(ndim, metric, capability, stats_fixture): @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed") @pytest.mark.repeat(50) @pytest.mark.parametrize("ndim", [11, 97, 1536]) +@pytest.mark.parametrize("dtype", ["int8", "uint8"]) @pytest.mark.parametrize("metric", ["inner", "euclidean", "sqeuclidean", "cosine"]) @pytest.mark.parametrize("capability", possible_capabilities) -def test_dense_i8(ndim, metric, capability, stats_fixture): +def test_dense_i8(ndim, dtype, metric, capability, stats_fixture): """Compares various SIMD kernels (like Dot-products, squared Euclidean, and Cosine distances) with their NumPy or baseline counterparts, testing accuracy for small integer types, that can't be directly processed with other tools without overflowing.""" np.random.seed() - a = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) - b = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) + if dtype == "int8": + a = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) + b = np.random.randint(-128, 127, size=(ndim), dtype=np.int8) + else: + a = np.random.randint(0, 255, size=(ndim), dtype=np.uint8) + b = np.random.randint(0, 255, size=(ndim), dtype=np.uint8) keep_one_capability(capability) baseline_kernel, simd_kernel = name_to_kernels(metric) @@ -719,8 +739,10 @@ def test_dense_i8(ndim, metric, capability, stats_fixture): if metric == "inner": assert round(float(result)) == round(float(expected)), f"Expected {expected}, but got {result}" else: - np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL), f"Expected {expected}, but got {result}" - collect_errors(metric, ndim, "int8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) + np.testing.assert_allclose( + result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL + ), f"Expected {expected}, but got {result}" + collect_errors(metric, ndim, dtype, accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture) #! Fun fact: SciPy doesn't actually raise an `OverflowError` when overflow happens #! here, instead it raises `ValueError: math domain error` during the `sqrt` operation.