diff --git a/benchs/bench_cppcontrib_sa_decode.cpp b/benchs/bench_cppcontrib_sa_decode.cpp index 524ce17b5a..2de468b3e1 100644 --- a/benchs/bench_cppcontrib_sa_decode.cpp +++ b/benchs/bench_cppcontrib_sa_decode.cpp @@ -20,9 +20,7 @@ #include #include #include -#include #include -#include #include #include diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index 7af170e20c..f6164f54e2 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -214,6 +214,7 @@ class CodecDescriptor(IndexBaseDescriptor): factory: Optional[str] = None construction_params: Optional[List[Dict[str, int]]] = None training_vectors: Optional[DatasetDescriptor] = None + FILENAME_PREFIX: str = "xt" def __post_init__(self): self.get_name() @@ -254,7 +255,7 @@ def name_from_factory(self) -> str: name += f"d_{self.d}.{self.metric.upper()}." if self.factory != "Flat": assert self.training_vectors is not None - name += self.training_vectors.get_filename("xt") + name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX) name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params) return name @@ -278,6 +279,7 @@ def alias(self, benchmark_io: BenchmarkIO): class IndexDescriptor(IndexBaseDescriptor): codec_desc: Optional[CodecDescriptor] = None database_desc: Optional[DatasetDescriptor] = None + FILENAME_PREFIX: str = "xb" def __hash__(self): return hash(str(self)) @@ -290,14 +292,14 @@ def is_built(self): def get_name(self) -> str: if self.desc_name is None: - self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix="xb") + self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX) return self.desc_name def flat_name(self): if self.flat_desc_name is not None: return self.flat_desc_name - self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix="xb") + self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX) return self.flat_desc_name # alias is used to refer when index is uploaded to blobstore and refered again @@ -313,6 +315,7 @@ class KnnDescriptor(IndexBaseDescriptor): query_dataset: Optional[DatasetDescriptor] = None search_params: Optional[Dict[str, int]] = None reconstruct: bool = False + FILENAME_PREFIX: str = "q" # range metric definitions # key: name # value: one of the following: @@ -340,7 +343,7 @@ def __hash__(self): def get_name(self): name = self.index_desc.get_name() name += IndexBaseDescriptor.param_dict_to_name(self.search_params) - name += self.query_dataset.get_filename("q") + name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX) name += f"k_{self.k}." name += f"t_{self.num_threads}." if self.reconstruct: @@ -353,7 +356,7 @@ def flat_name(self): if self.flat_desc_name is not None: return self.flat_desc_name name = self.index_desc.flat_name() - name += self.query_dataset.get_filename("q") + name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX) name += f"k_{self.k}." name += f"t_{self.num_threads}." if self.reconstruct: diff --git a/benchs/bench_hamming_computer.cpp b/benchs/bench_hamming_computer.cpp index 9b0f1a74de..641d5ff04b 100644 --- a/benchs/bench_hamming_computer.cpp +++ b/benchs/bench_hamming_computer.cpp @@ -5,7 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include diff --git a/benchs/bench_heap_replace.cpp b/benchs/bench_heap_replace.cpp index 90bdd699cb..1b3700fa3d 100644 --- a/benchs/bench_heap_replace.cpp +++ b/benchs/bench_heap_replace.cpp @@ -5,7 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py index 5568901d84..797c02656c 100644 --- a/contrib/torch_utils.py +++ b/contrib/torch_utils.py @@ -56,6 +56,13 @@ def swig_ptr_from_FloatTensor(x): return faiss.cast_integer_to_float_ptr( x.untyped_storage().data_ptr() + x.storage_offset() * 4) +def swig_ptr_from_BFloat16Tensor(x): + """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """ + assert x.is_contiguous() + assert x.dtype == torch.bfloat16 + return faiss.cast_integer_to_void_ptr( + x.untyped_storage().data_ptr() + x.storage_offset() * 2) + def swig_ptr_from_IntTensor(x): """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """ @@ -606,8 +613,11 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI elif xb.dtype == torch.float16: xb_type = faiss.DistanceDataType_F16 xb_ptr = swig_ptr_from_HalfTensor(xb) + elif xb.dtype == torch.bfloat16: + xb_type = faiss.DistanceDataType_BF16 + xb_ptr = swig_ptr_from_BFloat16Tensor(xb) else: - raise TypeError('xb must be f32 or f16') + raise TypeError('xq must be float32, float16 or bfloat16') nq, d2 = xq.size() assert d2 == d @@ -625,8 +635,11 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI elif xq.dtype == torch.float16: xq_type = faiss.DistanceDataType_F16 xq_ptr = swig_ptr_from_HalfTensor(xq) + elif xq.dtype == torch.bfloat16: + xq_type = faiss.DistanceDataType_BF16 + xq_ptr = swig_ptr_from_BFloat16Tensor(xq) else: - raise TypeError('xq must be f32 or f16') + raise TypeError('xq must be float32, float16 or bfloat16') if D is None: D = torch.empty(nq, k, device=xb.device, dtype=torch.float32) diff --git a/demos/demo_imi_flat.cpp b/demos/demo_imi_flat.cpp index 4de4670baf..a7f70bca13 100644 --- a/demos/demo_imi_flat.cpp +++ b/demos/demo_imi_flat.cpp @@ -15,7 +15,6 @@ #include #include #include -#include double elapsed() { struct timeval tv; diff --git a/demos/demo_sift1M.cpp b/demos/demo_sift1M.cpp index 15032c2aca..0235be0af2 100644 --- a/demos/demo_sift1M.cpp +++ b/demos/demo_sift1M.cpp @@ -12,8 +12,6 @@ #include #include -#include -#include #include diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index a89082facd..5e91038eb7 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -125,6 +125,7 @@ set(FAISS_HEADERS IndexIVFPQR.h IndexIVFSpectralHash.h IndexLSH.h + IndexNeuralNetCodec.h IndexLattice.h IndexNNDescent.h IndexNSG.h @@ -168,6 +169,7 @@ set(FAISS_HEADERS impl/ScalarQuantizer.h impl/ThreadedIndex-inl.h impl/ThreadedIndex.h + impl/index_read_utils.h impl/io.h impl/io_macros.h impl/kmeans1d.h @@ -179,13 +181,17 @@ set(FAISS_HEADERS impl/code_distance/code_distance.h impl/code_distance/code_distance-generic.h impl/code_distance/code_distance-avx2.h + impl/code_distance/code_distance-avx512.h + impl/code_distance/code_distance-sve.h invlists/BlockInvertedLists.h invlists/DirectMap.h invlists/InvertedLists.h invlists/InvertedListsIOHook.h + invlists/OnDiskInvertedLists.h utils/AlignedTable.h utils/bf16.h utils/Heap.h + utils/NeuralNet.h utils/WorkerThread.h utils/distances.h utils/extra_distances-inl.h @@ -204,8 +210,10 @@ set(FAISS_HEADERS utils/sorting.h utils/simdlib.h utils/simdlib_avx2.h + utils/simdlib_avx512.h utils/simdlib_emulated.h utils/simdlib_neon.h + utils/simdlib_ppc64.h utils/utils.h utils/distances_fused/avx512.h utils/distances_fused/distances_fused.h @@ -216,6 +224,7 @@ set(FAISS_HEADERS utils/approx_topk/mode.h utils/approx_topk_hamming/approx_topk_hamming.h utils/transpose/transpose-avx2-inl.h + utils/transpose/transpose-avx512-inl.h utils/hamming_distance/common.h utils/hamming_distance/generic-inl.h utils/hamming_distance/hamdis-inl.h diff --git a/faiss/IndexAdditiveQuantizer.cpp b/faiss/IndexAdditiveQuantizer.cpp index eb45a9684c..a7ce3fce00 100644 --- a/faiss/IndexAdditiveQuantizer.cpp +++ b/faiss/IndexAdditiveQuantizer.cpp @@ -16,7 +16,6 @@ #include #include #include -#include namespace faiss { diff --git a/faiss/IndexAdditiveQuantizerFastScan.cpp b/faiss/IndexAdditiveQuantizerFastScan.cpp index cb3097ac22..2e3bd7c2f4 100644 --- a/faiss/IndexAdditiveQuantizerFastScan.cpp +++ b/faiss/IndexAdditiveQuantizerFastScan.cpp @@ -8,11 +8,8 @@ #include #include -#include #include -#include - #include #include #include diff --git a/faiss/IndexBinaryFlat.cpp b/faiss/IndexBinaryFlat.cpp index ee15e59f17..f6e2e218c0 100644 --- a/faiss/IndexBinaryFlat.cpp +++ b/faiss/IndexBinaryFlat.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include namespace faiss { diff --git a/faiss/IndexBinaryHNSW.cpp b/faiss/IndexBinaryHNSW.cpp index 73853c4e75..119d6c0438 100644 --- a/faiss/IndexBinaryHNSW.cpp +++ b/faiss/IndexBinaryHNSW.cpp @@ -15,11 +15,6 @@ #include #include -#include -#include - -#include -#include #include #include diff --git a/faiss/IndexBinaryHash.cpp b/faiss/IndexBinaryHash.cpp index 0d514435f0..9cbb4635dd 100644 --- a/faiss/IndexBinaryHash.cpp +++ b/faiss/IndexBinaryHash.cpp @@ -19,7 +19,6 @@ #include #include -#include namespace faiss { diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp index cc4f1bddcb..8d10ecc9d9 100644 --- a/faiss/IndexFastScan.cpp +++ b/faiss/IndexFastScan.cpp @@ -17,11 +17,7 @@ #include #include #include -#include -#include #include -#include -#include #include #include diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp index e1fcc6d999..094d268d4c 100644 --- a/faiss/IndexFlat.cpp +++ b/faiss/IndexFlat.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include namespace faiss { diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index 23922cc0b8..6a40196f00 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -19,10 +19,7 @@ #include #include #include -#include -#include -#include #include #include @@ -31,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp index 8ad51d7588..7c2a7ff01a 100644 --- a/faiss/IndexIDMap.cpp +++ b/faiss/IndexIDMap.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp index 004bbdc03c..fe6dbfa2c6 100644 --- a/faiss/IndexIVF.cpp +++ b/faiss/IndexIVF.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/faiss/IndexIVFAdditiveQuantizer.cpp b/faiss/IndexIVFAdditiveQuantizer.cpp index 01ae9a072b..866b542441 100644 --- a/faiss/IndexIVFAdditiveQuantizer.cpp +++ b/faiss/IndexIVFAdditiveQuantizer.cpp @@ -16,7 +16,6 @@ #include #include #include -#include namespace faiss { diff --git a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp index 9bc0abd15d..91135a813f 100644 --- a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +++ b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp @@ -11,8 +11,6 @@ #include #include -#include - #include #include @@ -23,7 +21,6 @@ #include #include #include -#include #include namespace faiss { diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp index da7f8fb741..6c140c554f 100644 --- a/faiss/IndexIVFFastScan.cpp +++ b/faiss/IndexIVFFastScan.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index c1adf981ac..e9d6eead2c 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -22,7 +22,6 @@ #include #include -#include #include diff --git a/faiss/IndexIVFPQFastScan.cpp b/faiss/IndexIVFPQFastScan.cpp index 438edd9057..9d1cdfcae3 100644 --- a/faiss/IndexIVFPQFastScan.cpp +++ b/faiss/IndexIVFPQFastScan.cpp @@ -11,21 +11,17 @@ #include #include -#include - #include #include #include #include #include -#include #include #include #include -#include namespace faiss { diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp index ab8cd8221f..c0a6c0e914 100644 --- a/faiss/IndexIVFSpectralHash.cpp +++ b/faiss/IndexIVFSpectralHash.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace faiss { diff --git a/faiss/IndexLSH.cpp b/faiss/IndexLSH.cpp index 1f44c6116c..a2d29f8173 100644 --- a/faiss/IndexLSH.cpp +++ b/faiss/IndexLSH.cpp @@ -15,7 +15,6 @@ #include #include -#include namespace faiss { diff --git a/faiss/IndexNSG.cpp b/faiss/IndexNSG.cpp index bd0b00e865..3f92a61397 100644 --- a/faiss/IndexNSG.cpp +++ b/faiss/IndexNSG.cpp @@ -9,8 +9,6 @@ #include -#include - #include #include @@ -18,7 +16,6 @@ #include #include #include -#include #include namespace faiss { diff --git a/faiss/IndexPQFastScan.cpp b/faiss/IndexPQFastScan.cpp index 5eb716df1a..153a881bde 100644 --- a/faiss/IndexPQFastScan.cpp +++ b/faiss/IndexPQFastScan.cpp @@ -8,11 +8,8 @@ #include #include -#include #include -#include - #include #include #include diff --git a/faiss/IndexRefine.cpp b/faiss/IndexRefine.cpp index 01dd97a08b..8bc429a5e9 100644 --- a/faiss/IndexRefine.cpp +++ b/faiss/IndexRefine.cpp @@ -11,8 +11,6 @@ #include #include #include -#include -#include namespace faiss { diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp index ecb4fefe1e..19c61593fc 100644 --- a/faiss/IndexScalarQuantizer.cpp +++ b/faiss/IndexScalarQuantizer.cpp @@ -14,7 +14,6 @@ #include -#include #include #include #include diff --git a/faiss/MetaIndexes.cpp b/faiss/MetaIndexes.cpp index 3577ca1309..a2f1efbd90 100644 --- a/faiss/MetaIndexes.cpp +++ b/faiss/MetaIndexes.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/faiss/gpu/GpuAutoTune.cpp b/faiss/gpu/GpuAutoTune.cpp index 8cdb3210fa..fed0132d79 100644 --- a/faiss/gpu/GpuAutoTune.cpp +++ b/faiss/gpu/GpuAutoTune.cpp @@ -17,10 +17,7 @@ #include #include #include -#include #include -#include -#include namespace faiss { namespace gpu { diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index cb722d5860..e356ac2779 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -30,6 +30,7 @@ #include #include #include +#include #include #if defined USE_NVIDIA_CUVS @@ -231,7 +232,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { FAISS_THROW_IF_NOT_MSG( args.vectorType == args.queryType, "limitation: both vectorType and queryType must currently " - "be the same (F32 or F16"); + "be the same (F32 / F16 / BF16"); #if defined USE_NVIDIA_CUVS // Note: For now, cuVS bfknn requires queries and vectors to be same layout @@ -390,6 +391,17 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { bfKnnConvert(prov, args); } else if (args.vectorType == DistanceDataType::F16) { bfKnnConvert(prov, args); + } else if (args.vectorType == DistanceDataType::BF16) { +// no bf16 support for AMD +#ifndef USE_AMD_ROCM + if (prov->getResources()->supportsBFloat16CurrentDevice()) { + bfKnnConvert<__nv_bfloat16>(prov, args); + } else { + FAISS_THROW_MSG("not compiled with bfloat16 support"); + } +#else + FAISS_THROW_MSG("no AMD bfloat16 support"); +#endif } else { FAISS_THROW_MSG("unknown vectorType"); } @@ -456,8 +468,10 @@ void bfKnn_single_query_shard( args.k > 0, "bfKnn_tiling: tiling vectors is only supported for k > 0"); size_t distance_size = args.vectorType == DistanceDataType::F32 ? 4 - : args.vectorType == DistanceDataType::F16 ? 2 - : 0; + : (args.vectorType == DistanceDataType::F16 || + args.vectorType == DistanceDataType::BF16) + ? 2 + : 0; FAISS_THROW_IF_NOT_MSG( distance_size > 0, "bfKnn_tiling: unknown vectorType"); size_t shard_size = vectorsMemoryLimit / (args.dims * distance_size); @@ -514,8 +528,10 @@ void bfKnn_tiling( args.k > 0, "bfKnn_tiling: tiling queries is only supported for k > 0"); size_t distance_size = args.queryType == DistanceDataType::F32 ? 4 - : args.queryType == DistanceDataType::F16 ? 2 - : 0; + : (args.queryType == DistanceDataType::F16 || + args.queryType == DistanceDataType::BF16) + ? 2 + : 0; FAISS_THROW_IF_NOT_MSG( distance_size > 0, "bfKnn_tiling: unknown queryType"); size_t label_size = args.outIndicesType == IndicesDataType::I64 ? 8 diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h index 7052fc68b0..e4daf5e296 100644 --- a/faiss/gpu/GpuDistance.h +++ b/faiss/gpu/GpuDistance.h @@ -19,6 +19,7 @@ class GpuResourcesProvider; enum class DistanceDataType { F32 = 1, F16, + BF16, }; // Scalar type of the indices data diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp index 83c9a8480c..8d39fa65af 100644 --- a/faiss/gpu/GpuResources.cpp +++ b/faiss/gpu/GpuResources.cpp @@ -161,6 +161,10 @@ GpuMemoryReservation::~GpuMemoryReservation() { GpuResources::~GpuResources() = default; +bool GpuResources::supportsBFloat16CurrentDevice() { + return supportsBFloat16(getCurrentDevice()); +} + cublasHandle_t GpuResources::getBlasHandleCurrentDevice() { return getBlasHandle(getCurrentDevice()); } diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index 2f01c1781b..c0c851a892 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -205,6 +205,9 @@ class GpuResources { /// of demand virtual void initializeForDevice(int device) = 0; + /// Does the given GPU support bfloat16? + virtual bool supportsBFloat16(int device) = 0; + /// Returns the cuBLAS handle that we use for the given device virtual cublasHandle_t getBlasHandle(int device) = 0; @@ -252,6 +255,9 @@ class GpuResources { /// Functions provided by default /// + /// Does the current GPU support bfloat16? + bool supportsBFloat16CurrentDevice(); + /// Calls getBlasHandle with the current device cublasHandle_t getBlasHandleCurrentDevice(); diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index a91c7f693c..39ee38efa9 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -206,6 +206,13 @@ size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU( return requested; } +/// Does the given GPU support bfloat16? +bool StandardGpuResourcesImpl::supportsBFloat16(int device) { + initializeForDevice(device); + auto& prop = getDeviceProperties(device); + return prop.major >= 8; +} + void StandardGpuResourcesImpl::noTempMemory() { setTempMemory(0); } @@ -701,6 +708,14 @@ std::shared_ptr StandardGpuResources::getResources() { return res_; } +bool StandardGpuResources::supportsBFloat16(int device) { + return res_->supportsBFloat16(device); +} + +bool StandardGpuResources::supportsBFloat16CurrentDevice() { + return res_->supportsBFloat16CurrentDevice(); +} + void StandardGpuResources::noTempMemory() { res_->noTempMemory(); } diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 322a341a00..9c8cf4d55d 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -48,6 +48,9 @@ class StandardGpuResourcesImpl : public GpuResources { ~StandardGpuResourcesImpl() override; + /// Does the given GPU support bfloat16? + bool supportsBFloat16(int device) override; + /// Disable allocation of temporary memory; all temporary memory /// requests will call cudaMalloc / cudaFree at the point of use void noTempMemory(); @@ -199,6 +202,12 @@ class StandardGpuResources : public GpuResourcesProvider { std::shared_ptr getResources() override; + /// Whether or not the given device supports native bfloat16 arithmetic + bool supportsBFloat16(int device); + + /// Whether or not the current device supports native bfloat16 arithmetic + bool supportsBFloat16CurrentDevice(); + /// Disable allocation of temporary memory; all temporary memory /// requests will call cudaMalloc / cudaFree at the point of use void noTempMemory(); diff --git a/faiss/gpu/impl/Distance.cu b/faiss/gpu/impl/Distance.cu index 3ac99b2576..eb2e91e93e 100644 --- a/faiss/gpu/impl/Distance.cu +++ b/faiss/gpu/impl/Distance.cu @@ -504,6 +504,30 @@ void runAllPairwiseL2Distance( outDistances); } +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runAllPairwiseL2Distance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor* vectorNorms, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + Tensor& outDistances) { + runAllPairwiseDistance<__nv_bfloat16>( + true, + res, + stream, + vectors, + vectorsRowMajor, + vectorNorms, + queries, + queriesRowMajor, + outDistances); +} +#endif // USE_AMD_ROCM + void runAllPairwiseIPDistance( GpuResources* res, cudaStream_t stream, @@ -544,6 +568,29 @@ void runAllPairwiseIPDistance( outDistances); } +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runAllPairwiseIPDistance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + Tensor& outDistances) { + runAllPairwiseDistance<__nv_bfloat16>( + false, + res, + stream, + vectors, + vectorsRowMajor, + nullptr, + queries, + queriesRowMajor, + outDistances); +} +#endif // USE_AMD_ROCM + void runL2Distance( GpuResources* res, cudaStream_t stream, @@ -596,6 +643,35 @@ void runL2Distance( ignoreOutDistances); } +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runL2Distance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor* vectorNorms, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool ignoreOutDistances) { + runL2Distance<__nv_bfloat16>( + res, + stream, + vectors, + vectorsRowMajor, + vectorNorms, + queries, + queriesRowMajor, + k, + outDistances, + outIndices, + ignoreOutDistances); +} +#endif // USE_AMD_ROCM + void runIPDistance( GpuResources* res, cudaStream_t stream, @@ -640,5 +716,30 @@ void runIPDistance( outIndices); } +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runIPDistance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + int k, + Tensor& outDistances, + Tensor& outIndices) { + runIPDistance<__nv_bfloat16>( + res, + stream, + vectors, + vectorsRowMajor, + queries, + queriesRowMajor, + k, + outDistances, + outIndices); +} +#endif // USE_AMD_ROCM + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/Distance.cuh b/faiss/gpu/impl/Distance.cuh index 17d21f4d9a..d8e1d5c239 100644 --- a/faiss/gpu/impl/Distance.cuh +++ b/faiss/gpu/impl/Distance.cuh @@ -41,6 +41,19 @@ void runAllPairwiseL2Distance( bool queriesRowMajor, Tensor& outDistances); +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runAllPairwiseL2Distance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor* vectorNorms, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + Tensor& outDistances); +#endif // USE_AMD_ROCM + void runAllPairwiseIPDistance( GpuResources* res, cudaStream_t stream, @@ -59,6 +72,18 @@ void runAllPairwiseIPDistance( bool queriesRowMajor, Tensor& outDistances); +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runAllPairwiseIPDistance( + GpuResources* res, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + Tensor& outDistances); +#endif // USE_AMD_ROCM + /// Calculates brute-force L2 distance between `vectors` and /// `queries`, returning the k closest results seen void runL2Distance( @@ -91,6 +116,22 @@ void runL2Distance( Tensor& outIndices, bool ignoreOutDistances = false); +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runL2Distance( + GpuResources* resources, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor* vectorNorms, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool ignoreOutDistances = false); +#endif // USE_AMD_ROCM + /// Calculates brute-force inner product distance between `vectors` /// and `queries`, returning the k closest results seen void runIPDistance( @@ -115,6 +156,20 @@ void runIPDistance( Tensor& outDistances, Tensor& outIndices); +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runIPDistance( + GpuResources* resources, + cudaStream_t stream, + Tensor<__nv_bfloat16, 2, true>& vectors, + bool vectorsRowMajor, + Tensor<__nv_bfloat16, 2, true>& queries, + bool queriesRowMajor, + int k, + Tensor& outDistances, + Tensor& outIndices); +#endif // USE_AMD_ROCM + // // General distance implementation, assumes that all arguments are on the // device. This is the top-level internal distance function to call to dispatch diff --git a/faiss/gpu/impl/GeneralDistance.cuh b/faiss/gpu/impl/GeneralDistance.cuh index cc60794a04..208d3a81bf 100644 --- a/faiss/gpu/impl/GeneralDistance.cuh +++ b/faiss/gpu/impl/GeneralDistance.cuh @@ -151,10 +151,10 @@ __launch_bounds__(TILE_SIZE* TILE_SIZE) __global__ void generalDistance( bool kInBounds = k < query.getSize(1); queryTileBase[threadIdx.x + i * TILE_SIZE] = - kInBounds ? queryBase[k] : ConvertTo::to(0); + kInBounds ? queryBase[k] : ConvertTo::to(0.0f); vecTileBase[threadIdx.x + i * TILE_SIZE] = - kInBounds ? vecBase[k] : ConvertTo::to(0); + kInBounds ? vecBase[k] : ConvertTo::to(0.0f); } __syncthreads(); @@ -185,10 +185,10 @@ __launch_bounds__(TILE_SIZE* TILE_SIZE) __global__ void generalDistance( for (idx_t k = threadIdx.x; k < limit; k += TILE_SIZE) { // Load query tile queryTileBase[threadIdx.x] = - queryThreadInBounds ? queryBase[k] : ConvertTo::to(0); + queryThreadInBounds ? queryBase[k] : ConvertTo::to(0.0f); vecTileBase[threadIdx.x] = - vecThreadInBoundsLoad ? vecBase[k] : ConvertTo::to(0); + vecThreadInBoundsLoad ? vecBase[k] : ConvertTo::to(0.0f); __syncthreads(); @@ -211,11 +211,11 @@ __launch_bounds__(TILE_SIZE* TILE_SIZE) __global__ void generalDistance( // Load query tile queryTileBase[threadIdx.x] = queryThreadInBounds && kInBounds ? queryBase[k] - : ConvertTo::to(0); + : ConvertTo::to(0.0f); vecTileBase[threadIdx.x] = vecThreadInBoundsLoad && kInBounds ? vecBase[k] - : ConvertTo::to(0); + : ConvertTo::to(0.0f); __syncthreads(); diff --git a/faiss/gpu/impl/GpuScalarQuantizer.cuh b/faiss/gpu/impl/GpuScalarQuantizer.cuh index cb7454cf11..c2d781419d 100644 --- a/faiss/gpu/impl/GpuScalarQuantizer.cuh +++ b/faiss/gpu/impl/GpuScalarQuantizer.cuh @@ -154,7 +154,7 @@ struct Codec { inline __device__ void decode(void* data, idx_t vec, int d, float* out) const { half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec]; - out[0] = Convert()(p[d]); + out[0] = ConvertTo::to(p[d]); } inline __device__ float decodePartial( @@ -172,7 +172,7 @@ struct Codec { int d, float v[kDimPerIter]) const { half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec]; - p[d] = Convert()(v[0]); + p[d] = ConvertTo::to(v[0]); } inline __device__ void encodePartial( @@ -191,11 +191,11 @@ struct Codec { static constexpr int kEncodeBits = 16; inline __device__ EncodeT encodeNew(int dim, float v) const { - return Convert()(v); + return ConvertTo::to(v); } inline __device__ float decodeNew(int dim, EncodeT v) const { - return Convert()(v); + return ConvertTo::to(v); } int bytesPerVec; diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu index 66eb06d0d7..262fa19153 100644 --- a/faiss/gpu/impl/L2Norm.cu +++ b/faiss/gpu/impl/L2Norm.cu @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -276,5 +275,18 @@ void runL2Norm( runL2Norm(input, inputRowMajor, output, normSquared, stream); } +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runL2Norm( + Tensor<__nv_bfloat16, 2, true>& input, + bool inputRowMajor, + Tensor& output, + bool normSquared, + cudaStream_t stream) { + runL2Norm<__nv_bfloat16, __nv_bfloat162>( + input, inputRowMajor, output, normSquared, stream); +} +#endif + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/L2Norm.cuh b/faiss/gpu/impl/L2Norm.cuh index fa798b75b7..79aef4f131 100644 --- a/faiss/gpu/impl/L2Norm.cuh +++ b/faiss/gpu/impl/L2Norm.cuh @@ -7,7 +7,7 @@ #pragma once -#include +#include #include namespace faiss { @@ -27,5 +27,15 @@ void runL2Norm( bool normSquared, cudaStream_t stream); +// no bf16 support for AMD +#ifndef USE_AMD_ROCM +void runL2Norm( + Tensor<__nv_bfloat16, 2, true>& input, + bool inputRowMajor, + Tensor& output, + bool normSquared, + cudaStream_t stream); +#endif + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/VectorResidual.cu b/faiss/gpu/impl/VectorResidual.cu index cba7d9073c..425036552d 100644 --- a/faiss/gpu/impl/VectorResidual.cu +++ b/faiss/gpu/impl/VectorResidual.cu @@ -114,10 +114,8 @@ __global__ void gatherReconstructByIds( auto vec = vecs[id]; auto outVec = out[blockIdx.x]; - Convert conv; - for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) { - outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]); + outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo::to(vec[i]); } } @@ -131,10 +129,8 @@ __global__ void gatherReconstructByRange( auto vec = vecs[id]; auto outVec = out[blockIdx.x]; - Convert conv; - for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) { - outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]); + outVec[i] = id == idx_t(-1) ? 0.0f : ConvertTo::to(vec[i]); } } diff --git a/faiss/gpu/perf/PerfIVFPQAdd.cpp b/faiss/gpu/perf/PerfIVFPQAdd.cpp index 90caefcd6a..08aaf01a9a 100644 --- a/faiss/gpu/perf/PerfIVFPQAdd.cpp +++ b/faiss/gpu/perf/PerfIVFPQAdd.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include DEFINE_int32(batches, 10, "number of batches of vectors to add"); diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu index 1c9a43ca18..6c5526b1c5 100644 --- a/faiss/gpu/test/TestGpuDistance.cu +++ b/faiss/gpu/test/TestGpuDistance.cu @@ -32,6 +32,13 @@ #include #include +enum class TestThresholds { + Normal, + BF16, + // Linf has worse error than the other metrics for bf16 + BF16_Linf, +}; + void evaluate_bfknn( faiss::gpu::GpuDistanceParams& args, faiss::gpu::GpuResourcesProvider* res, @@ -43,16 +50,39 @@ void evaluate_bfknn( int k, bool colMajorVecs, bool colMajorQueries, - faiss::MetricType metric) { + faiss::MetricType metric, + TestThresholds thresh = TestThresholds::Normal) { using namespace faiss::gpu; bfKnn(res, args); std::stringstream str; - str << "using cuVS " << args.use_cuvs << "metric " << metric + str << "using cuVS " << args.use_cuvs << " metric " << metric << " colMajorVecs " << colMajorVecs << " colMajorQueries " << colMajorQueries; + float maxRelativeError; + float pctMaxDiff1; + float pctMaxDiffN; + + switch (thresh) { + case TestThresholds::Normal: + maxRelativeError = 6e-3f; + pctMaxDiff1 = 0.1f; + pctMaxDiffN = 0.015f; + break; + case TestThresholds::BF16: + maxRelativeError = 1.5e-2f; + pctMaxDiff1 = 0.3f; + pctMaxDiffN = 0.1f; + break; + case TestThresholds::BF16_Linf: + maxRelativeError = 1.5e-2f; + pctMaxDiff1 = 0.53f; + pctMaxDiffN = 0.2f; + break; + } + compareLists( cpuDistance.data(), cpuIndices.data(), @@ -64,9 +94,9 @@ void evaluate_bfknn( false, false, true, - 6e-3f, - 0.1f, - 0.015f); + maxRelativeError, + pctMaxDiff1, + pctMaxDiffN); } void testTransposition( @@ -82,6 +112,10 @@ void testTransposition( StandardGpuResources res; res.noTempMemory(); + // The transpose and distance code assumes the desired device is already set + DeviceScope scope(device); + auto stream = res.getDefaultStream(device); + int dim = randVal(20, 150); int numVecs = randVal(10, 30000); int numQuery = randVal(1, 1024); @@ -120,10 +154,6 @@ void testTransposition( cpuIndex.search( numQuery, queries.data(), k, cpuDistance.data(), cpuIndices.data()); - // The transpose and distance code assumes the desired device is already set - DeviceScope scope(device); - auto stream = res.getDefaultStream(device); - // Copy input data to GPU, and pre-transpose both vectors and queries for // passing auto gpuVecs = toDeviceNonTemporary( @@ -191,12 +221,161 @@ void testTransposition( metric); } +void testTransposition_bf16( + bool colMajorVecs, + bool colMajorQueries, + faiss::MetricType metric, + bool use_raft = false, + float metricArg = 0) { + using namespace faiss::gpu; + +#ifdef USE_AMD_ROCM + std::cout << "skipping bfloat16 test (no bfloat16 support on AMD)\n"; + EXPECT_TRUE(true); + return; +#else + int device = randVal(0, getNumDevices() - 1); + + StandardGpuResources res; + if (!res.supportsBFloat16(device)) { + std::cout << "skipping bfloat16 test (no bfloat16 support on device)\n"; + return; + } + + res.noTempMemory(); + // The transpose and distance code assumes the desired device is already set + DeviceScope scope(device); + auto stream = res.getDefaultStream(device); + + int dim = randVal(20, 150); + int numVecs = randVal(10, 30000); + int numQuery = randVal(1, 1024); + int k = std::min(numVecs, randVal(20, 70)); + + // Input data for CPU + std::vector vecs = randVecs(numVecs, dim); + std::vector queries = randVecs(numQuery, dim); + + if ((metric == faiss::MetricType::METRIC_JensenShannon) || + (metric == faiss::MetricType::METRIC_Jaccard)) { + // make values positive + for (auto& v : vecs) { + v = std::abs(v); + if (v == 0) { + v = 1e-6; + } + } + + for (auto& q : queries) { + q = std::abs(q); + if (q == 0) { + q = 1e-6; + } + } + } + + // The CPU index is our reference for the results + faiss::IndexFlat cpuIndex(dim, metric); + cpuIndex.metric_arg = metricArg; + cpuIndex.add(numVecs, vecs.data()); + + std::vector cpuDistance(numQuery * k, 0); + std::vector cpuIndices(numQuery * k, -1); + + cpuIndex.search( + numQuery, queries.data(), k, cpuDistance.data(), cpuIndices.data()); + + // Convert float32 data to bfloat16 via truncation not rounding + // (just copy high 2 bytes) + std::vector bf16_vecs(vecs.size()); + std::vector bf16_queries(queries.size()); + + auto fn_f32_bf16 = [](float v) { + uint32_t vi; + std::memcpy(&vi, &v, sizeof(uint32_t)); + return uint16_t(vi >> 16); + }; + + std::transform(vecs.begin(), vecs.end(), bf16_vecs.begin(), fn_f32_bf16); + std::transform( + queries.begin(), queries.end(), bf16_queries.begin(), fn_f32_bf16); + + // Copy input data to GPU, and pre-transpose both vectors and queries for + // passing. Just use uint16_t in lieu of __nv_bfloat16 + auto gpuVecs = toDeviceNonTemporary( + res.getResources().get(), + device, + bf16_vecs.data(), + stream, + {numVecs, dim}); + auto gpuQueries = toDeviceNonTemporary( + res.getResources().get(), + device, + bf16_queries.data(), + stream, + {numQuery, dim}); + + DeviceTensor vecsT( + res.getResources().get(), + makeDevAlloc(AllocType::Other, stream), + {dim, numVecs}); + runTransposeAny(gpuVecs, 0, 1, vecsT, stream); + + DeviceTensor queriesT( + res.getResources().get(), + makeDevAlloc(AllocType::Other, stream), + {dim, numQuery}); + runTransposeAny(gpuQueries, 0, 1, queriesT, stream); + + std::vector gpuDistance(numQuery * k, 0); + std::vector gpuIndices(numQuery * k, -1); + + GpuDistanceParams args; + args.metric = metric; + args.metricArg = metricArg; + args.k = k; + args.dims = dim; + args.vectors = colMajorVecs ? vecsT.data() : gpuVecs.data(); + args.vectorType = DistanceDataType::BF16; + args.vectorsRowMajor = !colMajorVecs; + args.numVectors = numVecs; + args.queries = colMajorQueries ? queriesT.data() : gpuQueries.data(); + args.queryType = DistanceDataType::BF16; + args.queriesRowMajor = !colMajorQueries; + args.numQueries = numQuery; + args.outDistances = gpuDistance.data(); + args.outIndices = gpuIndices.data(); + args.device = device; + + evaluate_bfknn( + args, + &res, + cpuDistance, + cpuIndices, + gpuDistance, + gpuIndices, + numQuery, + k, + colMajorVecs, + colMajorQueries, + metric, + metric == faiss::MetricType::METRIC_Linf ? TestThresholds::BF16_Linf + : TestThresholds::BF16); +#endif +} + // Test different memory layouts for brute-force k-NN TEST(TestGpuDistance, Transposition_RR) { testTransposition(false, false, faiss::MetricType::METRIC_L2); testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT); } +TEST(TestGpuDistance, Transposition_RR_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_L2); + testTransposition_bf16( + false, false, faiss::MetricType::METRIC_INNER_PRODUCT); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Transposition_RR) { testTransposition(false, false, faiss::MetricType::METRIC_L2, true); @@ -209,6 +388,10 @@ TEST(TestGpuDistance, Transposition_RC) { testTransposition(false, true, faiss::MetricType::METRIC_L2); } +TEST(TestGpuDistance, Transposition_RC_BF16) { + testTransposition_bf16(false, true, faiss::MetricType::METRIC_L2); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Transposition_RC) { testTransposition(false, true, faiss::MetricType::METRIC_L2, true); @@ -219,6 +402,10 @@ TEST(TestGpuDistance, Transposition_CR) { testTransposition(true, false, faiss::MetricType::METRIC_L2); } +TEST(TestGpuDistance, Transposition_CR_BF16) { + testTransposition_bf16(true, false, faiss::MetricType::METRIC_L2); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Transposition_CR) { testTransposition(true, false, faiss::MetricType::METRIC_L2, true); @@ -229,6 +416,10 @@ TEST(TestGpuDistance, Transposition_CC) { testTransposition(true, true, faiss::MetricType::METRIC_L2); } +TEST(TestGpuDistance, Transposition_CC_BF16) { + testTransposition_bf16(true, true, faiss::MetricType::METRIC_L2); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Transposition_CC) { testTransposition(true, true, faiss::MetricType::METRIC_L2, true); @@ -239,6 +430,10 @@ TEST(TestGpuDistance, L1) { testTransposition(false, false, faiss::MetricType::METRIC_L1); } +TEST(TestGpuDistance, L1_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_L1); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, L1) { testTransposition(false, false, faiss::MetricType::METRIC_L1, true); @@ -257,10 +452,18 @@ TEST(TestCuvsGpuDistance, L1_RC) { } #endif +TEST(TestGpuDistance, L1_RC_BF16) { + testTransposition_bf16(false, true, faiss::MetricType::METRIC_L1); +} + TEST(TestGpuDistance, L1_CR) { testTransposition(true, false, faiss::MetricType::METRIC_L1); } +TEST(TestGpuDistance, L1_CR_BF16) { + testTransposition_bf16(true, false, faiss::MetricType::METRIC_L1); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, L1_CR) { testTransposition(true, false, faiss::MetricType::METRIC_L1, true); @@ -271,6 +474,10 @@ TEST(TestGpuDistance, L1_CC) { testTransposition(true, true, faiss::MetricType::METRIC_L1); } +TEST(TestGpuDistance, L1_CC_BF16) { + testTransposition_bf16(true, true, faiss::MetricType::METRIC_L1); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, L1_CC) { testTransposition(true, true, faiss::MetricType::METRIC_L1, true); @@ -289,10 +496,19 @@ TEST(TestCuvsGpuDistance, Linf) { } #endif +TEST(TestGpuDistance, Linf_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_Linf); +} + TEST(TestGpuDistance, Lp) { testTransposition(false, false, faiss::MetricType::METRIC_Lp, false, 3); } +TEST(TestGpuDistance, Lp_BF16) { + testTransposition_bf16( + false, false, faiss::MetricType::METRIC_Lp, false, 3); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Lp) { testTransposition(false, false, faiss::MetricType::METRIC_Lp, true, 3); @@ -303,6 +519,10 @@ TEST(TestGpuDistance, Canberra) { testTransposition(false, false, faiss::MetricType::METRIC_Canberra); } +TEST(TestGpuDistance, Canberra_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_Canberra); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, Canberra) { testTransposition(false, false, faiss::MetricType::METRIC_Canberra, true); @@ -313,10 +533,19 @@ TEST(TestGpuDistance, BrayCurtis) { testTransposition(false, false, faiss::MetricType::METRIC_BrayCurtis); } +TEST(TestGpuDistance, BrayCurtis_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_BrayCurtis); +} + TEST(TestGpuDistance, JensenShannon) { testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon); } +TEST(TestGpuDistance, JensenShannon_BF16) { + testTransposition_bf16( + false, false, faiss::MetricType::METRIC_JensenShannon); +} + #if defined USE_NVIDIA_CUVS TEST(TestCuvsGpuDistance, JensenShannon) { testTransposition( @@ -328,6 +557,10 @@ TEST(TestGpuDistance, Jaccard) { testTransposition(false, false, faiss::MetricType::METRIC_Jaccard); } +TEST(TestGpuDistance, Jaccard_BF16) { + testTransposition_bf16(false, false, faiss::MetricType::METRIC_Jaccard); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp index 38807d9cf7..1357cfcb46 100644 --- a/faiss/gpu/test/TestUtils.cpp +++ b/faiss/gpu/test/TestUtils.cpp @@ -5,7 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include @@ -18,6 +17,77 @@ namespace faiss { namespace gpu { +inline float half2float(const unsigned short h) { + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); + std::memcpy(&f, &u, sizeof(u)); + return f; +} + +unsigned short float2half(const float f) { + unsigned int sign; + unsigned int remainder; + unsigned int x; + unsigned int u; + unsigned int result; + (void)std::memcpy(&x, &f, sizeof(f)); + + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + + if ((remainder > 0x80000000U) || + ((remainder == 0x80000000U) && ((result & 0x1U) != 0U))) { + return static_cast(result) + 1; + } else { + return static_cast(result); + } +} + inline float relativeError(float a, float b) { return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b))); } @@ -78,7 +148,7 @@ std::vector randBinaryVecs(size_t num, size_t dim) { std::vector roundToHalf(const std::vector& v) { auto out = std::vector(v.size()); for (int i = 0; i < v.size(); ++i) { - out[i] = __half2float(__float2half(v[i])); + out[i] = half2float(float2half(v[i])); } return out; diff --git a/faiss/gpu/utils/ConversionOperators.cuh b/faiss/gpu/utils/ConversionOperators.cuh index bbaac78f64..f0ab1ea1fd 100644 --- a/faiss/gpu/utils/ConversionOperators.cuh +++ b/faiss/gpu/utils/ConversionOperators.cuh @@ -22,30 +22,14 @@ namespace gpu { // Conversion utilities // -template -struct Convert { - inline __device__ To operator()(From v) const { - return (To)v; - } -}; - -template <> -struct Convert { - inline __device__ half operator()(float v) const { - return __float2half(v); - } -}; - -template <> -struct Convert { - inline __device__ float operator()(half v) const { - return __half2float(v); +template +struct ConvertTo { + template + static inline __device__ T to(U v) { + return T(v); } }; -template -struct ConvertTo {}; - template <> struct ConvertTo { static inline __device__ float to(float v) { @@ -54,6 +38,12 @@ struct ConvertTo { static inline __device__ float to(half v) { return __half2float(v); } + +#ifndef USE_AMD_ROCM + static inline __device__ float to(__nv_bfloat16 v) { + return __bfloat162float(v); + } +#endif // !USE_AMD_ROCM }; template <> @@ -106,6 +96,31 @@ struct ConvertTo { } }; +// no bf16 support for AMD +#ifndef USE_AMD_ROCM + +template <> +struct ConvertTo<__nv_bfloat16> { + static inline __device__ __nv_bfloat16 to(float v) { + return __float2bfloat16(v); + } + static inline __device__ __nv_bfloat16 to(half v) { + return __float2bfloat16(__half2float(v)); + } + static inline __device__ __nv_bfloat16 to(__nv_bfloat16 v) { + return v; + } +}; + +#endif // USE_AMD_ROCM + +template +struct Convert { + inline __device__ To operator()(From v) const { + return ConvertTo::to(v); + } +}; + // Tensor conversion template void runConvert(const From* in, To* out, size_t num, cudaStream_t stream) { diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh index 449829de66..6a1f779eab 100644 --- a/faiss/gpu/utils/Float16.cuh +++ b/faiss/gpu/utils/Float16.cuh @@ -16,7 +16,21 @@ #define FAISS_USE_FULL_FLOAT16 1 #endif // __CUDA_ARCH__ types +// Some compute capabilities have full bfloat16 ALUs. +// FIXME: no support in ROCm yet +#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM) +#define FAISS_USE_FULL_BFLOAT16 1 +#endif // __CUDA_ARCH__ types + #include +#if !defined(USE_AMD_ROCM) +#include +#endif +// #else +// FIXME: no support in ROCm yet +// #include +// #include +// #endif // !defined(USE_AMD_ROCM) namespace faiss { namespace gpu { diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh index d825233c0d..9239c735f6 100644 --- a/faiss/gpu/utils/MathOperators.cuh +++ b/faiss/gpu/utils/MathOperators.cuh @@ -13,7 +13,7 @@ // // Templated wrappers to express math for different scalar and vector // types, so kernels can have the same written form but can operate -// over half and float, and on vector types transparently +// over half, bfloat16 and float, and on vector types transparently // namespace faiss { @@ -556,5 +556,240 @@ struct Math { } }; +#ifndef USE_AMD_ROCM + +template <> +struct Math<__nv_bfloat16> { + typedef __nv_bfloat16 ScalarType; + + static inline __device__ __nv_bfloat16 + add(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hadd(a, b); +#else + return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b)); +#endif + } + + static inline __device__ __nv_bfloat16 + sub(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hsub(a, b); +#else + return __float2bfloat16(__bfloat162float(a) - __bfloat162float(b)); +#endif + } + + static inline __device__ __nv_bfloat16 + mul(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hmul(a, b); +#else + return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b)); +#endif + } + + static inline __device__ __nv_bfloat16 neg(__nv_bfloat16 v) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hneg(v); +#else + return __float2bfloat16(-__bfloat162float(v)); +#endif + } + + static inline __device__ float reduceAdd(__nv_bfloat16 v) { + return ConvertTo::to(v); + } + + static inline __device__ bool lt(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hlt(a, b); +#else + return __bfloat162float(a) < __bfloat162float(b); +#endif + } + + static inline __device__ bool gt(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hgt(a, b); +#else + return __bfloat162float(a) > __bfloat162float(b); +#endif + } + + static inline __device__ bool eq(__nv_bfloat16 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __heq(a, b); +#else + return __bfloat162float(a) == __bfloat162float(b); +#endif + } + + static inline __device__ __nv_bfloat16 zero() { +#if CUDA_VERSION >= 9000 + return 0.0f; +#else + __nv_bfloat16 h; + h.x = 0; + return h; +#endif + } +}; + +template <> +struct Math<__nv_bfloat162> { + typedef __nv_bfloat16 ScalarType; + +#ifndef FAISS_USE_FULL_BFLOAT16 + // define a few conversion functions that don't exist on cuda 11 + // this overrides their definition in cuda 12 but we use native bf16 on this + // platform anyways. + static inline __device__ float2 __bfloat1622float2(__nv_bfloat162 a) { + float2 af; + af.x = __bfloat162float(a.x); + af.y = __bfloat162float(a.y); + return af; + } + + static inline __device__ __nv_bfloat162 __float22bfloat162_rn(float2 af) { + __nv_bfloat162 a; + a.x = __float2bfloat16_rn(af.x); + a.y = __float2bfloat16_rn(af.y); + return a; + } + + static inline __device__ __nv_bfloat162 + __bfloat162bfloat162(__nv_bfloat16 v) { + __nv_bfloat162 a; + a.x = v; + a.y = v; + return a; + } +#endif + + static inline __device__ __nv_bfloat162 + add(__nv_bfloat162 a, __nv_bfloat162 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hadd2(a, b); +#else + float2 af = __bfloat1622float2(a); + float2 bf = __bfloat1622float2(b); + + af.x += bf.x; + af.y += bf.y; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 + sub(__nv_bfloat162 a, __nv_bfloat162 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hsub2(a, b); +#else + float2 af = __bfloat1622float2(a); + float2 bf = __bfloat1622float2(b); + + af.x -= bf.x; + af.y -= bf.y; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 + add(__nv_bfloat162 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + __nv_bfloat162 b2 = __bfloat162bfloat162(b); + return __hadd2(a, b2); +#else + float2 af = __bfloat1622float2(a); + float bf = __bfloat162float(b); + + af.x += bf; + af.y += bf; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 + sub(__nv_bfloat162 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + __nv_bfloat162 b2 = __bfloat162bfloat162(b); + return __hsub2(a, b2); +#else + float2 af = __bfloat1622float2(a); + float bf = __bfloat162float(b); + + af.x -= bf; + af.y -= bf; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 + mul(__nv_bfloat162 a, __nv_bfloat162 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hmul2(a, b); +#else + float2 af = __bfloat1622float2(a); + float2 bf = __bfloat1622float2(b); + + af.x *= bf.x; + af.y *= bf.y; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 + mul(__nv_bfloat162 a, __nv_bfloat16 b) { +#ifdef FAISS_USE_FULL_BFLOAT16 + __nv_bfloat162 b2 = __bfloat162bfloat162(b); + return __hmul2(a, b2); +#else + float2 af = __bfloat1622float2(a); + float bf = __bfloat162float(b); + + af.x *= bf; + af.y *= bf; + + return __float22bfloat162_rn(af); +#endif + } + + static inline __device__ __nv_bfloat162 neg(__nv_bfloat162 v) { +#ifdef FAISS_USE_FULL_BFLOAT16 + return __hneg2(v); +#else + float2 vf = __bfloat1622float2(v); + vf.x = -vf.x; + vf.y = -vf.y; + + return __float22bfloat162_rn(vf); +#endif + } + + static inline __device__ float reduceAdd(__nv_bfloat162 v) { + float2 vf = __bfloat1622float2(v); + vf.x += vf.y; + + return vf.x; + } + + // not implemented for vector types + // static inline __device__ bool lt(__nv_bfloat162 a, __nv_bfloat162 b); + // static inline __device__ bool gt(__nv_bfloat162 a, __nv_bfloat162 b); + // static inline __device__ bool eq(__nv_bfloat162 a, __nv_bfloat162 b); + + static inline __device__ __nv_bfloat162 zero() { + return __bfloat162bfloat162(Math<__nv_bfloat16>::zero()); + } +}; + +#endif // !USE_AMD_ROCM + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh index 98fd0956cd..2c85d7244d 100644 --- a/faiss/gpu/utils/MatrixMult-inl.cuh +++ b/faiss/gpu/utils/MatrixMult-inl.cuh @@ -21,6 +21,7 @@ template struct GetCudaType; #ifdef USE_AMD_ROCM + template <> struct GetCudaType { static constexpr hipblasDatatype_t Type = HIPBLAS_R_32F; @@ -30,7 +31,15 @@ template <> struct GetCudaType { static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F; }; + +// FIXME: no AMD support for bf16 +// template <> +// struct GetCudaType<__nv_bfloat16> { +// static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B; +// }; + #else + template <> struct GetCudaType { static constexpr cudaDataType_t Type = CUDA_R_32F; @@ -40,6 +49,12 @@ template <> struct GetCudaType { static constexpr cudaDataType_t Type = CUDA_R_16F; }; + +template <> +struct GetCudaType<__nv_bfloat16> { + static constexpr cudaDataType_t Type = CUDA_R_16BF; +}; + #endif template diff --git a/faiss/impl/AdditiveQuantizer.cpp b/faiss/impl/AdditiveQuantizer.cpp index fb58a49600..ff4ead427a 100644 --- a/faiss/impl/AdditiveQuantizer.cpp +++ b/faiss/impl/AdditiveQuantizer.cpp @@ -24,7 +24,6 @@ #include #include #include -#include extern "C" { diff --git a/faiss/impl/ProductAdditiveQuantizer.cpp b/faiss/impl/ProductAdditiveQuantizer.cpp index 248f3a617a..1764b3c60a 100644 --- a/faiss/impl/ProductAdditiveQuantizer.cpp +++ b/faiss/impl/ProductAdditiveQuantizer.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include @@ -20,7 +19,6 @@ #include #include #include -#include extern "C" { diff --git a/faiss/impl/ResidualQuantizer.h b/faiss/impl/ResidualQuantizer.h index 9ad5902244..572232fda4 100644 --- a/faiss/impl/ResidualQuantizer.h +++ b/faiss/impl/ResidualQuantizer.h @@ -112,7 +112,7 @@ struct ResidualQuantizer : AdditiveQuantizer { /** lower-level encode function * - * @param n number of vectors to hanlde + * @param n number of vectors to handle * @param residuals vectors to encode, size (n, beam_size, d) * @param beam_size input beam size * @param new_beam_size output beam size (should be <= K * beam_size) diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 121b8cf979..c7887a4f53 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -13,9 +13,6 @@ #include #include -#include -#include - #include #include #include diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 9addd6d22f..0118ef4711 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -13,13 +13,9 @@ #include #include -#include -#include - #include #include -#include #include #include diff --git a/faiss/impl/kmeans1d.cpp b/faiss/impl/kmeans1d.cpp index 1ad07c6ed3..0f607fd7ad 100644 --- a/faiss/impl/kmeans1d.cpp +++ b/faiss/impl/kmeans1d.cpp @@ -6,7 +6,6 @@ */ #include -#include #include #include #include diff --git a/faiss/impl/lattice_Zn.cpp b/faiss/impl/lattice_Zn.cpp index 3b6f217675..6ffd3c1c18 100644 --- a/faiss/impl/lattice_Zn.cpp +++ b/faiss/impl/lattice_Zn.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp index 546d3a886c..8ff4bfec7c 100644 --- a/faiss/index_factory.cpp +++ b/faiss/index_factory.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp index 725367ef16..eb0a2193c8 100644 --- a/faiss/invlists/OnDiskInvertedLists.cpp +++ b/faiss/invlists/OnDiskInvertedLists.cpp @@ -15,7 +15,6 @@ #include #include -#include #include #include diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp index ecf7e74146..627300a5bc 100644 --- a/faiss/utils/distances_simd.cpp +++ b/faiss/utils/distances_simd.cpp @@ -3759,7 +3759,7 @@ void fvec_add(size_t d, const float* a, float b, float* c) { size_t i; simd8float32 bv(b); for (i = 0; i + 7 < d; i += 8) { - simd8float32 ci, ai, bi; + simd8float32 ci, ai; ai.loadu(a + i); ci = ai + bv; ci.storeu(c + i); diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp index c2faea4bcb..ee2cd51f72 100644 --- a/faiss/utils/extra_distances.cpp +++ b/faiss/utils/extra_distances.cpp @@ -15,7 +15,6 @@ #include #include -#include #include namespace faiss { diff --git a/perf_tests/bench_scalar_quantizer_accuracy.cpp b/perf_tests/bench_scalar_quantizer_accuracy.cpp index 1e70690f94..bde9939f66 100644 --- a/perf_tests/bench_scalar_quantizer_accuracy.cpp +++ b/perf_tests/bench_scalar_quantizer_accuracy.cpp @@ -14,7 +14,6 @@ #include #include #include -#include using namespace faiss; DEFINE_uint32(d, 128, "dimension"); diff --git a/perf_tests/bench_scalar_quantizer_decode.cpp b/perf_tests/bench_scalar_quantizer_decode.cpp index 02b2adcf04..16d1502ae5 100644 --- a/perf_tests/bench_scalar_quantizer_decode.cpp +++ b/perf_tests/bench_scalar_quantizer_decode.cpp @@ -14,7 +14,6 @@ #include #include #include -#include using namespace faiss; DEFINE_uint32(d, 128, "dimension"); diff --git a/perf_tests/bench_scalar_quantizer_distance.cpp b/perf_tests/bench_scalar_quantizer_distance.cpp index a6990ea442..14945c58c4 100644 --- a/perf_tests/bench_scalar_quantizer_distance.cpp +++ b/perf_tests/bench_scalar_quantizer_distance.cpp @@ -14,7 +14,6 @@ #include #include #include -#include using namespace faiss; DEFINE_uint32(d, 128, "dimension"); diff --git a/perf_tests/bench_scalar_quantizer_encode.cpp b/perf_tests/bench_scalar_quantizer_encode.cpp index 8e6f8c5ec8..0e4909c96d 100644 --- a/perf_tests/bench_scalar_quantizer_encode.cpp +++ b/perf_tests/bench_scalar_quantizer_encode.cpp @@ -15,7 +15,6 @@ #include #include #include -#include using namespace faiss; DEFINE_uint32(d, 128, "dimension"); diff --git a/tests/test_approx_topk.cpp b/tests/test_approx_topk.cpp index e5c2c1df6a..ff4ee1f2ea 100644 --- a/tests/test_approx_topk.cpp +++ b/tests/test_approx_topk.cpp @@ -17,7 +17,6 @@ #include -#include #include #include diff --git a/tests/test_common_ivf_empty_index.cpp b/tests/test_common_ivf_empty_index.cpp index 13f01979df..eeaa839760 100644 --- a/tests/test_common_ivf_empty_index.cpp +++ b/tests/test_common_ivf_empty_index.cpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/tests/test_contrib.py b/tests/test_contrib.py index 33bca7a4be..a7b0b09155 100644 --- a/tests/test_contrib.py +++ b/tests/test_contrib.py @@ -3,28 +3,34 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import faiss -import unittest -import numpy as np -import platform import os -import random +import platform import shutil import tempfile +import unittest +from contextlib import contextmanager -from faiss.contrib import datasets -from faiss.contrib import inspect_tools -from faiss.contrib import evaluation -from faiss.contrib import ivf_tools -from faiss.contrib import clustering -from faiss.contrib import big_batch_search -from faiss.contrib.ondisk import merge_ondisk +import faiss +import numpy as np from common_faiss_tests import get_dataset_2 -from faiss.contrib.exhaustive_search import \ - knn_ground_truth, knn, range_ground_truth, \ - range_search_max_results, exponential_query_iterator -from contextlib import contextmanager + +from faiss.contrib import ( + big_batch_search, + clustering, + datasets, + evaluation, + inspect_tools, + ivf_tools, +) +from faiss.contrib.exhaustive_search import ( + exponential_query_iterator, + knn, + knn_ground_truth, + range_ground_truth, + range_search_max_results, +) +from faiss.contrib.ondisk import merge_ondisk class TestComputeGT(unittest.TestCase): @@ -664,7 +670,10 @@ def test_sort(self): np.testing.assert_equal(Inew, Iref) def test_hnsw_permute(self): - """ make sure HNSW permutation works (useful when used as coarse quantizer) """ + """ + make sure HNSW permutation works + (useful when used as coarse quantizer) + """ ds = datasets.SyntheticDataset(32, 0, 1000, 50) index = faiss.index_factory(ds.d, "HNSW32,Flat") index.add(ds.get_database()) @@ -692,8 +701,10 @@ def test_code_set(self): np.sort(codes[inserted], axis=None)) -@unittest.skipIf(platform.system() == 'Windows', - 'OnDiskInvertedLists is unsupported on Windows.') +@unittest.skipIf( + platform.system() == 'Windows', + 'OnDiskInvertedLists is unsupported on Windows.' +) class TestMerge(unittest.TestCase): @contextmanager def temp_directory(self): diff --git a/tests/test_dealloc_invlists.cpp b/tests/test_dealloc_invlists.cpp index 53d57cbf63..8ad278c8d1 100644 --- a/tests/test_dealloc_invlists.cpp +++ b/tests/test_dealloc_invlists.cpp @@ -18,7 +18,6 @@ #include #include #include -#include using namespace faiss; diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py index 57bf877fbb..1f840e6cac 100644 --- a/tests/test_graph_based.py +++ b/tests/test_graph_based.py @@ -335,7 +335,7 @@ def test_build_invalid_knng(self): """Make some invalid entries in the input knn graph. It would cause a warning but IndexNSG should be able - to handel this. + to handle this. """ knn_graph = self.make_knn_graph(faiss.METRIC_L2) knn_graph[:100, 5] = -111 diff --git a/tests/test_hnsw.cpp b/tests/test_hnsw.cpp index e3b8325c6a..c546a76778 100644 --- a/tests/test_hnsw.cpp +++ b/tests/test_hnsw.cpp @@ -17,7 +17,6 @@ #include #include #include -#include int reference_pop_min(faiss::HNSW::MinimaxHeap& heap, float* vmin_out) { assert(heap.k > 0); diff --git a/tests/test_ivf_index.cpp b/tests/test_ivf_index.cpp index 5f8d2ab0c7..21d8897d40 100644 --- a/tests/test_ivf_index.cpp +++ b/tests/test_ivf_index.cpp @@ -8,9 +8,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -20,7 +17,6 @@ #include #include #include -#include namespace { diff --git a/tests/test_ivfpq_codec.cpp b/tests/test_ivfpq_codec.cpp index 297d6b4cf8..c9e9a038cb 100644 --- a/tests/test_ivfpq_codec.cpp +++ b/tests/test_ivfpq_codec.cpp @@ -16,7 +16,6 @@ #include #include #include -#include namespace { diff --git a/tests/test_ivfpq_indexing.cpp b/tests/test_ivfpq_indexing.cpp index 4f659d09fb..d277c6acc4 100644 --- a/tests/test_ivfpq_indexing.cpp +++ b/tests/test_ivfpq_indexing.cpp @@ -13,7 +13,6 @@ #include #include -#include TEST(IVFPQ, accuracy) { // dimension of the vectors to index diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp index fceaec442b..3d7dd43ede 100644 --- a/tests/test_lowlevel_ivf.cpp +++ b/tests/test_lowlevel_ivf.cpp @@ -21,9 +21,7 @@ #include #include #include -#include #include -#include using namespace faiss; diff --git a/tests/test_ondisk_ivf.cpp b/tests/test_ondisk_ivf.cpp index 726fae0a59..4f3d3a3032 100644 --- a/tests/test_ondisk_ivf.cpp +++ b/tests/test_ondisk_ivf.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/tests/test_transfer_invlists.cpp b/tests/test_transfer_invlists.cpp index 30c0ad0d54..f275eb68e6 100644 --- a/tests/test_transfer_invlists.cpp +++ b/tests/test_transfer_invlists.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include