diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml index 71153231b9..7dff484f2a 100644 --- a/.github/actions/build_conda/action.yml +++ b/.github/actions/build_conda/action.yml @@ -34,8 +34,8 @@ runs: - name: Install conda build tools shell: ${{ steps.choose_shell.outputs.shell }} run: | - conda update -y -q conda - conda install -y -q conda-build + conda install -y -q "conda!=24.11.0" + conda install -y -q "conda-build!=24.11.0" - name: Enable anaconda uploads if: inputs.label != '' shell: ${{ steps.choose_shell.outputs.shell }} @@ -80,7 +80,7 @@ runs: working-directory: conda run: | conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \ - -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge + -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia - name: Conda build (GPU w/ cuVS) w/ anaconda upload if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != '' shell: ${{ steps.choose_shell.outputs.shell }} @@ -89,4 +89,4 @@ runs: PACKAGE_TYPE: ${{ inputs.label }} run: | conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \ - --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge + --user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ac1387c438..4d38b583d6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -238,8 +238,8 @@ jobs: with: label: main cuda: "12.1.1" - linux-x86_64-GPU-CUVS-packages-CUDA12-1-1: - name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.1.1) + linux-x86_64-GPU-CUVS-packages-CUDA12-4-0: + name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0) if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') runs-on: 4-core-ubuntu-gpu-t4 env: @@ -257,7 +257,7 @@ jobs: with: label: main cuvs: "ON" - cuda: "12.1.1" + cuda: "12.4.0" windows-x86_64-packages: name: Windows x86_64 packages if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index c3929683b8..ef1e8d2357 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -73,8 +73,8 @@ jobs: with: label: nightly cuda: "12.1.1" - linux-x86_64-GPU-CUVS-CUDA12-1-1-nightly: - name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.1.1) + linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly: + name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0) runs-on: 4-core-ubuntu-gpu-t4 env: CUDA_ARCHS: "70-real;72-real;75-real;80;86-real" @@ -90,7 +90,7 @@ jobs: with: label: nightly cuvs: "ON" - cuda: "12.1.1" + cuda: "12.4.0" windows-x86_64-nightly: name: Windows x86_64 nightlies runs-on: windows-2019 diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml index 6c43d19a7c..d37ac56a3c 100644 --- a/conda/faiss-gpu-cuvs/meta.yaml +++ b/conda/faiss-gpu-cuvs/meta.yaml @@ -9,7 +9,7 @@ {% if cudatoolkit == '11.8.0' %} {% set cuda_constraints=">=11.8,<12" %} {% set libcublas_constraints=">=11.11,<12" %} -{% elif cudatoolkit == '12.1.1' %} +{% elif cudatoolkit == '12.4.0' %} {% set cuda_constraints=">=12.1,<13" %} {% set libcublas_constraints=">=12.1,<13" %} {% endif %} @@ -48,7 +48,7 @@ outputs: - {{ compiler('cxx') }} - sysroot_linux-64 # [linux64] - llvm-openmp # [osx] - - cmake >=3.24.0 + - cmake >=3.26.4 - make # [not win] - _openmp_mutex =4.5=2_kmp_llvm # [x86_64] - mkl =2023 # [x86_64] diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index e80477f1a0..f36091d26a 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { } else if (args.vectorType == DistanceDataType::F16) { bfKnnConvert(prov, args); } else if (args.vectorType == DistanceDataType::BF16) { -// no bf16 support for AMD -#ifndef USE_AMD_ROCM if (prov->getResources()->supportsBFloat16CurrentDevice()) { bfKnnConvert<__nv_bfloat16>(prov, args); } else { FAISS_THROW_MSG("not compiled with bfloat16 support"); } -#else - FAISS_THROW_MSG("no AMD bfloat16 support"); -#endif } else { FAISS_THROW_MSG("unknown vectorType"); } diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh index 2b65854205..9bc4773bb0 100755 --- a/faiss/gpu/hipify.sh +++ b/faiss/gpu/hipify.sh @@ -3,6 +3,9 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +# +# Usage: ./gpu/hipify.sh +# function hipify_dir() { @@ -10,10 +13,36 @@ function hipify_dir() cd "$1" || exit echo "Hipifying $(pwd)" + if [ -d ./gpu-tmp ]; then + #Clearing out any leftover files and directories + echo "Removing old ./gpu-tmp" + rm -rf ./gpu-tmp + fi + + if [ -d ./gpu ]; then + #Making a temp directory to implement pre hipify rules + echo "Creating ./gpu-tmp" + cp -r ./gpu ./gpu-tmp + + # adjust __nv_bfloat162 before hipify because of inaccurate conversions + # adjust __nv_bfloat16 before hipify because of inaccurate conversions + for ext in hip cuh h cpp c cu cuh + do + while IFS= read -r -d '' src + do + sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src" + sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src" + done < <(find ./gpu-tmp -name "*.$ext" -print0) + done + else + echo "Can't find the gpu/ dir" + exit + fi + # create all destination directories for hipified files into sibling 'gpu-rocm' directory while IFS= read -r -d '' src do - dst="${src//gpu/gpu-rocm}" + dst="${src//gpu-tmp/gpu-rocm}" if [ -d $dst ]; then #Clearing out any leftover files and directories @@ -24,7 +53,7 @@ function hipify_dir() #Making directories echo "Creating $dst" mkdir -p "$dst" - done < <(find ./gpu -type d -print0) + done < <(find ./gpu-tmp -type d -print0) # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming # run all files in parallel to speed up @@ -32,9 +61,9 @@ function hipify_dir() do while IFS= read -r -d '' src do - dst="${src//\.\/gpu/\.\/gpu-rocm}" + dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}" hipify-perl -o="$dst.tmp" "$src" & - done < <(find ./gpu -name "*.$ext" -print0) + done < <(find ./gpu-tmp -name "*.$ext" -print0) done wait @@ -45,6 +74,12 @@ function hipify_dir() mv "$src" "$dst" done < <(find ./gpu-rocm -name "*.cu.tmp" -print0) + if [ -d ./gpu-tmp ]; then + #Clearing out any leftover files and directories + echo "Removing ./gpu-tmp" + rm -rf ./gpu-tmp + fi + # replace header include statements "& outDistances); -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runAllPairwiseL2Distance( GpuResources* res, cudaStream_t stream, @@ -52,7 +50,6 @@ void runAllPairwiseL2Distance( Tensor<__nv_bfloat16, 2, true>& queries, bool queriesRowMajor, Tensor& outDistances); -#endif // USE_AMD_ROCM void runAllPairwiseIPDistance( GpuResources* res, @@ -72,8 +69,6 @@ void runAllPairwiseIPDistance( bool queriesRowMajor, Tensor& outDistances); -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runAllPairwiseIPDistance( GpuResources* res, cudaStream_t stream, @@ -82,7 +77,6 @@ void runAllPairwiseIPDistance( Tensor<__nv_bfloat16, 2, true>& queries, bool queriesRowMajor, Tensor& outDistances); -#endif // USE_AMD_ROCM /// Calculates brute-force L2 distance between `vectors` and /// `queries`, returning the k closest results seen @@ -116,8 +110,6 @@ void runL2Distance( Tensor& outIndices, bool ignoreOutDistances = false); -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runL2Distance( GpuResources* resources, cudaStream_t stream, @@ -130,7 +122,6 @@ void runL2Distance( Tensor& outDistances, Tensor& outIndices, bool ignoreOutDistances = false); -#endif // USE_AMD_ROCM /// Calculates brute-force inner product distance between `vectors` /// and `queries`, returning the k closest results seen @@ -156,8 +147,6 @@ void runIPDistance( Tensor& outDistances, Tensor& outIndices); -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runIPDistance( GpuResources* resources, cudaStream_t stream, @@ -168,7 +157,6 @@ void runIPDistance( int k, Tensor& outDistances, Tensor& outIndices); -#endif // USE_AMD_ROCM // // General distance implementation, assumes that all arguments are on the diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu index 262fa19153..e76a0831ff 100644 --- a/faiss/gpu/impl/L2Norm.cu +++ b/faiss/gpu/impl/L2Norm.cu @@ -275,8 +275,6 @@ void runL2Norm( runL2Norm(input, inputRowMajor, output, normSquared, stream); } -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runL2Norm( Tensor<__nv_bfloat16, 2, true>& input, bool inputRowMajor, @@ -286,7 +284,6 @@ void runL2Norm( runL2Norm<__nv_bfloat16, __nv_bfloat162>( input, inputRowMajor, output, normSquared, stream); } -#endif } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/L2Norm.cuh b/faiss/gpu/impl/L2Norm.cuh index 79aef4f131..abd35fd120 100644 --- a/faiss/gpu/impl/L2Norm.cuh +++ b/faiss/gpu/impl/L2Norm.cuh @@ -27,15 +27,12 @@ void runL2Norm( bool normSquared, cudaStream_t stream); -// no bf16 support for AMD -#ifndef USE_AMD_ROCM void runL2Norm( Tensor<__nv_bfloat16, 2, true>& input, bool inputRowMajor, Tensor& output, bool normSquared, cudaStream_t stream); -#endif } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/ConversionOperators.cuh b/faiss/gpu/utils/ConversionOperators.cuh index f0ab1ea1fd..524b6ae21b 100644 --- a/faiss/gpu/utils/ConversionOperators.cuh +++ b/faiss/gpu/utils/ConversionOperators.cuh @@ -38,12 +38,9 @@ struct ConvertTo { static inline __device__ float to(half v) { return __half2float(v); } - -#ifndef USE_AMD_ROCM static inline __device__ float to(__nv_bfloat16 v) { return __bfloat162float(v); } -#endif // !USE_AMD_ROCM }; template <> @@ -96,9 +93,6 @@ struct ConvertTo { } }; -// no bf16 support for AMD -#ifndef USE_AMD_ROCM - template <> struct ConvertTo<__nv_bfloat16> { static inline __device__ __nv_bfloat16 to(float v) { @@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> { } }; -#endif // USE_AMD_ROCM - template struct Convert { inline __device__ To operator()(From v) const { diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh index 6a1f779eab..cfe405c794 100644 --- a/faiss/gpu/utils/Float16.cuh +++ b/faiss/gpu/utils/Float16.cuh @@ -12,25 +12,22 @@ #include // Some compute capabilities have full float16 ALUs. -#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM) +#if __CUDA_ARCH__ >= 530 #define FAISS_USE_FULL_FLOAT16 1 #endif // __CUDA_ARCH__ types // Some compute capabilities have full bfloat16 ALUs. -// FIXME: no support in ROCm yet -#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM) +#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM) #define FAISS_USE_FULL_BFLOAT16 1 #endif // __CUDA_ARCH__ types -#include #if !defined(USE_AMD_ROCM) #include -#endif -// #else -// FIXME: no support in ROCm yet -// #include -// #include -// #endif // !defined(USE_AMD_ROCM) +#include +#else +#include +#include +#endif // !defined(USE_AMD_ROCM) namespace faiss { namespace gpu { diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh index 9239c735f6..ce1e234a07 100644 --- a/faiss/gpu/utils/MathOperators.cuh +++ b/faiss/gpu/utils/MathOperators.cuh @@ -556,8 +556,6 @@ struct Math { } }; -#ifndef USE_AMD_ROCM - template <> struct Math<__nv_bfloat16> { typedef __nv_bfloat16 ScalarType; @@ -626,7 +624,7 @@ struct Math<__nv_bfloat16> { } static inline __device__ __nv_bfloat16 zero() { -#if CUDA_VERSION >= 9000 +#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM) return 0.0f; #else __nv_bfloat16 h; @@ -789,7 +787,5 @@ struct Math<__nv_bfloat162> { } }; -#endif // !USE_AMD_ROCM - } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh index 2c85d7244d..b28ea56dab 100644 --- a/faiss/gpu/utils/MatrixMult-inl.cuh +++ b/faiss/gpu/utils/MatrixMult-inl.cuh @@ -32,11 +32,10 @@ struct GetCudaType { static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F; }; -// FIXME: no AMD support for bf16 -// template <> -// struct GetCudaType<__nv_bfloat16> { -// static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B; -// }; +template <> +struct GetCudaType<__hip_bfloat16> { + static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B; +}; #else