diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index 71153231b9..7dff484f2a 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -34,8 +34,8 @@ runs:
     - name: Install conda build tools
       shell: ${{ steps.choose_shell.outputs.shell }}
       run: |
-        conda update -y -q conda
-        conda install -y -q conda-build
+        conda install -y -q "conda!=24.11.0"
+        conda install -y -q "conda-build!=24.11.0"
     - name: Enable anaconda uploads
       if: inputs.label != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
@@ -80,7 +80,7 @@ runs:
       working-directory: conda
       run: |
         conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+            -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
     - name: Conda build (GPU w/ cuVS) w/ anaconda upload
       if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
@@ -89,4 +89,4 @@ runs:
         PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ac1387c438..4d38b583d6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -238,8 +238,8 @@ jobs:
         with:
           label: main
           cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-packages-CUDA12-1-1:
-    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-packages-CUDA12-4-0:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -257,7 +257,7 @@ jobs:
         with:
           label: main
           cuvs: "ON"
-          cuda: "12.1.1"
+          cuda: "12.4.0"
   windows-x86_64-packages:
     name: Windows x86_64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index c3929683b8..ef1e8d2357 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -73,8 +73,8 @@ jobs:
         with:
           label: nightly
           cuda: "12.1.1"
-  linux-x86_64-GPU-CUVS-CUDA12-1-1-nightly:
-    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0)
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
@@ -90,7 +90,7 @@ jobs:
         with:
           label: nightly
           cuvs: "ON"
-          cuda: "12.1.1"
+          cuda: "12.4.0"
   windows-x86_64-nightly:
     name: Windows x86_64 nightlies
     runs-on: windows-2019
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 6c43d19a7c..d37ac56a3c 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -9,7 +9,7 @@
 {% if cudatoolkit == '11.8.0' %}
 {% set cuda_constraints=">=11.8,<12" %}
 {% set libcublas_constraints=">=11.11,<12" %}
-{% elif cudatoolkit == '12.1.1' %}
+{% elif cudatoolkit == '12.4.0' %}
 {% set cuda_constraints=">=12.1,<13" %}
 {% set libcublas_constraints=">=12.1,<13" %}
 {% endif %}
@@ -48,7 +48,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.24.0
+        - cmake >=3.26.4
         - make  # [not win]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index e80477f1a0..f36091d26a 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     } else if (args.vectorType == DistanceDataType::F16) {
         bfKnnConvert<half>(prov, args);
     } else if (args.vectorType == DistanceDataType::BF16) {
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
         if (prov->getResources()->supportsBFloat16CurrentDevice()) {
             bfKnnConvert<__nv_bfloat16>(prov, args);
         } else {
             FAISS_THROW_MSG("not compiled with bfloat16 support");
         }
-#else
-        FAISS_THROW_MSG("no AMD bfloat16 support");
-#endif
     } else {
         FAISS_THROW_MSG("unknown vectorType");
     }
diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh
index 2b65854205..9bc4773bb0 100755
--- a/faiss/gpu/hipify.sh
+++ b/faiss/gpu/hipify.sh
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+#
+# Usage: ./gpu/hipify.sh
+#
 
 function hipify_dir()
 {
@@ -10,10 +13,36 @@ function hipify_dir()
     cd "$1" || exit
     echo "Hipifying $(pwd)"
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing old ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
+    if [ -d ./gpu ]; then
+        #Making a temp directory to implement pre hipify rules
+        echo "Creating ./gpu-tmp"
+        cp -r ./gpu ./gpu-tmp
+
+        # adjust __nv_bfloat162 before hipify because of inaccurate conversions
+        # adjust __nv_bfloat16 before hipify because of inaccurate conversions
+        for ext in hip cuh h cpp c cu cuh
+        do
+            while IFS= read -r -d '' src
+            do
+                sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src"
+                sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src"
+            done <   <(find ./gpu-tmp -name "*.$ext" -print0)
+        done
+    else
+        echo "Can't find the gpu/ dir"
+        exit
+    fi
+
     # create all destination directories for hipified files into sibling 'gpu-rocm' directory
     while IFS= read -r -d '' src
     do
-        dst="${src//gpu/gpu-rocm}"
+        dst="${src//gpu-tmp/gpu-rocm}"
 
         if [ -d $dst ]; then
             #Clearing out any leftover files and directories
@@ -24,7 +53,7 @@ function hipify_dir()
         #Making directories
         echo "Creating $dst"
         mkdir -p "$dst"
-    done <   <(find ./gpu -type d -print0)
+    done <   <(find ./gpu-tmp -type d -print0)
 
     # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
     # run all files in parallel to speed up
@@ -32,9 +61,9 @@ function hipify_dir()
     do
         while IFS= read -r -d '' src
         do
-            dst="${src//\.\/gpu/\.\/gpu-rocm}"
+            dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}"
             hipify-perl -o="$dst.tmp" "$src" &
-        done <   <(find ./gpu -name "*.$ext" -print0)
+        done <   <(find ./gpu-tmp -name "*.$ext" -print0)
     done
     wait
 
@@ -45,6 +74,12 @@ function hipify_dir()
         mv "$src" "$dst"
     done <   <(find ./gpu-rocm -name "*.cu.tmp" -print0)
 
+    if [ -d ./gpu-tmp ]; then
+        #Clearing out any leftover files and directories
+        echo "Removing ./gpu-tmp"
+        rm -rf ./gpu-tmp
+    fi
+
     # replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
     # replace thrust::cuda::par with thrust::hip::par
     # adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
diff --git a/faiss/gpu/impl/Distance.cu b/faiss/gpu/impl/Distance.cu
index eb2e91e93e..9d943b15c9 100644
--- a/faiss/gpu/impl/Distance.cu
+++ b/faiss/gpu/impl/Distance.cu
@@ -504,8 +504,6 @@ void runAllPairwiseL2Distance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -526,7 +524,6 @@ void runAllPairwiseL2Distance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -568,8 +565,6 @@ void runAllPairwiseIPDistance(
             outDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -589,7 +584,6 @@ void runAllPairwiseIPDistance(
             queriesRowMajor,
             outDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runL2Distance(
         GpuResources* res,
@@ -643,8 +637,6 @@ void runL2Distance(
             ignoreOutDistances);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -670,7 +662,6 @@ void runL2Distance(
             outIndices,
             ignoreOutDistances);
 }
-#endif // USE_AMD_ROCM
 
 void runIPDistance(
         GpuResources* res,
@@ -716,8 +707,6 @@ void runIPDistance(
             outIndices);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -739,7 +728,6 @@ void runIPDistance(
             outDistances,
             outIndices);
 }
-#endif // USE_AMD_ROCM
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/Distance.cuh b/faiss/gpu/impl/Distance.cuh
index d8e1d5c239..07f2f028f2 100644
--- a/faiss/gpu/impl/Distance.cuh
+++ b/faiss/gpu/impl/Distance.cuh
@@ -41,8 +41,6 @@ void runAllPairwiseL2Distance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseL2Distance(
         GpuResources* res,
         cudaStream_t stream,
@@ -52,7 +50,6 @@ void runAllPairwiseL2Distance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 void runAllPairwiseIPDistance(
         GpuResources* res,
@@ -72,8 +69,6 @@ void runAllPairwiseIPDistance(
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runAllPairwiseIPDistance(
         GpuResources* res,
         cudaStream_t stream,
@@ -82,7 +77,6 @@ void runAllPairwiseIPDistance(
         Tensor<__nv_bfloat16, 2, true>& queries,
         bool queriesRowMajor,
         Tensor<float, 2, true>& outDistances);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force L2 distance between `vectors` and
 /// `queries`, returning the k closest results seen
@@ -116,8 +110,6 @@ void runL2Distance(
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Distance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -130,7 +122,6 @@ void runL2Distance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
-#endif // USE_AMD_ROCM
 
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
@@ -156,8 +147,6 @@ void runIPDistance(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runIPDistance(
         GpuResources* resources,
         cudaStream_t stream,
@@ -168,7 +157,6 @@ void runIPDistance(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices);
-#endif // USE_AMD_ROCM
 
 //
 // General distance implementation, assumes that all arguments are on the
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
index 262fa19153..e76a0831ff 100644
--- a/faiss/gpu/impl/L2Norm.cu
+++ b/faiss/gpu/impl/L2Norm.cu
@@ -275,8 +275,6 @@ void runL2Norm(
     runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
 }
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
@@ -286,7 +284,6 @@ void runL2Norm(
     runL2Norm<__nv_bfloat16, __nv_bfloat162>(
             input, inputRowMajor, output, normSquared, stream);
 }
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/L2Norm.cuh b/faiss/gpu/impl/L2Norm.cuh
index 79aef4f131..abd35fd120 100644
--- a/faiss/gpu/impl/L2Norm.cuh
+++ b/faiss/gpu/impl/L2Norm.cuh
@@ -27,15 +27,12 @@ void runL2Norm(
         bool normSquared,
         cudaStream_t stream);
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
 void runL2Norm(
         Tensor<__nv_bfloat16, 2, true>& input,
         bool inputRowMajor,
         Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream);
-#endif
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/utils/ConversionOperators.cuh b/faiss/gpu/utils/ConversionOperators.cuh
index f0ab1ea1fd..524b6ae21b 100644
--- a/faiss/gpu/utils/ConversionOperators.cuh
+++ b/faiss/gpu/utils/ConversionOperators.cuh
@@ -38,12 +38,9 @@ struct ConvertTo<float> {
     static inline __device__ float to(half v) {
         return __half2float(v);
     }
-
-#ifndef USE_AMD_ROCM
     static inline __device__ float to(__nv_bfloat16 v) {
         return __bfloat162float(v);
     }
-#endif // !USE_AMD_ROCM
 };
 
 template <>
@@ -96,9 +93,6 @@ struct ConvertTo<Half4> {
     }
 };
 
-// no bf16 support for AMD
-#ifndef USE_AMD_ROCM
-
 template <>
 struct ConvertTo<__nv_bfloat16> {
     static inline __device__ __nv_bfloat16 to(float v) {
@@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> {
     }
 };
 
-#endif // USE_AMD_ROCM
-
 template <typename From, typename To>
 struct Convert {
     inline __device__ To operator()(From v) const {
diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh
index 6a1f779eab..cfe405c794 100644
--- a/faiss/gpu/utils/Float16.cuh
+++ b/faiss/gpu/utils/Float16.cuh
@@ -12,25 +12,22 @@
 #include <faiss/gpu/utils/DeviceUtils.h>
 
 // Some compute capabilities have full float16 ALUs.
-#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM)
+#if __CUDA_ARCH__ >= 530
 #define FAISS_USE_FULL_FLOAT16 1
 #endif // __CUDA_ARCH__ types
 
 // Some compute capabilities have full bfloat16 ALUs.
-// FIXME: no support in ROCm yet
-#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM)
+#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM)
 #define FAISS_USE_FULL_BFLOAT16 1
 #endif // __CUDA_ARCH__ types
 
-#include <cuda_fp16.h>
 #if !defined(USE_AMD_ROCM)
 #include <cuda_bf16.h>
-#endif
-// #else
-//  FIXME: no support in ROCm yet
-//  #include <amd_hip_bf16.h>
-//  #include <amd_hip_fp16.h>
-// #endif // !defined(USE_AMD_ROCM)
+#include <cuda_fp16.h>
+#else
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#endif // !defined(USE_AMD_ROCM)
 
 namespace faiss {
 namespace gpu {
diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh
index 9239c735f6..ce1e234a07 100644
--- a/faiss/gpu/utils/MathOperators.cuh
+++ b/faiss/gpu/utils/MathOperators.cuh
@@ -556,8 +556,6 @@ struct Math<Half8> {
     }
 };
 
-#ifndef USE_AMD_ROCM
-
 template <>
 struct Math<__nv_bfloat16> {
     typedef __nv_bfloat16 ScalarType;
@@ -626,7 +624,7 @@ struct Math<__nv_bfloat16> {
     }
 
     static inline __device__ __nv_bfloat16 zero() {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
         return 0.0f;
 #else
         __nv_bfloat16 h;
@@ -789,7 +787,5 @@ struct Math<__nv_bfloat162> {
     }
 };
 
-#endif // !USE_AMD_ROCM
-
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh
index 2c85d7244d..b28ea56dab 100644
--- a/faiss/gpu/utils/MatrixMult-inl.cuh
+++ b/faiss/gpu/utils/MatrixMult-inl.cuh
@@ -32,11 +32,10 @@ struct GetCudaType<half> {
     static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F;
 };
 
-// FIXME: no AMD support for bf16
-// template <>
-// struct GetCudaType<__nv_bfloat16> {
-//     static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
-// };
+template <>
+struct GetCudaType<__hip_bfloat16> {
+    static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
+};
 
 #else