Skip to content

Commit

Permalink
ROCm support for bfloat16 (#4039)
Browse files Browse the repository at this point in the history
Summary:
Updated the hipify script to handle bfloat16 conversion and unblocked disabling for ROCm.

Pull Request resolved: #4039

Reviewed By: junjieqi

Differential Revision: D66413190

Pulled By: asadoughi

fbshipit-source-id: d1564f87e3c3466ff929dfd639bd544318371148
  • Loading branch information
ItsPitt authored and facebook-github-bot committed Nov 25, 2024
1 parent d1ae64e commit 37f52dc
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 67 deletions.
5 changes: 0 additions & 5 deletions faiss/gpu/GpuDistance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -402,16 +402,11 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
} else if (args.vectorType == DistanceDataType::F16) {
bfKnnConvert<half>(prov, args);
} else if (args.vectorType == DistanceDataType::BF16) {
// no bf16 support for AMD
#ifndef USE_AMD_ROCM
if (prov->getResources()->supportsBFloat16CurrentDevice()) {
bfKnnConvert<__nv_bfloat16>(prov, args);
} else {
FAISS_THROW_MSG("not compiled with bfloat16 support");
}
#else
FAISS_THROW_MSG("no AMD bfloat16 support");
#endif
} else {
FAISS_THROW_MSG("unknown vectorType");
}
Expand Down
43 changes: 39 additions & 4 deletions faiss/gpu/hipify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,46 @@
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Usage: ./gpu/hipify.sh
#

function hipify_dir()
{
# print dir name
cd "$1" || exit
echo "Hipifying $(pwd)"

if [ -d ./gpu-tmp ]; then
#Clearing out any leftover files and directories
echo "Removing old ./gpu-tmp"
rm -rf ./gpu-tmp
fi

if [ -d ./gpu ]; then
#Making a temp directory to implement pre hipify rules
echo "Creating ./gpu-tmp"
cp -r ./gpu ./gpu-tmp

# adjust __nv_bfloat162 before hipify because of inaccurate conversions
# adjust __nv_bfloat16 before hipify because of inaccurate conversions
for ext in hip cuh h cpp c cu cuh
do
while IFS= read -r -d '' src
do
sed -i 's@__nv_bfloat162@__hip_bfloat162@' "$src"
sed -i 's@__nv_bfloat16@__hip_bfloat16@' "$src"
done < <(find ./gpu-tmp -name "*.$ext" -print0)
done
else
echo "Can't find the gpu/ dir"
exit
fi

# create all destination directories for hipified files into sibling 'gpu-rocm' directory
while IFS= read -r -d '' src
do
dst="${src//gpu/gpu-rocm}"
dst="${src//gpu-tmp/gpu-rocm}"

if [ -d $dst ]; then
#Clearing out any leftover files and directories
Expand All @@ -24,17 +53,17 @@ function hipify_dir()
#Making directories
echo "Creating $dst"
mkdir -p "$dst"
done < <(find ./gpu -type d -print0)
done < <(find ./gpu-tmp -type d -print0)

# run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming
# run all files in parallel to speed up
for ext in cu cuh h cpp c
do
while IFS= read -r -d '' src
do
dst="${src//\.\/gpu/\.\/gpu-rocm}"
dst="${src//\.\/gpu-tmp/\.\/gpu-rocm}"
hipify-perl -o="$dst.tmp" "$src" &
done < <(find ./gpu -name "*.$ext" -print0)
done < <(find ./gpu-tmp -name "*.$ext" -print0)
done
wait

Expand All @@ -45,6 +74,12 @@ function hipify_dir()
mv "$src" "$dst"
done < <(find ./gpu-rocm -name "*.cu.tmp" -print0)

if [ -d ./gpu-tmp ]; then
#Clearing out any leftover files and directories
echo "Removing ./gpu-tmp"
rm -rf ./gpu-tmp
fi

# replace header include statements "<faiss/gpu/" with "<faiss/gpu-rocm"
# replace thrust::cuda::par with thrust::hip::par
# adjust header path location for hipblas.h to avoid unnecessary deprecation warnings
Expand Down
12 changes: 0 additions & 12 deletions faiss/gpu/impl/Distance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,6 @@ void runAllPairwiseL2Distance(
outDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -526,7 +524,6 @@ void runAllPairwiseL2Distance(
queriesRowMajor,
outDistances);
}
#endif // USE_AMD_ROCM

void runAllPairwiseIPDistance(
GpuResources* res,
Expand Down Expand Up @@ -568,8 +565,6 @@ void runAllPairwiseIPDistance(
outDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -589,7 +584,6 @@ void runAllPairwiseIPDistance(
queriesRowMajor,
outDistances);
}
#endif // USE_AMD_ROCM

void runL2Distance(
GpuResources* res,
Expand Down Expand Up @@ -643,8 +637,6 @@ void runL2Distance(
ignoreOutDistances);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -670,7 +662,6 @@ void runL2Distance(
outIndices,
ignoreOutDistances);
}
#endif // USE_AMD_ROCM

void runIPDistance(
GpuResources* res,
Expand Down Expand Up @@ -716,8 +707,6 @@ void runIPDistance(
outIndices);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -739,7 +728,6 @@ void runIPDistance(
outDistances,
outIndices);
}
#endif // USE_AMD_ROCM

} // namespace gpu
} // namespace faiss
12 changes: 0 additions & 12 deletions faiss/gpu/impl/Distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ void runAllPairwiseL2Distance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseL2Distance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -52,7 +50,6 @@ void runAllPairwiseL2Distance(
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);
#endif // USE_AMD_ROCM

void runAllPairwiseIPDistance(
GpuResources* res,
Expand All @@ -72,8 +69,6 @@ void runAllPairwiseIPDistance(
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runAllPairwiseIPDistance(
GpuResources* res,
cudaStream_t stream,
Expand All @@ -82,7 +77,6 @@ void runAllPairwiseIPDistance(
Tensor<__nv_bfloat16, 2, true>& queries,
bool queriesRowMajor,
Tensor<float, 2, true>& outDistances);
#endif // USE_AMD_ROCM

/// Calculates brute-force L2 distance between `vectors` and
/// `queries`, returning the k closest results seen
Expand Down Expand Up @@ -116,8 +110,6 @@ void runL2Distance(
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Distance(
GpuResources* resources,
cudaStream_t stream,
Expand All @@ -130,7 +122,6 @@ void runL2Distance(
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices,
bool ignoreOutDistances = false);
#endif // USE_AMD_ROCM

/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
Expand All @@ -156,8 +147,6 @@ void runIPDistance(
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runIPDistance(
GpuResources* resources,
cudaStream_t stream,
Expand All @@ -168,7 +157,6 @@ void runIPDistance(
int k,
Tensor<float, 2, true>& outDistances,
Tensor<idx_t, 2, true>& outIndices);
#endif // USE_AMD_ROCM

//
// General distance implementation, assumes that all arguments are on the
Expand Down
3 changes: 0 additions & 3 deletions faiss/gpu/impl/L2Norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,6 @@ void runL2Norm(
runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
}

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Expand All @@ -286,7 +284,6 @@ void runL2Norm(
runL2Norm<__nv_bfloat16, __nv_bfloat162>(
input, inputRowMajor, output, normSquared, stream);
}
#endif

} // namespace gpu
} // namespace faiss
3 changes: 0 additions & 3 deletions faiss/gpu/impl/L2Norm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,12 @@ void runL2Norm(
bool normSquared,
cudaStream_t stream);

// no bf16 support for AMD
#ifndef USE_AMD_ROCM
void runL2Norm(
Tensor<__nv_bfloat16, 2, true>& input,
bool inputRowMajor,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream);
#endif

} // namespace gpu
} // namespace faiss
8 changes: 0 additions & 8 deletions faiss/gpu/utils/ConversionOperators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,9 @@ struct ConvertTo<float> {
static inline __device__ float to(half v) {
return __half2float(v);
}

#ifndef USE_AMD_ROCM
static inline __device__ float to(__nv_bfloat16 v) {
return __bfloat162float(v);
}
#endif // !USE_AMD_ROCM
};

template <>
Expand Down Expand Up @@ -96,9 +93,6 @@ struct ConvertTo<Half4> {
}
};

// no bf16 support for AMD
#ifndef USE_AMD_ROCM

template <>
struct ConvertTo<__nv_bfloat16> {
static inline __device__ __nv_bfloat16 to(float v) {
Expand All @@ -112,8 +106,6 @@ struct ConvertTo<__nv_bfloat16> {
}
};

#endif // USE_AMD_ROCM

template <typename From, typename To>
struct Convert {
inline __device__ To operator()(From v) const {
Expand Down
17 changes: 7 additions & 10 deletions faiss/gpu/utils/Float16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,22 @@
#include <faiss/gpu/utils/DeviceUtils.h>

// Some compute capabilities have full float16 ALUs.
#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM)
#if __CUDA_ARCH__ >= 530
#define FAISS_USE_FULL_FLOAT16 1
#endif // __CUDA_ARCH__ types

// Some compute capabilities have full bfloat16 ALUs.
// FIXME: no support in ROCm yet
#if __CUDA_ARCH__ >= 800 // || defined(USE_AMD_ROCM)
#if __CUDA_ARCH__ >= 800 || defined(USE_AMD_ROCM)
#define FAISS_USE_FULL_BFLOAT16 1
#endif // __CUDA_ARCH__ types

#include <cuda_fp16.h>
#if !defined(USE_AMD_ROCM)
#include <cuda_bf16.h>
#endif
// #else
// FIXME: no support in ROCm yet
// #include <amd_hip_bf16.h>
// #include <amd_hip_fp16.h>
// #endif // !defined(USE_AMD_ROCM)
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
#endif // !defined(USE_AMD_ROCM)

namespace faiss {
namespace gpu {
Expand Down
6 changes: 1 addition & 5 deletions faiss/gpu/utils/MathOperators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,6 @@ struct Math<Half8> {
}
};

#ifndef USE_AMD_ROCM

template <>
struct Math<__nv_bfloat16> {
typedef __nv_bfloat16 ScalarType;
Expand Down Expand Up @@ -626,7 +624,7 @@ struct Math<__nv_bfloat16> {
}

static inline __device__ __nv_bfloat16 zero() {
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM)
return 0.0f;
#else
__nv_bfloat16 h;
Expand Down Expand Up @@ -789,7 +787,5 @@ struct Math<__nv_bfloat162> {
}
};

#endif // !USE_AMD_ROCM

} // namespace gpu
} // namespace faiss
9 changes: 4 additions & 5 deletions faiss/gpu/utils/MatrixMult-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@ struct GetCudaType<half> {
static constexpr hipblasDatatype_t Type = HIPBLAS_R_16F;
};

// FIXME: no AMD support for bf16
// template <>
// struct GetCudaType<__nv_bfloat16> {
// static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
// };
template <>
struct GetCudaType<__hip_bfloat16> {
static constexpr hipblasDatatype_t Type = HIPBLAS_R_16B;
};

#else

Expand Down

0 comments on commit 37f52dc

Please sign in to comment.