You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
which is 1024 by default if you does not modify via hipDeviceSetLimit API.
At first I thought the modification is not that important, but it indeed has impact on performance.
With the stack size of 512, collective communication on small data takes around ~10us.
With the stack size of 1024, however, it takes around ~244us, which is more than x20 latency.
The code line is introduced with #684, but there is no explanation on the situation.
Does anybody know why the stack size has an impact on kernel launch overhead?
I also made the following minimal working example which you may use to reproduce the issue.
#include <cstdio>
#include <hip/amd_detail/amd_hip_runtime.h>
#include <hip/hip_runtime.h>
#include <chrono>
#define CHECK_HIP(res) \
do { \
hipError_t err = (res); \
if (err != hipSuccess) { \
fprintf(stderr, "HIP Error (%s:%d): %s (%s)\n", __FILE__, __LINE__, \
hipGetErrorName(err), hipGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ void subkernel0(int *a) { *a = 0xdeadbee0;}
__device__ void subkernel1(int *a) { *a = 0xdeadbee1;}
__device__ void (*subkernels[])(int *a){subkernel0, subkernel1};
__global__ void mainkernel(int *a) {
subkernels[0](a);
}
size_t measure() {
int warmup = 30, niter = 100;
size_t elapsed = 0;
for (int i = -warmup; i < niter; ++i) {
int *a;
CHECK_HIP(hipMalloc(&a, sizeof(int)));
CHECK_HIP(hipMemset(a, 0, sizeof(int)));
CHECK_HIP(hipDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
mainkernel<<<1, 1>>>(a);
CHECK_HIP(hipDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();
int b;
CHECK_HIP(hipMemcpy(&b, a, sizeof(int), hipMemcpyDeviceToHost));
if (b != 0xdeadbee0) {
printf("Error: b = %x\n", b);
exit(1);
}
CHECK_HIP(hipFree(a));
if (i >= 0) {
elapsed += std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
}
}
return elapsed / niter;
}
int getStackSize() {
size_t curStackSize;
CHECK_HIP(hipDeviceGetLimit(&curStackSize, hipLimitStackSize));
return curStackSize;
}
int main() {
for (int i = 500; i <= 700; ++i) {
CHECK_HIP(hipDeviceSetLimit(hipLimitStackSize, i));
printf("Stack size: %d, Time: %zu ns\n", getStackSize(), measure());
//printf("%d,%zu\n", getStackSize(), measure());
}
return 0;
}
Operating System
Ubuntu 20.04.5 LTS (Focal Fossa)
CPU
AMD EPYC 7413 24-Core Processor
GPU
AMD Instinct MI100
ROCm Version
ROCm 6.0.0
ROCm Component
No response
Steps to Reproduce
No response
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
No response
Additional Information
No response
The text was updated successfully, but these errors were encountered:
Thank you for confirmation. We are aware of this issue on MI100/MI200 series and working on a fix. In the meantime, we recommend to NOT use indirect function call in RCCL.
Problem Description
When we use indirect function call in the kernel function as in below:
rccl/src/device/common.h
Line 348 in 53dcfcc
RCCL limits the size of stack to 512:
rccl/src/init.cc
Line 1876 in 53dcfcc
which is 1024 by default if you does not modify via
hipDeviceSetLimit
API.At first I thought the modification is not that important, but it indeed has impact on performance.
With the stack size of 512, collective communication on small data takes around ~10us.
With the stack size of 1024, however, it takes around ~244us, which is more than x20 latency.
The code line is introduced with #684, but there is no explanation on the situation.
Does anybody know why the stack size has an impact on kernel launch overhead?
I also made the following minimal working example which you may use to reproduce the issue.
Operating System
Ubuntu 20.04.5 LTS (Focal Fossa)
CPU
AMD EPYC 7413 24-Core Processor
GPU
AMD Instinct MI100
ROCm Version
ROCm 6.0.0
ROCm Component
No response
Steps to Reproduce
No response
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
No response
Additional Information
No response
The text was updated successfully, but these errors were encountered: