From d6fe782e05d501c7d14fe0bdc1c86944c8df3305 Mon Sep 17 00:00:00 2001 From: Burlen Loring Date: Mon, 14 Aug 2023 13:53:22 -0700 Subject: [PATCH] test add reproducer for invalid opcode perlmutter --- test/CMakeLists.txt | 15 ++ test/test_hamr_openmp_cuda_interop.cpp | 297 +++++++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 test/test_hamr_openmp_cuda_interop.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a301cc5..017e0bf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -102,6 +102,21 @@ if (HAMR_ENABLE_OPENMP) set_tests_properties(test_hamr_openmp_allocator PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) endif() +if (HAMR_ENABLE_OPENMP AND HAMR_ENABLE_CUDA AND HAMR_NVHPC_CUDA) + #NOTE: nvhpc is the only compiler that can do cuda and openmp in the same translation unit + # this test will have to be refactored for clang into multiple source files + if (NOT HAMR_NVHPC_CUDA) + set_source_files_properties(test_hamr_openmp_cuda_interop.cpp PROPERTIES LANGUAGE CUDA) + endif() + add_executable(test_hamr_openmp_cuda_interop test_hamr_openmp_cuda_interop.cpp) + if (NOT HAMR_NVHPC_CUDA) + set_target_properties(test_hamr_openmp_cuda_interop PROPERTIES CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}") + endif() + target_link_libraries(test_hamr_openmp_cuda_interop hamr) + add_test(NAME test_hamr_openmp_cuda_interop COMMAND test_hamr_openmp_cuda_interop 0 0) + set_tests_properties(test_hamr_openmp_cuda_interop PROPERTIES ENVIRONMENT HAMR_VERBOSE=1) +endif() + if (HAMR_ENABLE_PYTHON) add_test(NAME test_hamr_buffer_numpy_host diff --git a/test/test_hamr_openmp_cuda_interop.cpp b/test/test_hamr_openmp_cuda_interop.cpp new file mode 100644 index 0000000..c7e5e89 --- /dev/null +++ b/test/test_hamr_openmp_cuda_interop.cpp @@ -0,0 +1,297 @@ +#include "hamr_config.h" +#include "hamr_buffer.h" +#include "hamr_buffer_allocator.h" +#include "hamr_buffer_util.h" + +#include +#include +#include + +#include +#include + +using allocator = hamr::buffer_allocator; +using transfer = hamr::buffer_transfer; + +void example1(size_t nElem, int devId) +{ + // OpenMP allocates this array on device memory + omp_set_default_device(devId); + double *devPtr = (double*)malloc(nElem*sizeof(double)); + #pragma omp target enter data map(alloc: devPtr[0:nElem]) + + // OpenMP initializes the memory on the device + #pragma omp target teams distribute \ + parallel for map(alloc: devPtr[0:nElem]) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct with the device pointer + hamr::buffer *simData; + #pragma omp target data use_device_addr(devPtr) + { + simData = new hamr::buffer(allocator::openmp, hamr::stream(), + nElem, devId, devPtr, 0); + } + + // do something with the buffer + simData->print(); + + // delete the array + delete simData; + + // now it is safe to deallocate the device memory + #pragma omp target exit data map(release: devPtr[0:nElem]) +} + + +void example2(size_t nElem, int devId) +{ + // allocate device memory + omp_set_default_device(devId); + double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), devId); + + // wrap it in a shared pointer so it is eventually deallocated + std::shared_ptr spDev(devPtr, + [devId](double *ptr){ omp_target_free(ptr, devId); }); + + // initialize the array on the device + #pragma omp target teams distribute parallel for is_device_ptr(devPtr) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct with coordinated life cycle management + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, devId, spDev); + + // do something with the buffer + simData.print(); +} + +void example3(size_t nElem, int srcDev, int destDev) +{ + // allocate device memory + omp_set_default_device(srcDev); + double *devPtr = (double*)omp_target_alloc(nElem*sizeof(double), srcDev); + + // initialize + #pragma omp target teams distribute parallel for is_device_ptr(devPtr) + for (size_t i = 0; i < nElem; ++i) + devPtr[i] = -3.14; + + // zero-copy construct from a device pointer, and take ownership + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, srcDev, devPtr, 1); + + // move to destDev in place + omp_set_default_device(destDev); + simData.move(allocator::openmp); + + // do something with the buffer + simData.print(); +} + +void example4(size_t nElem, int srcDev, int destDev) +{ + // allocate and value initialize on one device using OpenMP + omp_set_default_device(srcDev); + auto simData = hamr::buffer(allocator::openmp, hamr::stream(), + nElem, -3.14); + + // deep-copy to another device using CUDA + cudaSetDevice(destDev); + auto dataCpy = hamr::buffer(allocator::openmp, simData); + + // make sure movement is complete before using + dataCpy.synchronize(); + + // do something with the data + dataCpy.print(); +} + + + +hamr::buffer +add_arrays_mp(int dev, hamr::buffer &a1, hamr::buffer &a2) +{ + // get a view of the incoming data on the device we will use + omp_set_default_device(dev); + +#if defined(STRUCTURED_BINDING) + auto [spa1, pa1] = hamr::get_openmp_accessible(a1); + auto [spa2, pa2] = hamr::get_openmp_accessible(a2); +#else + auto spa1 = a1.get_openmp_accessible(); + auto pa1 = spa1.get(); + + auto spa2 = a2.get_openmp_accessible(); + auto pa2 = spa2.get(); +#endif + + // allocate space for the result + size_t nElem = a1.size(); + + auto a3 = hamr::buffer(allocator::openmp, nElem); + + // direct access to the result since we know it is in place + auto pa3 = a3.data(); + + // do the calculation + #pragma omp target teams distribute parallel for is_device_ptr(pa1, pa2) + for (size_t i = 0; i < nElem; ++i) + pa3[i] = pa2[i] + pa1[i]; + + return a3; +} + +void example5(size_t nElem, int dev1, int dev2) +{ + // this data is located in host main memory + auto a1 = hamr::buffer(allocator::malloc, nElem, 1.0); + + // this data is located in device 1 main memory + omp_set_default_device(dev1); + auto a2 = hamr::buffer(allocator::openmp, hamr::stream(), + transfer::async, nElem, 2.0); + + // do the calculation on device 2 + auto a3 = add_arrays_mp(dev2, a1, a2); + + // do something with the result + a3.print(); +} + + +namespace libA { +__global__ +void add(double *a3, const double *a1, const double *a2, size_t n) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + if (i >= n) return; + a3[i] = a1[i] + a2[i]; + //printf("a3[%d]=%g a1[%d]=%g a2[%d]=%g \n", i, a3[i], i, a1[i], i, a2[i]); +} + +hamr::buffer +Add(int dev, const hamr::buffer &a1, const hamr::buffer &a2) +{ + // use this stream for the calculation + cudaStream_t strm = hamr::stream(); + + // get a view of the incoming data on the device we will use + cudaSetDevice(dev); + +#if defined(STRUCTURED_BINDING) + auto [spa1, pa1] = hamr::get_cuda_accessible(a1); + auto [spa2, pa2] = hamr::get_cuda_accessible(a2); +#else + auto spa1 = a1.get_openmp_accessible(); + auto pa1 = spa1.get(); + + auto spa2 = a2.get_openmp_accessible(); + auto pa2 = spa2.get(); +#endif + + // allocate space for the result + size_t nElem = a1.size(); + + auto a3 = hamr::buffer(allocator::cuda_async, strm, transfer::async, nElem); + + // direct access to the result since we know it is in place + auto pa3 = a3.data(); + + // make sure the data in flight, if it was moved, has arrived + a1.synchronize(); + a2.synchronize(); + + // do the calculation + int threads = 128; + int blocks = nElem / threads + ( nElem % threads ? 1 : 0 ); + add<<>>(pa3, pa1, pa2, nElem); + + return a3; +} +} + +namespace libB { +void Write(std::ofstream &ofs, hamr::buffer &a) +{ + // get a view of the data on the host + auto [spA, pA] = hamr::get_host_accessible(a); + + // make sure the data if moved has arrived + a.synchronize(); + + // send the data to the file + size_t nElem = a.size(); + for (size_t i = 0; i < nElem; ++i) + ofs << pA[i] << " "; + ofs << std::endl; +} +} + +void example6(size_t nElem, int dev1, int dev2) +{ + // this data is located in host memory, initialized to 1 + auto a1 = hamr::buffer(allocator::malloc, hamr::stream(), + transfer::async, nElem, 1.0); + + // this data is located in device 1 memory, unitialized + omp_set_default_device(dev1); + auto a2 = hamr::buffer(allocator::openmp, hamr::stream(), + transfer::async, nElem, 1.0); + + // initialize with OpenMP target offload + auto pA2 = a2.data(); + + #pragma omp target teams distribute parallel for is_device_ptr(pA2) + for (size_t i = 0; i < nElem; ++i) + pA2[i] = 2.0; + + // pass data to libA for the calculations + auto a3 = libA::Add(dev2, a1, a2); + + // pass data to libB for I/O + auto ofile = std::ofstream("data.txt"); + libB::Write(ofile, a1); + libB::Write(ofile, a2); + libB::Write(ofile, a3); + ofile.close(); +} + + + + + +int main(int argc, char **argv) +{ + if (argc != 3) + { + std::cerr << "usage: test_openmp_cuda_interop [device id] [device id]" << std::endl; + return -1; + } + + size_t nElem = 64; + int dev = atoi(argv[1]); + int destDev = atoi(argv[2]); + + std::cerr << "zero-copy construct manual life cycle management ... " << std::endl; + example1(nElem, dev); + + std::cerr << "zero-copy construct automatic life cycle management ... " << std::endl; + example2(nElem, dev); + + std::cerr << "move in place ..." << std::endl; + example3(nElem, dev, destDev); + + std::cerr << "deep copy construct on another device ... " << std::endl; + example4(nElem, dev, destDev); + + std::cerr << "add two arrays OpenMP ... " << std::endl; + example5(nElem, dev, destDev); + + std::cerr << "add two arrays OpenMP CUDA interop ... " << std::endl; + example6(nElem, dev, destDev); + + return 0; +}