Skip to content

Commit

Permalink
Fix #194 and add Large Kernel Parameters Sample
Browse files Browse the repository at this point in the history
  • Loading branch information
rnertney committed May 31, 2023
1 parent e612904 commit 8004ad5
Show file tree
Hide file tree
Showing 20 changed files with 924 additions and 51 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This ve
This section describes the release notes for the CUDA Samples on GitHub only.

### CUDA 12.1
* Added JIT LTO Sample
* Adding Large Kernel Sample

### [older versions...](./CHANGELOG.md)

Expand Down
17 changes: 2 additions & 15 deletions Samples/4_CUDA_Libraries/cudaNvSciNvMedia/cuda_consumer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_Layout;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneBitsPerPixel;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneOffset;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlanePitch;
pairArrayOut[numAttrs++].key = NvSciBufImageAttrKey_PlaneAlignedHeight;

uint32_t planePitchs[10];
uint32_t planePixel[10];
uint32_t planeAlignedHeight[10];

checkNvSciErrors(NvSciBufAttrListGetAttrs(attrlist, pairArrayOut, numAttrs));

Expand All @@ -183,13 +177,6 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
cudaExtResObj.planeCount * sizeof(int32_t));
memcpy(cudaExtResObj.planeOffset, (uint64_t *)pairArrayOut[7].value,
cudaExtResObj.planeCount * sizeof(uint64_t));
memcpy(planePixel, (uint32_t *)pairArrayOut[6].value,
cudaExtResObj.planeCount * sizeof(uint32_t));
memcpy(planePitchs, (uint32_t *)pairArrayOut[8].value,
cudaExtResObj.planeCount * sizeof(uint32_t));
memcpy(planeAlignedHeight, (uint32_t *)pairArrayOut[9].value,
cudaExtResObj.planeCount * sizeof(uint32_t));


NvSciBufAttrValImageLayoutType layout =
*(NvSciBufAttrValImageLayoutType *)pairArrayOut[5].value;
Expand All @@ -214,8 +201,8 @@ static void cudaImportNvSciImage(cudaExternalResInterop &cudaExtResObj,
for (int i = 0; i < cudaExtResObj.planeCount; i++) {
cudaExtent extent = {};
memset(&extent, 0, sizeof(extent));
extent.width = planePitchs[i] / (planePixel[i] / 8);
extent.height = planeAlignedHeight[i];
extent.width = cudaExtResObj.imageWidth[i];
extent.height = cudaExtResObj.imageHeight[i];
extent.depth = 0;
cudaChannelFormatDesc desc;
switch (channelCount) {
Expand Down
6 changes: 3 additions & 3 deletions Samples/4_CUDA_Libraries/jitLto/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,10 @@ else
@echo "Sample is ready - all dependencies have been met"
endif

jitlto.o:jitlto.cpp
jitLto.o:jitLto.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

jitLto: jitlto.o
jitLto: jitLto.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
Expand All @@ -398,7 +398,7 @@ run: build
testrun: build

clean:
rm -f jitLto jitlto.o
rm -f jitLto jitLto.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitLto

clobber: clean
251 changes: 251 additions & 0 deletions Samples/4_CUDA_Libraries/jitLto/jitLto.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <cuda.h>
#include <nvJitLink.h>
#include <nvrtc.h>
#include <iostream>
#include <cstring>

#define NUM_THREADS 128
#define NUM_BLOCKS 32

#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
#define NVJITLINK_SAFE_CALL(h,x) \
do { \
nvJitLinkResult result = x; \
if (result != NVJITLINK_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< result << '\n'; \
size_t lsize; \
result = nvJitLinkGetErrorLogSize(h, &lsize); \
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
char *log = (char*)malloc(lsize); \
result = nvJitLinkGetErrorLog(h, log); \
if (result == NVJITLINK_SUCCESS) { \
std::cerr << "error log: " << log << '\n'; \
free(log); \
} \
} \
exit(1); \
} \
} while(0)

const char *lto_saxpy = " \n\
extern __device__ float compute(float a, float x, float y); \n\
\n\
extern \"C\" __global__ \n\
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
{ \n\
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
if (tid < n) { \n\
out[tid] = compute(a, x[tid], y[tid]); \n\
} \n\
} \n";

const char *lto_compute = " \n\
__device__ float compute(float a, float x, float y) { \n\
return a * x + y; \n\
} \n";

// compile code into LTOIR, returning the IR and its size
static void getLTOIR (const char *code, const char *name,
char **ltoIR, size_t *ltoIRSize)
{
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(
nvrtcCreateProgram(&prog, // prog
code, // buffer
name, // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames

// specify that LTO IR should be generated for LTO operation
const char *opts[] = {"-dlto",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
// Obtain generated LTO IR from the program.
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize));
*ltoIR = new char[*ltoIRSize];
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
}

int main(int argc, char *argv[])
{
char *ltoIR1;
char *ltoIR2;
size_t ltoIR1Size;
size_t ltoIR2Size;
// getLTOIR uses nvrtc to get the LTOIR.
// We could also use nvcc offline with -dlto -fatbin
// to generate the IR, but using nvrtc keeps the build simpler.
getLTOIR(lto_saxpy, "lto_saxpy.cu", &ltoIR1, &ltoIR1Size);
getLTOIR(lto_compute, "lto_compute.cu", &ltoIR2, &ltoIR2Size);

CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));

// Dynamically determine the arch to link for
int major = 0;
int minor = 0;
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
int arch = major*10 + minor;
char smbuf[16];
memset(smbuf,0,16);
sprintf(smbuf, "-arch=sm_%d", arch);

// Load the generated LTO IR and link them together
nvJitLinkHandle handle;
const char *lopts[] = {"-lto", smbuf};
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));

NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)ltoIR1, ltoIR1Size, "lto_saxpy"));
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
(void *)ltoIR2, ltoIR2Size, "lto_compute"));

// The call to nvJitLinkComplete causes linker to link together the two
// LTO IR modules, do optimization on the linked LTO IR,
// and generate cubin from it.
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));

// check error log
size_t logSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize));
if (logSize > 0) {
char *log = (char*)malloc(logSize+1);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log));
std::cout << "Error log: " << log << std::endl;
free(log);
}

// get linked cubin
size_t cubinSize;
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
void *cubin = malloc(cubinSize);
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));

NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
delete[] ltoIR1;
delete[] ltoIR2;

// cubin is linked, so now load it
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));

// Generate input for execution, and create output buffers.
size_t n = NUM_THREADS * NUM_BLOCKS;
size_t bufferSize = n * sizeof(float);
float a = 5.1f;
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
for (size_t i = 0; i < n; ++i) {
hX[i] = static_cast<float>(i);
hY[i] = static_cast<float>(i * 2);
}
CUdeviceptr dX, dY, dOut;
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
// Execute SAXPY.
void *args[] = { &a, &dX, &dY, &dOut, &n };
CUDA_SAFE_CALL(
cuLaunchKernel(kernel,
NUM_BLOCKS, 1, 1, // grid dim
NUM_THREADS, 1, 1, // block dim
0, NULL, // shared mem and stream
args, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize());
// Retrieve and print output.
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));

for (size_t i = 0; i < n; ++i) {
std::cout << a << " * " << hX[i] << " + " << hY[i]
<< " = " << hOut[i] << '\n';
}
// check last value to verify
if (hOut[n-1] == 29074.5) {
std::cout << "PASSED!\n";
} else {
std::cout << "values not expected?\n";
}
// Release resources.
CUDA_SAFE_CALL(cuMemFree(dX));
CUDA_SAFE_CALL(cuMemFree(dY));
CUDA_SAFE_CALL(cuMemFree(dOut));
CUDA_SAFE_CALL(cuModuleUnload(module));
CUDA_SAFE_CALL(cuCtxDestroy(context));
free(cubin);
delete[] hX;
delete[] hY;
delete[] hOut;
return 0;
}
2 changes: 1 addition & 1 deletion Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
<ClCompile Include="jitLto.cpp" />

</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
Expand Down
2 changes: 1 addition & 1 deletion Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
<ClCompile Include="jitLto.cpp" />

</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
Expand Down
2 changes: 1 addition & 1 deletion Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="jitlto.cpp" />
<ClCompile Include="jitLto.cpp" />

</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/../../../Common"
],
"defines": [],
"compilerPath": "/usr/local/cuda/bin/nvcc",
"cStandard": "gnu17",
"cppStandard": "gnu++14",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.makefile-tools"
}
],
"version": 4
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"recommendations": [
"nvidia.nsight-vscode-edition",
"ms-vscode.cpptools",
"ms-vscode.makefile-tools"
]
}
10 changes: 10 additions & 0 deletions Samples/6_Performance/LargeKernelParameter/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"configurations": [
{
"name": "CUDA C++: Launch",
"type": "cuda-gdb",
"request": "launch",
"program": "${workspaceFolder}/LargeKernelParameter"
}
]
}
Loading

0 comments on commit 8004ad5

Please sign in to comment.