Skip to content
Open
51 changes: 47 additions & 4 deletions Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)

if (*pInstance == NULL) {
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
exit(EXIT_FAILURE);
}

return CUDA_SUCCESS;
}

CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance)
{
*pInstance = GetModuleHandle(__CudaLibName);
if (*pInstance) {
return CUDA_SUCCESS;
}
else {
return CUDA_ERROR_UNKNOWN;
}
}

#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
Expand All @@ -269,6 +280,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_ERROR_UNKNOWN; \
}

#define GET_PROC_ERROR_FUNCTIONS(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \
exit(EXIT_FAILURE); \
}

#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)

#include <dlfcn.h>
Expand All @@ -293,12 +311,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)

if (*pInstance == NULL) {
printf("dlopen \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
exit(EXIT_FAILURE);
}

return CUDA_SUCCESS;
}

CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance)
{
*pInstance = dlopen(__CudaLibName, RTLD_LAZY);
if (*pInstance) {
return CUDA_SUCCESS;
}
else {
return CUDA_ERROR_UNKNOWN;
}
}

#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
Expand All @@ -320,6 +349,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_ERROR_UNKNOWN; \
}

#define GET_PROC_ERROR_FUNCTIONS(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \
exit(EXIT_FAILURE); \
}

#else
#error unsupported platform
#endif
Expand All @@ -338,11 +374,19 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)

CUresult INIT_ERROR_FUNCTIONS(void)
{
CUDADRIVER CudaDrvLib;
CUresult result = CUDA_SUCCESS;
result = GET_DRIVER_HANDLE(&CudaDrvLib);
GET_PROC_ERROR_FUNCTIONS(cuGetErrorString, cuGetErrorString, 1);
return result;
}

CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
{
CUDADRIVER CudaDrvLib;
int driverVer = 1000;

CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));

// cuInit is required; alias it to _cuInit
Expand Down Expand Up @@ -619,6 +663,5 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGraphicsD3D9RegisterResource);
#endif
}

return CUDA_SUCCESS;
}
10 changes: 10 additions & 0 deletions Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,21 @@ inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5
#ifndef checkCudaErrors
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

extern "C" CUresult INIT_ERROR_FUNCTIONS(void);

// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
{
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;

if (!cuGetErrorString) {
CUresult result = INIT_ERROR_FUNCTIONS();
if (result != CUDA_SUCCESS) {
printf("CUDA driver API failed");
exit(EXIT_FAILURE);
}
}
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
Expand Down
1 change: 1 addition & 0 deletions Samples/3_CUDA_Features/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ add_subdirectory(newdelete)
add_subdirectory(ptxjit)
add_subdirectory(simpleCudaGraphs)
add_subdirectory(tf32TensorCoreGemm)
add_subdirectory(trimNotification)
add_subdirectory(warpAggregatedAtomicsCG)
33 changes: 33 additions & 0 deletions Samples/3_CUDA_Features/trimNotification/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
cmake_minimum_required(VERSION 3.20)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")

project(trimNotification LANGUAGES C CXX CUDA)

find_package(CUDAToolkit REQUIRED)

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()

# Include directories and libraries
include_directories(../../../Common)

# Source file
add_executable(trimNotification trimNotification.cpp)

target_include_directories(trimNotification PRIVATE ${CUDAToolkit_INCLUDE_DIRS})

target_link_libraries(trimNotification PUBLIC CUDA::cudart)

target_compile_options(trimNotification PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)

target_compile_features(trimNotification PRIVATE cxx_std_20 cuda_std_20)

set_target_properties(trimNotification PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
33 changes: 33 additions & 0 deletions Samples/3_CUDA_Features/trimNotification/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# trimNotification - Trim Notification

## Description

This example demonstrates use of cudaDeviceRegisterAsyncNotification to manage memory in CUDA. A GPU running in WDDM mode is required for this sample.

## Key Concepts

Memory Management

## Supported SM Architectures

## Supported OSes

Windows

## Supported CPU Architecture

x86_64

## CUDA APIs involved

### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaSetDevice, cudaDeviceRegisterAsyncNotification, cudaDeviceUnregisterAsyncNotification, cudaMalloc, cudaFree

## Dependencies needed to build/run

## Prerequisites

Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.

## References (for more details)
123 changes: 123 additions & 0 deletions Samples/3_CUDA_Features/trimNotification/trimNotification.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <chrono>
#include <cuda_runtime_api.h>
#include <mutex>
#include <span>
#include <stdio.h>
#include <vector>

constexpr uint32_t memblockSize = 40 * 1024 * 1024;
constexpr uint32_t stepTime = 10; // The time to wait between allocation steps in ms
constexpr uint64_t maxSteps = 500;

// This sample uses the cudaDeviceRegisterAsyncNotification API to allocate as much video memory as it can.
// A notification to free some video memory is sent whenever the application goes over its budget. If multiple
// instances of this program are launched, all instances will likely end up with similar budgets and in
// steady-state will have similar amounts of video memory allocated.

int main()
{
// This structure will be our 'userData' to pass into the callback. This will give the
// callback state that it can affect without needing to use globals.
struct MemoryInfo
{
std::vector<int *> memblocks;
std::mutex mutex;
bool allowedToGrow = true;
} memoryInfo;

// The function that we will register as a callback. This could also be a standalone function
// instead of a lambda, but, since it is only referenced here, we can use a lambda to avoid
// using the global namespace.
cudaAsyncCallback basicUserCallback =
[](cudaAsyncNotificationInfo_t *notificationInfo, void *userData, cudaAsyncCallbackHandle_t callback) {
MemoryInfo *memoryInfo = (MemoryInfo *)userData;

// Must check the type before accessing the info member of cudaAsyncNotificationInfo_t.
// Otherwise, we could misinterpret notificationInfo->info if a different type of
// notification is sent.
if (notificationInfo->type == cudaAsyncNotificationTypeOverBudget) {
printf("Asked to free %lld bytes of video memory\n", notificationInfo->info.overBudget.bytesOverBudget);
uint64_t numBlocksToFree = (notificationInfo->info.overBudget.bytesOverBudget / memblockSize) + 1;

// The async notification will be on a separate thread, so shared data should be synchronized.
std::scoped_lock lock(memoryInfo->mutex);
memoryInfo->allowedToGrow = false;

// Free the required number of memblocks
std::span<int *> blocksToFree(memoryInfo->memblocks.end() - numBlocksToFree, numBlocksToFree);

for (auto &block : blocksToFree) {
cudaFree(block);
block = nullptr;
}

// Shrink the vector to remove the freed blocks
std::erase(memoryInfo->memblocks, nullptr);
}
};

// Initialize CUDA
const int cudaDevice = 0;
cudaSetDevice(cudaDevice);

// This callback handle is an opaque object that can be used to unregister the notification via
// cudaDeviceUnregisterAsyncNotification and to identify which callback registration a given
// notification corresponds to.
cudaAsyncCallbackHandle_t callback;
cudaDeviceRegisterAsyncNotification(cudaDevice, basicUserCallback, (void *)&memoryInfo, &callback);

// Attempt to allocate a block of memory, then sleep before repeating.
uint64_t stepCounter = 0;
while (stepCounter < maxSteps) {
{
// The async notification will be on a separate thread, so shared data should be synchronized.
std::scoped_lock lock(memoryInfo.mutex);

if (memoryInfo.allowedToGrow) {
int *newMemblock;
cudaError_t cudaStatus = cudaMalloc((void **)&newMemblock, memblockSize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
memoryInfo.memblocks.push_back(newMemblock);
}
}

std::this_thread::sleep_for(std::chrono::milliseconds(stepTime));
stepCounter++;
}

printf("Current memory allocated: %lld MB\n", memoryInfo.memblocks.size() * memblockSize / 1024 / 1024);

// Unregister callback handle in application cleanup
cudaDeviceUnregisterAsyncNotification(cudaDevice, callback);

return 0;
}