diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c index 8adc4d87a..f6d375f2b 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c @@ -242,12 +242,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) if (*pInstance == NULL) { printf("LoadLibrary \"%s\" failed!\n", __CudaLibName); - return CUDA_ERROR_UNKNOWN; + exit(EXIT_FAILURE); } return CUDA_SUCCESS; } +CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance) +{ + *pInstance = GetModuleHandle(__CudaLibName); + if (*pInstance) { + return CUDA_SUCCESS; + } + else { + return CUDA_ERROR_UNKNOWN; + } +} + #define GET_PROC_EX(name, alias, required) \ alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ if (alias == NULL && required) { \ @@ -269,6 +280,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) return CUDA_ERROR_UNKNOWN; \ } +#define GET_PROC_ERROR_FUNCTIONS(name, alias, required) \ + alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ + if (alias == NULL && required) { \ + printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \ + exit(EXIT_FAILURE); \ + } + #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX) #include @@ -293,12 +311,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) if (*pInstance == NULL) { printf("dlopen \"%s\" failed!\n", __CudaLibName); - return CUDA_ERROR_UNKNOWN; + exit(EXIT_FAILURE); } return CUDA_SUCCESS; } +CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance) +{ + *pInstance = dlopen(__CudaLibName, RTLD_LAZY); + if (*pInstance) { + return CUDA_SUCCESS; + } + else { + return CUDA_ERROR_UNKNOWN; + } +} + #define GET_PROC_EX(name, alias, required) \ alias = (t##name *)dlsym(CudaDrvLib, #name); \ if (alias == NULL && required) { \ @@ -320,6 +349,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) return CUDA_ERROR_UNKNOWN; \ } +#define GET_PROC_ERROR_FUNCTIONS(name, alias, required) \ + alias = (t##name *)dlsym(CudaDrvLib, #name); \ + if (alias == NULL && required) { \ + printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \ + exit(EXIT_FAILURE); \ + } + #else #error unsupported platform #endif @@ -338,11 +374,19 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance) #define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1) #define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1) +CUresult INIT_ERROR_FUNCTIONS(void) +{ + CUDADRIVER CudaDrvLib; + CUresult result = CUDA_SUCCESS; + result = GET_DRIVER_HANDLE(&CudaDrvLib); + GET_PROC_ERROR_FUNCTIONS(cuGetErrorString, cuGetErrorString, 1); + return result; +} + CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) { CUDADRIVER CudaDrvLib; int driverVer = 1000; - CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib)); // cuInit is required; alias it to _cuInit @@ -619,6 +663,5 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) GET_PROC(cuGraphicsD3D9RegisterResource); #endif } - return CUDA_SUCCESS; } diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h index 7c61ff2a4..befd53bb2 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h @@ -42,11 +42,21 @@ inline int ftoi(float value) { return (value >= 0 ? static_cast(value + 0.5 #ifndef checkCudaErrors #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) +extern "C" CUresult INIT_ERROR_FUNCTIONS(void); + // These are the inline versions for all of the SDK helper functions inline void __checkCudaErrors(CUresult err, const char *file, const int line) { if (CUDA_SUCCESS != err) { const char *errorStr = NULL; + + if (!cuGetErrorString) { + CUresult result = INIT_ERROR_FUNCTIONS(); + if (result != CUDA_SUCCESS) { + printf("CUDA driver API failed"); + exit(EXIT_FAILURE); + } + } cuGetErrorString(err, &errorStr); fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " diff --git a/Samples/3_CUDA_Features/CMakeLists.txt b/Samples/3_CUDA_Features/CMakeLists.txt index d7416b242..43df5b289 100644 --- a/Samples/3_CUDA_Features/CMakeLists.txt +++ b/Samples/3_CUDA_Features/CMakeLists.txt @@ -21,4 +21,5 @@ add_subdirectory(newdelete) add_subdirectory(ptxjit) add_subdirectory(simpleCudaGraphs) add_subdirectory(tf32TensorCoreGemm) +add_subdirectory(trimNotification) add_subdirectory(warpAggregatedAtomicsCG) diff --git a/Samples/3_CUDA_Features/trimNotification/CMakeLists.txt b/Samples/3_CUDA_Features/trimNotification/CMakeLists.txt new file mode 100644 index 000000000..d27bc2ccf --- /dev/null +++ b/Samples/3_CUDA_Features/trimNotification/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.20) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules") + +project(trimNotification LANGUAGES C CXX CUDA) + +find_package(CUDAToolkit REQUIRED) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +if(ENABLE_CUDA_DEBUG) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets) +else() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option) +endif() + +# Include directories and libraries +include_directories(../../../Common) + +# Source file +add_executable(trimNotification trimNotification.cpp) + +target_include_directories(trimNotification PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + +target_link_libraries(trimNotification PUBLIC CUDA::cudart) + +target_compile_options(trimNotification PRIVATE $<$:--extended-lambda>) + +target_compile_features(trimNotification PRIVATE cxx_std_20 cuda_std_20) + +set_target_properties(trimNotification PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/Samples/3_CUDA_Features/trimNotification/README.md b/Samples/3_CUDA_Features/trimNotification/README.md new file mode 100644 index 000000000..edf4168dd --- /dev/null +++ b/Samples/3_CUDA_Features/trimNotification/README.md @@ -0,0 +1,33 @@ +# trimNotification - Trim Notification + +## Description + +This example demonstrates use of cudaDeviceRegisterAsyncNotification to manage memory in CUDA. A GPU running in WDDM mode is required for this sample. + +## Key Concepts + +Memory Management + +## Supported SM Architectures + +## Supported OSes + +Windows + +## Supported CPU Architecture + +x86_64 + +## CUDA APIs involved + +### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaSetDevice, cudaDeviceRegisterAsyncNotification, cudaDeviceUnregisterAsyncNotification, cudaMalloc, cudaFree + +## Dependencies needed to build/run + +## Prerequisites + +Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Make sure the dependencies mentioned in [Dependencies]() section above are installed. + +## References (for more details) diff --git a/Samples/3_CUDA_Features/trimNotification/trimNotification.cpp b/Samples/3_CUDA_Features/trimNotification/trimNotification.cpp new file mode 100644 index 000000000..3033fee0f --- /dev/null +++ b/Samples/3_CUDA_Features/trimNotification/trimNotification.cpp @@ -0,0 +1,123 @@ +/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +constexpr uint32_t memblockSize = 40 * 1024 * 1024; +constexpr uint32_t stepTime = 10; // The time to wait between allocation steps in ms +constexpr uint64_t maxSteps = 500; + +// This sample uses the cudaDeviceRegisterAsyncNotification API to allocate as much video memory as it can. +// A notification to free some video memory is sent whenever the application goes over its budget. If multiple +// instances of this program are launched, all instances will likely end up with similar budgets and in +// steady-state will have similar amounts of video memory allocated. + +int main() +{ + // This structure will be our 'userData' to pass into the callback. This will give the + // callback state that it can affect without needing to use globals. + struct MemoryInfo + { + std::vector memblocks; + std::mutex mutex; + bool allowedToGrow = true; + } memoryInfo; + + // The function that we will register as a callback. This could also be a standalone function + // instead of a lambda, but, since it is only referenced here, we can use a lambda to avoid + // using the global namespace. + cudaAsyncCallback basicUserCallback = + [](cudaAsyncNotificationInfo_t *notificationInfo, void *userData, cudaAsyncCallbackHandle_t callback) { + MemoryInfo *memoryInfo = (MemoryInfo *)userData; + + // Must check the type before accessing the info member of cudaAsyncNotificationInfo_t. + // Otherwise, we could misinterpret notificationInfo->info if a different type of + // notification is sent. + if (notificationInfo->type == cudaAsyncNotificationTypeOverBudget) { + printf("Asked to free %lld bytes of video memory\n", notificationInfo->info.overBudget.bytesOverBudget); + uint64_t numBlocksToFree = (notificationInfo->info.overBudget.bytesOverBudget / memblockSize) + 1; + + // The async notification will be on a separate thread, so shared data should be synchronized. + std::scoped_lock lock(memoryInfo->mutex); + memoryInfo->allowedToGrow = false; + + // Free the required number of memblocks + std::span blocksToFree(memoryInfo->memblocks.end() - numBlocksToFree, numBlocksToFree); + + for (auto &block : blocksToFree) { + cudaFree(block); + block = nullptr; + } + + // Shrink the vector to remove the freed blocks + std::erase(memoryInfo->memblocks, nullptr); + } + }; + + // Initialize CUDA + const int cudaDevice = 0; + cudaSetDevice(cudaDevice); + + // This callback handle is an opaque object that can be used to unregister the notification via + // cudaDeviceUnregisterAsyncNotification and to identify which callback registration a given + // notification corresponds to. + cudaAsyncCallbackHandle_t callback; + cudaDeviceRegisterAsyncNotification(cudaDevice, basicUserCallback, (void *)&memoryInfo, &callback); + + // Attempt to allocate a block of memory, then sleep before repeating. + uint64_t stepCounter = 0; + while (stepCounter < maxSteps) { + { + // The async notification will be on a separate thread, so shared data should be synchronized. + std::scoped_lock lock(memoryInfo.mutex); + + if (memoryInfo.allowedToGrow) { + int *newMemblock; + cudaError_t cudaStatus = cudaMalloc((void **)&newMemblock, memblockSize); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + memoryInfo.memblocks.push_back(newMemblock); + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(stepTime)); + stepCounter++; + } + + printf("Current memory allocated: %lld MB\n", memoryInfo.memblocks.size() * memblockSize / 1024 / 1024); + + // Unregister callback handle in application cleanup + cudaDeviceUnregisterAsyncNotification(cudaDevice, callback); + + return 0; +}