NVIDIA · NianZo · Aug 1, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 6, 2025
diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
@@ -242,12 +242,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 
     if (*pInstance == NULL) {
         printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
-        return CUDA_ERROR_UNKNOWN;
+        exit(EXIT_FAILURE);
     }
 
     return CUDA_SUCCESS;
 }
 
+CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance)
+{
+    *pInstance = GetModuleHandle(__CudaLibName);
+    if (*pInstance) {
+        return CUDA_SUCCESS;
+    }
+    else {
+        return CUDA_ERROR_UNKNOWN;
+    }
+}
+
 #define GET_PROC_EX(name, alias, required)                                               \
     alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                                \
     if (alias == NULL && required) {                                                     \
@@ -269,6 +280,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
         return CUDA_ERROR_UNKNOWN;                                                                      \
     }
 
+#define GET_PROC_ERROR_FUNCTIONS(name, alias, required)                               \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                             \
+    if (alias == NULL && required) {                                                  \
+        printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \
+        exit(EXIT_FAILURE);                                                           \
+    }
+
 #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
 
 #include <dlfcn.h>
@@ -293,12 +311,23 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 
     if (*pInstance == NULL) {
         printf("dlopen \"%s\" failed!\n", __CudaLibName);
-        return CUDA_ERROR_UNKNOWN;
+        exit(EXIT_FAILURE);
     }
 
     return CUDA_SUCCESS;
 }
 
+CUresult GET_DRIVER_HANDLE(CUDADRIVER *pInstance)
+{
+    *pInstance = dlopen(__CudaLibName, RTLD_LAZY);
+    if (*pInstance) {
+        return CUDA_SUCCESS;
+    }
+    else {
+        return CUDA_ERROR_UNKNOWN;
+    }
+}
+
 #define GET_PROC_EX(name, alias, required)                                               \
     alias = (t##name *)dlsym(CudaDrvLib, #name);                                         \
     if (alias == NULL && required) {                                                     \
@@ -320,6 +349,13 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
         return CUDA_ERROR_UNKNOWN;                                                                      \
     }
 
+#define GET_PROC_ERROR_FUNCTIONS(name, alias, required)                               \
+    alias = (t##name *)dlsym(CudaDrvLib, #name);                                      \
+    if (alias == NULL && required) {                                                  \
+        printf("Failed to find error function \"%s\" in %s\n", #name, __CudaLibName); \
+        exit(EXIT_FAILURE);                                                           \
+    }
+
 #else
 #error unsupported platform
 #endif
@@ -338,11 +374,19 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 #define GET_PROC_V2(name)       GET_PROC_EX_V2(name, name, 1)
 #define GET_PROC_V3(name)       GET_PROC_EX_V3(name, name, 1)
 
+CUresult INIT_ERROR_FUNCTIONS(void)
+{
+    CUDADRIVER CudaDrvLib;
+    CUresult   result = CUDA_SUCCESS;
+    result            = GET_DRIVER_HANDLE(&CudaDrvLib);
+    GET_PROC_ERROR_FUNCTIONS(cuGetErrorString, cuGetErrorString, 1);
+    return result;
+}
+
 CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 {
     CUDADRIVER CudaDrvLib;
     int        driverVer = 1000;
-
     CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
 
     // cuInit is required; alias it to _cuInit
@@ -619,6 +663,5 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
         GET_PROC(cuGraphicsD3D9RegisterResource);
 #endif
     }
-
     return CUDA_SUCCESS;
 }
diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
@@ -42,11 +42,21 @@ inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5
 #ifndef checkCudaErrors
 #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
 
+extern "C" CUresult INIT_ERROR_FUNCTIONS(void);
+
 // These are the inline versions for all of the SDK helper functions
 inline void __checkCudaErrors(CUresult err, const char *file, const int line)
 {
     if (CUDA_SUCCESS != err) {
         const char *errorStr = NULL;
+
+        if (!cuGetErrorString) {
+            CUresult result = INIT_ERROR_FUNCTIONS();
+            if (result != CUDA_SUCCESS) {
+                printf("CUDA driver API failed");
+                exit(EXIT_FAILURE);
+            }
+        }
         cuGetErrorString(err, &errorStr);
         fprintf(stderr,
                 "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "

diff --git a/Samples/3_CUDA_Features/CMakeLists.txt b/Samples/3_CUDA_Features/CMakeLists.txt
@@ -21,4 +21,5 @@ add_subdirectory(newdelete)
 add_subdirectory(ptxjit)
 add_subdirectory(simpleCudaGraphs)
 add_subdirectory(tf32TensorCoreGemm)
+add_subdirectory(trimNotification)
 add_subdirectory(warpAggregatedAtomicsCG)
diff --git a/Samples/3_CUDA_Features/trimNotification/CMakeLists.txt b/Samples/3_CUDA_Features/trimNotification/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(trimNotification LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(trimNotification trimNotification.cpp)
+
+target_include_directories(trimNotification PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+target_link_libraries(trimNotification PUBLIC CUDA::cudart)
+
+target_compile_options(trimNotification PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(trimNotification PRIVATE cxx_std_20 cuda_std_20)
+
+set_target_properties(trimNotification PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/Samples/3_CUDA_Features/trimNotification/README.md b/Samples/3_CUDA_Features/trimNotification/README.md
@@ -0,0 +1,33 @@
+# trimNotification - Trim Notification
+
+## Description
+
+This example demonstrates use of cudaDeviceRegisterAsyncNotification to manage memory in CUDA. A GPU running in WDDM mode is required for this sample.
+
+## Key Concepts
+
+Memory Management
+
+## Supported SM Architectures
+
+## Supported OSes
+
+Windows
+
+## Supported CPU Architecture
+
+x86_64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaSetDevice, cudaDeviceRegisterAsyncNotification, cudaDeviceUnregisterAsyncNotification, cudaMalloc, cudaFree
+
+## Dependencies needed to build/run
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
diff --git a/Samples/3_CUDA_Features/trimNotification/trimNotification.cpp b/Samples/3_CUDA_Features/trimNotification/trimNotification.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <chrono>
+#include <cuda_runtime_api.h>
+#include <mutex>
+#include <span>
+#include <stdio.h>
+#include <vector>
+
+constexpr uint32_t memblockSize = 40 * 1024 * 1024;
+constexpr uint32_t stepTime     = 10; // The time to wait between allocation steps in ms
+constexpr uint64_t maxSteps     = 500;
+
+// This sample uses the cudaDeviceRegisterAsyncNotification API to allocate as much video memory as it can.
+// A notification to free some video memory is sent whenever the application goes over its budget. If multiple
+// instances of this program are launched, all instances will likely end up with similar budgets and in
+// steady-state will have similar amounts of video memory allocated.
+
+int main()
+{
+    // This structure will be our 'userData' to pass into the callback. This will give the
+    // callback state that it can affect without needing to use globals.
+    struct MemoryInfo
+    {
+        std::vector<int *> memblocks;
+        std::mutex         mutex;
+        bool               allowedToGrow = true;
+    } memoryInfo;
+
+    // The function that we will register as a callback. This could also be a standalone function
+    // instead of a lambda, but, since it is only referenced here, we can use a lambda to avoid
+    // using the global namespace.
+    cudaAsyncCallback basicUserCallback =
+        [](cudaAsyncNotificationInfo_t *notificationInfo, void *userData, cudaAsyncCallbackHandle_t callback) {
+            MemoryInfo *memoryInfo = (MemoryInfo *)userData;
+
+            // Must check the type before accessing the info member of cudaAsyncNotificationInfo_t.
+            // Otherwise, we could misinterpret notificationInfo->info if a different type of
+            // notification is sent.
+            if (notificationInfo->type == cudaAsyncNotificationTypeOverBudget) {
+                printf("Asked to free %lld bytes of video memory\n", notificationInfo->info.overBudget.bytesOverBudget);
+                uint64_t numBlocksToFree = (notificationInfo->info.overBudget.bytesOverBudget / memblockSize) + 1;
+
+                // The async notification will be on a separate thread, so shared data should be synchronized.
+                std::scoped_lock lock(memoryInfo->mutex);
+                memoryInfo->allowedToGrow = false;
+
+                // Free the required number of memblocks
+                std::span<int *> blocksToFree(memoryInfo->memblocks.end() - numBlocksToFree, numBlocksToFree);
+
+                for (auto &block : blocksToFree) {
+                    cudaFree(block);
+                    block = nullptr;
+                }
+
+                // Shrink the vector to remove the freed blocks
+                std::erase(memoryInfo->memblocks, nullptr);
+            }
+        };
+
+    // Initialize CUDA
+    const int cudaDevice = 0;
+    cudaSetDevice(cudaDevice);
+
+    // This callback handle is an opaque object that can be used to unregister the notification via
+    // cudaDeviceUnregisterAsyncNotification and to identify which callback registration a given
+    // notification corresponds to.
+    cudaAsyncCallbackHandle_t callback;
+    cudaDeviceRegisterAsyncNotification(cudaDevice, basicUserCallback, (void *)&memoryInfo, &callback);
+
+    // Attempt to allocate a block of memory, then sleep before repeating.
+    uint64_t stepCounter = 0;
+    while (stepCounter < maxSteps) {
+        {
+            // The async notification will be on a separate thread, so shared data should be synchronized.
+            std::scoped_lock lock(memoryInfo.mutex);
+
+            if (memoryInfo.allowedToGrow) {
+                int        *newMemblock;
+                cudaError_t cudaStatus = cudaMalloc((void **)&newMemblock, memblockSize);
+                if (cudaStatus != cudaSuccess) {
+                    fprintf(stderr, "cudaMalloc failed!");
+                }
+                memoryInfo.memblocks.push_back(newMemblock);
+            }
+        }
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(stepTime));
+        stepCounter++;
+    }
+
+    printf("Current memory allocated: %lld MB\n", memoryInfo.memblocks.size() * memblockSize / 1024 / 1024);
+
+    // Unregister callback handle in application cleanup
+    cudaDeviceUnregisterAsyncNotification(cudaDevice, callback);
+
+    return 0;
+}