Syllo · rocker-zhang · Jun 22, 2026
diff --git a/include/nvtop/extract_gpuinfo_common.h b/include/nvtop/extract_gpuinfo_common.h
@@ -29,6 +29,10 @@
 
 #include "list.h"
 
+// Set by the UI while SM/Tensor activity is actually being plotted, so the NVIDIA backend only samples the
+// (shared, perfmon-counter-backed) NVML GPM metrics when they are displayed.
+extern bool gpuinfo_collect_compute_activity;
+
 #define STRINGIFY(x) STRINGIFY_HELPER_(x)
 #define STRINGIFY_HELPER_(x) #x
 
@@ -107,6 +111,10 @@ enum gpuinfo_dynamic_info_valid {
   gpuinfo_power_draw_max_valid,
   gpuinfo_effective_load_rate_valid,
   gpuinfo_multi_instance_mode_valid,
+  gpuinfo_sm_util_valid,
+  gpuinfo_tensor_util_valid,
+  gpuinfo_sm_occupancy_valid,
+  gpuinfo_dram_bw_util_valid,
   gpuinfo_dynamic_info_count,
 };
 
@@ -133,6 +141,10 @@ struct gpuinfo_dynamic_info {
   unsigned int power_draw;          // Power usage in milliwatts
   unsigned int power_draw_max;      // Max power usage in milliwatts
   bool multi_instance_mode;          // True if the GPU is in multi-instance mode
+  unsigned int sm_util;              // SM-active % (NVML GPM, = DCGM SM_ACTIVE)
+  unsigned int tensor_util;          // Tensor-pipe-active % (NVML GPM, = DCGM PIPE_TENSOR_ACTIVE)
+  unsigned int sm_occupancy;         // SM occupancy % (NVML GPM)
+  unsigned int dram_bw_util;         // DRAM/HBM bandwidth % (NVML GPM)
   unsigned char valid[(gpuinfo_dynamic_info_count + CHAR_BIT - 1) / CHAR_BIT];
 };
 

diff --git a/include/nvtop/interface_common.h b/include/nvtop/interface_common.h
@@ -33,6 +33,8 @@ enum plot_information {
   plot_gpu_clock_rate,
   plot_gpu_mem_clock_rate,
   plot_effective_load_rate,
+  plot_sm_util,
+  plot_tensor_util,
   plot_information_count
 };
 

diff --git a/src/extract_gpuinfo.c b/src/extract_gpuinfo.c
@@ -37,6 +37,10 @@
 const char drm_pdev[] = "drm-pdev";
 const char drm_client_id[] = "drm-client-id";
 
+// Off until the UI plots SM/Tensor activity; gates NVML GPM sampling so nvtop only touches the shared
+// perfmon counters when those series are actually shown.
+bool gpuinfo_collect_compute_activity = false;
+
 struct process_info_cache {
   pid_t pid;
   char *cmdline;

diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c
@@ -269,13 +269,60 @@ nvmlReturn_t (*nvmlDeviceGetProcessUtilization)(nvmlDevice_t device, nvmlProcess
                                                 unsigned int *processSamplesCount,
                                                 unsigned long long lastSeenTimeStamp);
 
+// NVML GPM (GPU Performance Monitoring) API — SM-active / Tensor-active etc. Lives inside libnvidia-ml,
+// dlsym'd like the rest (absent on drivers < R510). Struct layouts, metric ids and the version macros (==1)
+// are taken verbatim from nvml.h; metric ids are named, never bare integers.
+typedef struct nvmlGpmSample_st *nvmlGpmSample_t;
+
+typedef enum {
+  NVML_GPM_METRIC_SM_UTIL = 2,
+  NVML_GPM_METRIC_SM_OCCUPANCY = 3,
+  NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5,
+  NVML_GPM_METRIC_DRAM_BW_UTIL = 10,
+} nvtop_nvmlGpmMetricId_t;
+
+#define NVML_GPM_METRIC_MAX 210
+#define NVML_GPM_METRICS_GET_VERSION 1
+
+typedef struct {
+  char *shortName;
+  char *longName;
+  char *unit;
+} nvmlGpmMetricMetricInfo_t;
+
+typedef struct {
+  unsigned int metricId;
+  nvmlReturn_t nvmlReturn;
+  double value;
+  nvmlGpmMetricMetricInfo_t metricInfo;
+} nvmlGpmMetric_t;
+
+typedef struct {
+  unsigned int version;
+  unsigned int numMetrics;
+  nvmlGpmSample_t sample1;
+  nvmlGpmSample_t sample2;
+  nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX];
+} nvmlGpmMetricsGet_t;
+
+static nvmlReturn_t (*nvmlGpmSampleAlloc)(nvmlGpmSample_t *gpmSample);
+static nvmlReturn_t (*nvmlGpmSampleFree)(nvmlGpmSample_t gpmSample);
+static nvmlReturn_t (*nvmlGpmSampleGet)(nvmlDevice_t device, nvmlGpmSample_t gpmSample);
+static nvmlReturn_t (*nvmlGpmMetricsGet)(nvmlGpmMetricsGet_t *metricsGet);
+
 struct gpu_info_nvidia {
   struct gpu_info base;
   struct list_head allocate_list;
 
   nvmlDevice_t gpuhandle;
   bool isInMigMode;
   unsigned long long last_utilization_timestamp;
+  // GPM (SM/Tensor activity): two ping-ponged samples diffed each refresh
+  bool gpm_checked;
+  bool gpm_supported;
+  bool gpm_primed;
+  nvmlGpmSample_t gpm_prev;
+  nvmlGpmSample_t gpm_cur;
 };
 
 static LIST_HEAD(allocations);
@@ -469,6 +516,10 @@ static bool gpuinfo_nvidia_init(void) {
   // These ones might not be available
   nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization");
   nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode");
+  nvmlGpmSampleAlloc = dlsym(libnvidia_ml_handle, "nvmlGpmSampleAlloc");
+  nvmlGpmSampleFree = dlsym(libnvidia_ml_handle, "nvmlGpmSampleFree");
+  nvmlGpmSampleGet = dlsym(libnvidia_ml_handle, "nvmlGpmSampleGet");
+  nvmlGpmMetricsGet = dlsym(libnvidia_ml_handle, "nvmlGpmMetricsGet");
 
   last_nvml_return_status = nvmlInit();
   if (last_nvml_return_status != NVML_SUCCESS) {
@@ -485,6 +536,19 @@ static bool gpuinfo_nvidia_init(void) {
 }
 
 static void gpuinfo_nvidia_shutdown(void) {
+  // Free GPM samples while libnvidia-ml is still loaded (the symbol vanishes after dlclose).
+  if (nvmlGpmSampleFree) {
+    struct gpu_info_nvidia *gpm_it;
+    list_for_each_entry(gpm_it, &allocations, allocate_list) {
+      if (gpm_it->gpm_prev)
+        nvmlGpmSampleFree(gpm_it->gpm_prev);
+      if (gpm_it->gpm_cur)
+        nvmlGpmSampleFree(gpm_it->gpm_cur);
+      gpm_it->gpm_prev = NULL;
+      gpm_it->gpm_cur = NULL;
+    }
+  }
+
   if (libnvidia_ml_handle) {
     nvmlShutdown();
     dlclose(libnvidia_ml_handle);
@@ -721,6 +785,52 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
   if (last_nvml_return_status == NVML_SUCCESS)
     SET_VALID(gpuinfo_pcie_tx_valid, dynamic_info->valid);
 
+  // SM / Tensor activity via the GPM API (two-sample diff; NVML returns 0..100 percentages directly), only
+  // while the UI is plotting them. Capability is decided by the actual nvmlGpmSampleGet return rather than
+  // nvmlGpmQueryDeviceSupport, which is unreliable (reports unsupported on Blackwell where GPM works).
+  if (gpuinfo_collect_compute_activity && nvmlGpmSampleAlloc && nvmlGpmSampleGet && nvmlGpmMetricsGet &&
+      !gpu_info->gpm_checked) {
+    gpu_info->gpm_checked = true;
+    if (nvmlGpmSampleAlloc(&gpu_info->gpm_prev) == NVML_SUCCESS &&
+        nvmlGpmSampleAlloc(&gpu_info->gpm_cur) == NVML_SUCCESS)
+      gpu_info->gpm_supported = true;
+  }
+  if (gpuinfo_collect_compute_activity && gpu_info->gpm_supported) {
+    nvmlReturn_t gpm_ret = nvmlGpmSampleGet(device, gpu_info->gpm_cur);
+    if (gpm_ret == NVML_SUCCESS) {
+      if (gpu_info->gpm_primed) {
+        nvmlGpmMetricsGet_t mg = {.version = NVML_GPM_METRICS_GET_VERSION,
+                                  .numMetrics = 4,
+                                  .sample1 = gpu_info->gpm_prev,
+                                  .sample2 = gpu_info->gpm_cur};
+        mg.metrics[0].metricId = NVML_GPM_METRIC_SM_UTIL;
+        mg.metrics[1].metricId = NVML_GPM_METRIC_ANY_TENSOR_UTIL;
+        mg.metrics[2].metricId = NVML_GPM_METRIC_SM_OCCUPANCY;
+        mg.metrics[3].metricId = NVML_GPM_METRIC_DRAM_BW_UTIL;
+        if (nvmlGpmMetricsGet(&mg) == NVML_SUCCESS) {
+          if (mg.metrics[0].nvmlReturn == NVML_SUCCESS)
+            SET_GPUINFO_DYNAMIC(dynamic_info, sm_util, (unsigned)(mg.metrics[0].value + 0.5));
+          if (mg.metrics[1].nvmlReturn == NVML_SUCCESS)
+            SET_GPUINFO_DYNAMIC(dynamic_info, tensor_util, (unsigned)(mg.metrics[1].value + 0.5));
+          if (mg.metrics[2].nvmlReturn == NVML_SUCCESS)
+            SET_GPUINFO_DYNAMIC(dynamic_info, sm_occupancy, (unsigned)(mg.metrics[2].value + 0.5));
+          if (mg.metrics[3].nvmlReturn == NVML_SUCCESS)
+            SET_GPUINFO_DYNAMIC(dynamic_info, dram_bw_util, (unsigned)(mg.metrics[3].value + 0.5));
+        }
+      }
+      nvmlGpmSample_t swap = gpu_info->gpm_prev;
+      gpu_info->gpm_prev = gpu_info->gpm_cur;
+      gpu_info->gpm_cur = swap;
+      gpu_info->gpm_primed = true;
+    } else {
+      gpu_info->gpm_primed = false; // re-prime from a fresh pair on the next round
+      if (gpm_ret == NVML_ERROR_NOT_SUPPORTED)
+        gpu_info->gpm_supported = false; // GPU genuinely lacks GPM (e.g. Ampere) — stop polling it
+    }
+  } else {
+    gpu_info->gpm_primed = false; // collection disabled: drop the prime so a later enable starts fresh
+  }
+
   // Fan speed
   last_nvml_return_status = nvmlDeviceGetFanSpeed(device, &dynamic_info->fan_speed);
   if (last_nvml_return_status == NVML_SUCCESS)

diff --git a/src/interface.c b/src/interface.c
@@ -1736,6 +1736,14 @@ void save_current_data_to_ring(struct list_head *devices, struct nvtop_interface
             data_val = device->dynamic_info.effective_load_rate;
           }
           break;
+        case plot_sm_util:
+          if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, sm_util))
+            data_val = device->dynamic_info.sm_util;
+          break;
+        case plot_tensor_util:
+          if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, tensor_util))
+            data_val = device->dynamic_info.tensor_util;
+          break;
         case plot_information_count:
           break;
         }
@@ -1805,6 +1813,12 @@ static unsigned populate_plot_data_from_ring_buffer(const struct nvtop_interface
         case plot_effective_load_rate:
           snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u eff. load%%", dev_id);
           break;
+        case plot_sm_util:
+          snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u SM%%", dev_id);
+          break;
+        case plot_tensor_util:
+          snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u tensor%%", dev_id);
+          break;
         case plot_information_count:
           break;
         }
@@ -1848,6 +1862,15 @@ static void draw_plots(struct nvtop_interface *interface) {
 
 void draw_gpu_info_ncurses(unsigned devices_count, struct list_head *devices, struct nvtop_interface *interface) {
 
+  // Let the NVIDIA backend sample NVML GPM only when an SM/Tensor plot series is enabled, so an idle nvtop
+  // never arms the shared perfmon counters. Takes effect on the next refresh, which is fine.
+  gpuinfo_collect_compute_activity = false;
+  for (unsigned i = 0; !gpuinfo_collect_compute_activity && i < interface->total_dev_count; ++i) {
+    plot_info_to_draw td = interface->options.gpu_specific_opts[i].to_draw;
+    if (plot_isset_draw_info(plot_sm_util, td) || plot_isset_draw_info(plot_tensor_util, td))
+      gpuinfo_collect_compute_activity = true;
+  }
+
   draw_devices(devices, interface);
   if (!interface->setup_win.visible) {
     draw_plots(interface);

diff --git a/src/interface_options.c b/src/interface_options.c
@@ -202,7 +202,8 @@ static const char device_monitor[] = "Monitor";
 static const char device_shown_value[] = "ShownInfo";
 static const char *device_draw_vals[plot_information_count + 1] = {
     "gpuRate",       "gpuMemRate", "encodeRate",   "decodeRate",      "temperature",
-    "powerDrawRate", "fanSpeed",   "gpuClockRate", "gpuMemClockRate", "effectiveLoadRate", "none"};
+    "powerDrawRate", "fanSpeed",   "gpuClockRate", "gpuMemClockRate", "effectiveLoadRate",
+    "smUtil",        "tensorUtil", "none"};
 
 static int nvtop_option_ini_handler(void *user, const char *section, const char *name, const char *value) {
   struct nvtop_option_ini_data *ini_data = (struct nvtop_option_ini_data *)user;

diff --git a/src/interface_setup_win.c b/src/interface_setup_win.c
@@ -81,7 +81,7 @@ static const char *setup_chart_gpu_description      = "Displayed GPU";
 static const char *setup_chart_gpu_value_descriptions[plot_information_count] = {
     "GPU utilization rate", "GPU memory utilization rate",   "GPU encoder rate", "GPU decoder rate",
     "GPU temperature",      "Power draw rate (current/max)", "Fan speed",        "GPU clock rate",
-    "GPU memory clock rate", "Effective load rate"};
+    "GPU memory clock rate", "Effective load rate", "SM active rate", "Tensor active rate"};
 
 static const char *chart_color_names[] = {"Red", "Cyan", "Green", "Yellow", "Blue", "Magenta", "White"};
 static const unsigned chart_color_names_count = ARRAY_SIZE(chart_color_names);