Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/nvtop/extract_gpuinfo_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@

#include "list.h"

// Set by the UI while SM/Tensor activity is actually being plotted, so the NVIDIA backend only samples the
// (shared, perfmon-counter-backed) NVML GPM metrics when they are displayed.
extern bool gpuinfo_collect_compute_activity;

#define STRINGIFY(x) STRINGIFY_HELPER_(x)
#define STRINGIFY_HELPER_(x) #x

Expand Down Expand Up @@ -107,6 +111,10 @@ enum gpuinfo_dynamic_info_valid {
gpuinfo_power_draw_max_valid,
gpuinfo_effective_load_rate_valid,
gpuinfo_multi_instance_mode_valid,
gpuinfo_sm_util_valid,
gpuinfo_tensor_util_valid,
gpuinfo_sm_occupancy_valid,
gpuinfo_dram_bw_util_valid,
gpuinfo_dynamic_info_count,
};

Expand All @@ -133,6 +141,10 @@ struct gpuinfo_dynamic_info {
unsigned int power_draw; // Power usage in milliwatts
unsigned int power_draw_max; // Max power usage in milliwatts
bool multi_instance_mode; // True if the GPU is in multi-instance mode
unsigned int sm_util; // SM-active % (NVML GPM, = DCGM SM_ACTIVE)
unsigned int tensor_util; // Tensor-pipe-active % (NVML GPM, = DCGM PIPE_TENSOR_ACTIVE)
unsigned int sm_occupancy; // SM occupancy % (NVML GPM)
unsigned int dram_bw_util; // DRAM/HBM bandwidth % (NVML GPM)
unsigned char valid[(gpuinfo_dynamic_info_count + CHAR_BIT - 1) / CHAR_BIT];
};

Expand Down
2 changes: 2 additions & 0 deletions include/nvtop/interface_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ enum plot_information {
plot_gpu_clock_rate,
plot_gpu_mem_clock_rate,
plot_effective_load_rate,
plot_sm_util,
plot_tensor_util,
plot_information_count
};

Expand Down
4 changes: 4 additions & 0 deletions src/extract_gpuinfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
const char drm_pdev[] = "drm-pdev";
const char drm_client_id[] = "drm-client-id";

// Off until the UI plots SM/Tensor activity; gates NVML GPM sampling so nvtop only touches the shared
// perfmon counters when those series are actually shown.
bool gpuinfo_collect_compute_activity = false;

struct process_info_cache {
pid_t pid;
char *cmdline;
Expand Down
110 changes: 110 additions & 0 deletions src/extract_gpuinfo_nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,60 @@ nvmlReturn_t (*nvmlDeviceGetProcessUtilization)(nvmlDevice_t device, nvmlProcess
unsigned int *processSamplesCount,
unsigned long long lastSeenTimeStamp);

// NVML GPM (GPU Performance Monitoring) API — SM-active / Tensor-active etc. Lives inside libnvidia-ml,
// dlsym'd like the rest (absent on drivers < R510). Struct layouts, metric ids and the version macros (==1)
// are taken verbatim from nvml.h; metric ids are named, never bare integers.
typedef struct nvmlGpmSample_st *nvmlGpmSample_t;

typedef enum {
NVML_GPM_METRIC_SM_UTIL = 2,
NVML_GPM_METRIC_SM_OCCUPANCY = 3,
NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5,
NVML_GPM_METRIC_DRAM_BW_UTIL = 10,
} nvtop_nvmlGpmMetricId_t;

#define NVML_GPM_METRIC_MAX 210
#define NVML_GPM_METRICS_GET_VERSION 1

typedef struct {
char *shortName;
char *longName;
char *unit;
} nvmlGpmMetricMetricInfo_t;

typedef struct {
unsigned int metricId;
nvmlReturn_t nvmlReturn;
double value;
nvmlGpmMetricMetricInfo_t metricInfo;
} nvmlGpmMetric_t;

typedef struct {
unsigned int version;
unsigned int numMetrics;
nvmlGpmSample_t sample1;
nvmlGpmSample_t sample2;
nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX];
} nvmlGpmMetricsGet_t;

static nvmlReturn_t (*nvmlGpmSampleAlloc)(nvmlGpmSample_t *gpmSample);
static nvmlReturn_t (*nvmlGpmSampleFree)(nvmlGpmSample_t gpmSample);
static nvmlReturn_t (*nvmlGpmSampleGet)(nvmlDevice_t device, nvmlGpmSample_t gpmSample);
static nvmlReturn_t (*nvmlGpmMetricsGet)(nvmlGpmMetricsGet_t *metricsGet);

struct gpu_info_nvidia {
struct gpu_info base;
struct list_head allocate_list;

nvmlDevice_t gpuhandle;
bool isInMigMode;
unsigned long long last_utilization_timestamp;
// GPM (SM/Tensor activity): two ping-ponged samples diffed each refresh
bool gpm_checked;
bool gpm_supported;
bool gpm_primed;
nvmlGpmSample_t gpm_prev;
nvmlGpmSample_t gpm_cur;
};

static LIST_HEAD(allocations);
Expand Down Expand Up @@ -469,6 +516,10 @@ static bool gpuinfo_nvidia_init(void) {
// These ones might not be available
nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization");
nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode");
nvmlGpmSampleAlloc = dlsym(libnvidia_ml_handle, "nvmlGpmSampleAlloc");
nvmlGpmSampleFree = dlsym(libnvidia_ml_handle, "nvmlGpmSampleFree");
nvmlGpmSampleGet = dlsym(libnvidia_ml_handle, "nvmlGpmSampleGet");
nvmlGpmMetricsGet = dlsym(libnvidia_ml_handle, "nvmlGpmMetricsGet");

last_nvml_return_status = nvmlInit();
if (last_nvml_return_status != NVML_SUCCESS) {
Expand All @@ -485,6 +536,19 @@ static bool gpuinfo_nvidia_init(void) {
}

static void gpuinfo_nvidia_shutdown(void) {
// Free GPM samples while libnvidia-ml is still loaded (the symbol vanishes after dlclose).
if (nvmlGpmSampleFree) {
struct gpu_info_nvidia *gpm_it;
list_for_each_entry(gpm_it, &allocations, allocate_list) {
if (gpm_it->gpm_prev)
nvmlGpmSampleFree(gpm_it->gpm_prev);
if (gpm_it->gpm_cur)
nvmlGpmSampleFree(gpm_it->gpm_cur);
gpm_it->gpm_prev = NULL;
gpm_it->gpm_cur = NULL;
}
}

if (libnvidia_ml_handle) {
nvmlShutdown();
dlclose(libnvidia_ml_handle);
Expand Down Expand Up @@ -721,6 +785,52 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) {
if (last_nvml_return_status == NVML_SUCCESS)
SET_VALID(gpuinfo_pcie_tx_valid, dynamic_info->valid);

// SM / Tensor activity via the GPM API (two-sample diff; NVML returns 0..100 percentages directly), only
// while the UI is plotting them. Capability is decided by the actual nvmlGpmSampleGet return rather than
// nvmlGpmQueryDeviceSupport, which is unreliable (reports unsupported on Blackwell where GPM works).
if (gpuinfo_collect_compute_activity && nvmlGpmSampleAlloc && nvmlGpmSampleGet && nvmlGpmMetricsGet &&
!gpu_info->gpm_checked) {
gpu_info->gpm_checked = true;
if (nvmlGpmSampleAlloc(&gpu_info->gpm_prev) == NVML_SUCCESS &&
nvmlGpmSampleAlloc(&gpu_info->gpm_cur) == NVML_SUCCESS)
gpu_info->gpm_supported = true;
}
if (gpuinfo_collect_compute_activity && gpu_info->gpm_supported) {
nvmlReturn_t gpm_ret = nvmlGpmSampleGet(device, gpu_info->gpm_cur);
if (gpm_ret == NVML_SUCCESS) {
if (gpu_info->gpm_primed) {
nvmlGpmMetricsGet_t mg = {.version = NVML_GPM_METRICS_GET_VERSION,
.numMetrics = 4,
.sample1 = gpu_info->gpm_prev,
.sample2 = gpu_info->gpm_cur};
mg.metrics[0].metricId = NVML_GPM_METRIC_SM_UTIL;
mg.metrics[1].metricId = NVML_GPM_METRIC_ANY_TENSOR_UTIL;
mg.metrics[2].metricId = NVML_GPM_METRIC_SM_OCCUPANCY;
mg.metrics[3].metricId = NVML_GPM_METRIC_DRAM_BW_UTIL;
if (nvmlGpmMetricsGet(&mg) == NVML_SUCCESS) {
if (mg.metrics[0].nvmlReturn == NVML_SUCCESS)
SET_GPUINFO_DYNAMIC(dynamic_info, sm_util, (unsigned)(mg.metrics[0].value + 0.5));
if (mg.metrics[1].nvmlReturn == NVML_SUCCESS)
SET_GPUINFO_DYNAMIC(dynamic_info, tensor_util, (unsigned)(mg.metrics[1].value + 0.5));
if (mg.metrics[2].nvmlReturn == NVML_SUCCESS)
SET_GPUINFO_DYNAMIC(dynamic_info, sm_occupancy, (unsigned)(mg.metrics[2].value + 0.5));
if (mg.metrics[3].nvmlReturn == NVML_SUCCESS)
SET_GPUINFO_DYNAMIC(dynamic_info, dram_bw_util, (unsigned)(mg.metrics[3].value + 0.5));
}
}
nvmlGpmSample_t swap = gpu_info->gpm_prev;
gpu_info->gpm_prev = gpu_info->gpm_cur;
gpu_info->gpm_cur = swap;
gpu_info->gpm_primed = true;
} else {
gpu_info->gpm_primed = false; // re-prime from a fresh pair on the next round
if (gpm_ret == NVML_ERROR_NOT_SUPPORTED)
gpu_info->gpm_supported = false; // GPU genuinely lacks GPM (e.g. Ampere) — stop polling it
}
} else {
gpu_info->gpm_primed = false; // collection disabled: drop the prime so a later enable starts fresh
}

// Fan speed
last_nvml_return_status = nvmlDeviceGetFanSpeed(device, &dynamic_info->fan_speed);
if (last_nvml_return_status == NVML_SUCCESS)
Expand Down
23 changes: 23 additions & 0 deletions src/interface.c
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,14 @@ void save_current_data_to_ring(struct list_head *devices, struct nvtop_interface
data_val = device->dynamic_info.effective_load_rate;
}
break;
case plot_sm_util:
if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, sm_util))
data_val = device->dynamic_info.sm_util;
break;
case plot_tensor_util:
if (GPUINFO_DYNAMIC_FIELD_VALID(&device->dynamic_info, tensor_util))
data_val = device->dynamic_info.tensor_util;
break;
case plot_information_count:
break;
}
Expand Down Expand Up @@ -1805,6 +1813,12 @@ static unsigned populate_plot_data_from_ring_buffer(const struct nvtop_interface
case plot_effective_load_rate:
snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u eff. load%%", dev_id);
break;
case plot_sm_util:
snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u SM%%", dev_id);
break;
case plot_tensor_util:
snprintf(plot_legend[in_processing], PLOT_MAX_LEGEND_SIZE, "GPU%u tensor%%", dev_id);
break;
case plot_information_count:
break;
}
Expand Down Expand Up @@ -1848,6 +1862,15 @@ static void draw_plots(struct nvtop_interface *interface) {

void draw_gpu_info_ncurses(unsigned devices_count, struct list_head *devices, struct nvtop_interface *interface) {

// Let the NVIDIA backend sample NVML GPM only when an SM/Tensor plot series is enabled, so an idle nvtop
// never arms the shared perfmon counters. Takes effect on the next refresh, which is fine.
gpuinfo_collect_compute_activity = false;
for (unsigned i = 0; !gpuinfo_collect_compute_activity && i < interface->total_dev_count; ++i) {
plot_info_to_draw td = interface->options.gpu_specific_opts[i].to_draw;
if (plot_isset_draw_info(plot_sm_util, td) || plot_isset_draw_info(plot_tensor_util, td))
gpuinfo_collect_compute_activity = true;
}

draw_devices(devices, interface);
if (!interface->setup_win.visible) {
draw_plots(interface);
Expand Down
3 changes: 2 additions & 1 deletion src/interface_options.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ static const char device_monitor[] = "Monitor";
static const char device_shown_value[] = "ShownInfo";
static const char *device_draw_vals[plot_information_count + 1] = {
"gpuRate", "gpuMemRate", "encodeRate", "decodeRate", "temperature",
"powerDrawRate", "fanSpeed", "gpuClockRate", "gpuMemClockRate", "effectiveLoadRate", "none"};
"powerDrawRate", "fanSpeed", "gpuClockRate", "gpuMemClockRate", "effectiveLoadRate",
"smUtil", "tensorUtil", "none"};

static int nvtop_option_ini_handler(void *user, const char *section, const char *name, const char *value) {
struct nvtop_option_ini_data *ini_data = (struct nvtop_option_ini_data *)user;
Expand Down
2 changes: 1 addition & 1 deletion src/interface_setup_win.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ static const char *setup_chart_gpu_description = "Displayed GPU";
static const char *setup_chart_gpu_value_descriptions[plot_information_count] = {
"GPU utilization rate", "GPU memory utilization rate", "GPU encoder rate", "GPU decoder rate",
"GPU temperature", "Power draw rate (current/max)", "Fan speed", "GPU clock rate",
"GPU memory clock rate", "Effective load rate"};
"GPU memory clock rate", "Effective load rate", "SM active rate", "Tensor active rate"};

static const char *chart_color_names[] = {"Red", "Cyan", "Green", "Yellow", "Blue", "Magenta", "White"};
static const unsigned chart_color_names_count = ARRAY_SIZE(chart_color_names);
Expand Down