Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/client/EnvVars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class EnvVars
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numSubIterations; // Number of subiterations to perform
int numWarmups; // Number of un-timed warmup iterations to perform
int pingpongStride; // Stride in bytes between flag slots for pingpong laps
int showBorders; // Show ASCII box-drawing characaters in tables
int showIterations; // Show per-iteration timing info
int useInteractive; // Pause for user-input before starting transfer loop
Expand Down Expand Up @@ -159,6 +160,7 @@ class EnvVars
numIterations = GetEnvVar("NUM_ITERATIONS" , 10);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS" , 1);
numWarmups = GetEnvVar("NUM_WARMUPS" , 3);
pingpongStride = GetEnvVar("PINGPONG_STRIDE" , 8);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , 1);
showBorders = GetEnvVar("SHOW_BORDERS" , 1);
Expand Down Expand Up @@ -348,6 +350,7 @@ class EnvVars
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n");
printf(" PINGPONG_STRIDE - Stride in bytes between flag slots for pingpong laps (default 8, must be multiple of 8)\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
#if NIC_EXEC_ENABLED
printf(" ROCE_VERSION - RoCE version (default=2)\n");
Expand Down Expand Up @@ -467,6 +470,8 @@ class EnvVars
"Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
Print("NUM_WARMUPS", numWarmups,
"Running %d warmup iteration(s) per Test", numWarmups);
Print("PINGPONG_STRIDE", pingpongStride,
"Pingpong flag stride %d bytes per lap", pingpongStride);
#if NIC_EXEC_ENABLED
Print("ROCE_VERSION", roceVersion,
"RoCE version is set to %d", roceVersion);
Expand Down Expand Up @@ -621,6 +626,7 @@ class EnvVars
cfg.general.numIterations = numIterations;
cfg.general.numSubIterations = numSubIterations;
cfg.general.numWarmups = numWarmups;
cfg.general.pingpongStride = pingpongStride;
cfg.general.recordPerIteration = showIterations;
cfg.general.useInteractive = useInteractive;

Expand Down
204 changes: 131 additions & 73 deletions src/client/Utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,12 +518,16 @@ namespace TransferBench::Utils
size_t numTimedIterations = results.numTimedIterations;
for (auto const& exeInfoPair : results.exeResults) {
ExeResult const& exeResult = exeInfoPair.second;
numRows += 1 + exeResult.transferIdx.size();
int displayCount = 0;
for (int idx : exeResult.transferIdx)
if (transfers[idx].laps >= 0) displayCount++;
numRows += 1 + displayCount;
if (ev.showIterations) {
numRows += (numTimedIterations + 1);
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When showIterations is enabled, numRows is only incremented by (numTimedIterations + 1) once per executor, but the rendering logic appends per-iteration rows for each displayed transfer. Because TableHelper::Set silently ignores out-of-bounds writes, this can truncate iteration output whenever an executor has multiple transfers. Update numRows to account for (numTimedIterations + 1) per displayed transfer (regular + pingpong).

Suggested change
numRows += (numTimedIterations + 1);
numRows += displayCount * (numTimedIterations + 1);

Copilot uses AI. Check for mistakes.

// Check that per-iteration information exists
for (int idx : exeResult.transferIdx) {
if (transfers[idx].laps < 0) continue;
TransferResult const& r = results.tfrResults[idx];
if (r.perIterMsec.size() != numTimedIterations) {
Print("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
Expand Down Expand Up @@ -569,87 +573,141 @@ namespace TransferBench::Utils
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];

table.Set(rowIdx, 0, "Transfer %-4d ", idx);
table.Set(rowIdx, 1, "%8.3f GB/s " , r.avgBandwidthGbPerSec);
table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec);
table.Set(rowIdx, 3, "%12lu bytes " , r.numBytes);

char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);

if (isMultiRank) {
table.Set(rowIdx, 4, " %s -> R%d%c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
if (t.laps < 0) continue;

if (t.laps > 0) {
// Pingpong row: show latency using ping's round-trip delta
double latencyUs = r.avgDurationMsec * 1000.0;
table.Set(rowIdx, 0, "PingPong %-4d ", idx);
table.Set(rowIdx, 1, "%8.3f us " , latencyUs);
table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec);
table.Set(rowIdx, 3, "%8d laps " , t.laps);

Transfer const& pong = transfers[idx + 1];
if (isMultiRank) {
table.Set(rowIdx, 4, " %s->R%d%c%d->%s <+> %s->R%d%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
pong.exeDevice.exeRank, ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s->%c%d->%s <+> %s->%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
Comment on lines +586 to +602
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Transfer const& pong = transfers[idx + 1]; assumes the pong half always exists immediately after the ping index. If a ping transfer is last in the list, or transfer ordering differs (e.g., future parser changes / manual API usage), this will read out of bounds. Add a bounds/consistency check (e.g., idx + 1 < transfers.size() and transfers[idx+1].laps < 0) before dereferencing.

Suggested change
Transfer const& pong = transfers[idx + 1];
if (isMultiRank) {
table.Set(rowIdx, 4, " %s->R%d%c%d->%s <+> %s->R%d%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
pong.exeDevice.exeRank, ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s->%c%d->%s <+> %s->%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
bool const hasPong = (static_cast<size_t>(idx + 1) < transfers.size()) && (transfers[idx + 1].laps < 0);
if (hasPong) {
Transfer const& pong = transfers[idx + 1];
if (isMultiRank) {
table.Set(rowIdx, 4, " %s->R%d%c%d->%s <+> %s->R%d%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
pong.exeDevice.exeRank, ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s->%c%d->%s <+> %s->%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str(),
MemDevicesToStr(pong.srcs).c_str(),
ExeTypeStr[pong.exeDevice.exeType], pong.exeDevice.exeIndex,
MemDevicesToStr(pong.dsts).c_str());
}
} else {
if (isMultiRank) {
table.Set(rowIdx, 4, " %s->R%d%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s->%c%d->%s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
MemDevicesToStr(t.dsts).c_str());
}

Copilot uses AI. Check for mistakes.
}
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
rowIdx++;

if (ev.showIterations) {
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
for (size_t i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);

for (auto& time : times) {
double iterUs = time.first * 1000.0;
table.Set(rowIdx, 0, "Iter %03d ", time.second);
table.Set(rowIdx, 1, "%8.3f us ", iterUs);
table.Set(rowIdx, 2, "%8.3f ms ", time.first);
rowIdx++;
}

table.Set(rowIdx, 0, "StandardDev ");
table.Set(rowIdx, 1, "%8.3f us ", stdDevTime * 1000.0);
table.Set(rowIdx, 2, "%8.3f ms ", stdDevTime);
rowIdx++;
table.DrawRowBorder(rowIdx);
}
} else {
table.Set(rowIdx, 4, " %s -> %c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
}
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
rowIdx++;

// Show per-iteration timing information
if (ev.showIterations) {

// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;

double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
// Regular transfer row (laps == 0)
table.Set(rowIdx, 0, "Transfer %-4d ", idx);
table.Set(rowIdx, 1, "%8.3f GB/s " , r.avgBandwidthGbPerSec);
table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec);
table.Set(rowIdx, 3, "%12lu bytes " , r.numBytes);

char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);

if (isMultiRank) {
table.Set(rowIdx, 4, " %s -> R%d%c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s -> %c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);

// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;

std::set<int> usedXccs;
std::stringstream ss1;
if (exeDevice.exeType == EXE_GPU_GFX) {
if (time.second - 1 < r.perIterCUs.size()) {
ss1 << " CUs: ";
for (auto x : r.perIterCUs[time.second - 1]) {
ss1 << x.first << ":" << std::setfill('0') << std::setw(2) << x.second << " ";
usedXccs.insert(x.first);
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
rowIdx++;

if (ev.showIterations) {
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (size_t i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;

double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);

for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;

std::set<int> usedXccs;
std::stringstream ss1;
if (exeDevice.exeType == EXE_GPU_GFX) {
if (time.second - 1 < r.perIterCUs.size()) {
ss1 << " CUs: ";
for (auto x : r.perIterCUs[time.second - 1]) {
ss1 << x.first << ":" << std::setfill('0') << std::setw(2) << x.second << " ";
usedXccs.insert(x.first);
}
}
}
}

std::stringstream ss2;
if (!usedXccs.empty()) {
ss2 << " XCCs:";
for (auto x : usedXccs)
ss2 << " " << x;
std::stringstream ss2;
if (!usedXccs.empty()) {
ss2 << " XCCs:";
for (auto x : usedXccs)
ss2 << " " << x;
}

table.Set(rowIdx, 0, "Iter %03d ", time.second);
table.Set(rowIdx, 1, "%8.3f GB/s ", iterBandwidthGbs);
table.Set(rowIdx, 2, "%8.3f ms ", iterDurationMsec);
table.Set(rowIdx, 3, ss1.str());
table.Set(rowIdx, 4, ss2.str());
rowIdx++;
}

table.Set(rowIdx, 0, "Iter %03d ", time.second);
table.Set(rowIdx, 1, "%8.3f GB/s ", iterBandwidthGbs);
table.Set(rowIdx, 2, "%8.3f ms ", iterDurationMsec);
table.Set(rowIdx, 3, ss1.str());
table.Set(rowIdx, 4, ss2.str());
table.Set(rowIdx, 0, "StandardDev ");
table.Set(rowIdx, 1, "%8.3f GB/s ", stdDevBw);
table.Set(rowIdx, 2, "%8.3f ms ", stdDevTime);
rowIdx++;
table.DrawRowBorder(rowIdx);
}

table.Set(rowIdx, 0, "StandardDev ");
table.Set(rowIdx, 1, "%8.3f GB/s ", stdDevBw);
table.Set(rowIdx, 2, "%8.3f ms ", stdDevTime);
rowIdx++;
table.DrawRowBorder(rowIdx);
}
}
}
Expand Down
Loading