Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 92 additions & 7 deletions TraceLens/Trace2Tree/trace_to_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,16 +695,40 @@ def _preprocess_and_index_events(self) -> None:
# if python_id is not None:
# self.dict_pythonID2UID[python_id] = event[UID]

# Build CPU pid → GPU pid mapping from ac2g flow event pairs.
# In merged multi-rank traces each rank produces overlapping correlation
# IDs, so _get_graph_gpu_events must filter GPU events to the correct
# rank. ac2g "start" events carry the CPU pid; their matching "end"
# events carry the GPU pid — giving us the per-rank CPU↔GPU pid link.
self.cpu_pid_to_gpu_pids = defaultdict(set)
for link_id, start_evt in self.ac2g_event_map["start"].items():
end_evt = self.ac2g_event_map["end"].get(link_id)
if end_evt is not None:
cpu_pid = start_evt.get(PID)
gpu_pid = end_evt.get(PID)
if cpu_pid is not None and gpu_pid is not None:
self.cpu_pid_to_gpu_pids[cpu_pid].add(gpu_pid)

def _nn_module_stack_name_for_event(self, event: Dict[str, Any]) -> str:
name = event.get(TraceLens.util.TraceEventUtils.TraceKeys.Name, "")
return re.sub(r"_\d+$", "", name)

def add_gpu_ops_to_tree(self):
import gc
from collections import deque

UID = TraceLens.util.TraceEventUtils.TraceKeys.UID
Name = TraceLens.util.TraceEventUtils.TraceKeys.Name
events_by_uid = self.events_by_uid
name2event_uids = self.name2event_uids
graph_launch_names = {"cudaGraphLaunch", "hipGraphLaunch"}

# ── Phase B: link each GPU kernel to its immediate runtime parent ──────
# Iterates only runtime_event_uids (pre-filtered in _preprocess_and_index_events)
# rather than all self.events. _get_graph_gpu_events now uses the
# pre-built linking_id_to_gpu_events index (O(1) per graph launch) and
# filters by rank via cpu_pid_to_gpu_pids to avoid cross-rank attribution
# in merged multi-rank traces.
for runtime_uid in self.runtime_event_uids:
runtime_event = events_by_uid[runtime_uid]
if runtime_event["name"] in graph_launch_names:
Expand All @@ -720,12 +744,58 @@ def add_gpu_ops_to_tree(self):
name2event_uids[gpu_evt[Name]].append(gpu_evt_uid)
runtime_event.setdefault("gpu_events", []).append(gpu_evt_uid)

# Walk parent chain to propagate gpu_events
parent_uid = runtime_event.get("parent")
while parent_uid is not None:
parent = events_by_uid[parent_uid]
parent.setdefault("gpu_events", []).append(gpu_evt_uid)
parent_uid = parent.get("parent")
# ── Phase C: single bottom-up propagation of gpu_events ───────────────
# Replace the per-kernel O(depth) ancestor walk with a single BFS
# topological sort followed by a reverse-order list.extend() pass.
# C-level list.extend() is 10-50× faster than individual append() calls
# and avoids the GC pressure of millions of per-kernel allocations.
#
# BFS seeds: self.cpu_root_nodes is already populated by
# build_host_call_stack_tree — no O(N_all) scan needed.
#
# Visited set: in merged multi-rank traces, Phase B can (for any
# remaining cross-rank duplicates) add a GPU event as a child of
# multiple runtime parents. Without a visited set the BFS enqueues
# each such event K times, making traversal O(K² × N_gpu) — a definite
# hang for large merged traces. The visited set reduces this to O(N).
topo_order: list = []
visited: set = set()
q: deque = deque(
events_by_uid[uid] for uid in self.cpu_root_nodes if uid in events_by_uid
)
while q:
ev = q.popleft()
ev_uid = ev[UID]
if ev_uid in visited:
continue
visited.add(ev_uid)
topo_order.append(ev)
for child_uid in ev.get("children", ()):
if child_uid not in visited:
child = events_by_uid.get(child_uid)
if child is not None:
q.append(child)

gc.disable()
try:
for event in reversed(topo_order):
my_gpu = event.get("gpu_events")
if not my_gpu:
continue
parent_uid = event.get("parent")
if parent_uid is None:
continue
parent = events_by_uid.get(parent_uid)
if parent is None:
continue
parent_gpu = parent.get("gpu_events")
if parent_gpu is None:
parent["gpu_events"] = list(my_gpu)
else:
parent_gpu.extend(my_gpu)
finally:
gc.enable()
gc.collect()

def build_tree(self, add_python_func=False, link_fwd_bwd=True) -> None:
print(f"Building tree with add_python_func={add_python_func}")
Expand Down Expand Up @@ -1136,7 +1206,22 @@ def _get_graph_gpu_events(self, graph_launch_evt):
).get(self.linking_key)
if corr is None:
return []
return self.linking_id_to_gpu_events.get(corr, [])
all_gpu_events = self.linking_id_to_gpu_events.get(corr, [])
# In merged multi-rank traces, correlation IDs restart from the same
# range for every rank, so linking_id_to_gpu_events[corr] can contain
# GPU kernels from all K ranks. Use the CPU↔GPU pid mapping derived
# from ac2g flow events to restrict results to the rank that issued
# this graph launch. Fall back to returning all matches when no
# mapping is available (e.g. traces with no regular kernel launches).
cpu_pid = graph_launch_evt.get(TraceLens.util.TraceEventUtils.TraceKeys.PID)
gpu_pids = self.cpu_pid_to_gpu_pids.get(cpu_pid)
if not gpu_pids:
return all_gpu_events
return [
evt
for evt in all_gpu_events
if evt.get(TraceLens.util.TraceEventUtils.TraceKeys.PID) in gpu_pids
]

def _find_corresponding_output_event(self, input_event):
# 1. Get the linking id from the input event
Expand Down
19 changes: 15 additions & 4 deletions TraceLens/TreePerf/tree_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def compute_perf_metrics(
"dur": kernel["dur"],
"stream": kernel.get("args", {}).get("stream", None),
}
for kernel in list_kernels
for kernel in sorted(list_kernels, key=lambda k: k.get("ts", 0))
]

# Select the appropriate dictionary for FLOPS and memory functions
Expand Down Expand Up @@ -864,15 +864,23 @@ def get_kernel_launchers(self, include_nccl=False):
key=lambda uid: self.tree.get_UID2event(uid).get("ts", 0),
)

events_by_uid = self.tree.events_by_uid
for launcher_uid in sorted_launcher_uids:
kernels = launcher_to_kernels[launcher_uid]
event = self.tree.get_UID2event(launcher_uid)

event["total_direct_kernel_time"] = self.GPUEventAnalyser(
kernels
).compute_metrics()["busy_time"]
event["total_subtree_kernel_time"] = self._compute_subtree_kernel_time_us(
event
# add_gpu_ops_to_tree() propagates every GPU kernel UID up to all
# CPU/runtime ancestors via event["gpu_events"], making subtree
# kernel lookup O(1) instead of a recursive traversal.
subtree_kernel_uids = event.get("gpu_events", [])
subtree_kernels = [events_by_uid[uid] for uid in subtree_kernel_uids]
event["total_subtree_kernel_time"] = (
self.GPUEventAnalyser(subtree_kernels).compute_metrics()["busy_time"]
if subtree_kernels
else 0
)
event["direct_kernel_count"] = len(kernels)
event["kernel_details"] = [
Expand All @@ -881,7 +889,7 @@ def get_kernel_launchers(self, include_nccl=False):
"dur": kernel["dur"],
"stream": kernel.get("args", {}).get("stream", None),
}
for kernel in kernels
for kernel in sorted(kernels, key=lambda k: k.get("ts", 0))
]
event["op category"] = self.op_categorizer(event)
self._compute_overlap_info(event, kernels)
Expand Down Expand Up @@ -1307,6 +1315,9 @@ def _summarize_kernel_stats(series_of_kernel_lists, agg_metrics=["mean"]):
# --- CHANGE: Use the consistent metric name directly ---
kernel_summary[metric_name] = agg_func(dur_arr)

summary_list.sort(
key=lambda k: (k.get("total_duration_us", 0), k.get("name", ""))
)
return summary_list

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name,param: convNd,param: input_shape,param: filter_shape,param: dtype_input_weight,param: input_stride,param: weight_stride,param: bias,param: stride,param: padding,param: dilation,param: transposed_conv,param: output_padding,param: groups,GFLOPS_first,Data Moved (MB)_first,FLOPS/Byte_first,TB/s_mean,TB/s_median,TB/s_std,TB/s_min,TB/s_max,TFLOPS/s_mean,TFLOPS/s_median,TFLOPS/s_std,TFLOPS/s_min,TFLOPS/s_max,process_name_first,process_label_first,thread_name_first,Compute Spec,kernel_details__summarize_kernel_stats,trunc_kernel_details,Input Dims_first,Input type_first,Input Strides_first,Concrete Inputs_first,Kernel Time (µs)_mean,Kernel Time (µs)_median,Kernel Time (µs)_std,Kernel Time (µs)_min,Kernel Time (µs)_max,Kernel Time (µs)_sum,name_count,UID_first
aten::convolution,conv2d,"(4, 3, 224, 224)","(768, 3, 16, 16)","('c10::BFloat16', 'c10::BFloat16')","(150528, 50176, 224, 1)","(768, 256, 16, 1)",False,"(16, 16)","(0, 0)","(1, 1)",False,"(0, 0)",1,0.924844032,3.421875,257.75342465753425,0.02768590506403035,0.02768590506403035,,0.02768590506403035,0.02768590506403035,7.136136844997193,7.136136844997193,,7.136136844997193,7.136136844997193,python3,CPU,thread 5617 (python3),matrix_bf16,"[{'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.272), 'mean_duration_us': np.float64(2.272), 'median_duration_us': np.float64(2.272), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.272), 'max_duration_us': np.float64(2.272)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.464), 'mean_duration_us': np.float64(10.464), 'median_duration_us': np.float64(10.464), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.464), 'max_duration_us': np.float64(10.464)}, {'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.112), 'mean_duration_us': np.float64(2.112), 'median_duration_us': np.float64(2.112), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.112), 'max_duration_us': np.float64(2.112)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.4), 'mean_duration_us': np.float64(10.4), 'median_duration_us': np.float64(10.4), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.4), 'max_duration_us': np.float64(10.4)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8>(cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8::Params)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(98.176), 'mean_duration_us': np.float64(98.176), 'median_duration_us': np.float64(98.176), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(98.176), 'max_duration_us': np.float64(98.176)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16, __nv_bfloat16, float, true, false, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nhwc2nchw_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.208), 'mean_duration_us': np.float64(2.208), 'median_duration_us': np.float64(2.208), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.208), 'max_duration_us': np.float64(2.208)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1})', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(3.968), 'mean_duration_us': np.float64(3.968), 'median_duration_us': np.float64(3.968), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(3.968), 'max_duration_us': np.float64(3.968)}]","[{'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.27)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.46)}, {'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.11)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.4)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop...', 'stream': 7, 'mean_duration_us': np.float64(98.18)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(2.21)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kern...', 'stream': 7, 'mean_duration_us': np.float64(3.97)}]","[[4, 3, 224, 224], [768, 3, 16, 16], [768], [], [], [], [], [], []]","['c10::BFloat16', 'c10::BFloat16', 'c10::BFloat16', 'ScalarList', 'ScalarList', 'ScalarList', 'Scalar', 'ScalarList', 'Scalar']","[[150528, 50176, 224, 1], [768, 256, 16, 1], [1], [], [], [], [], [], []]","['', '', '', '[16, 16]', '[0, 0]', '[1, 1]', 'False', '[0, 0]', '1']",129.60009765625,129.60009765625,,129.60009765625,129.60009765625,129.60009765625,1,2
aten::convolution,conv2d,"(4, 3, 224, 224)","(768, 3, 16, 16)","('c10::BFloat16', 'c10::BFloat16')","(150528, 50176, 224, 1)","(768, 256, 16, 1)",False,"(16, 16)","(0, 0)","(1, 1)",False,"(0, 0)",1,0.924844032,3.421875,257.75342465753425,0.02768590506403035,0.02768590506403035,,0.02768590506403035,0.02768590506403035,7.136136844997193,7.136136844997193,,7.136136844997193,7.136136844997193,python3,CPU,thread 5617 (python3),matrix_bf16,"[{'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.112), 'mean_duration_us': np.float64(2.112), 'median_duration_us': np.float64(2.112), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.112), 'max_duration_us': np.float64(2.112)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16, __nv_bfloat16, float, true, false, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nhwc2nchw_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.208), 'mean_duration_us': np.float64(2.208), 'median_duration_us': np.float64(2.208), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.208), 'max_duration_us': np.float64(2.208)}, {'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.272), 'mean_duration_us': np.float64(2.272), 'median_duration_us': np.float64(2.272), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.272), 'max_duration_us': np.float64(2.272)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1})', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(3.968), 'mean_duration_us': np.float64(3.968), 'median_duration_us': np.float64(3.968), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(3.968), 'max_duration_us': np.float64(3.968)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.4), 'mean_duration_us': np.float64(10.4), 'median_duration_us': np.float64(10.4), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.4), 'max_duration_us': np.float64(10.4)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.464), 'mean_duration_us': np.float64(10.464), 'median_duration_us': np.float64(10.464), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.464), 'max_duration_us': np.float64(10.464)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8>(cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8::Params)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(98.176), 'mean_duration_us': np.float64(98.176), 'median_duration_us': np.float64(98.176), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(98.176), 'max_duration_us': np.float64(98.176)}]","[{'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.11)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(2.21)}, {'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.27)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kern...', 'stream': 7, 'mean_duration_us': np.float64(3.97)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.4)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.46)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop...', 'stream': 7, 'mean_duration_us': np.float64(98.18)}]","[[4, 3, 224, 224], [768, 3, 16, 16], [768], [], [], [], [], [], []]","['c10::BFloat16', 'c10::BFloat16', 'c10::BFloat16', 'ScalarList', 'ScalarList', 'ScalarList', 'Scalar', 'ScalarList', 'Scalar']","[[150528, 50176, 224, 1], [768, 256, 16, 1], [1], [], [], [], [], [], []]","['', '', '', '[16, 16]', '[0, 0]', '[1, 1]', 'False', '[0, 0]', '1']",129.60009765625,129.60009765625,,129.60009765625,129.60009765625,129.60009765625,1,2
Loading
Loading