AMD-AGI · brieflynn · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/TraceLens/Trace2Tree/trace_to_tree.py b/TraceLens/Trace2Tree/trace_to_tree.py
@@ -695,16 +695,40 @@ def _preprocess_and_index_events(self) -> None:
             #     if python_id is not None:
             #         self.dict_pythonID2UID[python_id] = event[UID]
 
+        # Build CPU pid → GPU pid mapping from ac2g flow event pairs.
+        # In merged multi-rank traces each rank produces overlapping correlation
+        # IDs, so _get_graph_gpu_events must filter GPU events to the correct
+        # rank.  ac2g "start" events carry the CPU pid; their matching "end"
+        # events carry the GPU pid — giving us the per-rank CPU↔GPU pid link.
+        self.cpu_pid_to_gpu_pids = defaultdict(set)
+        for link_id, start_evt in self.ac2g_event_map["start"].items():
+            end_evt = self.ac2g_event_map["end"].get(link_id)
+            if end_evt is not None:
+                cpu_pid = start_evt.get(PID)
+                gpu_pid = end_evt.get(PID)
+                if cpu_pid is not None and gpu_pid is not None:
+                    self.cpu_pid_to_gpu_pids[cpu_pid].add(gpu_pid)
+
     def _nn_module_stack_name_for_event(self, event: Dict[str, Any]) -> str:
         name = event.get(TraceLens.util.TraceEventUtils.TraceKeys.Name, "")
         return re.sub(r"_\d+$", "", name)
 
     def add_gpu_ops_to_tree(self):
+        import gc
+        from collections import deque
+
         UID = TraceLens.util.TraceEventUtils.TraceKeys.UID
         Name = TraceLens.util.TraceEventUtils.TraceKeys.Name
         events_by_uid = self.events_by_uid
         name2event_uids = self.name2event_uids
         graph_launch_names = {"cudaGraphLaunch", "hipGraphLaunch"}
+
+        # ── Phase B: link each GPU kernel to its immediate runtime parent ──────
+        # Iterates only runtime_event_uids (pre-filtered in _preprocess_and_index_events)
+        # rather than all self.events.  _get_graph_gpu_events now uses the
+        # pre-built linking_id_to_gpu_events index (O(1) per graph launch) and
+        # filters by rank via cpu_pid_to_gpu_pids to avoid cross-rank attribution
+        # in merged multi-rank traces.
         for runtime_uid in self.runtime_event_uids:
             runtime_event = events_by_uid[runtime_uid]
             if runtime_event["name"] in graph_launch_names:
@@ -720,12 +744,58 @@ def add_gpu_ops_to_tree(self):
                 name2event_uids[gpu_evt[Name]].append(gpu_evt_uid)
                 runtime_event.setdefault("gpu_events", []).append(gpu_evt_uid)
 
-                # Walk parent chain to propagate gpu_events
-                parent_uid = runtime_event.get("parent")
-                while parent_uid is not None:
-                    parent = events_by_uid[parent_uid]
-                    parent.setdefault("gpu_events", []).append(gpu_evt_uid)
-                    parent_uid = parent.get("parent")
+        # ── Phase C: single bottom-up propagation of gpu_events ───────────────
+        # Replace the per-kernel O(depth) ancestor walk with a single BFS
+        # topological sort followed by a reverse-order list.extend() pass.
+        # C-level list.extend() is 10-50× faster than individual append() calls
+        # and avoids the GC pressure of millions of per-kernel allocations.
+        #
+        # BFS seeds: self.cpu_root_nodes is already populated by
+        # build_host_call_stack_tree — no O(N_all) scan needed.
+        #
+        # Visited set: in merged multi-rank traces, Phase B can (for any
+        # remaining cross-rank duplicates) add a GPU event as a child of
+        # multiple runtime parents.  Without a visited set the BFS enqueues
+        # each such event K times, making traversal O(K² × N_gpu) — a definite
+        # hang for large merged traces.  The visited set reduces this to O(N).
+        topo_order: list = []
+        visited: set = set()
+        q: deque = deque(
+            events_by_uid[uid] for uid in self.cpu_root_nodes if uid in events_by_uid
+        )
+        while q:
+            ev = q.popleft()
+            ev_uid = ev[UID]
+            if ev_uid in visited:
+                continue
+            visited.add(ev_uid)
+            topo_order.append(ev)
+            for child_uid in ev.get("children", ()):
+                if child_uid not in visited:
+                    child = events_by_uid.get(child_uid)
+                    if child is not None:
+                        q.append(child)
+
+        gc.disable()
+        try:
+            for event in reversed(topo_order):
+                my_gpu = event.get("gpu_events")
+                if not my_gpu:
+                    continue
+                parent_uid = event.get("parent")
+                if parent_uid is None:
+                    continue
+                parent = events_by_uid.get(parent_uid)
+                if parent is None:
+                    continue
+                parent_gpu = parent.get("gpu_events")
+                if parent_gpu is None:
+                    parent["gpu_events"] = list(my_gpu)
+                else:
+                    parent_gpu.extend(my_gpu)
+        finally:
+            gc.enable()
+            gc.collect()
 
     def build_tree(self, add_python_func=False, link_fwd_bwd=True) -> None:
         print(f"Building tree with add_python_func={add_python_func}")
@@ -1136,7 +1206,22 @@ def _get_graph_gpu_events(self, graph_launch_evt):
         ).get(self.linking_key)
         if corr is None:
             return []
-        return self.linking_id_to_gpu_events.get(corr, [])
+        all_gpu_events = self.linking_id_to_gpu_events.get(corr, [])
+        # In merged multi-rank traces, correlation IDs restart from the same
+        # range for every rank, so linking_id_to_gpu_events[corr] can contain
+        # GPU kernels from all K ranks.  Use the CPU↔GPU pid mapping derived
+        # from ac2g flow events to restrict results to the rank that issued
+        # this graph launch.  Fall back to returning all matches when no
+        # mapping is available (e.g. traces with no regular kernel launches).
+        cpu_pid = graph_launch_evt.get(TraceLens.util.TraceEventUtils.TraceKeys.PID)
+        gpu_pids = self.cpu_pid_to_gpu_pids.get(cpu_pid)
+        if not gpu_pids:
+            return all_gpu_events
+        return [
+            evt
+            for evt in all_gpu_events
+            if evt.get(TraceLens.util.TraceEventUtils.TraceKeys.PID) in gpu_pids
+        ]
 
     def _find_corresponding_output_event(self, input_event):
         # 1. Get the linking id from the input event

diff --git a/TraceLens/TreePerf/tree_perf.py b/TraceLens/TreePerf/tree_perf.py
@@ -327,7 +327,7 @@ def compute_perf_metrics(
                 "dur": kernel["dur"],
                 "stream": kernel.get("args", {}).get("stream", None),
             }
-            for kernel in list_kernels
+            for kernel in sorted(list_kernels, key=lambda k: k.get("ts", 0))
         ]
 
         # Select the appropriate dictionary for FLOPS and memory functions
@@ -864,15 +864,23 @@ def get_kernel_launchers(self, include_nccl=False):
             key=lambda uid: self.tree.get_UID2event(uid).get("ts", 0),
         )
 
+        events_by_uid = self.tree.events_by_uid
         for launcher_uid in sorted_launcher_uids:
             kernels = launcher_to_kernels[launcher_uid]
             event = self.tree.get_UID2event(launcher_uid)
 
             event["total_direct_kernel_time"] = self.GPUEventAnalyser(
                 kernels
             ).compute_metrics()["busy_time"]
-            event["total_subtree_kernel_time"] = self._compute_subtree_kernel_time_us(
-                event
+            # add_gpu_ops_to_tree() propagates every GPU kernel UID up to all
+            # CPU/runtime ancestors via event["gpu_events"], making subtree
+            # kernel lookup O(1) instead of a recursive traversal.
+            subtree_kernel_uids = event.get("gpu_events", [])
+            subtree_kernels = [events_by_uid[uid] for uid in subtree_kernel_uids]
+            event["total_subtree_kernel_time"] = (
+                self.GPUEventAnalyser(subtree_kernels).compute_metrics()["busy_time"]
+                if subtree_kernels
+                else 0
             )
             event["direct_kernel_count"] = len(kernels)
             event["kernel_details"] = [
@@ -881,7 +889,7 @@ def get_kernel_launchers(self, include_nccl=False):
                     "dur": kernel["dur"],
                     "stream": kernel.get("args", {}).get("stream", None),
                 }
-                for kernel in kernels
+                for kernel in sorted(kernels, key=lambda k: k.get("ts", 0))
             ]
             event["op category"] = self.op_categorizer(event)
             self._compute_overlap_info(event, kernels)
@@ -1307,6 +1315,9 @@ def _summarize_kernel_stats(series_of_kernel_lists, agg_metrics=["mean"]):
                     # --- CHANGE: Use the consistent metric name directly ---
                     kernel_summary[metric_name] = agg_func(dur_arr)
 
+        summary_list.sort(
+            key=lambda k: (k.get("total_duration_us", 0), k.get("name", ""))
+        )
         return summary_list
 
     @staticmethod

diff --git a/tests/traces/h100/Falconsai_nsfw_image_detection__1016002_perf_report_csvs/CONV_fwd.csv b/tests/traces/h100/Falconsai_nsfw_image_detection__1016002_perf_report_csvs/CONV_fwd.csv
@@ -1,2 +1,2 @@
 name,param: convNd,param: input_shape,param: filter_shape,param: dtype_input_weight,param: input_stride,param: weight_stride,param: bias,param: stride,param: padding,param: dilation,param: transposed_conv,param: output_padding,param: groups,GFLOPS_first,Data Moved (MB)_first,FLOPS/Byte_first,TB/s_mean,TB/s_median,TB/s_std,TB/s_min,TB/s_max,TFLOPS/s_mean,TFLOPS/s_median,TFLOPS/s_std,TFLOPS/s_min,TFLOPS/s_max,process_name_first,process_label_first,thread_name_first,Compute Spec,kernel_details__summarize_kernel_stats,trunc_kernel_details,Input Dims_first,Input type_first,Input Strides_first,Concrete Inputs_first,Kernel Time (µs)_mean,Kernel Time (µs)_median,Kernel Time (µs)_std,Kernel Time (µs)_min,Kernel Time (µs)_max,Kernel Time (µs)_sum,name_count,UID_first
-aten::convolution,conv2d,"(4, 3, 224, 224)","(768, 3, 16, 16)","('c10::BFloat16', 'c10::BFloat16')","(150528, 50176, 224, 1)","(768, 256, 16, 1)",False,"(16, 16)","(0, 0)","(1, 1)",False,"(0, 0)",1,0.924844032,3.421875,257.75342465753425,0.02768590506403035,0.02768590506403035,,0.02768590506403035,0.02768590506403035,7.136136844997193,7.136136844997193,,7.136136844997193,7.136136844997193,python3,CPU,thread 5617 (python3),matrix_bf16,"[{'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.272), 'mean_duration_us': np.float64(2.272), 'median_duration_us': np.float64(2.272), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.272), 'max_duration_us': np.float64(2.272)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.464), 'mean_duration_us': np.float64(10.464), 'median_duration_us': np.float64(10.464), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.464), 'max_duration_us': np.float64(10.464)}, {'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.112), 'mean_duration_us': np.float64(2.112), 'median_duration_us': np.float64(2.112), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.112), 'max_duration_us': np.float64(2.112)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.4), 'mean_duration_us': np.float64(10.4), 'median_duration_us': np.float64(10.4), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.4), 'max_duration_us': np.float64(10.4)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8>(cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8::Params)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(98.176), 'mean_duration_us': np.float64(98.176), 'median_duration_us': np.float64(98.176), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(98.176), 'max_duration_us': np.float64(98.176)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16, __nv_bfloat16, float, true, false, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nhwc2nchw_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.208), 'mean_duration_us': np.float64(2.208), 'median_duration_us': np.float64(2.208), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.208), 'max_duration_us': np.float64(2.208)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1})', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(3.968), 'mean_duration_us': np.float64(3.968), 'median_duration_us': np.float64(3.968), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(3.968), 'max_duration_us': np.float64(3.968)}]","[{'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.27)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.46)}, {'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.11)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.4)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop...', 'stream': 7, 'mean_duration_us': np.float64(98.18)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(2.21)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kern...', 'stream': 7, 'mean_duration_us': np.float64(3.97)}]","[[4, 3, 224, 224], [768, 3, 16, 16], [768], [], [], [], [], [], []]","['c10::BFloat16', 'c10::BFloat16', 'c10::BFloat16', 'ScalarList', 'ScalarList', 'ScalarList', 'Scalar', 'ScalarList', 'Scalar']","[[150528, 50176, 224, 1], [768, 256, 16, 1], [1], [], [], [], [], [], []]","['', '', '', '[16, 16]', '[0, 0]', '[1, 1]', 'False', '[0, 0]', '1']",129.60009765625,129.60009765625,,129.60009765625,129.60009765625,129.60009765625,1,2
+aten::convolution,conv2d,"(4, 3, 224, 224)","(768, 3, 16, 16)","('c10::BFloat16', 'c10::BFloat16')","(150528, 50176, 224, 1)","(768, 256, 16, 1)",False,"(16, 16)","(0, 0)","(1, 1)",False,"(0, 0)",1,0.924844032,3.421875,257.75342465753425,0.02768590506403035,0.02768590506403035,,0.02768590506403035,0.02768590506403035,7.136136844997193,7.136136844997193,,7.136136844997193,7.136136844997193,python3,CPU,thread 5617 (python3),matrix_bf16,"[{'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.112), 'mean_duration_us': np.float64(2.112), 'median_duration_us': np.float64(2.112), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.112), 'max_duration_us': np.float64(2.112)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16, __nv_bfloat16, float, true, false, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nhwc2nchw_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.208), 'mean_duration_us': np.float64(2.208), 'median_duration_us': np.float64(2.208), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.208), 'max_duration_us': np.float64(2.208)}, {'name': 'Memset (Device)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(2.272), 'mean_duration_us': np.float64(2.272), 'median_duration_us': np.float64(2.272), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(2.272), 'max_duration_us': np.float64(2.272)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast<at::native::CUDAFunctor_add<c10::BFloat16> >(at::TensorIteratorBase&, at::native::CUDAFunctor_add<c10::BFloat16> const&)::{lambda(int)#1})', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(3.968), 'mean_duration_us': np.float64(3.968), 'median_duration_us': np.float64(3.968), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(3.968), 'max_duration_us': np.float64(3.968)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.4), 'mean_duration_us': np.float64(10.4), 'median_duration_us': np.float64(10.4), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.4), 'max_duration_us': np.float64(10.4)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16, __nv_bfloat16, float, false, true, (cudnnKernelDataType_t)0>(cudnn::engines_precompiled::nchw2nhwc_params_t<float>, __nv_bfloat16 const*, __nv_bfloat16*)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(10.464), 'mean_duration_us': np.float64(10.464), 'median_duration_us': np.float64(10.464), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(10.464), 'max_duration_us': np.float64(10.464)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8>(cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x64_32x4_nhwc_align8::Params)', 'stream': 7, 'count': 1, 'total_duration_us': np.float64(98.176), 'mean_duration_us': np.float64(98.176), 'median_duration_us': np.float64(98.176), 'std_dev_duration_us': np.float64(0.0), 'min_duration_us': np.float64(98.176), 'max_duration_us': np.float64(98.176)}]","[{'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.11)}, {'name': 'void cudnn::engines_precompiled::nhwcToNchwKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(2.21)}, {'name': 'Memset (Device)', 'stream': 7, 'mean_duration_us': np.float64(2.27)}, {'name': 'void at::native::elementwise_kernel<128, 4, at::native::gpu_kern...', 'stream': 7, 'mean_duration_us': np.float64(3.97)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.4)}, {'name': 'void cudnn::engines_precompiled::nchwToNhwcKernel<__nv_bfloat16,...', 'stream': 7, 'mean_duration_us': np.float64(10.46)}, {'name': 'void cutlass__5x_cudnn::Kernel<cutlass_tensorop_bf16_s16816fprop...', 'stream': 7, 'mean_duration_us': np.float64(98.18)}]","[[4, 3, 224, 224], [768, 3, 16, 16], [768], [], [], [], [], [], []]","['c10::BFloat16', 'c10::BFloat16', 'c10::BFloat16', 'ScalarList', 'ScalarList', 'ScalarList', 'Scalar', 'ScalarList', 'Scalar']","[[150528, 50176, 224, 1], [768, 256, 16, 1], [1], [], [], [], [], [], []]","['', '', '', '[16, 16]', '[0, 0]', '[1, 1]', 'False', '[0, 0]', '1']",129.60009765625,129.60009765625,,129.60009765625,129.60009765625,129.60009765625,1,2