PaddlePaddle · roll-away · Nov 19, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/graph_net/test/naive_decomposer_and_post_extract_process_test.sh b/graph_net/test/naive_decomposer_and_post_extract_process_test.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# bash graph_net/test/naive_decomposer_and_post_extract_process_test.sh
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+
+# input model path
+MODEL_NAME=resnet18
+MODEL_PATH_IN_SAMPLES=/timm/$MODEL_NAME
+decorator_config_json_str=$(cat <<EOF
+{
+    "decorator_path": "$GRAPH_NET_ROOT/torch/extractor.py",
+    "decorator_config": {
+        "name": "$MODEL_NAME",
+        "custom_extractor_path": "$GRAPH_NET_ROOT/torch/naive_graph_decomposer.py",
+        "custom_extractor_config": {
+            "output_dir": "/tmp/naive_decompose_workspace",
+            "split_positions": [8, 16, 32],
+            "group_head_and_tail": true,
+            "filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",
+            "filter_config": {},
+            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process_count_kernels.py",
+            "post_extract_process_class_name": "PostExtractProcess"
+        }
+    }
+}
+EOF
+)
+DECORATOR_CONFIG=$(echo $decorator_config_json_str | base64 -w 0)
+
+python3 -m graph_net.torch.run_model --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --decorator-config=$DECORATOR_CONFIG
diff --git a/graph_net/test/naive_graph_decomposer_test.sh b/graph_net/test/naive_graph_decomposer_test.sh
@@ -1,4 +1,13 @@
 #!/bin/bash
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+GRAPH_NET_DIR=$(dirname "$SCRIPT_DIR")
+PROJECT_ROOT=$(dirname "$GRAPH_NET_DIR")
+
+# 将项目根目录加入Python路径
+export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH"
+
+
+
 
 
 GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(

diff --git a/graph_net/torch/naive_graph_decomposer.py b/graph_net/torch/naive_graph_decomposer.py
@@ -32,6 +32,8 @@ def make_config(
         output_dir="./tmp/naive_decomposer_dir",
         filter_path=None,
         filter_config=None,
+        post_extract_process_path=None,
+        post_extract_process_class_name=None,
     ):
         for pos in split_positions:
             assert isinstance(
@@ -44,6 +46,8 @@ def make_config(
             "output_dir": output_dir,
             "filter_path": filter_path,
             "filter_config": filter_config if filter_config is not None else {},
+            "post_extract_process_path": post_extract_process_path,
+            "post_extract_process_class_name": post_extract_process_class_name,
         }
 
     def __call__(self, gm: torch.fx.GraphModule, sample_inputs):
@@ -71,6 +75,7 @@ def __init__(self, parent_graph_extractor, submodule, seq_no):
         self.seq_no = seq_no
         self.extracted = False
         name = f"{parent_graph_extractor.name}_{self.seq_no}"
+        self.model_name = name
         self.builtin_extractor = BuiltinGraphExtractor(
             name=name,
             dynamic=False,
@@ -79,11 +84,15 @@ def __init__(self, parent_graph_extractor, submodule, seq_no):
             workspace_path=self.parent_graph_extractor.config["output_dir"],
         )
         self.filter = self.make_filter(self.parent_graph_extractor.config)
+        self.post_extract_process = self.make_post_extract_process(
+            self.parent_graph_extractor.config
+        )
 
     def forward(self, *args):
         if not self.extracted:
             if self.need_extract(self.submodule, args):
                 self.builtin_extractor(self.submodule, args)
+                self._post_extract_process()
             self.extracted = True
         return self.submodule(*args)
 
@@ -92,8 +101,20 @@ def need_extract(self, gm, sample_inputs):
             return True
         return self.filter(gm, sample_inputs)
 
+    def _post_extract_process(self):
+        model_path = os.path.join(
+            self.parent_graph_extractor.config["output_dir"], self.model_name
+        )
+        return self.post_extract_process(model_path)
+
     def make_filter(self, config):
         if config["filter_path"] is None:
             return None
         module = imp_util.load_module(config["filter_path"])
         return module.GraphFilter(config["filter_config"])
+
+    def make_post_extract_process(self, config):
+        if config["post_extract_process_path"] is None:
+            return None
+        module = imp_util.load_module(config["post_extract_process_path"])
+        return module.PostExtractProcess(config["post_extract_process_path"])
diff --git a/graph_net/torch/naive_subgraph_filter.py b/graph_net/torch/naive_subgraph_filter.py
@@ -3,5 +3,4 @@ def __init__(self, config):
         self.config = config
 
     def __call__(self, gm, sample_inputs):
-        print(f"GraphFilter\n{gm.code}")
         return True
diff --git a/graph_net/torch/post_extract_process_count_kernels.py b/graph_net/torch/post_extract_process_count_kernels.py
@@ -0,0 +1,79 @@
+from graph_net.torch import utils
+import importlib.util
+import shutil
+import torch
+from typing import Type
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+class PostExtractProcess:
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, model_path=None):
+        if model_path is None:
+            return False
+        # model
+        model_class = load_class_from_file(
+            f"{model_path}/model.py", class_name="GraphModule"
+        )
+        assert model_class is not None
+        model = model_class()
+        print(f"{model_path=}")
+
+        inputs_params = utils.load_converted_from_text(f"{model_path}")
+        params = inputs_params["weight_info"]
+        state_dict = {k: utils.replay_tensor(v) for k, v in params.items()}
+
+        model(**state_dict)
+        compiled_model = torch.compile(model)
+        compiled_num_of_kernels = count_kernels(compiled_model, state_dict)
+        if compiled_num_of_kernels == 1:
+            print(model_path, "can be fully integrated")
+            return True
+        else:
+            print(model_path, "can not be fully integrated")
+            shutil.rmtree(model_path)
+            return False
+
+
+def load_class_from_file(file_path: str, class_name: str) -> Type[torch.nn.Module]:
+    spec = importlib.util.spec_from_file_location("unnamed", file_path)
+    unnamed = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(unnamed)
+    model_class = getattr(unnamed, class_name, None)
+    return model_class
+
+
+def count_kernels(model, sample_inputs) -> int:
+    """
+    Count the number of CUDA kernel launches performed during a model's forward pass.
+
+    Args:
+        model(graph models)
+        sample_inputs(tensors)
+
+    Returns:
+        int: The number of kernels used.
+
+    Behavior:
+        - Runs the model once inside a PyTorch profiler context.
+        - Identifies the event with key = 'cudaLaunchKernel', which corresponds
+        to the number of CUDA kernel launches.
+    """
+    model.eval()
+    # Use PyTorch Profiler
+
+    with profile(
+        activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
+        record_shapes=True,
+    ) as prof:
+        with record_function("model_inference"):
+            output = model(**sample_inputs)
+    events = prof.key_averages()
+
+    total_count = 0
+    for e in events:
+        if e.key == "cuLaunchKernel" or e.key == "cudaLaunchKernel":
+            total_count += e.count
+    return total_count