1) support resume; 2) fix bugs in torch/backend/range_decomposer_validator_backend.py (#441)

lixinqi · JewelRoam · web-flow · commit 5750ae78ac1f · 2025-12-10T23:23:13.000+08:00
* debug_typical_sequence

* support model-path-prefix in splitting positions

* fix

* fix

* Improve efficiency of test/typical_sequence_decomposer_test.sh

* 1) support resume; 2) fix bugs in torch/backend/range_decomposer_validator_backend.py

---------

Co-authored-by: JewelRoam &lt;2752594773@qq.com&gt;
diff --git a/graph_net/model_path_handler.py b/graph_net/model_path_handler.py
@@ -1,4 +1,5 @@
 import argparse
+import traceback
 from graph_net.imp_util import load_module
 import logging
 import sys
@@ -52,6 +53,9 @@ def handle_model_path_list_in_current_process(handler, args):
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             return
+        except Exception:
+            print("------------[model_path_handler failed]------------", flush=True)
+            traceback.print_exc()
 
 
 def handle_model_path_list_in_subprocess(args):
diff --git a/graph_net/test/dev_model_list/validation_error_model_list.txt b/graph_net/test/dev_model_list/validation_error_model_list.txt
@@ -1,11 +1,4 @@
-samples/timm/convnextv2_base.fcmae_ft_in1k
-samples/timm/hgnet_tiny.paddle_in1k
-samples/timm/mobilenetv4_conv_aa_large.e230_r384_in12k
-samples/timm/regnety_080_tv.tv2_in1k
-samples/timm/res2net50_14w_8s.in1k
-samples/torchaudio/wavlm_base
 samples/torchgeometric/RECT_L
-samples/torchvision/vgg16_bn
 samples/transformers-auto-model/bge-small-en-v1.5
 samples/transformers-auto-model/distilbert_distilbert-base-multilingual-cased
 samples/transformers-auto-model/OFA-Sys_chinese-clip-vit-large-patch14
@@ -17,4 +10,3 @@ samples/transformers-auto-model/opus-mt-en-tw
 samples/transformers-auto-model/opus-mt-fi-niu
 samples/transformers-auto-model/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat
 samples/transformers-auto-model/opus-mt-tc-bible-big-gmw-deu_eng_fra_por_spa
-samples/ultralytics/yolov3-tinyu
diff --git a/graph_net/test/typical_sequence_decomposer_test.sh b/graph_net/test/typical_sequence_decomposer_test.sh
@@ -9,9 +9,28 @@ mkdir -p "$DECOMPOSE_PATH"
 # model_list="$GRAPH_NET_ROOT/graph_net/config/small100_torch_samples_list.txt"
 model_list="$GRAPH_NET_ROOT/graph_net/test/dev_model_list/validation_error_model_list.txt"
 
+op_names_extractor_config_json_str=$(cat <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/typical_sequence_split_points.py",
+    "handler_class_name": "OpNamesExtractor",
+    "handler_config": {
+        "resume": true,
+        "model_path_prefix": "$GRAPH_NET_ROOT",
+        "output_dir": "$DECOMPOSE_PATH"
+    }
+}
+EOF
+)
+OP_NAMES_EXTRACTOR_CONFIG=$(echo $op_names_extractor_config_json_str | base64 -w 0)
+
+python3 -m graph_net.model_path_handler \
+    --model-path-list $model_list \
+    --handler-config=$OP_NAMES_EXTRACTOR_CONFIG \
+
 python3 -m graph_net.torch.typical_sequence_split_points \
+    --enable-resume \
     --model-list "$model_list" \
-    --model-path-prefix "$GRAPH_NET_ROOT" \
+    --op-names-path-prefix "$DECOMPOSE_PATH" \
     --device "cuda" \
     --window-size 10 \
     --fold-policy default \
@@ -23,6 +42,7 @@ decompose_config_json_str=$(cat <<EOF
     "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/graph_decomposer.py",
     "handler_class_name": "RangeDecomposerExtractor",
     "handler_config": {
+        "resume": true,
         "model_path_prefix": "$GRAPH_NET_ROOT",
         "output_dir": "$DECOMPOSE_PATH",
         "split_results_path": "$DECOMPOSE_PATH/split_results.json",
@@ -37,10 +57,10 @@ DECOMPOSE_CONFIG=$(echo $decompose_config_json_str | base64 -w 0)
 python3 -m graph_net.model_path_handler \
     --model-path-list $model_list \
     --handler-config=$DECOMPOSE_CONFIG \
-    --use-subprocess
 
 test_compiler_config_json_str=$(cat <<EOF
 {
+    "model_path_prefix": "$GRAPH_NET_ROOT",
     "decomposed_root": "$DECOMPOSE_PATH"
 }
 EOF
@@ -53,7 +73,7 @@ python3 -m graph_net.torch.test_compiler \
     --device cuda \
     --config $TEST_COMPILER_CONFIG \
     --model-path-prefix $GRAPH_NET_ROOT \
-    > "$DECOMPOSE_PATH/validation.log" 2>&1
+    2>&1 | tee "$DECOMPOSE_PATH/validation.log"
 
 python3 -m graph_net.plot_ESt \
     --benchmark-path "$DECOMPOSE_PATH/validation.log" \
diff --git a/graph_net/torch/backend/range_decomposer_validator_backend.py b/graph_net/torch/backend/range_decomposer_validator_backend.py
@@ -1,27 +1,41 @@
 import torch
+import inspect
 import torch.nn as nn
 import os
 import importlib.util
 from typing import List
 
 
 class ComposedModel(nn.Module):
-    def __init__(self, subgraph: List[nn.Module]):
+    def __init__(self, subgraphs: List[nn.Module]):
         super().__init__()
-        self.subgraphs = nn.ModuleList(subgraph)
+        self.subgraphs = nn.ModuleList(subgraphs)
 
     def forward(self, **kwargs):
         output = None
         for i, subgraph in enumerate(self.subgraphs):
-            print(f"{i=} subgraph begin")
             if output is None:
-                output = subgraph(**kwargs)
+                output = subgraph(**self._convert_inputs(subgraph, kwargs))
             else:
                 output = subgraph(*output)
-            print(f"{i=} subgraph end")
 
         return output
 
+    def _convert_inputs(self, subgraph, input_kwargs):
+        input_keywords = set(name for name, _ in input_kwargs.items())
+        sub_graph_arg_names = set(inspect.signature(subgraph.forward).parameters)
+        assert (
+            len(sub_graph_arg_names - input_keywords) == 0
+        ), f"{(sub_graph_arg_names - input_keywords)=}"
+        for remainder in input_keywords - sub_graph_arg_names:
+            assert remainder.startswith("s")
+            assert remainder[1:].isdigit()
+        return {
+            name: value
+            for name, value in input_kwargs.items()
+            if name in sub_graph_arg_names
+        }
+
 
 class RangeDecomposerValidatorBackend:
     def _load_model_instance(self, path: str, device: str) -> torch.nn.Module:
@@ -36,40 +50,56 @@ def _load_model_instance(self, path: str, device: str) -> torch.nn.Module:
         instance = ModelClass().to(device)
         return instance
 
-    def _make_config(self, decomposed_root, decomposed_model_name_suffix="_decomposed"):
+    def _make_config(
+        self,
+        model_path_prefix: str,
+        decomposed_root: str,
+        decomposed_dentry: str = "_decomposed",
+    ):
         return {
+            "model_path_prefix": model_path_prefix,
             "decomposed_root": decomposed_root,
-            "decomposed_model_name_suffix": decomposed_model_name_suffix,
+            "decomposed_dentry": decomposed_dentry,
         }
 
+    def _get_rel_model_path(self, model_path) -> str:
+        model_path = os.path.realpath(model_path)
+        model_path_prefix = os.path.realpath(self.config["model_path_prefix"])
+        assert model_path.startswith(model_path_prefix)
+        rel_model_path = model_path[len(model_path_prefix) :]
+        if rel_model_path.startswith("/"):
+            rel_model_path = rel_model_path[1:]
+        assert not rel_model_path.startswith("/")
+        return rel_model_path
+
+    def _get_model_name_order(self, name):
+        lst = name.split("_")
+        if not (len(lst) > 0):
+            return -1
+        if not (lst[-1].isdigit()):
+            return -1
+        return int(lst[-1])
+
     def __call__(self, model: torch.nn.Module) -> torch.nn.Module:
         config = self._make_config(**self.config)
-        model_file_path = model.__class__.__graph_net_file_path__
-        model_dir = os.path.dirname(model_file_path)
-        model_name = os.path.basename(model_dir)
+        model_path = os.path.dirname(model.__class__.__graph_net_file_path__)
+        rel_model_path = self._get_rel_model_path(model_path)
         decomposed_parent_dir = os.path.join(
-            config["decomposed_root"], f"{model_name}_decomposed"
+            config["decomposed_root"], rel_model_path, config["decomposed_dentry"]
         )
         subgraph_paths = []
-        for name in sorted(os.listdir(decomposed_parent_dir)):
+        dentries = os.listdir(decomposed_parent_dir)
+        for name in sorted(dentries, key=self._get_model_name_order):
             full_path = os.path.join(decomposed_parent_dir, name)
-            if os.path.isdir(full_path) and name[-1].isdigit():
+            if os.path.isdir(full_path) and self._get_model_name_order(name) >= 0:
                 subgraph_paths.append(full_path)
 
-        print(
-            f"[RangeDecomposerValidatorBackend] Found subgraphs: {[os.path.basename(p) for p in subgraph_paths]}"
-        )
-
         device = model.__class__.__graph_net_device__
         subgraph_instances = []
 
         for path in subgraph_paths:
             instance = self._load_model_instance(path, device)
             subgraph_instances.append(instance)
-            dir_name = os.path.basename(path)
-            print(
-                f"[RangeDecomposerValidatorBackend] Loaded and instantiated '{dir_name}'"
-            )
 
         composed_model = ComposedModel(subgraph_instances)
         return composed_model.eval()
diff --git a/graph_net/torch/fx_graph_parse_util.py b/graph_net/torch/fx_graph_parse_util.py
@@ -113,7 +113,8 @@ def _get_name_pattern2replacement(names_from_signature, names_from_placeholder):
 
 
 def _rename_placeholder(name, pattern2replacement):
-    assert name[:2] == "L_" or name[:2] == "l_", f"{name=}"
+    if not (name[:2] == "L_" or name[:2] == "l_"):
+        return name
     name = name[2:]
     if name[0] == "l":
         name = "L" + name[1:]
diff --git a/graph_net/torch/graph_decomposer.py b/graph_net/torch/graph_decomposer.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 import torch
 import json
 from graph_net.torch.decompose_util import convert_to_submodules_graph
@@ -79,7 +80,7 @@ def __call__(self, gm: torch.fx.GraphModule, sample_inputs):
     def get_naive_decomposer_extractor(self, submodule, seq_no):
         return NaiveDecomposerExtractorModule(
             config=self.config,
-            parent_graph_name=self.name,
+            parent_graph_rel_model_path=self.name,
             submodule=submodule,
             seq_no=seq_no,
         )
@@ -145,7 +146,7 @@ def get_naive_decomposer_extractor(self, model_path):
         def fn(submodule, seq_no):
             return NaiveDecomposerExtractorModule(
                 config=self.config,
-                parent_graph_name=os.path.basename(model_path),
+                parent_graph_rel_model_path=os.path.basename(model_path),
                 submodule=submodule,
                 seq_no=seq_no,
             )
@@ -165,6 +166,7 @@ def __init__(self, config: dict = None):
 
     def _make_config(
         self,
+        resume: bool = False,
         split_results_path=None,
         group_head_and_tail=False,
         chain_style=False,
@@ -181,6 +183,7 @@ def _make_config(
                 f"split_results_path should be a valid JSON file path, but got {split_results_path=}"
             )
         return {
+            "resume": resume,
             "split_results_path": split_results_path,
             "group_head_and_tail": group_head_and_tail,
             "chain_style": chain_style,
@@ -190,10 +193,25 @@ def _make_config(
             "model_path_prefix": model_path_prefix,
         }
 
+    def _is_model_handled(self, rel_model_path, split_positions):
+        num_subgraphs = len(split_positions) + 1
+        decomposed_model_path = Path(self.config["output_dir"]) / rel_model_path
+        num_decomposed = len(list(decomposed_model_path.rglob("model.py")))
+        if num_decomposed > 0:
+            assert (
+                num_subgraphs <= num_decomposed
+            ), f"{num_subgraphs=} {num_decomposed=} {str(decomposed_model_path)=}"
+        return num_subgraphs == num_decomposed
+
     def __call__(self, rel_model_path):
         model_path = os.path.join(self.config["model_path_prefix"], rel_model_path)
         split_results = load_json(self.config["split_results_path"])
         split_positions = split_results[rel_model_path]["split_positions"]
+        if self.config["resume"] and self._is_model_handled(
+            rel_model_path, split_positions
+        ):
+            return
+        torch.cuda.empty_cache()
         config = {
             "split_positions": split_positions,
             "group_head_and_tail": self.config.get("group_head_and_tail", False),
@@ -203,16 +221,16 @@ def __call__(self, rel_model_path):
         gm = parse_sole_graph_module(module, inputs)
         rewrited_gm: torch.fx.GraphModule = convert_to_submodules_graph(
             gm,
-            submodule_hook=self.get_naive_decomposer_extractor(model_path),
+            submodule_hook=self.get_naive_decomposer_extractor(rel_model_path),
             **config,
         )
         rewrited_gm(*inputs)
 
-    def get_naive_decomposer_extractor(self, model_path):
+    def get_naive_decomposer_extractor(self, rel_model_path):
         def fn(submodule, seq_no):
             return NaiveDecomposerExtractorModule(
                 config=self.config,
-                parent_graph_name=os.path.basename(model_path),
+                parent_graph_rel_model_path=rel_model_path,
                 submodule=submodule,
                 seq_no=seq_no,
             )
@@ -224,7 +242,7 @@ class NaiveDecomposerExtractorModule(torch.nn.Module):
     def __init__(
         self,
         config: dict,
-        parent_graph_name: str,
+        parent_graph_rel_model_path: str,
         submodule: torch.nn.Module,
         seq_no: int,
     ):
@@ -233,34 +251,28 @@ def __init__(
         self.submodule = submodule
         self.seq_no = seq_no
         self.extracted = False
-        self.parent_graph_name = parent_graph_name
+        self.parent_graph_rel_model_path = parent_graph_rel_model_path
+        parent_graph_model_name = os.path.basename(parent_graph_rel_model_path)
         if self.seq_no is None:
-            self.model_name = parent_graph_name
+            self.model_name = parent_graph_model_name
         else:
-            submodule_name = f"{parent_graph_name}_{self.seq_no}"
+            submodule_name = f"{parent_graph_model_name}_{self.seq_no}"
             self.model_name = submodule_name
         self.builtin_extractor = BuiltinGraphExtractor(
             name=submodule_name,
             dynamic=False,
             mut_graph_codes=[],
             placeholder_auto_rename=False,
             workspace_path=os.path.join(
-                self.config["output_dir"], f"{parent_graph_name}_decomposed"
+                self.config["output_dir"], parent_graph_rel_model_path, "_decomposed"
             ),
         )
         self.filter = self.make_filter(self.config)
 
     def _get_model_path(self):
         return os.path.join(
             self.config["output_dir"],
-            f"{self.parent_graph_name}_decomposed",
-            self.model_name,
-        )
-
-    def _get_model_path(self):
-        return os.path.join(
-            self.config["output_dir"],
-            f"{self.parent_graph_name}_decomposed",
+            f"{self.parent_graph_model_name}/_decomposed",
             self.model_name,
         )
 
diff --git a/graph_net/torch/test_compiler.py b/graph_net/torch/test_compiler.py
diff --git a/graph_net/torch/typical_sequence_split_points.py b/graph_net/torch/typical_sequence_split_points.py
diff --git a/graph_net/torch/utils.py b/graph_net/torch/utils.py