fixed kv head replication in qwen3 moe (#357)

FrankLeeeee · web-flow · commit 9639a524bb82 · 2025-12-08T20:19:43.000+08:00
* fixed kv head replication in qwen3 moe

* poliosh

* poliosh
diff --git a/specforge/layers/linear.py b/specforge/layers/linear.py
@@ -79,8 +79,10 @@ def __init__(
         bias=True,
         device=None,
         dtype=None,
-        kv_head_replicas=False,
         layout_type: str = "normal",
+        kv_head_replicas=False,
+        kv_head_idx=None,
+        total_num_kv_heads=None,
     ):
         super().__init__()
         factory_kwargs = {"device": device, "dtype": dtype}
@@ -91,7 +93,10 @@ def __init__(
 
         self.in_features = in_features
         self.out_features = out_features
-        if kv_head_replicas:
+        self.kv_head_replicas = kv_head_replicas
+        self.kv_head_idx = kv_head_idx
+        self.total_num_kv_heads = total_num_kv_heads
+        if self.kv_head_replicas:
             self.out_features_per_shard = out_features
         else:
             self.out_features_per_shard = out_features // self.tp_size
@@ -113,14 +118,33 @@ def shard_state_dict(self, state_dict, *args):
         """
         This is a state dict hook to be triggered before loading the state dict. This will shard the weights and biases according to the layout type.
         """
-        if self.layout_type == "normal":
-            self.handle_normal_layout(state_dict, *args)
-        elif self.layout_type == "merged_qkv":
-            self.handle_merged_qkv(state_dict, *args)
-        elif self.layout_type == "gate_up":
-            self.handle_gate_up_layout(state_dict, *args)
+        if self.kv_head_replicas:
+            assert self.kv_head_idx is not None
+            assert self.layout_type == "normal"
+            self.handle_kv_head_replicas(state_dict, *args)
         else:
-            raise ValueError(f"Invalid layout type: {self.layout_type}")
+            if self.layout_type == "normal":
+                self.handle_normal_layout(state_dict, *args)
+            elif self.layout_type == "merged_qkv":
+                self.handle_merged_qkv(state_dict, *args)
+            elif self.layout_type == "gate_up":
+                self.handle_gate_up_layout(state_dict, *args)
+            else:
+                raise ValueError(f"Invalid layout type: {self.layout_type}")
+
+    def handle_kv_head_replicas(self, state_dict, *args):
+        """
+        This is a special case for GQA where the key/value are split according to the number of kv heads and the head which belongs to this rank.
+        As the TP size is larger than the number of kv heads, we only keep one kv head per rank.
+        """
+        if "weight" in state_dict:
+            state_dict["weight"] = state_dict["weight"].chunk(
+                self.total_num_kv_heads, dim=0
+            )[self.kv_head_idx]
+        if "bias" in state_dict and state_dict["bias"] is not None:
+            state_dict["bias"] = state_dict["bias"].chunk(
+                self.total_num_kv_heads, dim=0
+            )[self.kv_head_idx]
 
     def handle_normal_layout(self, state_dict, *args):
         """
diff --git a/specforge/modeling/target/custom_backend/qwen3_moe.py b/specforge/modeling/target/custom_backend/qwen3_moe.py
@@ -78,14 +78,18 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
         # Calculate head distribution for TP
         self.total_num_heads = config.num_attention_heads
         self.total_num_kv_heads = config.num_key_value_heads
-        self.num_heads = self.total_num_heads // self.tp_size
+        self.num_heads = (
+            self.total_num_heads // self.tp_size
+        )  # this is the number heads per rank
 
         # Handle KV head replication when tp_size > total_num_kv_heads
         if self.tp_size > self.total_num_kv_heads:
             # In replication mode, each rank gets 1 KV head (replicated across groups)
             self.num_kv_heads = 1
             self.num_kv_head_replicas = self.tp_size // self.total_num_kv_heads
-            self.num_key_value_groups = self.num_heads // self.num_kv_heads
+            self.num_key_value_groups = (
+                self.num_heads // self.num_kv_heads
+            )  # this is size for expanding kv for gqa
             self.kv_head_replicas = True
         else:
             self.num_kv_heads = self.total_num_kv_heads
@@ -103,18 +107,23 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
             self.num_kv_heads * self.head_dim,
             bias=config.attention_bias,
             kv_head_replicas=self.kv_head_replicas,
+            kv_head_idx=self.tp_rank // self.num_kv_head_replicas,
+            total_num_kv_heads=config.num_key_value_heads,
         )
         self.v_proj = ColumnParallelLinear(
             config.hidden_size,
             self.num_kv_heads * self.head_dim,
             bias=config.attention_bias,
             kv_head_replicas=self.kv_head_replicas,
+            kv_head_idx=self.tp_rank // self.num_kv_head_replicas,
+            total_num_kv_heads=config.num_key_value_heads,
         )
         self.o_proj = RowParallelLinear(
             config.num_attention_heads * self.head_dim,
             config.hidden_size,
             bias=config.attention_bias,
         )
+
         self.q_norm = Qwen3MoeRMSNorm(
             self.head_dim, eps=config.rms_norm_eps
         )  # unlike olmo, only on the head dim!
@@ -193,9 +202,10 @@ def __init__(self, config, intermediate_size=None):
 
         # Add TP support
         self.tp_group = get_tp_group()
-
         self.gate_proj = ColumnParallelLinear(
-            self.hidden_size, self.intermediate_size, bias=False
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
         )
         self.up_proj = ColumnParallelLinear(
             self.hidden_size, self.intermediate_size, bias=False
diff --git a/tests/test_modeling/test_target/test_custom_backend/test_qwen3_moe_tp.py b/tests/test_modeling/test_target/test_custom_backend/test_qwen3_moe_tp.py
@@ -16,7 +16,7 @@
 from tests.utils import get_available_port
 
 
-def test_qwen3_moe_tp(rank, world_size, temp_dir, port):
+def test_qwen3_moe_tp(rank, world_size, temp_dir, port, num_heads, num_kv_heads):
     os.environ["RANK"] = str(rank)
     os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["MASTER_ADDR"] = "localhost"
@@ -33,8 +33,8 @@ def test_qwen3_moe_tp(rank, world_size, temp_dir, port):
             moe_intermediate_size=512,
             num_hidden_layers=2,
             max_position_embeddings=1024,
-            num_attention_heads=8,
-            num_key_value_heads=4,
+            num_attention_heads=num_heads,
+            num_key_value_heads=num_kv_heads,
             num_experts=64,
             num_experts_per_tok=8,
             hidden_act="silu",
@@ -93,10 +93,14 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    def test_qwen3_moe_tp(self):
+    def test_qwen3_moe_tp_no_kv_head_replicas(self):
         # Set to 2 as only 2 GPU avaialble in CI
         port = get_available_port()
-        mp.spawn(test_qwen3_moe_tp, nprocs=2, args=(2, self.temp_dir.name, port))
+        mp.spawn(test_qwen3_moe_tp, nprocs=2, args=(2, self.temp_dir.name, port, 8, 4))
+
+    def test_qwen3_moe_tp_kv_head_replicas(self):
+        port = get_available_port()
+        mp.spawn(test_qwen3_moe_tp, nprocs=2, args=(2, self.temp_dir.name, port, 8, 1))
 
 
 if __name__ == "__main__":