support qwen3_vl and qwen3_vl_moe position_ids and rope_deltas

dcw02 · dcw02 · commit 47ac5c9925a3 · 2025-10-30T20:38:21.000-04:00
diff --git a/specforge/core/eagle3.py b/specforge/core/eagle3.py
@@ -608,26 +608,26 @@ def forward(
             past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
 
-        if position_ids is None:
-            attention_mask_tensor = (
-                attention_mask
-                if not isinstance(attention_mask, dict)
-                else attention_mask["full_attention"]
+        base_attention_mask = (
+            attention_mask
+            if not isinstance(attention_mask, dict)
+            else attention_mask["full_attention"]
+        )
+        # Cache the raw mask so that SDPA and RoPE refresh both see the same window-aligned view.
+        if base_attention_mask is not None and base_attention_mask.ndim == 4:
+            base_attention_mask = torch.diagonal(
+                base_attention_mask[:, 0], dim1=1, dim2=2
             )
-            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
-                attention_mask_tensor = torch.diagonal(
-                    attention_mask_tensor[:, 0], dim1=1, dim2=2
-                )
-                attention_mask_tensor = (
-                    attention_mask_tensor
-                    / torch.finfo(attention_mask_tensor.dtype).min
-                )
-                attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            base_attention_mask = (
+                base_attention_mask / torch.finfo(base_attention_mask.dtype).min
+            )
+            base_attention_mask = (1.0 - base_attention_mask).int()
 
+        if position_ids is None:
             get_rope_kwargs = {
                 "input_ids": input_ids,
                 "image_grid_thw": image_grid_thw,
-                "attention_mask": attention_mask_tensor,
+                "attention_mask": base_attention_mask,
             }
             if self.target_model_type in {"qwen3_vl", "qwen3_vl_moe"}:
                 get_rope_kwargs["video_grid_thw"] = video_grid_thw
@@ -639,8 +639,18 @@ def forward(
             )
             if rope_deltas is not None:
                 self.rope_deltas = rope_deltas
+            full_attention_mask = (
+                base_attention_mask.clone()
+                if base_attention_mask is not None
+                else None
+            )
         else:
             position_ids = position_ids
+            full_attention_mask = (
+                base_attention_mask.clone()
+                if base_attention_mask is not None
+                else None
+            )
 
         # Step 4: handle attention mask
         if attention_mask is None:
@@ -651,7 +661,7 @@ def forward(
             )
         if self.attention_backend == "sdpa":
             attention_mask = self.draft_model.prepare_decoder_attention_mask(
-                attention_mask=attention_mask,
+                attention_mask=full_attention_mask,
                 hidden_states=hidden_states,
                 batch_size=batch_size,
                 seq_length=seq_length,
@@ -715,6 +725,67 @@ def forward(
                 input_ids = padding(input_ids, left=False)
                 position_mask = padding(position_mask, left=False)
                 loss_mask = padding(loss_mask, left=False)
+                # Shrink the cached mask so SDPA keeps the same view after padding.
+                # Roll the cached SDPA mask so it matches the new left-shifted window.
+                if full_attention_mask is not None:
+                    full_attention_mask = padding(full_attention_mask, left=False)
+
+                if self.attention_backend == "sdpa":
+                    attention_mask = self.draft_model.prepare_decoder_attention_mask(
+                        attention_mask=full_attention_mask,
+                        hidden_states=hidden_states,
+                        batch_size=batch_size,
+                        seq_length=seq_length,
+                        past_key_values_length=past_key_values_length,
+                    )
+                elif (
+                    attention_mask is not None
+                    and self.target_model_type in {"qwen3_vl", "qwen3_vl_moe"}
+                ):
+                    # qwen3 path carries the un-expanded 2D causal mask directly.
+                    attention_mask = padding(attention_mask, left=False)
+
+                next_attention_tensor = (
+                    full_attention_mask
+                    if full_attention_mask is not None
+                    else (
+                        attention_mask
+                        if self.target_model_type in {"qwen3_vl", "qwen3_vl_moe"}
+                        else None
+                    )
+                )
+                if (
+                    next_attention_tensor is not None
+                    and self.target_model_type not in {"qwen3_vl", "qwen3_vl_moe"}
+                    and next_attention_tensor.ndim == 4
+                ):
+                    # qwen2.5 still produces inverted 4D masks; collapse and flip them before RoPE.
+                    next_attention_tensor = torch.diagonal(
+                        next_attention_tensor[:, 0], dim1=1, dim2=2
+                    )
+                    if next_attention_tensor.dtype.is_floating_point:
+                        next_attention_tensor = (
+                            next_attention_tensor
+                            / torch.finfo(next_attention_tensor.dtype).min
+                        )
+                        next_attention_tensor = (1.0 - next_attention_tensor).int()
+
+                # qwen3_vl expects video grid kwargs rather than second_per_grid_ts, qwen2.5_vl still needs both.
+                rope_kwargs = {
+                    "input_ids": input_ids,
+                    "image_grid_thw": image_grid_thw,
+                    "attention_mask": next_attention_tensor,
+                }
+                if self.target_model_type in {"qwen3_vl", "qwen3_vl_moe"}:
+                    rope_kwargs["video_grid_thw"] = video_grid_thw
+                else:
+                    rope_kwargs["video_grid_thw"] = video_grid_thw
+                    rope_kwargs["second_per_grid_ts"] = second_per_grid_ts
+                position_ids, rope_deltas = self.target_model.model.get_rope_index(
+                    **rope_kwargs
+                )
+                if rope_deltas is not None:
+                    self.rope_deltas = rope_deltas
                 # Flex attention mask shirnking is handled inside attention module
         return plosses, vlosses, acces