Fix invalid Triton code for mixed scalar/block indexing in store operations when block dimension has size 1

oulgen · oulgen · commit 3d40d9df4625 · 2025-12-12T10:48:02.000-08:00
Fixes #1256 stack-info: PR: #1258, branch: oulgen/stack/186
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -179,6 +179,42 @@ def codegen_store(
     ) -> ast.AST:
         indexing = SubscriptIndexing.create(state, fake_tensor, subscript, extra_mask)
         name = state.device_function.tensor_arg(fake_tensor).name
+
+        # Compute the effective pointer shape (dimensions that contribute to the offset).
+        # Dimensions where fake_tensor.size(i) == 1 are skipped in the offset computation,
+        # so we need to reshape the value to match the effective pointer shape.
+        env = CompileEnvironment.current()
+        output_size = SubscriptIndexing.compute_shape(fake_tensor, subscript, state)
+
+        # Compute the effective shape after dropping size-1 tensor dimensions
+        # This matches the logic in SubscriptIndexing.create that skips size-1 dims
+        effective_shape = []
+        tensor_dim = 0
+        for k in subscript:
+            if k is None:
+                # None adds a dimension of size 1 to output, not from tensor
+                pass
+            elif isinstance(k, int):
+                # Scalar int eliminates the dimension
+                tensor_dim += 1
+            elif isinstance(k, (torch.SymInt, torch.Tensor, slice)):
+                # These consume a tensor dimension
+                if not env.known_equal(fake_tensor.size(tensor_dim), 1):
+                    # This dimension contributes to the pointer
+                    # Find corresponding output dimension
+                    if tensor_dim < len(output_size):
+                        effective_shape.append(output_size[tensor_dim])
+                tensor_dim += 1
+
+        # If effective_shape is empty but output_size is not all-1s, we need to reshape
+        # the value to be scalar. Skip reshaping for scalar constants which don't have shape.
+        if not effective_shape and output_size and not isinstance(value, ast.Constant):
+            # Pointer is scalar but value may have shape - squeeze to scalar
+            value = expr_from_string(
+                "tl.reshape({value}, [])",
+                value=value,
+            )
+
         return expr_from_string(
             f"tl.store({name} + {{offset}}, {{value}}, {{mask}})",
             value=value,
diff --git a/test/test_indexing.expected b/test/test_indexing.expected
@@ -759,6 +759,75 @@ def masked_store(x: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_indexing.py:N]: return out
     return out
 
+--- assertExpectedJournal(TestIndexing.test_mixed_scalar_block_store_size1_dim)
+from __future__ import annotations
+
+import torch
+import helion.language as hl
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers
+from torch._inductor.runtime.triton_helpers import math as tl_math
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_with_mixed_store(x_data, out, scales, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    # src[test_indexing.py:N]: for m_tile, n_tile in hl.tile([m, n], block_size=[None, n_block]):
+    num_blocks_0 = 1
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_1 * _BLOCK_SIZE_0
+    # src[test_indexing.py:N]: n_tile.begin, n_tile.end, block_size=BLOCK_SIZE
+    tile_end = offset_0 + _BLOCK_SIZE_0
+    # src[test_indexing.py:N]: for n_tile_local in hl.tile(
+    # src[test_indexing.py:N]:     n_tile.begin, n_tile.end, block_size=BLOCK_SIZE
+    # src[test_indexing.py:N]: ):
+    # src[test_indexing.py:N-N]: ...
+    for offset_2 in tl.range(offset_0.to(tl.int32), tile_end.to(tl.int32), _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        mask_2 = indices_2 < tile_end
+        # src[test_indexing.py:N]: x_block = x_data[m_tile, n_tile_local]
+        x_block = tl.load(x_data + indices_2[None, :] * 1, mask_2[None, :], other=0)
+        # src[test_indexing.py:N]: row_max = x_block.abs().amax(dim=1)
+        v_0 = tl_math.abs(x_block)
+        _mask_to = tl.where(tl.broadcast_to(mask_2[None, :], [1, _BLOCK_SIZE_2]), v_0, tl.full([], float('-inf'), tl.float32))
+        row_max = tl.cast(tl.max(_mask_to, 1), tl.float32)
+        # src[test_indexing.py:N]: row_value = row_max.to(torch.uint8)
+        v_1 = tl.cast(row_max, tl.uint8)
+        # src[test_indexing.py:N]: out[m_tile, n_tile_local] = x_block * 2.0
+        v_2 = 2.0
+        v_3 = x_block * v_2
+        tl.store(out + indices_2[None, :] * 1, v_3, mask_2[None, :])
+        # src[test_indexing.py:N]: scale_col_idx = n_tile_local.begin // BLOCK_SIZE  # scalar
+        floordiv = triton_helpers.div_floor_integer(offset_2, 32)
+        # src[test_indexing.py:N]: scales[m_tile, scale_col_idx] = row_value  # row_value is block
+        tl.store(scales + floordiv * 1, tl.reshape(v_1, []), None)
+
+def kernel_with_mixed_store(x_data: torch.Tensor, BLOCK_SIZE: hl.constexpr, *, _launcher=_default_launcher):
+    # src[test_indexing.py:N]: m, n = x_data.shape
+    m, n = x_data.shape
+    # src[test_indexing.py:N]: n = hl.specialize(n)
+    n = 64
+    # src[test_indexing.py:N]: n_scale_cols = (n + BLOCK_SIZE - 1) // BLOCK_SIZE
+    n_scale_cols = (n + 32 - 1) // 32
+    # src[test_indexing.py:N]: scales = x_data.new_empty((m, n_scale_cols), dtype=torch.uint8)
+    scales = x_data.new_empty((m, n_scale_cols), dtype=torch.uint8)
+    # src[test_indexing.py:N]: out = x_data.new_empty(x_data.shape, dtype=torch.float32)
+    out = x_data.new_empty(x_data.shape, dtype=torch.float32)
+    # src[test_indexing.py:N]: for m_tile, n_tile in hl.tile([m, n], block_size=[None, n_block]):
+    _BLOCK_SIZE_0 = 32
+    # src[test_indexing.py:N]: for n_tile_local in hl.tile(
+    # src[test_indexing.py:N]:     n_tile.begin, n_tile.end, block_size=BLOCK_SIZE
+    # src[test_indexing.py:N]: ):
+    # src[test_indexing.py:N-N]: ...
+    _BLOCK_SIZE_2 = 32
+    # src[test_indexing.py:N]: for m_tile, n_tile in hl.tile([m, n], block_size=[None, n_block]):
+    # src[test_indexing.py:N]:     for n_tile_local in hl.tile(
+    # src[test_indexing.py:N]:         n_tile.begin, n_tile.end, block_size=BLOCK_SIZE
+    # src[test_indexing.py:N-N]: ...
+    _launcher(_helion_kernel_with_mixed_store, (1 * triton.cdiv(64, _BLOCK_SIZE_0),), x_data, out, scales, _BLOCK_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=1)
+    # src[test_indexing.py:N]: return out, scales
+    return (out, scales)
+
 --- assertExpectedJournal(TestIndexing.test_non_consecutive_tensor_indexers_no_broadcast)
 from __future__ import annotations
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -2161,6 +2161,56 @@ def store_with_mixed_indices(
         torch.testing.assert_close(result, expected)
         self.assertExpectedJournal(code)
 
+    def test_mixed_scalar_block_store_size1_dim(self):
+        """Test store with mixed scalar/block indexing when block dimension has size 1.
+
+        This tests a bug fix where storing a block value with:
+        - One index being a tile/block (e.g., m_tile) over a size-1 dimension
+        - Another index being a scalar (e.g., computed from tile.begin)
+        would generate invalid Triton code because the pointer became scalar
+        but the value was still a block.
+        """
+
+        @helion.kernel(autotune_effort="none")
+        def kernel_with_mixed_store(
+            x_data: torch.Tensor, BLOCK_SIZE: hl.constexpr
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            m, n = x_data.shape
+            n = hl.specialize(n)
+            n_scale_cols = (n + BLOCK_SIZE - 1) // BLOCK_SIZE
+            scales = x_data.new_empty((m, n_scale_cols), dtype=torch.uint8)
+            out = x_data.new_empty(x_data.shape, dtype=torch.float32)
+
+            n_block = hl.register_block_size(BLOCK_SIZE, n)
+
+            for m_tile, n_tile in hl.tile([m, n], block_size=[None, n_block]):
+                for n_tile_local in hl.tile(
+                    n_tile.begin, n_tile.end, block_size=BLOCK_SIZE
+                ):
+                    x_block = x_data[m_tile, n_tile_local]
+
+                    # Compute one value per row in m_tile
+                    row_max = x_block.abs().amax(dim=1)
+                    row_value = row_max.to(torch.uint8)
+
+                    out[m_tile, n_tile_local] = x_block * 2.0
+
+                    # Mixed indexing: block row index + scalar column index
+                    scale_col_idx = n_tile_local.begin // BLOCK_SIZE  # scalar
+                    scales[m_tile, scale_col_idx] = row_value  # row_value is block
+
+            return out, scales
+
+        # Test with m=1 (single row - this was the failing case before the fix)
+        # The fix ensures tl.reshape is applied to squeeze the value to scalar
+        # when the pointer is scalar due to size-1 dimensions being dropped.
+        x1 = torch.randn(1, 64, device=DEVICE, dtype=torch.float32)
+        code, (out1, scales1) = code_and_output(kernel_with_mixed_store, (x1, 32))
+        expected_out1 = x1 * 2.0
+        torch.testing.assert_close(out1, expected_out1)
+        self.assertEqual(scales1.shape, (1, 2))
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()