From ef46156a9730baa54a39c4796e5b6e4df7671e0e Mon Sep 17 00:00:00 2001
From: Supadchaya Puangpontip <supadchaya@meta.com>
Date: Wed, 17 Dec 2025 21:42:53 -0800
Subject: [PATCH] Use TORCH_CHECK_VALUE in sparse ops (#5215)

Summary:

X-link: https://github.com/facebookresearch/FBGEMM/pull/2212

Change to use [`TORCH_CHECK_VALUE`](https://fburl.com/code/xxeanfi2) for sparse ops so that the error can be correctly classified correctly and avoid unnecessary job retries.

Reviewed By: q10

Differential Revision: D88918840
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
index 75bfe2edaa..36f5d152f5 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
@@ -84,7 +84,10 @@ class LookupFunctionBatchedUnaryEmbeddingOp
     auto table_offsets = *savedItr++;
     auto offsets = *savedItr++;
     auto indices = *savedItr++;
-    TORCH_CHECK(grad_outputs.size() == 1);
+    TORCH_CHECK_VALUE(
+        grad_outputs.size() == 1,
+        "Expected grad outputs size to be 1, but got ",
+        grad_outputs.size());
     // .contiguous() is called on the gradient inputs because
     // the batched_unary_embeddings_backward_cuda assumes contiguous inputs.
     // may cause illegal memory access when it is not
@@ -123,7 +126,7 @@ class IndexSelectDim0GPUOp
       const bool skip_indices_sorting_fwd) {
     TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input, indices);
     // Expect a 1D index tensor
-    TORCH_CHECK(
+    TORCH_CHECK_VALUE(
         indices.dim() == 1, "Index tensor must be 1D, but got ", indices.dim());
 
     Tensor sorted_indices, orig_indices;
@@ -150,7 +153,7 @@ class IndexSelectDim0GPUOp
   static torch::autograd::variable_list backward(
       torch::autograd::AutogradContext* ctx,
       torch::autograd::variable_list grad_outputs) {
-    TORCH_CHECK(
+    TORCH_CHECK_VALUE(
         grad_outputs.size() == 1,
         "The size of grad_outputs should be 1, but got ",
         grad_outputs.size());
@@ -291,7 +294,7 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
     const auto& indices = indices_group[i];
 
     // Verify that all input tensors have the same number of dimensions
-    TORCH_CHECK(
+    TORCH_CHECK_VALUE(
         input_dim == input.dim(),
         "All inputs in group_index_select must have the same number of dimensions. Expect ",
         input_dim,
@@ -308,7 +311,7 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
     auto num_output_rows_ = indices.size(0);
 
     // Verify that all input tensors have the same shape[0]
-    TORCH_CHECK(
+    TORCH_CHECK_VALUE(
         num_output_rows == num_output_rows_,
         "The number of indices to be selected must be the same for the entire group of ",
         group_size,
@@ -409,7 +412,7 @@ static torch::autograd::variable_list group_index_select_dim0_forward_impl_gpu(
 static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
     at::TensorList all_inputs,
     c10::SymIntArrayRef output_shape_group_ref) {
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       all_inputs.size() > 2,
       "all_inputs size must be larger than 2, but got ",
       all_inputs.size());
@@ -441,7 +444,7 @@ static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
       saved_data.is_contiguous(), "Tensor saved_data must be contiguous.");
   int64_t* saved_data_ptr = saved_data.data_ptr<int64_t>();
   // Check that the size is the same
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       saved_data_ptr[0] == group_size,
       "The size of saved_data[0] must match group_size. Expect ",
       group_size,
@@ -523,7 +526,7 @@ static torch::autograd::variable_list group_index_select_dim0_backward_impl_gpu(
   // Split to output_group
   auto output_group = group_grad_input.split(grad_input_numels, 0);
 
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       output_group.size() == static_cast<size_t>(group_size),
       "output_group size must be ",
       group_size,