[ut] add layernorm kernel unitest

Xianzhe Dong · Xianzhe Dong · commit d4c7a5d43537 · 2024-04-17T02:32:46.000-04:00
diff --git a/src/kernels/CMakeLists.txt b/src/kernels/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(cc_library)
+include(cc_binary)
 
 cc_library(
   NAME 
@@ -72,6 +73,24 @@ cc_library(
     torch
 )
 
+# cc_test(
+# NAME
+# layernorm_kernels_test
+# SRCS
+# layernrom_kernels_test.cu
+# layernorm_kernels.cu
+# DEPS
+# DEFINES
+# )
+cc_binary(
+  NAME
+  layernorm_kernels_test
+  SRCS
+  layernrom_kernels_test.cu
+  layernorm_kernels.cu
+  DEPS
+  torch
+)
+
 add_subdirectory(flash_attn)
 add_subdirectory(flash_infer)
-
diff --git a/src/kernels/layernorm_kernels.cu b/src/kernels/layernorm_kernels.cu
@@ -1,8 +1,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/torch.h>
+
 #include "dispatch.h"
-#include "reduce_kernel_utils.cuh"
 #include "layernorm_kernels.h"
+#include "reduce_kernel_utils.cuh"
 
 namespace llm::kernel {
 
@@ -116,11 +117,11 @@ __global__ void layer_norm_kernel(T* __restrict__ out,
 // The mean and standard-deviation are calculated over the last dimension
 template <>
 __global__ void layer_norm_kernel<half2>(half2* __restrict__ out,
-                                  const half2* __restrict__ input,
-                                  const half2* __restrict__ weight,
-                                  const half2* __restrict__ bias,
-                                  const float epsilon,
-                                  int n) {
+                                         const half2* __restrict__ input,
+                                         const half2* __restrict__ weight,
+                                         const half2* __restrict__ bias,
+                                         const float epsilon,
+                                         int n) {
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
 
@@ -147,7 +148,6 @@ __global__ void layer_norm_kernel<half2>(half2* __restrict__ out,
   }
   variance = block_reduce_sum<half2>(variance);
   if (tidx == 0) {
-    // const half2 e = make_half2(__float2half(epsilon), __float2half(epsilon));
     s_variance = __hadd(variance.x, variance.y);
     s_variance = __hdiv(s_variance, __float2half((float)n * 2));
     s_variance = __hadd(s_variance, __float2half(epsilon));
@@ -157,16 +157,11 @@ __global__ void layer_norm_kernel<half2>(half2* __restrict__ out,
 
   for (int i = tidx; i < n; i += blockDim.x) {
     const int idx = bidx * n + i;
-    // float local_out =
-    //     (__ldg(&input[idx]) - s_mean) * s_variance * __ldg(&weight[i]);
-    // if (bias != nullptr) {
-    //   local_out += __ldg(&bias[i]);
-    // }
     half2 local_out = __ldg(&input[idx]);
     local_out = __hsub2(local_out, make_half2(s_mean, s_mean));
     local_out = __hmul2(local_out, make_half2(s_variance, s_variance));
     local_out = __hmul2(local_out, __ldg(&weight[i]));
-    if (bias != nullptr){
+    if (bias != nullptr) {
       local_out = __hadd2(local_out, __ldg(&bias[i]));
     }
     out[idx] = local_out;
@@ -199,52 +194,34 @@ void layer_norm(torch::Tensor& out,
 
 template <typename T>
 void invoke_layernorm_kernel(T* out,
-                                  const T* input,
-                                  const T* weight,
-                                  const T* bias,
-                                  const float epsilon,
-                                  int m,
-                                  int n) {
+                             const T* input,
+                             const T* weight,
+                             const T* bias,
+                             const float epsilon,
+                             int m,
+                             int n) {
   layer_norm_kernel<T><<<m, n>>>(out, input, weight, bias, epsilon, n);
 }
 
 template <>
 void invoke_layernorm_kernel<half2>(half2* out,
-                                  const half2* input,
-                                  const half2* weight,
-                                  const half2* bias,
-                                  const float epsilon,
-                                  int m,
-                                  int n) {
+                                    const half2* input,
+                                    const half2* weight,
+                                    const half2* bias,
+                                    const float epsilon,
+                                    int m,
+                                    int n) {
   layer_norm_kernel<half2><<<m, n>>>(out, input, weight, bias, epsilon, n);
 }
 template <>
 void invoke_layernorm_kernel<float>(float* out,
-                                  const float* input,
-                                  const float* weight,
-                                  const float* bias,
-                                  const float epsilon,
-                                  int m,
-                                  int n) {
+                                    const float* input,
+                                    const float* weight,
+                                    const float* bias,
+                                    const float epsilon,
+                                    int m,
+                                    int n) {
   layer_norm_kernel<float><<<m, n>>>(out, input, weight, bias, epsilon, n);
-                                  }
-// void invoke_float_layernorm_kernel(float* out,
-//                                    const float* input,
-//                                    const float* weight,
-//                                    const float* bias,
-//                                    const float epsilon,
-//                                    int m,
-//                                    int n){
-//   layer_norm_kernel<float><<<m, n>>>(out, input, weight, bias, epsilon, n);
-//                                    }
-
-// void invoke_half2_layernorm_kernel(half2* out,
-//                                    const half2* input,
-//                                    const half2* weight,
-//                                    const half2* bias,
-//                                    const float epsilon,
-//                                    int m,
-//                                    int n){
-//   layer_norm_kernel<half2><<<m, n>>>(out, input, weight, bias, epsilon, n);
-// }
-}  // namespace llm::kernel
+}
+
+}  // namespace llm::kernel
diff --git a/src/kernels/layernorm_kernels.h b/src/kernels/layernorm_kernels.h
@@ -22,20 +22,4 @@ void invoke_layernorm_kernel(T* out,
                              const float epsilon,
                              int m,
                              int n);
-
-// void invoke_float_layernorm_kernel(float* out,
-//                                    const float* input,
-//                                    const float* weight,
-//                                    const float* bias,
-//                                    const float epsilon,
-//                                    int m,
-//                                    int n);
-
-// void invoke_half2_layernorm_kernel(half2* out,
-//                                    const half2* input,
-//                                    const half2* weight,
-//                                    const half2* bias,
-//                                    const float epsilon,
-//                                    int m,
-//                                    int n);
 }  // namespace llm::kernel
diff --git a/src/kernels/layernrom_kernels_test.cu b/src/kernels/layernrom_kernels_test.cu
@@ -0,0 +1,129 @@
+#include <cuda_fp16.h>
+
+#include <cstdio>
+
+#include "layernorm_kernels.h"
+
+template <typename T>
+void printMatrix(T* a, int m, int n) {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      printf("%f ", (float)a[i * n + j]);
+    }
+    puts("");
+  }
+  puts("");
+}
+
+template <>
+void printMatrix<half2>(half2* a, int m, int n) {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      printf(
+          "%f %f ", __half2float(a[i * n + j].x), __half2float(a[i * n + j].y));
+    }
+    puts("");
+  }
+  puts("");
+}
+
+void layernorm_kernel_half2_test() {
+  float epsilon = 1e-6;
+  int m = 2;
+  int n = 2;
+
+  half2* out = (half2*)malloc(m * n * sizeof(half2));
+  half2* input = (half2*)malloc(m * n * sizeof(half2));
+  half2* weight = (half2*)malloc(m * n * sizeof(half2));
+  half2* bias = (half2*)malloc(m * n * sizeof(half2));
+
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      input[i * n + j] = half2(__float2half((float)(i * n + j * 2)),
+                               __float2half((float)(i * n + j * 2 + 1)));
+      weight[i * n + j] = half2(__float2half(1.), __float2half(1.));
+      bias[i * n + j] = half2(__float2half(0.), __float2half(0.));
+    }
+  }
+
+  half2* dout;
+  half2* dinput;
+  half2* dweight;
+  half2* dbias;
+  cudaMalloc((void**)&dout, sizeof(half2) * m * n);
+  cudaMalloc((void**)&dinput, sizeof(half2) * m * n);
+  cudaMalloc((void**)&dweight, sizeof(half2) * m * n);
+  cudaMalloc((void**)&dbias, sizeof(half2) * m * n);
+
+  cudaMemcpy(dinput, input, sizeof(half2) * m * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(dweight, weight, sizeof(half2) * m * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(dbias, bias, sizeof(half2) * m * n, cudaMemcpyHostToDevice);
+
+  llm::kernel::invoke_layernorm_kernel<half2>(
+      dout, dinput, dweight, dbias, epsilon, m, n);
+
+  cudaMemcpy(out, dout, sizeof(half2) * m * n, cudaMemcpyDeviceToHost);
+
+  printf("---------- test half2 layernorm kernel -----------\n");
+  printf("input:\n");
+  printMatrix<half2>(input, m, n);
+  printf("weights:\n");
+  printMatrix<half2>(weight, m, n);
+  printf("bias:\n");
+  printMatrix<half2>(bias, m, n);
+  printf("outputs:\n");
+  printMatrix<half2>(out, m, n);
+}
+
+void layernorm_kernel_float_test() {
+  float epsilon = 1e-6;
+  int m = 2;
+  int n = 4;
+
+  float* out = (float*)malloc(m * n * sizeof(float));
+  float* input = (float*)malloc(m * n * sizeof(float));
+  float* weight = (float*)malloc(m * n * sizeof(float));
+  float* bias = (float*)malloc(m * n * sizeof(float));
+
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      input[i * n + j] = (float)(i * n + j);
+      weight[i * n + j] = 1.;
+      bias[i * n + j] = 0.;
+    }
+  }
+
+  float* dout;
+  float* dinput;
+  float* dweight;
+  float* dbias;
+  cudaMalloc((void**)&dout, sizeof(float) * m * n);
+  cudaMalloc((void**)&dinput, sizeof(float) * m * n);
+  cudaMalloc((void**)&dweight, sizeof(float) * m * n);
+  cudaMalloc((void**)&dbias, sizeof(float) * m * n);
+
+  cudaMemcpy(dinput, input, sizeof(float) * m * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(dweight, weight, sizeof(float) * m * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(dbias, bias, sizeof(float) * m * n, cudaMemcpyHostToDevice);
+
+  llm::kernel::invoke_layernorm_kernel<float>(
+      dout, dinput, dweight, dbias, epsilon, m, n);
+
+  cudaMemcpy(out, dout, sizeof(float) * m * n, cudaMemcpyDeviceToHost);
+
+  printf("---------- test float layernorm kernel -----------\n");
+  printf("input:\n");
+  printMatrix<float>(input, m, n);
+  printf("weights:\n");
+  printMatrix<float>(weight, m, n);
+  printf("bias:\n");
+  printMatrix<float>(bias, m, n);
+  printf("outputs:\n");
+  printMatrix<float>(out, m, n);
+}
+
+int main() {
+  layernorm_kernel_float_test();
+  layernorm_kernel_half2_test();
+  return 0;
+}
diff --git a/src/kernels/reduce_kernel_utils.cuh b/src/kernels/reduce_kernel_utils.cuh
@@ -24,6 +24,36 @@ __inline__ __device__ T warp_reduce_sum(T val) {
   return val;
 }
 
+// performs a parallel reduction operation across the threads within a single
+// warp (32 threads).
+//   - val: The value to be reduced within a warp.
+template <>
+__inline__ __device__ half warp_reduce_sum<half>(half val) {
+  // uses bitwise operations to perform a parallel reduction
+  // within a warp. The 'mask' is right-shifted by 1 in each iteration
+  // until it reaches zero, effectively summing all values within the warp.
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    val = __hadd(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+  }
+  return val;
+}
+
+// performs a parallel reduction operation across the threads within a single
+// warp (32 threads).
+//   - val: The value to be reduced within a warp.
+template <>
+__inline__ __device__ half2 warp_reduce_sum<half2>(half2 val) {
+  // uses bitwise operations to perform a parallel reduction
+  // within a warp. The 'mask' is right-shifted by 1 in each iteration
+  // until it reaches zero, effectively summing all values within the warp.
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    val = __hadd2(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+  }
+  return val;
+}
+
 // performs a parallel reduction operation across the threads within a single
 // warp (32 threads).
 //   - val: The value to be reduced within a warp.
@@ -63,6 +93,35 @@ __inline__ __device__ T block_reduce_sum(T val) {
   return val;
 }
 
+/* Calculate the sum of all elements in a thread block */
+template <>
+__inline__ __device__ half2 block_reduce_sum<half2>(half2 val) {
+  // up to 32 warps in a block
+  static __shared__ half2 shared[32];
+  // lane id in a warp
+  int lane = threadIdx.x & 0x1f;
+  // wrap id: threadIdx.x / 32
+  int wid = threadIdx.x >> 5;
+
+  // perform a parallel reduction across the threads within each warp
+  val = warp_reduce_sum<half2>(val);
+
+  if (lane == 0) {
+    // write the sum of each warp to shared memory
+    shared[wid] = val;
+  }
+  // wait for all warps to finish
+  __syncthreads();
+
+  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+  // blockDim.x is not divided by 32
+  val = (threadIdx.x < (blockDim.x / 32.f))
+            ? shared[lane]
+            : make_half2(__float2half(0.0f), __float2half(0.0f));
+  val = warp_reduce_sum<half2>(val);
+  return val;
+}
+
 /* Calculate the max of all elements in a thread block */
 template <typename T>
 __inline__ __device__ T block_reduce_max(T val) {