diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..2a46877c76
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+ "1024": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "16384": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "512": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "800": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..7372d5c322
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+ "1": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "256": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..569382ce2f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+ "1": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..1456fd0b4b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+ "1024": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "128": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "16384": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "32768": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "512": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "800": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..0f5983241f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+ "1": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 1
+ },
+ "100": {
+ "BLOCK_SIZE": 256,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "16": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "4096": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE": 256,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..3612e98183
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+ "1": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 8
+ },
+ "100": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 8
+ },
+ "16": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "2048": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 2,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "256": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "32": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "8": {
+ "BLOCK_DIM": 1024,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 4,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..ff46525471
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+ "1": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 16,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "100": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ },
+ "1024": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "16": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ },
+ "2048": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "256": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "32": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "64": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "8": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..e3eb000004
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,42 @@
+{
+ "1": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "100": {
+ "num_stages": 2,
+ "num_warps": 1
+ },
+ "1024": {
+ "num_stages": 5,
+ "num_warps": 2
+ },
+ "128": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "16": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "2048": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "256": {
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "32": {
+ "num_stages": 5,
+ "num_warps": 1
+ },
+ "64": {
+ "num_stages": 5,
+ "num_warps": 1
+ },
+ "8": {
+ "num_stages": 1,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..9d20b4ea6b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+ "1": {
+ "num_stages": 4,
+ "num_warps": 2
+ },
+ "100": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "1024": {
+ "num_stages": 5,
+ "num_warps": 2
+ },
+ "128": {
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "16": {
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "2048": {
+ "num_stages": 3,
+ "num_warps": 2
+ },
+ "256": {
+ "num_stages": 2,
+ "num_warps": 2
+ },
+ "32": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "4096": {
+ "num_stages": 3,
+ "num_warps": 2
+ },
+ "64": {
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "8": {
+ "num_stages": 4,
+ "num_warps": 2
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..fdb476db92
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+ "1": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "1024": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "2048": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "32": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..5f06f89508
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+ "1": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "100": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "1024": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "32": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..d0b540f69e
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+ "1024": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "16384": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "2048": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "512": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 32,
+ "NUM_STAGES": 1,
+ "num_warps": 8
+ },
+ "800": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..6c5307023b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+ "1024": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 4,
+ "num_warps": 4
+ },
+ "16384": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "2048": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "256": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "32768": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "512": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 1,
+ "num_warps": 1
+ },
+ "800": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "8192": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 833cc8f4b0..194914d455 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -5,6 +5,7 @@
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
from transformers.feature_extraction_utils import BatchFeature
from transformers.utils import TensorType
+from functools import lru_cache
class WhisperFeatureExtractor(SequenceFeatureExtractor):
@@ -47,17 +48,24 @@ def __init__(
mel_scale="slaney",
)
+ @lru_cache(maxsize=12)
+ def get_hann_window(self, device: Union[str, torch.device]):
+ return torch.hann_window(self.n_fft, device=device)
+
+ @lru_cache(maxsize=12)
+ def get_mel_filters(self, device: Union[str, torch.device]):
+ return torch.from_numpy(self.mel_filters).to(device, torch.float32)
+
def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
waveform = torch.from_numpy(waveform).to(device, torch.float32)
- window = torch.hann_window(self.n_fft, device=device)
+ window = self.get_hann_window(device)
if self.dither != 0.0:
waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2
-
- mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+ mel_filters = self.get_mel_filters(device)
mel_spec = mel_filters.T @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index a1419f83ef..1b8fa0110d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,6 +1,8 @@
import os
import json
import librosa
+import copy
+from functools import lru_cache
from io import BytesIO
from lightllm.common.build_utils import repair_config
from lightllm.models.registry import ModelRegistry
@@ -66,6 +68,11 @@ def get_audio_token_length(self, audio: AudioItem):
# print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}")
return token_num
+ @lru_cache(maxsize=128)
+ def _encode_prompt_text(self, prompt: str):
+ origin_ids = self.tokenizer.encode(prompt)
+ return origin_ids
+
def _caclu_audio_token_num(self, input_audio_len: int):
_mel_len = input_audio_len // int(self.hop_length)
input_lengths_leave = _mel_len % 100
@@ -74,7 +81,8 @@ def _caclu_audio_token_num(self, input_audio_len: int):
return output_lengths
def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
- origin_ids = self.tokenizer.encode(prompt)
+ origin_ids = self._encode_prompt_text(prompt)
+ origin_ids = copy.deepcopy(origin_ids)
#
->
origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c51d5dc3bd..03c57126ff 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -2,20 +2,14 @@
import json
import math
import torch
-import rpyc
-import librosa
import numpy as np
-from io import BytesIO
from torch import Tensor, nn
from safetensors import safe_open
from torch.nn import functional as F
from typing import Callable, Optional, Union, List
-from rpyc.utils.classic import obtain
-
from transformers.activations import ACT2FN
from lightllm.server.multimodal_params import AudioItem
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
@@ -341,9 +335,8 @@ def encode(self, audio_items: List[AudioItem]):
if isinstance(item, AudioItem):
uuids.append(item.uuid)
items.append(item)
- audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = BytesIO(audio_data)
- audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+ assert self.processor.sampling_rate == 16000
+ audio = item.load_audio_from_shm_payload()
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index c6d099a2d8..2bf325340f 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,9 +4,6 @@
import torch
-from sgl_kernel import causal_conv1d_fwd
-from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
-
def causal_conv1d_fn(
x: torch.Tensor,
@@ -51,6 +48,8 @@ def causal_conv1d_fn(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError("activation must be None, silu, or swish")
+ from sgl_kernel import causal_conv1d_fwd
+
if x.stride(-1) != 1:
x = x.contiguous()
bias = bias.contiguous() if bias is not None else None
@@ -103,6 +102,8 @@ def causal_conv1d_update(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
+ from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
activation_val = activation in ["silu", "swish"]
unsqueeze = x.dim() == 2
if unsqueeze:
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 4a048074d9..aaa29e1c71 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -1,17 +1,13 @@
import os
import json
-import rpyc
-import librosa
import numpy as np
import torch
import torch.nn.functional as F
-from io import BytesIO
from typing import List, Union
from safetensors.torch import load_file
from transformers.processing_utils import ProcessorMixin
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.server.multimodal_params import AudioItem
-from rpyc.utils.classic import obtain
+
# tokenizer_class removed
class WhisperProcessor(ProcessorMixin):
@@ -171,9 +167,7 @@ def encode(self, audio_items: List[AudioItem]):
if isinstance(item, AudioItem):
uuids.append(item.uuid)
items.append(item)
- audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = BytesIO(audio_data)
- audio, _ = librosa.load(audio, sr=16000)
+ audio = item.load_audio_from_shm_payload()
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index 7fc2696135..bdf98cd17a 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any):
class ChatCompletionRequest(BaseModel):
- model: str
+ model: str = "default"
messages: List[ChatCompletionMessageParam]
function_call: Optional[str] = "none"
temperature: Optional[float] = 1
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
index 73a0e0b250..0dad890e2c 100644
--- a/lightllm/server/embed_cache/impl/naive_memory_cache.py
+++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -213,6 +213,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l
"token_id": rec.token_id,
"start_index_in_embed_cache": rec.mem_block.start,
"token_num": rec.token_num,
+ "data_ready": rec.data,
+ "embed_ready": rec.embed,
}
)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8126d76446..c9822ff618 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -124,72 +124,84 @@ def __init__(
self.latest_success_infer_time_mark.set_value(int(time.time()))
return
- async def _alloc_resource(self, items, md5sums, token_nums, datas):
-
- while True:
- records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
-
- if records is None:
- await asyncio.sleep(0.1)
- continue
-
- if isinstance(records, str) and "error" in records:
- logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
- raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
-
- uid_list = []
- for item, rec in zip(items, records):
- item: Union[ImageItem, AudioItem] = item
- item.uuid = rec["id"]
- item.token_id = rec["token_id"]
- item.token_num = rec["token_num"]
- item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
-
- uid_list.append(rec["id"])
-
- ready_flags = obtain(self.cache_client.root.get_items_data(uid_list))
- update_data_ids = []
-
- for uid, ready, data in zip(uid_list, ready_flags, datas):
- if not ready:
- create_shm(get_shm_name_data(uid), data)
- update_data_ids.append(uid)
+ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
+ if self.args.detail_log:
+ cost_ms = (time.time() - start_time) * 1000.0
+ extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+ suffix = f" {extras}" if extras else ""
+ logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+ return
- if update_data_ids:
- self.cache_client.root.set_items_data(update_data_ids)
+ async def _alloc_resource(self, items, md5sums, token_nums, datas):
+ if len(items) == 0:
return
- async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
- # 只有 P 和 NORMAL 节点需要真的管理多模态资源
- if self.pd_mode.is_P_or_NORMAL():
+ for _ in range(2000):
# 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
# 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
# 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
async with self._resource_lock:
- items, md5sums, tokens_nums, datas = [], [], [], []
- for img in multimodal_params.images:
- self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
- data = img.read()
- # must after init_imageitem_extral_params
- token_num = self.tokenizer.get_image_token_length(img)
- md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
- md5sums.append(md5sum)
- img.md5 = md5sum
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(img)
- for audio in multimodal_params.audios:
- self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
- data = audio.read()
- token_num = self.tokenizer.get_audio_token_length(audio)
- md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
- md5sums.append(md5sum)
- audio.md5 = md5sum
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(audio)
-
- await self._alloc_resource(items, md5sums, tokens_nums, datas)
+ records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+ if records is not None:
+ break
+ await asyncio.sleep(0.005)
+
+ # 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。
+ if records is None:
+ async with self._resource_lock:
+ while records is None:
+ records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+ if records is not None:
+ break
+ await asyncio.sleep(0.1)
+
+ if isinstance(records, str) and "error" in records:
+ logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
+ raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
+
+ update_data_ids = []
+ for item, rec, data in zip(items, records, datas):
+ item: Union[ImageItem, AudioItem] = item
+ item.uuid = rec["id"]
+ item.token_id = rec["token_id"]
+ item.token_num = rec["token_num"]
+ item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
+
+ if not rec["data_ready"]:
+ create_shm(get_shm_name_data(rec["id"]), data)
+ update_data_ids.append(rec["id"])
+
+ if update_data_ids:
+ self.cache_client.root.set_items_data(update_data_ids)
+ return
+
+ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
+ # 只有 P 和 NORMAL 节点需要真的管理多模态资源
+ if self.pd_mode.is_P_or_NORMAL():
+ items, md5sums, tokens_nums, datas = [], [], [], []
+ for img in multimodal_params.images:
+ self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+ data = img.read()
+ # must after init_imageitem_extral_params
+ token_num = self.tokenizer.get_image_token_length(img)
+ md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+ md5sums.append(md5sum)
+ img.md5 = md5sum
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(img)
+ for audio in multimodal_params.audios:
+ self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+ data = audio.read()
+ token_num = self.tokenizer.get_audio_token_length(audio)
+ md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
+ md5sums.append(md5sum)
+ audio.md5 = md5sum
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(audio)
+
+ await self._alloc_resource(items, md5sums, tokens_nums, datas)
return
async def _release_multimodal_resources(self, multimodal_params: MultimodalParams):
@@ -289,6 +301,15 @@ async def generate(
start_time = time.time()
request_headers = request.headers if request is not None else {}
group_request_id = self.alloc_req_id(sampling_params, is_health_req)
+ audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
+ image_count = len(multimodal_params.images) if multimodal_params is not None else 0
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "received",
+ audio_count=audio_count,
+ image_count=image_count,
+ )
try:
original_multimodal_params = None
@@ -297,11 +318,21 @@ async def generate(
if self.pd_mode.is_P_or_NORMAL():
await multimodal_params.verify_and_preload(request)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "verify_and_preload_done",
+ )
# 记录请求到达的相关信息
await self._log_req_header(request_headers, group_request_id)
# encode
prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "encode_done",
+ )
prompt_tokens = len(prompt_ids)
# 监控
@@ -310,6 +341,11 @@ async def generate(
self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens)
self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens)
prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "check_and_repair_length_done",
+ )
if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
# 在 nixl pd 模式下的 p 节点, 为了更好的兼容多模态的推理流程,np 节点需要先上报其 encode 好的 prompt ids 信息,然后
@@ -357,6 +393,11 @@ async def generate(
chunked_prefill_size=self.args.chunked_prefill_size,
)
req_objs.append(req_obj)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "shm_req_init_done",
+ )
logger.debug(
f"alloc shm_req for req_id {group_request_id}, "
@@ -370,6 +411,11 @@ async def generate(
await self.transfer_to_next_module_or_node(
prompt, sampling_params, original_multimodal_params, req_status.group_req_objs
)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "request_forwarded",
+ )
results_generator = self._wait_to_token_package(
start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 05dd479411..6210628751 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,14 +1,18 @@
"""Multimodal parameters for text generation."""
+import asyncio
import os
import librosa
import base64
+import numpy as np
from typing import List
from io import BytesIO
from PIL import Image
from fastapi import Request
+from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.utils.multimodal_utils import fetch_resource
from lightllm.utils.log_utils import init_logger
+
logger = init_logger(__name__)
@@ -44,11 +48,19 @@ async def preload(self, request: Request):
raise ValueError(f"cannot read audio which type is {self._type}!")
# check if valid audio bytes
- audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+ audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000)
+ audio_values = np.asarray(audio_values, dtype=np.float32)
+
from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
- self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度
- self._preload_data = audio_data
+ if audio_values.shape[0] < MIN_AUDIO_LEN:
+ audio_values = np.pad(
+ audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0
+ )
+ logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}")
+
+ self.audio_length = int(audio_values.shape[0])
+ self._preload_data = audio_values.tobytes()
return
except Exception as e:
@@ -79,6 +91,14 @@ def to_origin_dict(self):
ret["data"] = self._data
return ret
+ def load_audio_from_shm_payload(self) -> np.ndarray:
+ audio_data = read_shm(get_shm_name_data(self.uuid))
+ audio_array = np.frombuffer(audio_data, dtype=np.float32)
+ if audio_array.shape[0] != self.audio_length:
+ logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}")
+ assert audio_array.shape[0] == self.audio_length
+ return audio_array
+
class ImageItem:
def __init__(self, **kwargs):
@@ -170,10 +190,11 @@ def __init__(
return
async def verify_and_preload(self, request: Request):
- for image in self.images:
- await image.preload(request)
- for audio in self.audios:
- await audio.preload(request)
+ tasks = [image.preload(request) for image in self.images]
+ tasks += [audio.preload(request) for audio in self.audios]
+
+ if tasks:
+ await asyncio.gather(*tasks)
return
def to_dict(self):
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index 0d2705fab2..f5e0b8df9a 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch):
# 添加新请求
reqs = [r.to_router_rpc_obj() for r in batch.reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(reqs)
self.shm_reqs_io_buffer.set_ready()
logger.debug(f"Prefill Batch: {batch.simple_log()} \n")
@@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch):
async def _aborted_reqs(self, aborted_reqs: List[Req]):
cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(cmds)
self.shm_reqs_io_buffer.set_ready()
return
@@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]):
async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]):
cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(cmds)
self.shm_reqs_io_buffer.set_ready()
return
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index cca01e126c..4b49ea8891 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -5,6 +5,7 @@
from PIL import Image
from io import BytesIO
from fastapi import Request
+from functools import lru_cache
from lightllm.utils.log_utils import init_logger
logger = init_logger(__name__)
@@ -35,23 +36,30 @@ def image2base64(img_str: str):
return base64.b64encode(buffer.getvalue()).decode("utf-8")
+@lru_cache(maxsize=256)
+def _get_xhttp_client(proxy=None):
+ kvargs = _httpx_async_client_proxy_kwargs(proxy)
+ kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20)
+ return httpx.AsyncClient(**kvargs)
+
+
async def fetch_resource(url, request: Request, timeout, proxy=None):
logger.info(f"Begin to download resource from url: {url}")
start_time = time.time()
- async with httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(proxy)) as client:
- async with client.stream("GET", url, timeout=timeout) as response:
- response.raise_for_status()
- ans_bytes = []
- async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
- if request is not None and await request.is_disconnected():
- await response.aclose()
- raise Exception("Request disconnected. User cancelled download.")
- ans_bytes.append(chunk)
- # 接收的数据不能大于128M
- if len(ans_bytes) > 128:
- raise Exception(f"url {url} recv data is too big")
-
- content = b"".join(ans_bytes)
+ client = _get_xhttp_client(proxy)
+ async with client.stream("GET", url, timeout=timeout) as response:
+ response.raise_for_status()
+ ans_bytes = []
+ async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+ if request is not None and await request.is_disconnected():
+ await response.aclose()
+ raise Exception("Request disconnected. User cancelled download.")
+ ans_bytes.append(chunk)
+ # 接收的数据不能大于128M
+ if len(ans_bytes) > 128:
+ raise Exception(f"url {url} recv data is too big")
+
+ content = b"".join(ans_bytes)
end_time = time.time()
cost_time = end_time - start_time
logger.info(f"Download url {url} resource cost time: {cost_time} seconds")