diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..2a46877c76
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "800": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..7372d5c322
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 16,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..569382ce2f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..1456fd0b4b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "32768": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "800": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..0f5983241f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+  "1": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 1
+  },
+  "100": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..3612e98183
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+  "1": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 2,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "8": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..ff46525471
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+  "1": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 16,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "100": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  },
+  "1024": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "16": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  },
+  "2048": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "64": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "8": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..e3eb000004
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,42 @@
+{
+  "1": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "100": {
+    "num_stages": 2,
+    "num_warps": 1
+  },
+  "1024": {
+    "num_stages": 5,
+    "num_warps": 2
+  },
+  "128": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "16": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "2048": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "64": {
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "8": {
+    "num_stages": 1,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..9d20b4ea6b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+  "1": {
+    "num_stages": 4,
+    "num_warps": 2
+  },
+  "100": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "1024": {
+    "num_stages": 5,
+    "num_warps": 2
+  },
+  "128": {
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "16": {
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "2048": {
+    "num_stages": 3,
+    "num_warps": 2
+  },
+  "256": {
+    "num_stages": 2,
+    "num_warps": 2
+  },
+  "32": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "4096": {
+    "num_stages": 3,
+    "num_warps": 2
+  },
+  "64": {
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "8": {
+    "num_stages": 4,
+    "num_warps": 2
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..fdb476db92
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+  "1": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..5f06f89508
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+  "1": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..d0b540f69e
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+  "1024": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "512": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "800": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..6c5307023b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+  "1024": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "32768": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "512": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "800": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 833cc8f4b0..194914d455 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -5,6 +5,7 @@
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.utils import TensorType
+from functools import lru_cache
 
 
 class WhisperFeatureExtractor(SequenceFeatureExtractor):
@@ -47,17 +48,24 @@ def __init__(
             mel_scale="slaney",
         )
 
+    @lru_cache(maxsize=12)
+    def get_hann_window(self, device: Union[str, torch.device]):
+        return torch.hann_window(self.n_fft, device=device)
+
+    @lru_cache(maxsize=12)
+    def get_mel_filters(self, device: Union[str, torch.device]):
+        return torch.from_numpy(self.mel_filters).to(device, torch.float32)
+
     def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
         waveform = torch.from_numpy(waveform).to(device, torch.float32)
-        window = torch.hann_window(self.n_fft, device=device)
+        window = self.get_hann_window(device)
 
         if self.dither != 0.0:
             waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
 
         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
         magnitudes = stft[..., :-1].abs() ** 2
-
-        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+        mel_filters = self.get_mel_filters(device)
         mel_spec = mel_filters.T @ magnitudes
 
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index a1419f83ef..1b8fa0110d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,6 +1,8 @@
 import os
 import json
 import librosa
+import copy
+from functools import lru_cache
 from io import BytesIO
 from lightllm.common.build_utils import repair_config
 from lightllm.models.registry import ModelRegistry
@@ -66,6 +68,11 @@ def get_audio_token_length(self, audio: AudioItem):
         # print(f"token_num is {token_num}  n_samples is {self.n_samples} hop_length is {self.hop_length}")
         return token_num
 
+    @lru_cache(maxsize=128)
+    def _encode_prompt_text(self, prompt: str):
+        origin_ids = self.tokenizer.encode(prompt)
+        return origin_ids
+
     def _caclu_audio_token_num(self, input_audio_len: int):
         _mel_len = input_audio_len // int(self.hop_length)
         input_lengths_leave = _mel_len % 100
@@ -74,7 +81,8 @@ def _caclu_audio_token_num(self, input_audio_len: int):
         return output_lengths
 
     def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
-        origin_ids = self.tokenizer.encode(prompt)
+        origin_ids = self._encode_prompt_text(prompt)
+        origin_ids = copy.deepcopy(origin_ids)
 
         # <img><image_pad></img> -> <img></img>
         origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c51d5dc3bd..03c57126ff 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -2,20 +2,14 @@
 import json
 import math
 import torch
-import rpyc
-import librosa
 import numpy as np
-from io import BytesIO
 from torch import Tensor, nn
 from safetensors import safe_open
 from torch.nn import functional as F
 from typing import Callable, Optional, Union, List
-from rpyc.utils.classic import obtain
-
 from transformers.activations import ACT2FN
 
 from lightllm.server.multimodal_params import AudioItem
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
@@ -341,9 +335,8 @@ def encode(self, audio_items: List[AudioItem]):
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
                 items.append(item)
-                audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = BytesIO(audio_data)
-                audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+                assert self.processor.sampling_rate == 16000
+                audio = item.load_audio_from_shm_payload()
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index c6d099a2d8..2bf325340f 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,9 +4,6 @@
 
 import torch
 
-from sgl_kernel import causal_conv1d_fwd
-from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
-
 
 def causal_conv1d_fn(
     x: torch.Tensor,
@@ -51,6 +48,8 @@ def causal_conv1d_fn(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
+    from sgl_kernel import causal_conv1d_fwd
+
     if x.stride(-1) != 1:
         x = x.contiguous()
     bias = bias.contiguous() if bias is not None else None
@@ -103,6 +102,8 @@ def causal_conv1d_update(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
+    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
     activation_val = activation in ["silu", "swish"]
     unsqueeze = x.dim() == 2
     if unsqueeze:
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 4a048074d9..aaa29e1c71 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -1,17 +1,13 @@
 import os
 import json
-import rpyc
-import librosa
 import numpy as np
 import torch
 import torch.nn.functional as F
-from io import BytesIO
 from typing import List, Union
 from safetensors.torch import load_file
 from transformers.processing_utils import ProcessorMixin
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.multimodal_params import AudioItem
-from rpyc.utils.classic import obtain
+
 
 # tokenizer_class removed
 class WhisperProcessor(ProcessorMixin):
@@ -171,9 +167,7 @@ def encode(self, audio_items: List[AudioItem]):
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
                 items.append(item)
-                audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = BytesIO(audio_data)
-                audio, _ = librosa.load(audio, sr=16000)
+                audio = item.load_audio_from_shm_payload()
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index 7fc2696135..bdf98cd17a 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any):
 
 
 class ChatCompletionRequest(BaseModel):
-    model: str
+    model: str = "default"
     messages: List[ChatCompletionMessageParam]
     function_call: Optional[str] = "none"
     temperature: Optional[float] = 1
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
index 73a0e0b250..0dad890e2c 100644
--- a/lightllm/server/embed_cache/impl/naive_memory_cache.py
+++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -213,6 +213,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l
                             "token_id": rec.token_id,
                             "start_index_in_embed_cache": rec.mem_block.start,
                             "token_num": rec.token_num,
+                            "data_ready": rec.data,
+                            "embed_ready": rec.embed,
                         }
                     )
 
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8126d76446..c9822ff618 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -124,72 +124,84 @@ def __init__(
         self.latest_success_infer_time_mark.set_value(int(time.time()))
         return
 
-    async def _alloc_resource(self, items, md5sums, token_nums, datas):
-
-        while True:
-            records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
-
-            if records is None:
-                await asyncio.sleep(0.1)
-                continue
-
-            if isinstance(records, str) and "error" in records:
-                logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
-                raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
-
-            uid_list = []
-            for item, rec in zip(items, records):
-                item: Union[ImageItem, AudioItem] = item
-                item.uuid = rec["id"]
-                item.token_id = rec["token_id"]
-                item.token_num = rec["token_num"]
-                item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
-
-                uid_list.append(rec["id"])
-
-            ready_flags = obtain(self.cache_client.root.get_items_data(uid_list))
-            update_data_ids = []
-
-            for uid, ready, data in zip(uid_list, ready_flags, datas):
-                if not ready:
-                    create_shm(get_shm_name_data(uid), data)
-                    update_data_ids.append(uid)
+    def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
+        if self.args.detail_log:
+            cost_ms = (time.time() - start_time) * 1000.0
+            extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+            suffix = f" {extras}" if extras else ""
+            logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+        return
 
-            if update_data_ids:
-                self.cache_client.root.set_items_data(update_data_ids)
+    async def _alloc_resource(self, items, md5sums, token_nums, datas):
+        if len(items) == 0:
             return
 
-    async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
-        # 只有 P 和 NORMAL 节点需要真的管理多模态资源
-        if self.pd_mode.is_P_or_NORMAL():
+        for _ in range(2000):
             # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity，从而造成死锁的问题。
             # 如果不加任何锁，假如请求1和请求2都有6张图片，而cache_capacity为10，
             # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图，将会资源竞争产生死锁。
             async with self._resource_lock:
-                items, md5sums, tokens_nums, datas = [], [], [], []
-                for img in multimodal_params.images:
-                    self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
-                    data = img.read()
-                    # must after init_imageitem_extral_params
-                    token_num = self.tokenizer.get_image_token_length(img)
-                    md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
-                    md5sums.append(md5sum)
-                    img.md5 = md5sum
-                    tokens_nums.append(token_num)
-                    datas.append(data)
-                    items.append(img)
-                for audio in multimodal_params.audios:
-                    self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
-                    data = audio.read()
-                    token_num = self.tokenizer.get_audio_token_length(audio)
-                    md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
-                    md5sums.append(md5sum)
-                    audio.md5 = md5sum
-                    tokens_nums.append(token_num)
-                    datas.append(data)
-                    items.append(audio)
-
-                await self._alloc_resource(items, md5sums, tokens_nums, datas)
+                records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+                if records is not None:
+                    break
+                await asyncio.sleep(0.005)
+
+        # 长时间无法申请到足够资源的时候，则开始进行阻塞式尝试，防止其他请求一起申请相关资源。
+        if records is None:
+            async with self._resource_lock:
+                while records is None:
+                    records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+                    if records is not None:
+                        break
+                    await asyncio.sleep(0.1)
+
+        if isinstance(records, str) and "error" in records:
+            logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
+            raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
+
+        update_data_ids = []
+        for item, rec, data in zip(items, records, datas):
+            item: Union[ImageItem, AudioItem] = item
+            item.uuid = rec["id"]
+            item.token_id = rec["token_id"]
+            item.token_num = rec["token_num"]
+            item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
+
+            if not rec["data_ready"]:
+                create_shm(get_shm_name_data(rec["id"]), data)
+                update_data_ids.append(rec["id"])
+
+        if update_data_ids:
+            self.cache_client.root.set_items_data(update_data_ids)
+        return
+
+    async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
+        # 只有 P 和 NORMAL 节点需要真的管理多模态资源
+        if self.pd_mode.is_P_or_NORMAL():
+            items, md5sums, tokens_nums, datas = [], [], [], []
+            for img in multimodal_params.images:
+                self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+                data = img.read()
+                # must after init_imageitem_extral_params
+                token_num = self.tokenizer.get_image_token_length(img)
+                md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+                md5sums.append(md5sum)
+                img.md5 = md5sum
+                tokens_nums.append(token_num)
+                datas.append(data)
+                items.append(img)
+            for audio in multimodal_params.audios:
+                self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+                data = audio.read()
+                token_num = self.tokenizer.get_audio_token_length(audio)
+                md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
+                md5sums.append(md5sum)
+                audio.md5 = md5sum
+                tokens_nums.append(token_num)
+                datas.append(data)
+                items.append(audio)
+
+            await self._alloc_resource(items, md5sums, tokens_nums, datas)
         return
 
     async def _release_multimodal_resources(self, multimodal_params: MultimodalParams):
@@ -289,6 +301,15 @@ async def generate(
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
+        audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
+        image_count = len(multimodal_params.images) if multimodal_params is not None else 0
+        self._log_stage_timing(
+            group_request_id,
+            start_time,
+            "received",
+            audio_count=audio_count,
+            image_count=image_count,
+        )
 
         try:
             original_multimodal_params = None
@@ -297,11 +318,21 @@ async def generate(
 
             if self.pd_mode.is_P_or_NORMAL():
                 await multimodal_params.verify_and_preload(request)
+                self._log_stage_timing(
+                    group_request_id,
+                    start_time,
+                    "verify_and_preload_done",
+                )
 
             # 记录请求到达的相关信息
             await self._log_req_header(request_headers, group_request_id)
             # encode
             prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "encode_done",
+            )
 
             prompt_tokens = len(prompt_ids)
             # 监控
@@ -310,6 +341,11 @@ async def generate(
                 self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens)
                 self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens)
             prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "check_and_repair_length_done",
+            )
 
             if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
                 # 在 nixl pd 模式下的 p 节点， 为了更好的兼容多模态的推理流程，np 节点需要先上报其 encode 好的 prompt ids 信息，然后
@@ -357,6 +393,11 @@ async def generate(
                     chunked_prefill_size=self.args.chunked_prefill_size,
                 )
                 req_objs.append(req_obj)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "shm_req_init_done",
+            )
 
             logger.debug(
                 f"alloc shm_req for req_id {group_request_id}, "
@@ -370,6 +411,11 @@ async def generate(
             await self.transfer_to_next_module_or_node(
                 prompt, sampling_params, original_multimodal_params, req_status.group_req_objs
             )
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "request_forwarded",
+            )
 
             results_generator = self._wait_to_token_package(
                 start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 05dd479411..6210628751 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,14 +1,18 @@
 """Multimodal parameters for text generation."""
+import asyncio
 import os
 import librosa
 import base64
+import numpy as np
 from typing import List
 from io import BytesIO
 from PIL import Image
 from fastapi import Request
+from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.utils.multimodal_utils import fetch_resource
 from lightllm.utils.log_utils import init_logger
 
+
 logger = init_logger(__name__)
 
 
@@ -44,11 +48,19 @@ async def preload(self, request: Request):
                 raise ValueError(f"cannot read audio which type is {self._type}!")
 
             # check if valid audio bytes
-            audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000)
+            audio_values = np.asarray(audio_values, dtype=np.float32)
+
             from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
-            self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN)  # 如果音频过短，会被pad到480的长度
-            self._preload_data = audio_data
+            if audio_values.shape[0] < MIN_AUDIO_LEN:
+                audio_values = np.pad(
+                    audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0
+                )
+                logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}")
+
+            self.audio_length = int(audio_values.shape[0])
+            self._preload_data = audio_values.tobytes()
             return
 
         except Exception as e:
@@ -79,6 +91,14 @@ def to_origin_dict(self):
         ret["data"] = self._data
         return ret
 
+    def load_audio_from_shm_payload(self) -> np.ndarray:
+        audio_data = read_shm(get_shm_name_data(self.uuid))
+        audio_array = np.frombuffer(audio_data, dtype=np.float32)
+        if audio_array.shape[0] != self.audio_length:
+            logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}")
+            assert audio_array.shape[0] == self.audio_length
+        return audio_array
+
 
 class ImageItem:
     def __init__(self, **kwargs):
@@ -170,10 +190,11 @@ def __init__(
         return
 
     async def verify_and_preload(self, request: Request):
-        for image in self.images:
-            await image.preload(request)
-        for audio in self.audios:
-            await audio.preload(request)
+        tasks = [image.preload(request) for image in self.images]
+        tasks += [audio.preload(request) for audio in self.audios]
+
+        if tasks:
+            await asyncio.gather(*tasks)
         return
 
     def to_dict(self):
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index 0d2705fab2..f5e0b8df9a 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch):
         # 添加新请求
         reqs = [r.to_router_rpc_obj() for r in batch.reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(reqs)
         self.shm_reqs_io_buffer.set_ready()
         logger.debug(f"Prefill Batch: {batch.simple_log()} \n")
@@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch):
     async def _aborted_reqs(self, aborted_reqs: List[Req]):
         cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(cmds)
         self.shm_reqs_io_buffer.set_ready()
         return
@@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]):
     async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]):
         cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(cmds)
         self.shm_reqs_io_buffer.set_ready()
         return
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index cca01e126c..4b49ea8891 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -5,6 +5,7 @@
 from PIL import Image
 from io import BytesIO
 from fastapi import Request
+from functools import lru_cache
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -35,23 +36,30 @@ def image2base64(img_str: str):
     return base64.b64encode(buffer.getvalue()).decode("utf-8")
 
 
+@lru_cache(maxsize=256)
+def _get_xhttp_client(proxy=None):
+    kvargs = _httpx_async_client_proxy_kwargs(proxy)
+    kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20)
+    return httpx.AsyncClient(**kvargs)
+
+
 async def fetch_resource(url, request: Request, timeout, proxy=None):
     logger.info(f"Begin to download resource from url: {url}")
     start_time = time.time()
-    async with httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(proxy)) as client:
-        async with client.stream("GET", url, timeout=timeout) as response:
-            response.raise_for_status()
-            ans_bytes = []
-            async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
-                if request is not None and await request.is_disconnected():
-                    await response.aclose()
-                    raise Exception("Request disconnected. User cancelled download.")
-                ans_bytes.append(chunk)
-                # 接收的数据不能大于128M
-                if len(ans_bytes) > 128:
-                    raise Exception(f"url {url} recv data is too big")
-
-            content = b"".join(ans_bytes)
+    client = _get_xhttp_client(proxy)
+    async with client.stream("GET", url, timeout=timeout) as response:
+        response.raise_for_status()
+        ans_bytes = []
+        async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+            if request is not None and await request.is_disconnected():
+                await response.aclose()
+                raise Exception("Request disconnected. User cancelled download.")
+            ans_bytes.append(chunk)
+            # 接收的数据不能大于128M
+            if len(ans_bytes) > 128:
+                raise Exception(f"url {url} recv data is too big")
+
+    content = b"".join(ans_bytes)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info(f"Download url {url} resource cost time: {cost_time} seconds")