diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..2a46877c76 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,92 @@ +{ + "1024": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "16384": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "800": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "8192": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..7372d5c322 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,92 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "100": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..569382ce2f --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,101 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "100": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..1456fd0b4b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,101 @@ +{ + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "16384": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "32768": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "800": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 4 + }, + "8192": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..0f5983241f --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,46 @@ +{ + "1": { + "BLOCK_SIZE": 128, + "num_warps": 1 + }, + "100": { + "BLOCK_SIZE": 256, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "16": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "4096": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE": 256, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..3612e98183 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,68 @@ +{ + "1": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 8 + }, + "100": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "128": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 8 + }, + "16": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "2048": { + "BLOCK_DIM": 128, + "BLOCK_M": 2, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "256": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "32": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 4 + }, + "4096": { + "BLOCK_DIM": 128, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "8": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..ff46525471 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,62 @@ +{ + "1": { + "BLOCK_DIM": 128, + "BLOCK_M": 16, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "100": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + }, + "1024": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "128": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "16": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + }, + "2048": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "256": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "32": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "64": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "8": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..e3eb000004 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,42 @@ +{ + "1": { + "num_stages": 1, + "num_warps": 1 + }, + "100": { + "num_stages": 2, + "num_warps": 1 + }, + "1024": { + "num_stages": 5, + "num_warps": 2 + }, + "128": { + "num_stages": 4, + "num_warps": 1 + }, + "16": { + "num_stages": 1, + "num_warps": 1 + }, + "2048": { + "num_stages": 4, + "num_warps": 1 + }, + "256": { + "num_stages": 2, + "num_warps": 4 + }, + "32": { + "num_stages": 5, + "num_warps": 1 + }, + "64": { + "num_stages": 5, + "num_warps": 1 + }, + "8": { + "num_stages": 1, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..9d20b4ea6b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,46 @@ +{ + "1": { + "num_stages": 4, + "num_warps": 2 + }, + "100": { + "num_stages": 1, + "num_warps": 1 + }, + "1024": { + "num_stages": 5, + "num_warps": 2 + }, + "128": { + "num_stages": 2, + "num_warps": 4 + }, + "16": { + "num_stages": 5, + "num_warps": 4 + }, + "2048": { + "num_stages": 3, + "num_warps": 2 + }, + "256": { + "num_stages": 2, + "num_warps": 2 + }, + "32": { + "num_stages": 4, + "num_warps": 1 + }, + "4096": { + "num_stages": 3, + "num_warps": 2 + }, + "64": { + "num_stages": 3, + "num_warps": 4 + }, + "8": { + "num_stages": 4, + "num_warps": 2 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..fdb476db92 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,90 @@ +{ + "1": { + "BLOCK_K": 128, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "100": { + "BLOCK_K": 64, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "1024": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_K": 128, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 5, + "num_warps": 4 + }, + "16": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "2048": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + }, + "32": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "4096": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_K": 128, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..5f06f89508 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,90 @@ +{ + "1": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + }, + "100": { + "BLOCK_K": 64, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "1024": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "128": { + "BLOCK_K": 128, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "16": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 5, + "num_warps": 4 + }, + "2048": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "32": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_K": 256, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..d0b540f69e --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,62 @@ +{ + "1024": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "16384": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 32, + "NUM_STAGES": 1, + "num_warps": 8 + }, + "800": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "8192": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..6c5307023b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,68 @@ +{ + "1024": { + "BLOCK_M": 8, + "BLOCK_N": 64, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "128": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "16384": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "256": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "32768": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "512": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "64": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 128, + "NUM_STAGES": 1, + "num_warps": 1 + }, + "800": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "8192": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index 833cc8f4b0..194914d455 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -5,6 +5,7 @@ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor from transformers.feature_extraction_utils import BatchFeature from transformers.utils import TensorType +from functools import lru_cache class WhisperFeatureExtractor(SequenceFeatureExtractor): @@ -47,17 +48,24 @@ def __init__( mel_scale="slaney", ) + @lru_cache(maxsize=12) + def get_hann_window(self, device: Union[str, torch.device]): + return torch.hann_window(self.n_fft, device=device) + + @lru_cache(maxsize=12) + def get_mel_filters(self, device: Union[str, torch.device]): + return torch.from_numpy(self.mel_filters).to(device, torch.float32) + def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray: waveform = torch.from_numpy(waveform).to(device, torch.float32) - window = torch.hann_window(self.n_fft, device=device) + window = self.get_hann_window(device) if self.dither != 0.0: waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device) stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True) magnitudes = stft[..., :-1].abs() ** 2 - - mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32) + mel_filters = self.get_mel_filters(device) mel_spec = mel_filters.T @ magnitudes log_spec = torch.clamp(mel_spec, min=1e-10).log10() diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index a1419f83ef..1b8fa0110d 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -1,6 +1,8 @@ import os import json import librosa +import copy +from functools import lru_cache from io import BytesIO from lightllm.common.build_utils import repair_config from lightllm.models.registry import ModelRegistry @@ -66,6 +68,11 @@ def get_audio_token_length(self, audio: AudioItem): # print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}") return token_num + @lru_cache(maxsize=128) + def _encode_prompt_text(self, prompt: str): + origin_ids = self.tokenizer.encode(prompt) + return origin_ids + def _caclu_audio_token_num(self, input_audio_len: int): _mel_len = input_audio_len // int(self.hop_length) input_lengths_leave = _mel_len % 100 @@ -74,7 +81,8 @@ def _caclu_audio_token_num(self, input_audio_len: int): return output_lengths def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs): - origin_ids = self.tokenizer.encode(prompt) + origin_ids = self._encode_prompt_text(prompt) + origin_ids = copy.deepcopy(origin_ids) # -> origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)] diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index c51d5dc3bd..03c57126ff 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -2,20 +2,14 @@ import json import math import torch -import rpyc -import librosa import numpy as np -from io import BytesIO from torch import Tensor, nn from safetensors import safe_open from torch.nn import functional as F from typing import Callable, Optional, Union, List -from rpyc.utils.classic import obtain - from transformers.activations import ACT2FN from lightllm.server.multimodal_params import AudioItem -from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor @@ -341,9 +335,8 @@ def encode(self, audio_items: List[AudioItem]): if isinstance(item, AudioItem): uuids.append(item.uuid) items.append(item) - audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = BytesIO(audio_data) - audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) + assert self.processor.sampling_rate == 16000 + audio = item.load_audio_from_shm_payload() else: raise ValueError(f"cannot read audio which type is {type(item)}!") diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py index c6d099a2d8..2bf325340f 100644 --- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py +++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py @@ -4,9 +4,6 @@ import torch -from sgl_kernel import causal_conv1d_fwd -from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel - def causal_conv1d_fn( x: torch.Tensor, @@ -51,6 +48,8 @@ def causal_conv1d_fn( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError("activation must be None, silu, or swish") + from sgl_kernel import causal_conv1d_fwd + if x.stride(-1) != 1: x = x.contiguous() bias = bias.contiguous() if bias is not None else None @@ -103,6 +102,8 @@ def causal_conv1d_update( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}") + from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel + activation_val = activation in ["silu", "swish"] unsqueeze = x.dim() == 2 if unsqueeze: diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 4a048074d9..aaa29e1c71 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -1,17 +1,13 @@ import os import json -import rpyc -import librosa import numpy as np import torch import torch.nn.functional as F -from io import BytesIO from typing import List, Union from safetensors.torch import load_file from transformers.processing_utils import ProcessorMixin -from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.server.multimodal_params import AudioItem -from rpyc.utils.classic import obtain + # tokenizer_class removed class WhisperProcessor(ProcessorMixin): @@ -171,9 +167,7 @@ def encode(self, audio_items: List[AudioItem]): if isinstance(item, AudioItem): uuids.append(item.uuid) items.append(item) - audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = BytesIO(audio_data) - audio, _ = librosa.load(audio, sr=16000) + audio = item.load_audio_from_shm_payload() else: raise ValueError(f"cannot read audio which type is {type(item)}!") diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py index 7fc2696135..bdf98cd17a 100644 --- a/lightllm/server/api_models.py +++ b/lightllm/server/api_models.py @@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any): class ChatCompletionRequest(BaseModel): - model: str + model: str = "default" messages: List[ChatCompletionMessageParam] function_call: Optional[str] = "none" temperature: Optional[float] = 1 diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py index 73a0e0b250..0dad890e2c 100644 --- a/lightllm/server/embed_cache/impl/naive_memory_cache.py +++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py @@ -213,6 +213,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l "token_id": rec.token_id, "start_index_in_embed_cache": rec.mem_block.start, "token_num": rec.token_num, + "data_ready": rec.data, + "embed_ready": rec.embed, } ) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 8126d76446..c9822ff618 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -124,72 +124,84 @@ def __init__( self.latest_success_infer_time_mark.set_value(int(time.time())) return - async def _alloc_resource(self, items, md5sums, token_nums, datas): - - while True: - records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) - - if records is None: - await asyncio.sleep(0.1) - continue - - if isinstance(records, str) and "error" in records: - logger.error(str(records) + "and try to set --embed_cache_storage_size bigger") - raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger") - - uid_list = [] - for item, rec in zip(items, records): - item: Union[ImageItem, AudioItem] = item - item.uuid = rec["id"] - item.token_id = rec["token_id"] - item.token_num = rec["token_num"] - item.start_index_in_embed_cache = rec["start_index_in_embed_cache"] - - uid_list.append(rec["id"]) - - ready_flags = obtain(self.cache_client.root.get_items_data(uid_list)) - update_data_ids = [] - - for uid, ready, data in zip(uid_list, ready_flags, datas): - if not ready: - create_shm(get_shm_name_data(uid), data) - update_data_ids.append(uid) + def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs): + if self.args.detail_log: + cost_ms = (time.time() - start_time) * 1000.0 + extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) + suffix = f" {extras}" if extras else "" + logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") + return - if update_data_ids: - self.cache_client.root.set_items_data(update_data_ids) + async def _alloc_resource(self, items, md5sums, token_nums, datas): + if len(items) == 0: return - async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): - # 只有 P 和 NORMAL 节点需要真的管理多模态资源 - if self.pd_mode.is_P_or_NORMAL(): + for _ in range(2000): # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 async with self._resource_lock: - items, md5sums, tokens_nums, datas = [], [], [], [] - for img in multimodal_params.images: - self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) - data = img.read() - # must after init_imageitem_extral_params - token_num = self.tokenizer.get_image_token_length(img) - md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) - md5sums.append(md5sum) - img.md5 = md5sum - tokens_nums.append(token_num) - datas.append(data) - items.append(img) - for audio in multimodal_params.audios: - self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) - data = audio.read() - token_num = self.tokenizer.get_audio_token_length(audio) - md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params))) - md5sums.append(md5sum) - audio.md5 = md5sum - tokens_nums.append(token_num) - datas.append(data) - items.append(audio) - - await self._alloc_resource(items, md5sums, tokens_nums, datas) + records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) + if records is not None: + break + await asyncio.sleep(0.005) + + # 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。 + if records is None: + async with self._resource_lock: + while records is None: + records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) + if records is not None: + break + await asyncio.sleep(0.1) + + if isinstance(records, str) and "error" in records: + logger.error(str(records) + "and try to set --embed_cache_storage_size bigger") + raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger") + + update_data_ids = [] + for item, rec, data in zip(items, records, datas): + item: Union[ImageItem, AudioItem] = item + item.uuid = rec["id"] + item.token_id = rec["token_id"] + item.token_num = rec["token_num"] + item.start_index_in_embed_cache = rec["start_index_in_embed_cache"] + + if not rec["data_ready"]: + create_shm(get_shm_name_data(rec["id"]), data) + update_data_ids.append(rec["id"]) + + if update_data_ids: + self.cache_client.root.set_items_data(update_data_ids) + return + + async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): + # 只有 P 和 NORMAL 节点需要真的管理多模态资源 + if self.pd_mode.is_P_or_NORMAL(): + items, md5sums, tokens_nums, datas = [], [], [], [] + for img in multimodal_params.images: + self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) + data = img.read() + # must after init_imageitem_extral_params + token_num = self.tokenizer.get_image_token_length(img) + md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) + md5sums.append(md5sum) + img.md5 = md5sum + tokens_nums.append(token_num) + datas.append(data) + items.append(img) + for audio in multimodal_params.audios: + self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) + data = audio.read() + token_num = self.tokenizer.get_audio_token_length(audio) + md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params))) + md5sums.append(md5sum) + audio.md5 = md5sum + tokens_nums.append(token_num) + datas.append(data) + items.append(audio) + + await self._alloc_resource(items, md5sums, tokens_nums, datas) return async def _release_multimodal_resources(self, multimodal_params: MultimodalParams): @@ -289,6 +301,15 @@ async def generate( start_time = time.time() request_headers = request.headers if request is not None else {} group_request_id = self.alloc_req_id(sampling_params, is_health_req) + audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0 + image_count = len(multimodal_params.images) if multimodal_params is not None else 0 + self._log_stage_timing( + group_request_id, + start_time, + "received", + audio_count=audio_count, + image_count=image_count, + ) try: original_multimodal_params = None @@ -297,11 +318,21 @@ async def generate( if self.pd_mode.is_P_or_NORMAL(): await multimodal_params.verify_and_preload(request) + self._log_stage_timing( + group_request_id, + start_time, + "verify_and_preload_done", + ) # 记录请求到达的相关信息 await self._log_req_header(request_headers, group_request_id) # encode prompt_ids = await self._encode(prompt, multimodal_params, sampling_params) + self._log_stage_timing( + group_request_id, + start_time, + "encode_done", + ) prompt_tokens = len(prompt_ids) # 监控 @@ -310,6 +341,11 @@ async def generate( self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens) self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens) prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params) + self._log_stage_timing( + group_request_id, + start_time, + "check_and_repair_length_done", + ) if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP(): # 在 nixl pd 模式下的 p 节点, 为了更好的兼容多模态的推理流程,np 节点需要先上报其 encode 好的 prompt ids 信息,然后 @@ -357,6 +393,11 @@ async def generate( chunked_prefill_size=self.args.chunked_prefill_size, ) req_objs.append(req_obj) + self._log_stage_timing( + group_request_id, + start_time, + "shm_req_init_done", + ) logger.debug( f"alloc shm_req for req_id {group_request_id}, " @@ -370,6 +411,11 @@ async def generate( await self.transfer_to_next_module_or_node( prompt, sampling_params, original_multimodal_params, req_status.group_req_objs ) + self._log_stage_timing( + group_request_id, + start_time, + "request_forwarded", + ) results_generator = self._wait_to_token_package( start_time, diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 05dd479411..6210628751 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -1,14 +1,18 @@ """Multimodal parameters for text generation.""" +import asyncio import os import librosa import base64 +import numpy as np from typing import List from io import BytesIO from PIL import Image from fastapi import Request +from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.utils.multimodal_utils import fetch_resource from lightllm.utils.log_utils import init_logger + logger = init_logger(__name__) @@ -44,11 +48,19 @@ async def preload(self, request: Request): raise ValueError(f"cannot read audio which type is {self._type}!") # check if valid audio bytes - audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000) + audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000) + audio_values = np.asarray(audio_values, dtype=np.float32) + from lightllm.models.whisper.defaults import MIN_AUDIO_LEN - self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度 - self._preload_data = audio_data + if audio_values.shape[0] < MIN_AUDIO_LEN: + audio_values = np.pad( + audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0 + ) + logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}") + + self.audio_length = int(audio_values.shape[0]) + self._preload_data = audio_values.tobytes() return except Exception as e: @@ -79,6 +91,14 @@ def to_origin_dict(self): ret["data"] = self._data return ret + def load_audio_from_shm_payload(self) -> np.ndarray: + audio_data = read_shm(get_shm_name_data(self.uuid)) + audio_array = np.frombuffer(audio_data, dtype=np.float32) + if audio_array.shape[0] != self.audio_length: + logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}") + assert audio_array.shape[0] == self.audio_length + return audio_array + class ImageItem: def __init__(self, **kwargs): @@ -170,10 +190,11 @@ def __init__( return async def verify_and_preload(self, request: Request): - for image in self.images: - await image.preload(request) - for audio in self.audios: - await audio.preload(request) + tasks = [image.preload(request) for image in self.images] + tasks += [audio.preload(request) for audio in self.audios] + + if tasks: + await asyncio.gather(*tasks) return def to_dict(self): diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index 0d2705fab2..f5e0b8df9a 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch): # 添加新请求 reqs = [r.to_router_rpc_obj() for r in batch.reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(reqs) self.shm_reqs_io_buffer.set_ready() logger.debug(f"Prefill Batch: {batch.simple_log()} \n") @@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch): async def _aborted_reqs(self, aborted_reqs: List[Req]): cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(cmds) self.shm_reqs_io_buffer.set_ready() return @@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]): async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]): cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(cmds) self.shm_reqs_io_buffer.set_ready() return diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py index cca01e126c..4b49ea8891 100644 --- a/lightllm/utils/multimodal_utils.py +++ b/lightllm/utils/multimodal_utils.py @@ -5,6 +5,7 @@ from PIL import Image from io import BytesIO from fastapi import Request +from functools import lru_cache from lightllm.utils.log_utils import init_logger logger = init_logger(__name__) @@ -35,23 +36,30 @@ def image2base64(img_str: str): return base64.b64encode(buffer.getvalue()).decode("utf-8") +@lru_cache(maxsize=256) +def _get_xhttp_client(proxy=None): + kvargs = _httpx_async_client_proxy_kwargs(proxy) + kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20) + return httpx.AsyncClient(**kvargs) + + async def fetch_resource(url, request: Request, timeout, proxy=None): logger.info(f"Begin to download resource from url: {url}") start_time = time.time() - async with httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(proxy)) as client: - async with client.stream("GET", url, timeout=timeout) as response: - response.raise_for_status() - ans_bytes = [] - async for chunk in response.aiter_bytes(chunk_size=1024 * 1024): - if request is not None and await request.is_disconnected(): - await response.aclose() - raise Exception("Request disconnected. User cancelled download.") - ans_bytes.append(chunk) - # 接收的数据不能大于128M - if len(ans_bytes) > 128: - raise Exception(f"url {url} recv data is too big") - - content = b"".join(ans_bytes) + client = _get_xhttp_client(proxy) + async with client.stream("GET", url, timeout=timeout) as response: + response.raise_for_status() + ans_bytes = [] + async for chunk in response.aiter_bytes(chunk_size=1024 * 1024): + if request is not None and await request.is_disconnected(): + await response.aclose() + raise Exception("Request disconnected. User cancelled download.") + ans_bytes.append(chunk) + # 接收的数据不能大于128M + if len(ans_bytes) > 128: + raise Exception(f"url {url} recv data is too big") + + content = b"".join(ans_bytes) end_time = time.time() cost_time = end_time - start_time logger.info(f"Download url {url} resource cost time: {cost_time} seconds")