Skip to content

[NPU] Improve the performance of the embedding for NPU #1036

@TianHao324

Description

@TianHao324

Currently, the embedding operator is supported on the NPU and has passed the accuracy test. However, the running performance is much lower than that of other systems such as Hugging Face, and further improvements are needed.

********** Benchmark Data **********
[
  {
    "kernel_name": "embedding",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      43.11292266845703,
      42.90717315673828,
      43.2876091003418,
      43.323081970214844,
      42.62879943847656,
      43.2666015625,
      43.26323699951172,
      43.450679779052734
    ],
    "y_values_20": [
      43.11175537109375,
      42.90643310546875,
      43.28743362426758,
      43.32281494140625,
      42.62324523925781,
      43.2645263671875,
      43.26128387451172,
      43.43910217285156
    ],
    "y_values_80": [
      43.11408615112305,
      42.90790939331055,
      43.287784576416016,
      43.32334518432617,
      42.63435745239258,
      43.2686767578125,
      43.265193939208984,
      43.462257385253906
    ],
    "timestamp": "2026-01-21 03:48:04",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 32, \"T\": 512, \"D\": 768, \"dtype\": \"torch.float32\"}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "embedding",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      0.08064000308513641,
      0.09166000038385391,
      0.11357000470161438,
      0.1482200026512146,
      0.18525999784469604,
      0.21186000108718872,
      0.2272000014781952,
      0.23625999689102173
    ],
    "y_values_20": [
      0.08020000159740448,
      0.09121999889612198,
      0.1130559965968132,
      0.14762000739574432,
      0.18479999899864197,
      0.21121999621391296,
      0.22665999829769135,
      0.2358199954032898
    ],
    "y_values_80": [
      0.08157600462436676,
      0.09262000024318695,
      0.11438000202178955,
      0.14905999600887299,
      0.18602000176906586,
      0.21264000236988068,
      0.22774000465869904,
      0.2370000034570694
    ],
    "timestamp": "2026-01-21 03:48:17",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 32, \"T\": 512, \"D\": 768, \"dtype\": \"torch.float32\"}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "embedding",
    "kernel_provider": "torch_compile",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      0.1696000099182129,
      0.20161999762058258,
      0.20436999201774597,
      0.20262999832630157,
      0.20093999803066254,
      0.21164000034332275,
      0.23170000314712524,
      0.23863999545574188
    ],
    "y_values_20": [
      0.16475999355316162,
      0.1967879980802536,
      0.19750800728797913,
      0.19844000041484833,
      0.19786399602890015,
      0.21121999621391296,
      0.2281000018119812,
      0.23797999322414398
    ],
    "y_values_80": [
      0.17654000222682953,
      0.20855599641799927,
      0.2127159982919693,
      0.20847199857234955,
      0.20453999936580658,
      0.21206000447273254,
      0.2365799993276596,
      0.23921999335289001
    ],
    "timestamp": "2026-01-21 03:48:32",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 32, \"T\": 512, \"D\": 768, \"dtype\": \"torch.float32\"}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "embedding",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      61.94499969482422,
      62.33565902709961,
      61.86201858520508,
      62.008819580078125,
      62.232479095458984,
      62.473838806152344,
      63.21195983886719,
      64.28494262695312
    ],
    "y_values_20": [
      61.94499969482422,
      62.33565902709961,
      61.86201858520508,
      62.008819580078125,
      62.232479095458984,
      62.473838806152344,
      63.21195983886719,
      64.28494262695312
    ],
    "y_values_80": [
      61.94499969482422,
      62.33565902709961,
      61.86201858520508,
      62.008819580078125,
      62.232479095458984,
      62.473838806152344,
      63.21195983886719,
      64.28494262695312
    ],
    "timestamp": "2026-01-21 03:48:50",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 32, \"T\": 512, \"D\": 768, \"dtype\": \"torch.float32\"}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "embedding",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      1.5649499893188477,
      1.5831799507141113,
      1.6187798976898193,
      1.6978700160980225,
      1.8488800525665283,
      2.2740001678466797,
      2.995260000228882,
      4.282050132751465
    ],
    "y_values_20": [
      1.5638200044631958,
      1.581808090209961,
      1.6171720027923584,
      1.6966240406036377,
      1.8473479747772217,
      2.271728038787842,
      2.993760108947754,
      4.2774481773376465
    ],
    "y_values_80": [
      1.5658999681472778,
      1.585103988647461,
      1.6203559637069702,
      1.6992720365524292,
      1.8494240045547485,
      2.2756519317626953,
      2.9975199699401855,
      4.283827781677246
    ],
    "timestamp": "2026-01-21 03:49:03",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 32, \"T\": 512, \"D\": 768, \"dtype\": \"torch.float32\"}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "embedding",
    "kernel_provider": "torch_compile",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "embedding dimension",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768,
      65536,
      131072
    ],
    "y_values_50": [
      1.5654499530792236,
      1.579859972000122,
      1.6195299625396729,
      1.7007999420166016,
      1.8484400510787964,
      2.296339988708496,
      3.0023000240325928,
      4.276410102844238
    ],
    "y_values_20": [
      1.5641800165176392,
      1.5789759159088135,
      1.6186840534210205,
      1.6993600130081177,
      1.8475240468978882,
      2.295095920562744,
      3.000540018081665,
      4.273176193237305
    ],
    "y_values_80": [
      1.5664000511169434,
      1.5810760259628296,
      1.6209640502929688,
      1.7026759386062622,
      1.8497200012207031,
      2.298271894454956,
      3.0036399364471436,
      4.278992176055908
    ],

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions