diff --git a/tests/chat_template/test_chat_template.py b/tests/chat_template/test_chat_template.py
index bf702560e..6ed85fc01 100644
--- a/tests/chat_template/test_chat_template.py
+++ b/tests/chat_template/test_chat_template.py
@@ -1,11 +1,17 @@
 from datetime import datetime
 import os
+import json
 import parametrize
 from unittest import TestCase
 from transformers import AutoTokenizer
+import torch
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+import unittest
 
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP 
 from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages.qwen35_chat import Qwen35ChatMessages, qwen35_tokenize_fn_slowspeed
 
 
 QWEN3_PATH = os.environ["QWEN3_PATH"]
@@ -222,6 +228,45 @@ def test_deepseek_v3_template(self, template_type,thinking, tokenizer):
         input_ids = _messages.tokenize(tokenizer, chat_template)['input_ids']
         
         self.assertTrue((input_ids == input_ids_ref))
+    
+
+    @unittest.skipIf(
+        Version(transformers_version) < Version("5.2.0"),
+        f"transformers >= 5.2.0 is required, but got {transformers_version}"
+    )
+    def test_qwen35vl_template(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        chat_template = CHAT_TEMPLATE_MAP["qwen3.5-vl"]
+        tokenizer = AutoTokenizer.from_pretrained(QWEN35_VL_PATH, trust_remote_code=True)
+        
+        jsonl_path = 'tests/resource/qwen35_tokenize_data.jsonl'
+        all_data= []
+        with open(jsonl_path, 'r') as f:
+            for line in f:
+                all_data.append(json.loads(line))
+        
+        for j, data in enumerate(all_data):
+            if j in [13,14]: # video 肯定和 hf 对不上
+                continue
+            gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
+            _messages = Qwen35ChatMessages(messages=data["messages"], tools=data.get("tools"))
+            tokenized = _messages.tokenize(tokenizer, chat_template, add_vision_id=True)
+            decode_str = tokenizer.decode(tokenized['input_ids'], skip_special_tokens=False)
 
+            if j!=15 and j!=16:
+                self.assertEqual(tokenized['input_ids'], gt_token_ids)
+                self.assertEqual(tokenized['labels'], gt_labels)
 
-        
\ No newline at end of file
+                enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
+                hf_text = tokenizer.apply_chat_template(data['messages'],   
+                                                tools=data.get('tools'),       
+                                                add_vision_id=True,   
+                                                tokenize=False,
+                                                enable_thinking=enable_thinking,
+                                                add_generation_prompt=False)
+                self.assertEqual(decode_str, hf_text)
+            else:
+                if j==15:
+                    self.assertTrue('Video 1: <|vision_start|><|video_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|><0.0-10.0 seconds>Describe the video in detail. [NO_REASONING]<|im_end|>' in decode_str)
+                else:
+                    self.assertTrue('Video 1: <0.0 seconds><|vision_start|><|video_pad|><|vision_end|><1.0 seconds><|vision_start|><|video_pad|><|vision_end|><2.0 seconds><|vision_start|><|video_pad|><|vision_end|><0.0-10.0 seconds>Describe the video in detail. [NO_REASONING]<|im_end|>' in decode_str)
diff --git a/tests/datasets/test_qwen35_vl_tokenize_fn.py b/tests/datasets/test_qwen35_vl_tokenize_fn.py
new file mode 100644
index 000000000..3f8cbf529
--- /dev/null
+++ b/tests/datasets/test_qwen35_vl_tokenize_fn.py
@@ -0,0 +1,338 @@
+import os
+from unittest import TestCase
+from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig, PretrainTokenizeFunction
+from transformers import AutoTokenizer, AutoProcessor,Qwen3VLProcessor
+import json
+import torch
+import parametrize
+from xtuner.v1.utils.test_utils import add_video_root
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+import unittest
+from xtuner.v1.data_proto.messages.qwen35_chat import qwen35_tokenize_fn_slowspeed
+
+VIDEO_ROOT = os.environ["VIDEO_ROOT"]
+
+
+@unittest.skipIf(
+    Version(transformers_version) < Version("5.2.0"),
+    f"transformers >= 5.2.0 is required, but got {transformers_version}"
+)
+class TestMLLMTokenizeFn(TestCase):
+    def setUp(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        self.tokenizer = AutoTokenizer.from_pretrained(QWEN35_VL_PATH)
+        self.tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, 
+                                                   chat_template="qwen3.5-vl",
+                                                   rand_video_max_frames=14,
+                                                   add_vision_id=False).build(
+            self.tokenizer)
+        self.processor = AutoProcessor.from_pretrained(QWEN35_VL_PATH)
+
+    def test_qwen35vl_text(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, chat_template="qwen3.5-vl", add_vision_id=True).build(self.tokenizer)
+
+        data_path = 'tests/resource/qwen35_tokenize_data.jsonl'
+        all_data= []
+        with open(data_path, 'r') as f:
+            for line in f:
+                all_data.append(json.loads(line))
+        
+        for j, data in enumerate(all_data):
+            if j>=12:
+                break
+            gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(self.tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
+            ret = tokenize_fn(data)
+            input_ids_xtuner = ret['input_ids']
+            labels_xtuner = ret['labels']
+            self.assertEqual(input_ids_xtuner, gt_token_ids)
+            self.assertEqual(labels_xtuner, gt_labels)
+
+            enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
+            decode_str = self.tokenizer.decode(input_ids_xtuner, skip_special_tokens=False)
+            hf_text = self.tokenizer.apply_chat_template(data['messages'],   
+                                               tools=data.get('tools'),       
+                                               add_vision_id=True,   
+                                               tokenize=False,
+                                               enable_thinking=enable_thinking,
+                                               add_generation_prompt=False)
+            self.assertEqual(decode_str, hf_text)
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen35_vl_sft_single_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_single_image_example_data_new.jsonl'
+        total_step = 50
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i >=total_step:
+                    break
+                raw_data = json.loads(line)
+                
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = raw_data['messages']
+                messages[0]['content'][0]['type'] = 'image'
+                messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                del messages[0]['content'][0]['image']
+
+                for msg in messages:
+                    if not isinstance(msg['content'], list):
+                        msg['content'] = [{"type": "text", "text": msg['content']}]
+
+                ret = self.processor.apply_chat_template(messages,
+                                                         add_generation_prompt=False,
+                                                         tokenize=True,
+                                                         add_vision_id=add_vision_id,
+                                                         return_dict=True)
+                input_ids_hf = ret['input_ids'][0]
+                pixel_values_hf = ret['pixel_values']
+                image_grid_thw_hf = ret['image_grid_thw']
+                self.assertEqual(input_ids_xtuner, input_ids_hf)
+                self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_sft_multi_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_multi_image_example_data_new.jsonl'
+        total_index = [0, 1, 2, 3, 4, 10]
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i not in total_index:
+                    continue
+                raw_data = json.loads(line)
+
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = raw_data['messages']
+                if i != 10:
+                    messages[0]['content'][0]['type'] = 'image'
+                    messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                    messages[0]['content'][1]['type'] = 'image'
+                    messages[0]['content'][1]['path'] = 'tests/' + messages[0]['content'][1]['image']['url']
+                    del messages[0]['content'][0]['image']
+                    del messages[0]['content'][1]['image']
+                else:
+                    messages[0]['content'][0]['type'] = 'image'
+                    messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                    del messages[0]['content'][0]['image']
+
+                    messages[4]['content'][0]['type'] = 'image'
+                    messages[4]['content'][0]['path'] = 'tests/' + messages[4]['content'][0]['image']['url']
+                    del messages[4]['content'][0]['image']
+
+                for msg in messages:
+                    if not isinstance(msg['content'], list):
+                        msg['content'] = [{"type": "text", "text": msg['content']}]
+
+                ret = self.processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=True,
+                                                         return_dict=True, add_vision_id=add_vision_id)
+                input_ids_hf = ret['input_ids'][0]
+                pixel_values_hf = ret['pixel_values']
+                image_grid_thw_hf = ret['image_grid_thw']
+
+                self.assertEqual(input_ids_xtuner, input_ids_hf)
+                self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    def test_calc_frame_info(self):
+        self.tokenize_fn.state = "cache"
+        data_path = 'tests/resource/mllm_video_frame_test_data_new.jsonl'
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                raw_data = json.loads(line)
+                self.tokenize_fn(raw_data)
+                frames_indices_list, origin_fps_list, timestamps_list = self.tokenize_fn.calc_frame_info(raw_data)
+                num_frames_list = []
+                for frames_indices in frames_indices_list:
+                    if isinstance(frames_indices, int):
+                        num_frames_list.append(frames_indices)
+                    else:
+                        num_frames_list.append(len(frames_indices))
+                if i == 0:
+                    # case: 如果不存在 origin_fps ，则会基于预设的 rand_video_max_frames 参数随机采样
+                    assert len(origin_fps_list) == len(timestamps_list) == 0
+                    assert self.tokenize_fn.video_processor.min_frames <= num_frames_list[
+                        0] <= self.tokenize_fn.rand_video_max_frames
+                    assert self.tokenize_fn.video_processor.min_frames <= num_frames_list[
+                        1] <= self.tokenize_fn.rand_video_max_frames
+                elif i == 1:
+                    # case: 如果存在 origin_fps ，则会基于 origin_fps 计算 timestamps
+                    self.assertEqual(num_frames_list, [20, 4])
+                    self.assertEqual(origin_fps_list, [10, 8])
+                    self.assertEqual(timestamps_list,
+                                     [[0.25, 1.3, 2.35, 3.35, 4.45, 5.45, 6.55, 7.55, 8.600000000000001, 9.65],
+                                      [0.25, 1.125]])
+                elif i == 2:
+                    # case: 测试 origin_fps 为 1 且长度小于 4 时是否正常
+                    self.assertEqual(num_frames_list, [20, 4])
+                    self.assertEqual(origin_fps_list, [10, 1])
+                    self.assertEqual(timestamps_list,
+                                     [[0.25, 1.3, 2.35, 3.35, 4.45, 5.45, 6.55, 7.55, 8.600000000000001, 9.65],
+                                      [0.0, 0.0]])
+                elif i == 3:
+                    # case: 测试存在 processed_fps 且一个能被 fps 整除，一个不能且视频长度大于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [10, 14])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.35, 2.45, 3.55, 4.65],
+                                                       [0.3, 1.3, 2.4000000000000004, 3.5, 4.6, 5.7, 6.7]])
+                elif i == 4:
+                    # case: 测试存在 processed_fps 且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [10, 12])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.35, 2.45, 3.55, 4.65],
+                                                       [0.1, 0.5, 0.9, 1.2999999999999998, 1.7000000000000002, 2.1]])
+                elif i == 5:
+                    # case: 测试存在 frames_timestamp，且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [4, 14])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5],
+                                                       [0.1, 0.5, 1.1, 1.5, 1.9, 2.5, 2.9]])
+                elif i == 6:
+                    # case: 测试存在 frames_timestamp，且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [4, 12])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5],
+                                                       [0.1, 0.5, 0.9, 1.2999999999999998, 1.7000000000000002, 2.1]])
+                elif i == 7:
+                    # case: 测试单视频
+                    self.assertEqual(num_frames_list, [4])
+                    self.assertEqual(origin_fps_list, [20])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5]])
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_sft_video(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, rand_video_max_frames=14,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(
+            self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_video_example_data_new.jsonl'
+        hf_data_path = 'tests/resource/mllm_sft_video_hf_example_data_new.jsonl'
+        hf_raw_datas = []
+        with open(hf_data_path) as f:
+            for line in f:
+                hf_raw_datas.append(json.loads(line))
+
+        total_index = [1, 4, 5, 6, 7, 8, 9]
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i not in total_index:
+                    continue
+                raw_data = json.loads(line)
+                hf_raw_data = hf_raw_datas[i]
+
+                if i in [7]:
+                    # transformers 当输入视频文件夹时候，无法支持采样，有多少视频就读多少视频
+                    do_sample_frames = False
+                    tokenize_fn.video_processor.fps = 3
+                    tokenize_fn.rand_video_max_frames = 24  # 设置为大于采样后视频，防止进行采样
+                else:
+                    do_sample_frames = True
+                    tokenize_fn.video_processor.fps = 2
+                    tokenize_fn.rand_video_max_frames = 14
+
+                ret = tokenize_fn(raw_data, media_root=VIDEO_ROOT)
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = hf_raw_data['messages']
+                add_video_root(messages, VIDEO_ROOT)
+
+                if i not in [8, 9]:
+                    ret = self.processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=True,
+                                                             do_sample_frames=do_sample_frames,
+                                                             return_dict=True, add_vision_id=add_vision_id,
+                                                             return_tensors="pt")
+                    input_ids_hf = ret['input_ids'][0]
+                    pixel_values_hf = ret['pixel_values_videos']
+                    image_grid_thw_hf = ret['video_grid_thw']
+
+                text = self.tokenize_fn.tokenizer.decode(input_ids_xtuner)
+
+                if i == 1:
+                    # 不应该包括 seconds> 内容
+                    self.assertTrue('seconds>' not in text)
+                else:
+                    if i == 8:
+                        # 测试能整除下均匀采样
+                        self.assertEqual(pixel_values_xtuner.size(), (45760, 1536))
+                        self.assertEqual(text.count('seconds>'), 13)
+                    elif i == 9:
+                        # 测试无法整除且超过最大帧数情况下，均匀采样
+                        self.assertEqual(pixel_values_xtuner.size(), (24640, 1536))
+                        self.assertEqual(text.count('seconds>'), 7)
+                    else:
+                        if i == 7:
+                            self.assertEqual(len(input_ids_xtuner), len(input_ids_hf))
+                        else:
+                            self.assertEqual(input_ids_xtuner, input_ids_hf.tolist())
+                        self.assertTrue('seconds>' in text)
+                        self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                        self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_pretrain_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_pretrain_image_example_data_new.jsonl'
+        total_step = 60
+        with open(data_path, encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= total_step:
+                    break
+                raw_data = json.loads(line)
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                labels_xtuner = torch.tensor(ret['labels'])
+                input_str = tokenize_fn.tokenizer.decode(input_ids_xtuner, skip_special_tokens=False)
+                input_str = input_str.replace('<|image_pad|>', '')
+                input_xtuner_str = input_str.replace('<|vision_start|><|vision_end|>', '<|vision_start|><|image_pad|><|vision_end|>')
+                
+                messages = raw_data['messages']
+                messages[0]['role'] = 'user'
+                hf_text = self.tokenizer.apply_chat_template(raw_data['messages'],          
+                                               add_vision_id=add_vision_id,   
+                                               tokenize=False,
+                                               enable_thinking=False,
+                                               add_generation_prompt=False)
+                hf_text = hf_text.replace('<|im_start|>user\n', '')   
+                hf_text = hf_text[:-1]  # remove \n                   
+                self.assertEqual(input_xtuner_str, hf_text)
+                self.assertTrue((labels_xtuner == self.tokenize_fn.img_context_token_id).sum() == 0)
+    
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_pretrain_video(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_pretrain_video_example_data_new.jsonl'
+        total_step = 60
+        with open(data_path, encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= total_step:   
+                    break
+                raw_data = json.loads(line)
+                ret = tokenize_fn(raw_data, media_root=VIDEO_ROOT)
+                labels_xtuner = torch.tensor(ret['labels'])
+                self.assertTrue((labels_xtuner == tokenize_fn.video_context_token_id).sum() == 0)
diff --git a/tests/resource/mllm_pretrain_image_example_data_new.jsonl b/tests/resource/mllm_pretrain_image_example_data_new.jsonl
new file mode 100644
index 000000000..ebb3512a6
--- /dev/null
+++ b/tests/resource/mllm_pretrain_image_example_data_new.jsonl
@@ -0,0 +1,12 @@
+{"id": 1, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}},{"type":"text", "text": "图片中的狗是什么颜色?"}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}},{"type": "text", "text": "图中有几只猫?"}]}]}
+{"id": 2, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 3, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}]}]}
+{"id": 4, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 5, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 6, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 7, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}]}
+{"id": 8, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 9, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "When the dog gazes out the window, it seems to be longing for something. This may imply that it wants to go outside or interact with someone or something."}]}]}
+{"id": 10, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 11, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 12, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
diff --git a/tests/resource/mllm_pretrain_video_example_data_new.jsonl b/tests/resource/mllm_pretrain_video_example_data_new.jsonl
new file mode 100644
index 000000000..15d42a52a
--- /dev/null
+++ b/tests/resource/mllm_pretrain_video_example_data_new.jsonl
@@ -0,0 +1,5 @@
+{"id": 1, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "请描述下视频内容？一男一女在打网球。请简要解释下网球，网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "视频中在做什么？打网球"}]}]}
+{"id": 3, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "Please describe the video content? A man and a woman are playing tennis. Please briefly explain tennis. Tennis is a sport where players use rackets to hit the ball over the net into the opponent's court. The goal is to score points by making the ball land in the opponent's court and the opponent fails to return it. Tennis can be played as singles or doubles."}]}]}
+{"id": 4, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "What is happening in the video? Playing tennis"}]}]}
+{"id": 5, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}},{"type": "text", "text": "What is happening in the video? Playing tennis"}]}]}
\ No newline at end of file
diff --git a/tests/resource/mllm_sft_multi_image_example_data_new.jsonl b/tests/resource/mllm_sft_multi_image_example_data_new.jsonl
new file mode 100644
index 000000000..1e17b88c1
--- /dev/null
+++ b/tests/resource/mllm_sft_multi_image_example_data_new.jsonl
@@ -0,0 +1,11 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "两张图片有啥相同之处？"}]}, {"role": "assistant", "content": "两幅图片中都存在动物。"}, {"role": "user", "content": "都有些什么动物？"}, {"role": "assistant", "content": "第一幅图片中有一只狗，第二副图片中有两只猫。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "两张图片有啥相同之处？"}]}, {"role": "assistant", "content": "两幅图片中都存在动物。"}, {"role": "user", "content": "都有些什么动物？"}, {"role": "assistant", "content": "第一幅图片中有两只猫，第二副图片中有一只狗。"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第二幅图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第一幅图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}, {"role": "user", "content": "第一幅图中还有啥东西？"}, {"role": "assistant", "content": "第一幅图片中还有2个电视遥控器"}, {"role": "user", "content": "两只猫在做什么？"}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}, {"role": "user", "content": "请描述下第一幅图片？"}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第二幅图片中这只狗有什么类型的项圈？"}]}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What are the similarities between the two images?"}]}, {"role": "assistant", "content": "Both images contain animals."}, {"role": "user", "content": "What animals are there?"}, {"role": "assistant", "content": "The first image contains a dog, and the second image contains two cats."}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What are the similarities between the two images?"}]}, {"role": "assistant", "content": "Both images contain animals."}, {"role": "user", "content": "What animals are there?"}, {"role": "assistant", "content": "The first image contains two cats, and the second image contains a dog."}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the color of the dog in the second image?"}]}, {"role": "assistant", "content": "The dog in the image is brown."}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "How many cats are in the first image?"}]}, {"role": "assistant", "content": "There are 2 cats in the image."}, {"role": "user", "content": "What else is in the first image?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the first image."}, {"role": "user", "content": "What are the two cats doing?"}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the first image?"}, {"role": "assistant", "content": "The image shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the type of collar the dog in the second image has?"}]}, {"role": "assistant", "content": "The dog has a red collar."}]}
+{"id": 11, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the first image?"}]}, {"role": "assistant", "content": "There are 2 cats in the image."}, {"role": "user", "content": "What else is in the first image?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the first image."}, {"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What are the two cats doing?"}]}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the first image?"}, {"role": "assistant", "content": "The image shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
diff --git a/tests/resource/mllm_sft_single_image_example_data_new.jsonl b/tests/resource/mllm_sft_single_image_example_data_new.jsonl
new file mode 100644
index 000000000..7837da245
--- /dev/null
+++ b/tests/resource/mllm_sft_single_image_example_data_new.jsonl
@@ -0,0 +1,22 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}, {"role": "user", "content": "这只狗有什么类型的项圈？"}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}, {"role": "user", "content": "那只狗在窗后的位置是什么？"}, {"role": "assistant", "content": "那只狗正坐在窗户后面。"}, {"role": "user", "content": "那只狗望着窗外时，它的表情是怎样的？"}, {"role": "assistant", "content": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}, {"role": "user", "content": "你能描述一下狗的毛吗？"}, {"role": "assistant", "content": "这只狗有蓬松、毛茸茸的棕色皮毛，使它看起来既可爱又温顺。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "这只狗有什么类型的项圈？"}]}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "那只狗在窗后的位置是什么？"}]}, {"role": "assistant", "content": "那只狗正坐在窗户后面。"}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "那只狗望着窗外时，它的表情是怎样的？"}]}, {"role": "assistant", "content": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "你能描述一下狗的毛吗？"}]}, {"role": "assistant", "content": "这只狗有蓬松、毛茸茸的棕色皮毛，使它看起来既可爱又温顺。"}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}]}, {"role": "assistant", "content": "The dog in the picture is brown."}, {"role": "user", "content": "What type of collar does this dog have?"}, {"role": "assistant", "content": "This dog has a red collar."}, {"role": "user", "content": "What is the position of the dog behind the window?"}, {"role": "assistant", "content": "The dog is sitting behind the window."}, {"role": "user", "content": "What is the dog's expression when it looks out the window?"}, {"role": "assistant", "content": "When the dog gazes out the window, it seems to be longing for something. This might suggest that it wants to go outside or interact with someone or something."}, {"role": "user", "content": "Can you describe the dog's fur?"}, {"role": "assistant", "content": "The dog has fluffy, furry brown fur, which makes it look both cute and gentle."}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}]}, {"role": "assistant", "content": "The dog in the picture is brown."}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What type of collar does this dog have?"}]}, {"role": "assistant", "content": "This dog has a red collar."}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What is the position of the dog behind the window?"}]}, {"role": "assistant", "content": "The dog is sitting behind the window."}]}
+{"id": 11, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What is the dog's expression when it looks out the window?"}]}, {"role": "assistant", "content": "When the dog gazes out the window, it seems to be longing for something. This might suggest that it wants to go outside or interact with someone or something."}]}
+{"id": 12, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the dog's fur?"}]}, {"role": "assistant", "content": "The dog has fluffy, furry brown fur, which makes it look both cute and gentle."}]}
+{"id": 13, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}, {"role": "user", "content": "图中还有啥东西？"}, {"role": "assistant", "content": "图片中还有2个电视遥控器"}, {"role": "user", "content": "两只猫在做什么？"}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}, {"role": "user", "content": "请描述下这个图片？"}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 14, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}]}
+{"id": 15, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图中除了猫还有啥东西？"}]}, {"role": "assistant", "content": "图片中还有2个电视遥控器"}]}
+{"id": 16, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "两只猫在做什么？"}]}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}]}
+{"id": 17, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "请描述下这个图片？"}]}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 18, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}, {"role": "assistant", "content": "There are 2 cats in the picture."}, {"role": "user", "content": "What else is in the picture?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the picture."}, {"role": "user", "content": "What are the two cats doing?"}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the picture?"}, {"role": "assistant", "content": "The picture shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
+{"id": 19, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}, {"role": "assistant", "content": "There are 2 cats in the picture."}]}
+{"id": 20, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What else is in the picture besides the cats?"}]}, {"role": "assistant", "content": "There are also 2 TV remotes in the picture."}]}
+{"id": 21, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What are the two cats doing?"}]}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}]}
+{"id": 22, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Can you describe the picture?"}]}, {"role": "assistant", "content": "The picture shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
diff --git a/tests/resource/mllm_sft_video_example_data_new.jsonl b/tests/resource/mllm_sft_video_example_data_new.jsonl
new file mode 100644
index 000000000..023eb7a80
--- /dev/null
+++ b/tests/resource/mllm_sft_video_example_data_new.jsonl
@@ -0,0 +1,10 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": "一男一女在打网球"}, {"role": "user", "content": "请简要解释下网球"}, {"role": "assistant", "content": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": "A man and a woman are playing tennis."}, {"role": "user", "content": "Can you briefly explain tennis?"}, {"role": "assistant", "content": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": "Playing tennis."}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": "A man and a woman are playing tennis."}, {"role": "user", "content": "Can you briefly explain tennis?"}, {"role": "assistant", "content": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": "Playing tennis."}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_4fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 23, "processed_fps": 4}}, {"type": "video", "video": {"url": "tennis_frames_2fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 13, "processed_fps": 2}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_4fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 23, "processed_fps": 4}}, {"type": "video", "video": {"url": "tennis_frames_2fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 13, "processed_fps": 2}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_15fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 91, "processed_fps": 15}}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
diff --git a/tests/resource/mllm_sft_video_hf_example_data_new.jsonl b/tests/resource/mllm_sft_video_hf_example_data_new.jsonl
new file mode 100644
index 000000000..146928d1d
--- /dev/null
+++ b/tests/resource/mllm_sft_video_hf_example_data_new.jsonl
@@ -0,0 +1,10 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "What is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "video", "path": "tennis.mp4"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_15fps/"},{"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
\ No newline at end of file
diff --git a/tests/resource/mllm_video_frame_test_data_new.jsonl b/tests/resource/mllm_video_frame_test_data_new.jsonl
new file mode 100644
index 000000000..7c566ff90
--- /dev/null
+++ b/tests/resource/mllm_video_frame_test_data_new.jsonl
@@ -0,0 +1,8 @@
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "两个视频中都在做什么?"},{"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "origin_video_length":12, "origin_fps":8}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "origin_video_length":1, "origin_fps":1}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":20,"processed_video_length":50, "processed_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":36, "processed_fps":5, "origin_video_length":72, "origin_fps":10}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":20,"processed_video_length":50, "processed_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":12, "processed_fps":5, "origin_video_length":24, "origin_fps":10}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":16, "processed_fps":5, "origin_video_length":32, "origin_fps":10,"frames_timestamp":[0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":12, "processed_fps":5, "origin_video_length":24, "origin_fps":10,"frames_timestamp":[0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "视频中都在做什么?"}]}, {"role": "assistant", "content": "打网球"}]}
diff --git a/tests/resource/qwen35_tokenize_data.jsonl b/tests/resource/qwen35_tokenize_data.jsonl
new file mode 100644
index 000000000..b367e5fa3
--- /dev/null
+++ b/tests/resource/qwen35_tokenize_data.jsonl
@@ -0,0 +1,17 @@
+{"id":1,"messages": [{"role": "system", "content": "这是单轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"}]}
+{"id":2,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道","reasoning_content": "这是 reasoning_content 内容"}]}
+{"id":3,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "\n我需要先调用一些工具才能知道","reasoning_content": "\n这是 reasoning_content 内容\n"}]}
+{"id":4,"messages": [{"role": "system", "content": "这是多轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "loss": false},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"}]}
+{"id":5,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题", "reasoning_content": "这是 reasoning_content 内容"}]}
+{"id":6,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"},{"role": "user", "content": "这是第三个问题"},{"role": "assistant", "content": "好的，我知道这是第三个问题", "reasoning_content": "这是 reasoning_content 内容 2"}]}
+{"id":7,"messages": [{"role": "system", "content": "这是单轮无think+toolcall例子"},{"role": "user", "content": "北京今天的天气如何？"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls":[{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments":  {"location": "Boston"}}}]},{"role": "tool","content": "35"},{"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}],"tools": [{"type":"function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}},{"type": "function", "function": {"name":"get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters":{"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":8,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。", "reasoning_content": "这是 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":9,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。","reasoning_content": "这是最后一个 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":10,"messages": [{"role": "system", "content": "这是多轮无think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。", "loss": false}, {"role": "user", "content": "这是第二个问题。上海的天气如何"}, {"role": "assistant", "content": "好的，我知道这是第二个问题。我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":11,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。只有一个用户 user 输入。只有一次真 user 输入 表示整个对话过程中只有 user message。此时中间的所有 think 过程都会保留"}, {"role": "user", "content": "北京和上海今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "我现在知道北京的天气了，我需要继续知道上海的天气", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度，上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入，则之前的 think 内容会全部丢掉，因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何？"}, {"role": "assistant", "content": "现在是第二个问题了，我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手，能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容，它们有什么相同点和不同点？"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫，背景都是室内环境，光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧，而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比，有什么新的元素出现？"}]}, {"role": "assistant", "content": "与前两张图片相比，这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片，你认为它们想表达什么主题？"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题：猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松，第二张图片展示了猫的舒适和满足，而第三张图片则通过不同的构图和视觉元素，传达了猫在这个环境中的多样性和丰富性。整体上，这些图片共同描绘了猫在室内生活中的多样化表现，表达了对猫的喜爱和对其生活状态的关注。"}]}
+{"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/demo.mp4"}}, {"type": "text", "text": "请描述这个视频的主要内容，并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景，有行人和车辆在移动。随后镜头切换到一家咖啡店的内部，可以看到顾客在排队点单。接着视频展示了咖啡制作的过程，包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅，画面清晰。"}]}
+{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/cooking_tutorial.mp4"}}, {"type": "video", "video": {"url":"https://example.com/video/cooking_result.mp4"}}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/failed_attempt.mp4"}}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
+{"id":16,"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "demo", "origin_video_length": 196, "origin_fps": 29.97002997002997, "processed_video_length": 26, "processed_fps": 4.0, "image_wh": [202, 360], "num_frames":[1,1,1], "conversation_timestamps": [0.0, 10.0]}}, {"type": "text", "text": "Describe the video in detail. [NO_REASONING]"}]}, {"role": "assistant", "content": "A young boy is sitting in the passenger seat of a car, wearing a white and blue striped polo shirt with a 'UTSA' logo. He is holding a pair of sunglasses and gradually puts them on his face. The background outside the car window shows a fence, trees, and a clear sky."}]}
+{"id":17,"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "demo", "origin_video_length": 196, "origin_fps": 29.97002997002997, "processed_video_length": 26, "processed_fps": 4.0, "image_wh": [202, 360], "timestamps":[0,1,2], "conversation_timestamps": [0.0, 10.0]}}, {"type": "text", "text": "Describe the video in detail. [NO_REASONING]"}]}, {"role": "assistant", "content": "A young boy is sitting in the passenger seat of a car, wearing a white and blue striped polo shirt with a 'UTSA' logo. He is holding a pair of sunglasses and gradually puts them on his face. The background outside the car window shows a fence, trees, and a clear sky."}]}
\ No newline at end of file
diff --git a/xtuner/v1/data_proto/messages/__init__.py b/xtuner/v1/data_proto/messages/__init__.py
index b0e841f70..9b95ad829 100644
--- a/xtuner/v1/data_proto/messages/__init__.py
+++ b/xtuner/v1/data_proto/messages/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseMessages
 from .chat import ChatMessages
+from .qwen35_chat import Qwen35ChatMessages
 
 
-__all__ = ["BaseMessages", "ChatMessages"]
+__all__ = ["BaseMessages", "ChatMessages", "Qwen35ChatMessages"]
diff --git a/xtuner/v1/data_proto/messages/chat.py b/xtuner/v1/data_proto/messages/chat.py
index fcc4adc64..55e1dd1f8 100644
--- a/xtuner/v1/data_proto/messages/chat.py
+++ b/xtuner/v1/data_proto/messages/chat.py
@@ -227,7 +227,7 @@ def get_prompt(self, chat_template: ChatTemplate) -> str:
                 prompt += chat_template.sep
         return prompt
 
-    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate) -> Dict:
+    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate, **kwargs) -> Dict:
         input_ids = tokenizer.encode("", add_special_tokens=False)
         labels = [IGNORE_INDEX for _ in input_ids]
 
diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
new file mode 100644
index 000000000..dcb09d5d5
--- /dev/null
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -0,0 +1,421 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import json
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel, ConfigDict
+
+from transformers import PreTrainedTokenizer
+from xtuner.v1.data_proto.templates import HybridChatTemplate
+
+
+def get_offset_mapping(tokenizer, text: str):
+    encoding = tokenizer(text, add_special_tokens=False)
+    input_ids = encoding["input_ids"]
+    offset_mapping = []
+    pos = 0
+    pending_ids = []  # type: ignore
+    max_pending = 8
+
+    def _flush_pending(start, end):
+        nonlocal pending_ids, pos
+        offset_mapping.extend([(start, end)] * len(pending_ids))
+        pos = end
+        pending_ids = []
+
+    def _flush_pending_as_empty():
+        nonlocal pending_ids
+        offset_mapping.extend([(pos, pos)] * len(pending_ids))
+        pending_ids = []
+
+    for token_id in input_ids:
+        pending_ids.append(token_id)
+        decoded = tokenizer.decode(pending_ids, skip_special_tokens=False)
+        if not decoded:
+            continue
+        idx = text.find(decoded, pos)
+        if idx != -1:
+            end = idx + len(decoded)
+            _flush_pending(idx, end)
+        elif "\ufffd" not in decoded or len(pending_ids) >= max_pending:
+            _flush_pending_as_empty()
+
+    if pending_ids:
+        _flush_pending_as_empty()
+    return input_ids, offset_mapping
+
+
+def render_content(content, do_vision_count, image_count, video_count, add_vision_id=False):
+    if isinstance(content, str):
+        return content, image_count, video_count
+    result = ""
+    for item in content:
+        if "image" in item or "image_url" in item or item.get("type") == "image":
+            if do_vision_count:
+                image_count += 1
+            if add_vision_id:
+                result += f"Picture {image_count}: "
+            result += "<|vision_start|><|image_pad|><|vision_end|>"
+        elif "video" in item or item.get("type") == "video":
+            if do_vision_count:
+                video_count += 1
+            if add_vision_id:
+                result += f"Video {video_count}: "
+
+            video_content = item.get("video", {})
+            assert isinstance(video_content, dict), f"video_content must be a dict, but got {type(video_content)}"
+            timestamps = video_content.get("timestamps", [])
+            if len(timestamps) > 0:
+                video_placeholder = ""
+                for timestamp in timestamps:
+                    video_placeholder += f"<{timestamp:.1f} seconds><|vision_start|><|video_pad|><|vision_end|>"
+                result += video_placeholder
+            else:
+                # 每个视频可能有 n 帧，每一帧里面可能占据 m 个 token
+                assert "num_frames" in video_content, "num_frames must be in video_content"
+                num_frames = video_content["num_frames"]
+                for _ in range(len(num_frames)):
+                    result += "<|vision_start|><|video_pad|><|vision_end|>"
+            conversation_timestamp = video_content.get("conversation_timestamps", [])
+            if len(conversation_timestamp) > 0:
+                start_time = conversation_timestamp[0]
+                end_time = conversation_timestamp[1]
+                timestamps = f"<{start_time:.1f}-{end_time:.1f} seconds>"
+                result += timestamps
+
+        elif "text" in item:
+            result += item["text"]
+    return result, image_count, video_count
+
+
+# Qwen3.5 工具系统提示（与 Qwen3 不同的 XML 格式）
+_QWEN35_TOOL_SYSTEM = "# Tools\n\nYou have access to the following functions:\n\n<tools>"
+_QWEN35_TOOL_INSTRUCTIONS = (
+    "\n</tools>\n\n"
+    "If you choose to call a function ONLY reply in the following format with NO suffix:\n\n"
+    "<tool_call>\n"
+    "<function=example_function_name>\n"
+    "<parameter=example_parameter_1>\n"
+    "value_1\n"
+    "</parameter>\n"
+    "<parameter=example_parameter_2>\n"
+    "This is the value for the second parameter\n"
+    "that can span\n"
+    "multiple lines\n"
+    "</parameter>\n"
+    "</function>\n"
+    "</tool_call>\n\n"
+    "<IMPORTANT>\n"
+    "Reminder:\n"
+    "- Function calls MUST follow the specified format: an inner <function=...></function> "
+    "block must be nested within <tool_call></tool_call> XML tags\n"
+    "- Required parameters MUST be specified\n"
+    "- You may provide optional reasoning for your function call in natural language BEFORE "
+    "the function call, but NOT after\n"
+    "- If there is no function call available, answer the question like normal with your "
+    "current knowledge and do not tell the user about function calls\n"
+    "</IMPORTANT>"
+)
+
+
+def _render_tool_call_args(arguments: dict) -> str:
+    """将 tool_call arguments dict 渲染为 Qwen3.5 XML 参数格式。"""
+    parts = ""
+    for k, v in arguments.items():
+        parts += f"<parameter={k}>\n"
+        if isinstance(v, (dict, list)):
+            parts += json.dumps(v, ensure_ascii=False)
+        else:
+            parts += str(v)
+        parts += "\n</parameter>\n"
+    return parts
+
+
+def qwen35_tokenize_fn_fastspeed(
+    messages,
+    tokenizer=None,
+    tools=None,
+    add_generation_prompt=False,
+    add_vision_id=False,
+    return_labels=True,
+):
+    enable_thinking = any("reasoning_content" in msg for msg in messages)
+
+    image_count = 0
+    video_count = 0
+    result = ""
+    loss_mask: list[bool] = []
+
+    def _render(content, do_vision_count: bool) -> str:
+        nonlocal image_count, video_count
+        out, image_count, video_count = render_content(
+            content, do_vision_count, image_count, video_count, add_vision_id
+        )
+        return out
+
+    def _append(text: str, is_loss: bool) -> None:
+        nonlocal result
+        result += text
+        loss_mask.extend([is_loss] * len(text))
+
+    # ── system / tools 块 ─────────────────────────────────────────────────
+    if tools:
+        _append("<|im_start|>system\n", False)
+        _append(_QWEN35_TOOL_SYSTEM, False)
+        for tool in tools:
+            _append("\n" + json.dumps(tool, ensure_ascii=False), False)
+        _append(_QWEN35_TOOL_INSTRUCTIONS, False)
+        if messages[0]["role"] == "system":
+            sys_content = _render(messages[0]["content"], False).strip()
+            if sys_content:
+                _append("\n\n" + sys_content, False)
+        _append("<|im_end|>\n", False)
+    else:
+        if messages[0]["role"] == "system":
+            sys_content = _render(messages[0]["content"], False).strip()
+            _append(f"<|im_start|>system\n{sys_content}<|im_end|>\n", False)
+
+    # ── 计算 last_query_index ─────────────────────────────────────────────
+    multi_step_tool = True
+    last_query_index = len(messages) - 1
+    for i in range(len(messages) - 1, -1, -1):
+        msg = messages[i]
+        if multi_step_tool and msg["role"] == "user":
+            content_str = _render(msg["content"], False).strip()
+            if not (content_str.startswith("<tool_response>") and content_str.endswith("</tool_response>")):
+                multi_step_tool = False
+                last_query_index = i
+
+    # ── 主循环 ────────────────────────────────────────────────────────────
+    for idx, message in enumerate(messages):
+        is_first = idx == 0
+        is_last = idx == len(messages) - 1
+        content = _render(message["content"], True).strip()
+        role = message["role"]
+
+        if role == "user" or (role == "system" and not is_first):
+            _append(f"<|im_start|>{role}\n{content}<|im_end|>\n", False)
+
+        elif role == "assistant":
+            reasoning_content = ""
+            if isinstance(message.get("reasoning_content"), str):
+                reasoning_content = message["reasoning_content"]
+            else:
+                if "</think>" in content:
+                    reasoning_content = content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n")
+                    content = content.split("</think>")[-1].lstrip("\n")
+            # Qwen3.5 模板对 reasoning_content 做 |trim
+            reasoning_content = reasoning_content.strip()
+
+            is_loss = message.get("loss", True)
+
+            _append(f"<|im_start|>{role}\n", False)
+
+            if idx > last_query_index:
+                # 最后查询之后的轮次：渲染 <think> 块，并计算 loss
+                _append("<think>\n", False)
+                if reasoning_content:
+                    # 有 reasoning：gen prompt 以 <think>\n 结尾，content_tokens 从 reasoning 开始
+                    _append(reasoning_content + "\n", is_loss)
+                    _append("</think>\n\n", is_loss)
+                elif enable_thinking:
+                    # enable_thinking=True 但无 reasoning：gen prompt 以 <think>\n 结尾
+                    # content_tokens 从 </think> 开始，所以 </think>\n\n 算 loss
+                    _append("\n", False)  # 空内容的 \n（与 <think>\n 合并为 \n\n token，不算 loss）
+                    _append("</think>\n\n", is_loss)
+                else:
+                    # enable_thinking=False：gen prompt 以完整 <think>\n\n</think>\n\n 结尾
+                    # content_tokens 只包含实际回复，</think>\n\n 不算 loss
+                    _append("\n", False)
+                    _append("</think>\n\n", False)
+                body_is_loss = is_loss
+            else:
+                # 历史轮次：
+                # - enable_thinking=False：gen prompt 含完整 <think>\n\n</think>\n\n，
+                #   content_tokens 只有回复内容，在 total_ids 中可以找到 → 用 is_loss
+                # - enable_thinking=True：content_tokens 以 </think> 开头，
+                #   total_ids 里历史轮无 <think> 块 → NOT FOUND → 不算 loss
+                body_is_loss = is_loss if not enable_thinking else False
+                _append(content, body_is_loss)
+
+            if idx > last_query_index:
+                _append(content, body_is_loss)
+
+            # tool_calls（Qwen3.5 XML 格式）
+            if message.get("tool_calls"):
+                for tc_idx, tool_call in enumerate(message["tool_calls"]):
+                    tc = tool_call.get("function", tool_call)
+                    tc_name = tc["name"]
+                    tc_args = tc.get("arguments", {})
+
+                    if tc_idx == 0:
+                        if content.strip():
+                            _append("\n\n", body_is_loss)
+                        _append(f"<tool_call>\n<function={tc_name}>\n", body_is_loss)
+                    else:
+                        _append(f"\n<tool_call>\n<function={tc_name}>\n", body_is_loss)
+
+                    if isinstance(tc_args, dict):
+                        _append(_render_tool_call_args(tc_args), body_is_loss)
+                    _append("</function>\n</tool_call>", body_is_loss)
+
+            _append("<|im_end|>\n", body_is_loss)
+
+        elif role == "tool":
+            prev_role = messages[idx - 1]["role"] if idx > 0 else None
+            if is_first or prev_role != "tool":
+                _append("<|im_start|>user", False)
+            _append("\n<tool_response>\n", False)
+            _append(content, False)
+            _append("\n</tool_response>", False)
+            next_role = messages[idx + 1]["role"] if not is_last else None
+            if is_last or next_role != "tool":
+                _append("<|im_end|>\n", False)
+
+    if add_generation_prompt:
+        _append("<|im_start|>assistant\n", False)
+        if not enable_thinking:
+            _append("<think>\n\n</think>\n\n", False)
+        else:
+            _append("<think>\n", False)
+
+    # ── 不需要 labels ─────────────────────────────────────────────────────
+    if not return_labels:
+        return result, loss_mask
+
+    # ── 需要 labels ───────────────────────────────────────────────────────
+    assert tokenizer is not None, "return_labels=True 时必须传入 tokenizer"
+
+    try:
+        encoded = tokenizer(
+            result,
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+        )
+        input_ids = encoded["input_ids"]
+        offset_mapping = encoded["offset_mapping"]
+    except Exception:
+        input_ids, offset_mapping = get_offset_mapping(tokenizer, result)
+
+    labels = []
+    for token_id, (start, end) in zip(input_ids, offset_mapping):
+        if start == end:
+            labels.append(-100)
+        elif any(loss_mask[i] for i in range(start, end)):
+            labels.append(token_id)
+        else:
+            labels.append(-100)
+
+    return input_ids, labels
+
+
+def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tools=None, add_vision_id=False, **kwargs):
+    """
+    终极稳定版 Tokenize：基于 Token 级别的绝对对齐 (椒盐算法升级版)。
+    逻辑：
+    1. 生成全量 total_ids 作为唯一真实的参考系。
+    2. 对于每个 assistant 消息，通过历史截断渲染，提取出它“应该长什么样”的 token 序列。
+    3. 在 total_ids 中顺藤摸瓜，精确匹配这些 token 序列。
+    4. 完美解决字符偏移错位、模板历史修改、以及特殊 Token 对齐问题。
+    """
+
+    enable_thinking = any("reasoning_content" in msg for msg in messages)
+
+    full_text = tokenizer.apply_chat_template(
+        messages, tokenize=False, tools=tools, add_vision_id=add_vision_id, enable_thinking=enable_thinking, **kwargs
+    )
+    total_ids = tokenizer.encode(full_text, add_special_tokens=False)
+    labels = [-100] * len(total_ids)
+    # 记录在 total_ids 中搜索的起始位置，确保不会搜到前面的轮次
+    curr_ptr = 0
+    for i, msg in enumerate(messages):
+        if msg["role"] == "assistant" and msg.get("loss", True):
+            # 1. 获取包含当前消息之前所有内容的“前缀”文本 (带 generation prompt)
+            prompt_text = tokenizer.apply_chat_template(
+                messages[:i],
+                tokenize=False,
+                add_generation_prompt=True,
+                add_vision_id=add_vision_id,
+                enable_thinking=enable_thinking,
+                tools=tools if i == 0 else None,
+                **kwargs,
+            )
+            # 2. 获取包含当前消息的完整“截断”文本
+            # 我们通过修改当前消息的内容，强制在末尾加上一个罕见标记，来准确捕获这部分的内容
+            # 为什么要加标记？因为我们想知道当前消息的结束符（如 <|im_end|>）被 tokenizer 编成了什么
+            temp_msgs = [m.copy() for m in messages[: i + 1]]
+            # 提取真实内容
+            m_text = tokenizer.apply_chat_template(
+                temp_msgs,
+                tokenize=False,
+                add_vision_id=add_vision_id,
+                enable_thinking=enable_thinking,
+                tools=tools if i == 0 else None,
+                **kwargs,
+            )
+            # 转换为 Token 序列
+            p_ids = tokenizer.encode(prompt_text, add_special_tokens=False)
+            m_ids = tokenizer.encode(m_text, add_special_tokens=False)
+            # 3. 提取当前消息的纯内容 Tokens (包含 reasoning, content, tool_calls, 以及结尾的 im_end)
+            # 注意：由于 tokenizer 的特性，m_ids 的前缀可能并不完美等于 p_ids
+            # 所以我们要寻找 p_ids 的特征来切分
+            # 为了最稳健，我们直接在 m_ids 的末尾倒推。
+            # 我们知道 m_ids 是由 p_ids + current_content_ids 组成的
+            # 我们直接取差集：
+            content_tokens = m_ids[len(p_ids) :]
+            if not content_tokens:
+                continue
+            # 4. 在全量 total_ids 中搜索这段 content_tokens
+            found = False
+            # 从 curr_ptr 开始往后搜
+            for s_ptr in range(curr_ptr, len(total_ids) - len(content_tokens) + 1):
+                if total_ids[s_ptr : s_ptr + len(content_tokens)] == content_tokens:
+                    # 匹配成功！
+                    labels[s_ptr : s_ptr + len(content_tokens)] = content_tokens
+                    curr_ptr = s_ptr + len(content_tokens)
+                    found = True
+                    break
+            if not found:
+                # 如果没找到，说明模板在全量渲染时，修改了这条历史消息的内容（例如删了 thinking）
+                # 这是允许的，只要它不是当前轮次（我们不强求历史轮次一定要匹配上，因为我们通常只对最后的 Turn 算 loss）
+                # 但如果是最后一条消息还没匹配上，那就一定是出大问题了
+                if i == len(messages) - 1:
+                    raise ValueError("严重错误：最后一条 Assistant 消息无法在全量 Token 中对齐。")
+    return total_ids, labels
+
+
+# 我们采用全新逻辑，因此不需要继承 BaseChatMessages，后续之前的 ChatMessages 逻辑全部删除
+class Qwen35ChatMessages(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    messages: List[dict]  # 暂时不做校验
+    tools: Optional[List[Dict]] = None
+
+    def tokenize(
+        self, tokenizer: PreTrainedTokenizer, chat_template: HybridChatTemplate, add_vision_id=False, **kwargs
+    ) -> Dict:
+        is_pretrain = False
+        if len(self.messages) == 1 and self.messages[0]["role"] == "pretrain":
+            is_pretrain = True
+
+        if is_pretrain:
+            text, _, _ = render_content(
+                self.messages[0]["content"],
+                do_vision_count=True,
+                image_count=0,
+                video_count=0,
+                add_vision_id=add_vision_id,
+            )
+            token_ids = tokenizer.encode(text, add_special_tokens=False)
+            label_ids = copy.deepcopy(token_ids)
+        else:
+            # replace system message
+            if chat_template.default_system is not None:
+                if self.messages[0]["role"] == "system":
+                    self.messages[0]["content"] = chat_template.default_system
+                else:
+                    self.messages.insert(0, {"role": "system", "content": chat_template.default_system})
+
+            token_ids, label_ids = qwen35_tokenize_fn_fastspeed(
+                self.messages, tokenizer, self.tools, add_vision_id=add_vision_id, return_labels=True
+            )
+        return {"input_ids": token_ids, "labels": label_ids}
diff --git a/xtuner/v1/data_proto/templates/__init__.py b/xtuner/v1/data_proto/templates/__init__.py
index 65fbf1b46..7d7a7a78d 100644
--- a/xtuner/v1/data_proto/templates/__init__.py
+++ b/xtuner/v1/data_proto/templates/__init__.py
@@ -8,6 +8,12 @@
 current_date = datetime.now().strftime("%Y-%m-%d")
 
 CHAT_TEMPLATE_MAP = {
+    "qwen3.5-vl": HybridChatTemplate(
+        image_start_token="<|vision_start|>",
+        image_end_token="<|vision_end|>",
+        image_context_token="<|image_pad|>",
+        video_context_token="<|video_pad|>",
+    ),
     "intern-s1": HybridChatTemplate(
         system="<|im_start|>system\n{system}<|im_end|>\n",
         user="<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n",
diff --git a/xtuner/v1/data_proto/templates/hybrid.py b/xtuner/v1/data_proto/templates/hybrid.py
index 0ec2ddfcf..a83b553d5 100644
--- a/xtuner/v1/data_proto/templates/hybrid.py
+++ b/xtuner/v1/data_proto/templates/hybrid.py
@@ -12,11 +12,11 @@ class HybridChatTemplate(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     # Normal Chat
-    system: str  # System message format, role
+    system: str | None = None  # System message format, role
     developer: str | None = None  # Developer message format, role
-    user: str  # User message format, role
-    assistant: str  # Assistant message format, role
-    stop_words: List[str]  # List of stop words
+    user: str | None = None  # User message format, role
+    assistant: str | None = None  # Assistant message format, role
+    stop_words: List[str] | None = None  # List of stop words
     sep: str = "\n"
     thinking: str | None = None  # Thinking message format, not role
     default_system: Optional[str] = None
@@ -61,6 +61,7 @@ def mm_token_maps(self) -> Dict[str, int]:
 
     def decorate_system(self, text: str) -> str:
         """Decorate text with the `system` template."""
+        assert self.system is not None, "system template is not defined."
         return self.system.format(system=text)
 
     def decorate_developer(self, text: str) -> str:
@@ -71,6 +72,7 @@ def decorate_developer(self, text: str) -> str:
 
     def decorate_assistant(self, text: str) -> str:
         """Decorate text with the `assistant` template."""
+        assert self.assistant is not None, "assistant template is not defined."
         return self.assistant.format(assistant=text)
 
     def decorate_thinking(self, text: str) -> str:
@@ -81,6 +83,7 @@ def decorate_thinking(self, text: str) -> str:
 
     def decorate_user(self, text: str) -> str:
         """Decorate text with the `user` template."""
+        assert self.user is not None, "user template is not defined."
         return self.user.format(user=text)
 
     def decorate_files(self, text: str) -> str:
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
index c4ec171b4..d21fbbe19 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
@@ -31,10 +31,11 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
             content = msg["content"]
             if isinstance(content, list):
                 for c in content:
-                    if c["type"] == "image_url":
-                        image_paths.append(c["image_url"]["url"])
-                        if "image_wh" in c["image_url"]:
-                            image_wh = c["image_url"]["image_wh"]
+                    if c["type"] in ("image_url", "image"):
+                        key = "image_url" if "image_url" in c else "image"
+                        image_paths.append(c[key]["url"])
+                        if "image_wh" in c[key]:
+                            image_wh = c[key]["image_wh"]
                             if isinstance(image_wh[0], (list, tuple)):
                                 assert len(image_wh) == 1, (
                                     f"Only one image size is supported for each image. but got {image_wh}"
@@ -42,10 +43,10 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
                                 image_wh = image_wh[0]
                             image_wh_list.append(image_wh)
                             assert len(image_wh) == 2, f"image_wh should be [width, height], but got {image_wh}"
-                    if c["type"] == "video_url":
-                        video_paths.append(c["video_url"]["url"])
-
-                        video_wh = c["video_url"].get("image_wh")
+                    if c["type"] in ("video_url", "video"):
+                        key = "video_url" if "video_url" in c else "video"
+                        video_paths.append(c[key]["url"])
+                        video_wh = c[key].get("image_wh")
                         if video_wh is not None:
                             if isinstance(video_wh[0], (list, tuple)):
                                 assert len(video_wh) == 1, (
@@ -56,16 +57,17 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
                             assert len(video_wh) == 2, f"video_wh should be [width, height], but got {video_wh}"
 
                         video_extra_dict = {}
-                        if "origin_video_length" in c["video_url"]:
-                            video_extra_dict["origin_video_length"] = c["video_url"]["origin_video_length"]
-                        if "origin_fps" in c["video_url"]:
-                            video_extra_dict["origin_fps"] = c["video_url"]["origin_fps"]
-                        if "processed_video_length" in c["video_url"]:
-                            video_extra_dict["processed_video_length"] = c["video_url"]["processed_video_length"]
-                        if "processed_fps" in c["video_url"]:
-                            video_extra_dict["processed_fps"] = c["video_url"]["processed_fps"]
-                        if "frames_timestamp" in c["video_url"]:
-                            video_extra_dict["frames_timestamp"] = c["video_url"]["frames_timestamp"]
+
+                        if "origin_video_length" in c[key]:
+                            video_extra_dict["origin_video_length"] = c[key]["origin_video_length"]
+                        if "origin_fps" in c[key]:
+                            video_extra_dict["origin_fps"] = c[key]["origin_fps"]
+                        if "processed_video_length" in c[key]:
+                            video_extra_dict["processed_video_length"] = c[key]["processed_video_length"]
+                        if "processed_fps" in c[key]:
+                            video_extra_dict["processed_fps"] = c[key]["processed_fps"]
+                        if "frames_timestamp" in c[key]:
+                            video_extra_dict["frames_timestamp"] = c[key]["frames_timestamp"]
                         if len(video_extra_dict) > 0:
                             video_extra_info_list.append(video_extra_dict)
 
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
index 0781b5b42..d9571b4a1 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
@@ -6,7 +6,7 @@
 import os
 from itertools import chain
 from types import SimpleNamespace
-from typing import Literal, Optional, Union
+from typing import Dict, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -17,7 +17,7 @@
 import transformers
 from transformers import AutoProcessor, PreTrainedTokenizer
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages import ChatMessages, Qwen35ChatMessages
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP, HybridChatTemplate
 from xtuner.v1.utils import get_logger
 
@@ -203,12 +203,81 @@ def replace_video_token(
     assert current_image_idx == n_image, f"VIDEO ERROR: total_image_idx: {current_image_idx} != {n_image}"
 
 
+def replace_video_timestamps_and_num_frame(
+    messages: list[Dict], num_image_token_list: list[list[int]], timestamps_list: list[list[float]]
+):
+    if len(timestamps_list) > 0:
+        assert len(timestamps_list) == len(num_image_token_list), (
+            "timestamps should have the same length as num_image_token_list"
+        )
+    video_cnt = 0
+    for msg in messages:
+        if msg["role"] == "user" or msg["role"] == "pretrain":
+            content = msg["content"]
+            if isinstance(content, list):
+                for item in content:
+                    if "video" in item:
+                        video_content = item["video"]
+                        if len(timestamps_list) > 0:
+                            timestamps = timestamps_list[video_cnt]
+                            video_content["timestamps"] = timestamps
+                        if len(num_image_token_list) > 0:
+                            num_frames = num_image_token_list[video_cnt]
+                            video_content["num_frames"] = num_frames
+                        video_cnt += 1
+
+
+def replace_qwen35_media_token(
+    tokenized: dict, context_token_id: int, num_media_token_list: list[int] | list[list[int]], total_media_count: int
+):
+    input_ids = tokenized["input_ids"]
+    labels = tokenized.get("labels")
+
+    if isinstance(num_media_token_list[0], list):
+        # video flatten list
+        num_media_token_list = [item for sublist in num_media_token_list for item in sublist]  # type: ignore
+
+    context_token_count = input_ids.count(context_token_id)
+    assert context_token_count == len(num_media_token_list), (
+        f"context_token_count and num_media_token_list length must be the same, "
+        f"but got {context_token_count} and {len(num_media_token_list)}"
+    )
+    new_input_ids: list[int] = []
+    new_labels: list[int] | None = [] if labels is not None else None
+    media_idx = 0
+    for i, tid in enumerate(input_ids):
+        if tid == context_token_id:
+            n = int(num_media_token_list[media_idx])  # type: ignore
+            new_input_ids.extend([context_token_id] * n)
+            if new_labels is not None and labels is not None:
+                lbl = labels[i]
+                new_labels.extend([lbl] * n)
+            media_idx += 1
+        else:
+            new_input_ids.append(tid)
+            if new_labels is not None and labels is not None:
+                new_labels.append(labels[i])
+
+    if new_labels is not None:
+        assert len(new_input_ids) == len(new_labels), "new_input_ids and new_labels length must be the same"
+    assert new_input_ids.count(context_token_id) == total_media_count, (
+        f"new_input_ids and total_media_count must be the same, "
+        f"but got {new_input_ids.count(context_token_id)} and {total_media_count}"
+    )
+
+    tokenized["input_ids"] = new_input_ids
+    if new_labels is not None:
+        tokenized["labels"] = new_labels
+    return tokenized
+
+
 class Qwen3VLTokenizeFunction(BaseMLLMTokenizeFunction):
     def __init__(
         self,
         tokenizer: PreTrainedTokenizer,
         processor_path: str,
         anno_name: str,
+        chat_template: str = "qwen3-vl",  # qwen3.5-vl or qwen3-vl
         min_pixels: int | None = None,  # Max image pixels (H*W) for image
         max_pixels: int | None = None,  # Min image pixels (H*W) for image
         video_min_frames: int | None = None,  # Min frames per video
@@ -234,7 +303,6 @@ def __init__(
         add_eos_token: bool = True,  # for mllm pretrain
         add_bos_token: bool = False,  # for mllm pretrain
         trim_memory_interval: int = 1,
-        chat_template: str = "qwen3-vl",
     ):
         self.oss_loader = None
         self.debug = debug
@@ -295,7 +363,7 @@ def __init__(
             f"video_max_frames: {self.video_processor.max_frames}, fps: {self.video_processor.fps}, "
             f"rand_video_max_frames: {self.rand_video_max_frames}"
         )
-
+        self.chat_template_name = chat_template
         self.chat_template = copy.deepcopy(CHAT_TEMPLATE_MAP[chat_template])
         if system_message is not None:
             self.chat_template.default_system = system_message
@@ -340,6 +408,17 @@ def __init__(
             trim_memory_interval=trim_memory_interval,
         )
 
+    def calc_num_tokens_pure_text_get_item(self, data_item) -> CacheItem:
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+        tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+        input_ids = tokenized["input_ids"]
+        labels = tokenized["labels"]
+        input_ids, _ = self._truncated_input_and_labels(input_ids, labels)
+        return {"num_tokens": len(input_ids), "num_img_tokens": [0]}
+
     def _truncated_data_item(
         self, input_ids: list[int], labels: list[int] | None = None, position_ids: torch.Tensor | None = None
     ):
@@ -360,14 +439,18 @@ def _truncated_data_item(
         return input_ids, labels, position_ids
 
     def pure_text_get_item(self, data_item: dict) -> QwenVL3DataItem:
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-
         is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
+                is_pretrain = True
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
         assert is_pretrain is False, "Text pretrain data should not be processed by this function"
 
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
         input_ids = tokenized["input_ids"]
         labels: list[int] = tokenized["labels"]
 
@@ -413,14 +496,24 @@ def calc_num_tokens_multi_modal_get_item(self, data_item: dict) -> CacheItem:
             print(f"ERROR of {self._image_wh_list}: {e}, data_name: {self.data_name}")
             return {"num_tokens": 0, "num_img_tokens": [0]}  # type: ignore
 
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_image_token(messages, self.chat_template, sum_media_grid_thw, add_vision_id=self.add_vision_id)
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(
+                tokenized, self.img_context_token_id, sum_media_grid_thw, sum_media_grid_thw.sum()
+            )
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            replace_image_token(messages, self.chat_template, sum_media_grid_thw, add_vision_id=self.add_vision_id)
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -458,16 +551,24 @@ def multi_modal_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3
         visual_processed = self.image_processor.preprocess(image_data_list, return_tensors="pt")
         image_tensor = visual_processed["pixel_values"]
         grid_thw = visual_processed["image_grid_thw"]  # b,3
-        grid_thw_merged = [merged_thw.prod() // self.merge_length for merged_thw in grid_thw]  # type: ignore
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_image_token(messages, self.chat_template, grid_thw_merged, add_vision_id=self.add_vision_id)  # type: ignore
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        grid_thw_merged = [merged_thw.prod().item() // self.merge_length for merged_thw in grid_thw]  # type: ignore
+
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(tokenized, self.img_context_token_id, grid_thw_merged, sum(grid_thw_merged))
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_image_token(messages, self.chat_template, grid_thw_merged, add_vision_id=self.add_vision_id)  # type: ignore
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
         labels = tokenized["labels"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -494,7 +595,7 @@ def multi_modal_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3
 
         # 如果图片被截断，则该数据要丢弃
         num_image_tokens_1 = (torch.tensor(input_ids) == self.img_context_token_id).sum()
-        num_image_tokens_2 = torch.stack(grid_thw_merged, dim=0).sum()
+        num_image_tokens_2 = sum(grid_thw_merged)
         # assert 会被捕获，该数据会丢弃
         assert num_image_tokens_1 == num_image_tokens_2, (
             f"num_image_tokens of input_ids {num_image_tokens_1} != num_image_tokens of media_grid_thw {num_image_tokens_2}, "
@@ -691,28 +792,32 @@ def calc_num_tokens_video_get_item(self, data_item: dict) -> CacheItem:
             num_image_token_list.append([frame_seqlen] * grid_t)
             total_sum_media_grid_thw += sum_media_grid_thw
 
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_video_token(
-            messages,
-            self.chat_template,
-            num_image_token_list,
-            timestamps_list=timestamps_list,
-            add_vision_id=self.add_vision_id,
-        )
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            replace_video_timestamps_and_num_frame(data_item["messages"], num_image_token_list, timestamps_list)
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(
+                tokenized, self.video_context_token_id, num_image_token_list, total_sum_media_grid_thw
+            )
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_video_token(
+                messages, self.chat_template, num_image_token_list, timestamps_list, add_vision_id=self.add_vision_id
+            )
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
             if self.add_eos_token:
                 input_ids = input_ids + [self.eos_token_id]
-
         input_ids, _, _ = self._truncated_data_item(input_ids)
-
         # 如果图片被截断，则该数据丢弃
         num_image_tokens_1 = (torch.tensor(input_ids) == self.video_context_token_id).sum()
         num_image_tokens_2 = total_sum_media_grid_thw
@@ -824,21 +929,28 @@ def video_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3DataIt
             num_imgs_list.append(num_frames)
             total_sum_media_grid_thw += sum_media_grid_thw
 
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_video_token(
-            messages,
-            self.chat_template,
-            num_image_tokens_list,
-            timestamps_list=timestamps_list,
-            add_vision_id=self.add_vision_id,
-        )
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            replace_video_timestamps_and_num_frame(data_item["messages"], num_image_tokens_list, timestamps_list)
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(
+                tokenized, self.video_context_token_id, num_image_tokens_list, total_sum_media_grid_thw
+            )
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_video_token(
+                messages, self.chat_template, num_image_tokens_list, timestamps_list, add_vision_id=self.add_vision_id
+            )
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+
         input_ids = tokenized["input_ids"]
         labels = tokenized["labels"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -908,7 +1020,7 @@ class Qwen3VLTokenizeFnConfig(BaseMLLMTokenizeFnConfig):
     add_vision_id: bool = True
 
     trim_memory_interval: int = 1
-    chat_template: Literal["qwen3-vl", "qwen3-vl-rl"] = "qwen3-vl"
+    chat_template: Literal["qwen3-vl", "qwen3-vl-rl", "qwen3.5-vl"] = "qwen3-vl"
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs
diff --git a/xtuner/v1/datasets/sft_tokenize_fn/openai.py b/xtuner/v1/datasets/sft_tokenize_fn/openai.py
index 1c1bbfba4..5ce9c8cfd 100644
--- a/xtuner/v1/datasets/sft_tokenize_fn/openai.py
+++ b/xtuner/v1/datasets/sft_tokenize_fn/openai.py
@@ -7,7 +7,7 @@
 from pydantic import BaseModel, ConfigDict
 
 from transformers import PreTrainedTokenizer
-from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages import ChatMessages, Qwen35ChatMessages
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP
 from xtuner.v1.datasets.data_item import CacheItem, DataItem
 from xtuner.v1.utils import get_logger
@@ -30,6 +30,7 @@ def __init__(
         assert chat_template in CHAT_TEMPLATE_MAP, (
             f"chat_template {chat_template} not found in {CHAT_TEMPLATE_MAP.keys()}"
         )
+        self.chat_template_name = chat_template
         self.chat_template = CHAT_TEMPLATE_MAP[chat_template]
         self._hash = hash
         self._tokenizer_hash = tokenizer_hash
@@ -43,7 +44,11 @@ def __call__(self, item: dict | list, **kwargs) -> DataItem | CacheItem:
             tools = item["tools"]
         if isinstance(item, dict) and "messages" in item:
             item = item["messages"]
-        messages = ChatMessages(messages=item, tools=tools)
+
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=item, tools=tools)
+        else:
+            messages = ChatMessages(messages=item, tools=tools)
         tokenized = messages.tokenize(self.tokenizer, self.chat_template)
 
         input_ids = tokenized["input_ids"]