Merge branch 'main' into modular-z

yiyixuxu · web-flow · commit c1f0cd3a3fba · 2025-12-09T02:00:17.000-10:00
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -651,7 +651,7 @@
       - local: api/pipelines/wuerstchen
         title: Wuerstchen
       - local: api/pipelines/z_image
-        title: Z-Image        
+        title: Z-Image
       title: Image
     - sections:
       - local: api/pipelines/allegro
diff --git a/docs/source/en/api/pipelines/z_image.md b/docs/source/en/api/pipelines/z_image.md
@@ -26,8 +26,41 @@ specific language governing permissions and limitations under the License.
 
 Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
 
+## Image-to-image
+
+Use [`ZImageImg2ImgPipeline`] to transform an existing image based on a text prompt.
+
+```python
+import torch
+from diffusers import ZImageImg2ImgPipeline
+from diffusers.utils import load_image
+
+pipe = ZImageImg2ImgPipeline.from_pretrained("Tongyi-MAI/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+init_image = load_image(url).resize((1024, 1024))
+
+prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors"
+image = pipe(
+    prompt,
+    image=init_image,
+    strength=0.6,
+    num_inference_steps=9,
+    guidance_scale=0.0,
+    generator=torch.Generator("cuda").manual_seed(42),
+).images[0]
+image.save("zimage_img2img.png")
+```
+
 ## ZImagePipeline
 
 [[autodoc]] ZImagePipeline
 	- all
-	- __call__
+	- __call__
+
+## ZImageImg2ImgPipeline
+
+[[autodoc]] ZImageImg2ImgPipeline
+	- all
+	- __call__
diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # NVIDIA ModelOpt
 
-[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
+[NVIDIA-ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
 
 Before you begin, make sure you have nvidia_modelopt installed.
 
@@ -57,7 +57,7 @@ image.save("output.png")
 >
 > The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration.  
 > 
-> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples).
+> More details can be found [here](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples).
 
 ## NVIDIAModelOptConfig
 
@@ -86,7 +86,7 @@ The quantization methods supported are as follows:
 | **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
 
 
-Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
+Refer to the [official modelopt documentation](https://nvidia.github.io/Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
 
 ## Serializing and Deserializing quantized models
 
diff --git a/scripts/convert_hunyuan_video1_5_to_diffusers.py b/scripts/convert_hunyuan_video1_5_to_diffusers.py
@@ -69,6 +69,11 @@
         "target_size": 960,
         "task_type": "i2v",
     },
+    "480p_i2v_step_distilled": {
+        "target_size": 640,
+        "task_type": "i2v",
+        "use_meanflow": True,
+    },
 }
 
 SCHEDULER_CONFIGS = {
@@ -93,6 +98,9 @@
     "720p_i2v_distilled": {
         "shift": 7.0,
     },
+    "480p_i2v_step_distilled": {
+        "shift": 7.0,
+    },
 }
 
 GUIDANCE_CONFIGS = {
@@ -117,6 +125,9 @@
     "720p_i2v_distilled": {
         "guidance_scale": 1.0,
     },
+    "480p_i2v_step_distilled": {
+        "guidance_scale": 1.0,
+    },
 }
 
 
@@ -126,7 +137,7 @@ def swap_scale_shift(weight):
     return new_weight
 
 
-def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
+def convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=None):
     """
     Convert HunyuanVideo 1.5 original checkpoint to Diffusers format.
     """
@@ -142,6 +153,20 @@ def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
     )
     converted_state_dict["time_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop("time_in.mlp.2.bias")
 
+    if config.use_meanflow:
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.0.bias"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.2.bias"
+        )
+
     # 2. context_embedder.time_text_embed.timestep_embedder <- txt_in.t_embedder
     converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_1.weight"] = (
         original_state_dict.pop("txt_in.t_embedder.mlp.0.weight")
@@ -627,7 +652,7 @@ def convert_transformer(args):
     config = TRANSFORMER_CONFIGS[args.transformer_type]
     with init_empty_weights():
         transformer = HunyuanVideo15Transformer3DModel(**config)
-    state_dict = convert_hyvideo15_transformer_to_diffusers(original_state_dict)
+    state_dict = convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=transformer.config)
     transformer.load_state_dict(state_dict, strict=True, assign=True)
 
     return transformer
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -664,6 +664,7 @@
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "ZImageImg2ImgPipeline",
             "ZImagePipeline",
         ]
     )
@@ -1364,6 +1365,7 @@
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            ZImageImg2ImgPipeline,
             ZImagePipeline,
         )
 
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video15.py b/src/diffusers/models/transformers/transformer_hunyuan_video15.py
@@ -184,19 +184,32 @@ class HunyuanVideo15TimeEmbedding(nn.Module):
             The dimension of the output embedding.
     """
 
-    def __init__(self, embedding_dim: int):
+    def __init__(self, embedding_dim: int, use_meanflow: bool = False):
         super().__init__()
 
         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
 
+        self.use_meanflow = use_meanflow
+        self.time_proj_r = None
+        self.timestep_embedder_r = None
+        if use_meanflow:
+            self.time_proj_r = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.timestep_embedder_r = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
     def forward(
         self,
         timestep: torch.Tensor,
+        timestep_r: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         timesteps_proj = self.time_proj(timestep)
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=timestep.dtype))
 
+        if timestep_r is not None:
+            timesteps_proj_r = self.time_proj_r(timestep_r)
+            timesteps_emb_r = self.timestep_embedder_r(timesteps_proj_r.to(dtype=timestep.dtype))
+            timesteps_emb = timesteps_emb + timesteps_emb_r
+
         return timesteps_emb
 
 
@@ -567,6 +580,7 @@ def __init__(
         # YiYi Notes: config based on target_size_config https://github.com/yiyixuxu/hy15/blob/main/hyvideo/pipelines/hunyuan_video_pipeline.py#L205
         target_size: int = 640,  # did not name sample_size since it is in pixel spaces
         task_type: str = "i2v",
+        use_meanflow: bool = False,
     ) -> None:
         super().__init__()
 
@@ -582,7 +596,7 @@ def __init__(
         )
         self.context_embedder_2 = HunyuanVideo15ByT5TextProjection(text_embed_2_dim, 2048, inner_dim)
 
-        self.time_embed = HunyuanVideo15TimeEmbedding(inner_dim)
+        self.time_embed = HunyuanVideo15TimeEmbedding(inner_dim, use_meanflow=use_meanflow)
 
         self.cond_type_embed = nn.Embedding(3, inner_dim)
 
@@ -612,6 +626,7 @@ def forward(
         timestep: torch.LongTensor,
         encoder_hidden_states: torch.Tensor,
         encoder_attention_mask: torch.Tensor,
+        timestep_r: Optional[torch.LongTensor] = None,
         encoder_hidden_states_2: Optional[torch.Tensor] = None,
         encoder_attention_mask_2: Optional[torch.Tensor] = None,
         image_embeds: Optional[torch.Tensor] = None,
@@ -643,7 +658,7 @@ def forward(
         image_rotary_emb = self.rope(hidden_states)
 
         # 2. Conditional embeddings
-        temb = self.time_embed(timestep)
+        temb = self.time_embed(timestep, timestep_r=timestep_r)
 
         hidden_states = self.x_embedder(hidden_states)
 
diff --git a/src/diffusers/models/transformers/transformer_prx.py b/src/diffusers/models/transformers/transformer_prx.py
@@ -16,7 +16,6 @@
 
 import torch
 from torch import nn
-from torch.nn.functional import fold, unfold
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
@@ -532,7 +531,19 @@ def img2seq(img: torch.Tensor, patch_size: int) -> torch.Tensor:
             Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`, where `L = (H // patch_size) * (W
             // patch_size)` is the number of patches.
     """
-    return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
+    b, c, h, w = img.shape
+    p = patch_size
+
+    # Reshape to (B, C, H//p, p, W//p, p) separating grid and patch dimensions
+    img = img.reshape(b, c, h // p, p, w // p, p)
+
+    # Permute to (B, H//p, W//p, C, p, p) using einsum
+    # n=batch, c=channels, h=grid_height, p=patch_height, w=grid_width, q=patch_width
+    img = torch.einsum("nchpwq->nhwcpq", img)
+
+    # Flatten to (B, L, C * p * p)
+    img = img.reshape(b, -1, c * p * p)
+    return img
 
 
 def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Tensor:
@@ -554,12 +565,26 @@ def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Te
             Reconstructed image tensor of shape `(B, C, H, W)`.
     """
     if isinstance(shape, tuple):
-        shape = shape[-2:]
+        h, w = shape[-2:]
     elif isinstance(shape, torch.Tensor):
-        shape = (int(shape[0]), int(shape[1]))
+        h, w = (int(shape[0]), int(shape[1]))
     else:
         raise NotImplementedError(f"shape type {type(shape)} not supported")
-    return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
+
+    b, l, d = seq.shape
+    p = patch_size
+    c = d // (p * p)
+
+    # Reshape back to grid structure: (B, H//p, W//p, C, p, p)
+    seq = seq.reshape(b, h // p, w // p, c, p, p)
+
+    # Permute back to image layout: (B, C, H//p, p, W//p, p)
+    # n=batch, h=grid_height, w=grid_width, c=channels, p=patch_height, q=patch_width
+    seq = torch.einsum("nhwcpq->nchpwq", seq)
+
+    # Final reshape to (B, C, H, W)
+    seq = seq.reshape(b, c, h, w)
+    return seq
 
 
 class PRXTransformer2DModel(ModelMixin, ConfigMixin, AttentionMixin):
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -404,7 +404,7 @@
         "Kandinsky5T2IPipeline",
         "Kandinsky5I2IPipeline",
     ]
-    _import_structure["z_image"] = ["ZImagePipeline"]
+    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
     _import_structure["skyreels_v2"] = [
         "SkyReelsV2DiffusionForcingPipeline",
         "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -841,7 +841,7 @@
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
-        from .z_image import ZImagePipeline
+        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
         try:
             if not is_onnx_available():
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
@@ -119,6 +119,7 @@
 )
 from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
 from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
 
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
@@ -162,6 +163,7 @@
         ("cogview4-control", CogView4ControlPipeline),
         ("qwenimage", QwenImagePipeline),
         ("qwenimage-controlnet", QwenImageControlNetPipeline),
+        ("z-image", ZImagePipeline),
     ]
 )
 
@@ -189,6 +191,7 @@
         ("qwenimage", QwenImageImg2ImgPipeline),
         ("qwenimage-edit", QwenImageEditPipeline),
         ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
+        ("z-image", ZImageImg2ImgPipeline),
     ]
 )
 
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
@@ -852,6 +852,15 @@ def __call__(
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
 
+                if self.transformer.config.use_meanflow:
+                    if i == len(timesteps) - 1:
+                        timestep_r = torch.tensor([0.0], device=device)
+                    else:
+                        timestep_r = timesteps[i + 1]
+                    timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype)
+                else:
+                    timestep_r = None
+
                 # Step 1: Collect model inputs needed for the guidance method
                 # conditional inputs should always be first element in the tuple
                 guider_inputs = {
@@ -893,6 +902,7 @@ def __call__(
                             hidden_states=latent_model_input,
                             image_embeds=image_embeds,
                             timestep=timestep,
+                            timestep_r=timestep_r,
                             attention_kwargs=self.attention_kwargs,
                             return_dict=False,
                             **cond_kwargs,
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
 
-from ...utils import logging
+from ...utils import is_transformers_version, logging
 
 
 logger = logging.get_logger(__name__)
@@ -46,6 +46,9 @@ def __init__(self, config: CLIPConfig):
 
         self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
         self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+        # Model requires post_init after transformers v4.57.3
+        if is_transformers_version(">", "4.57.3"):
+            self.post_init()
 
     @torch.no_grad()
     def forward(self, clip_input, images):
diff --git a/src/diffusers/pipelines/z_image/__init__.py b/src/diffusers/pipelines/z_image/__init__.py
@@ -23,6 +23,7 @@
 else:
     _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
     _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+    _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]
 
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -35,6 +36,7 @@
     else:
         from .pipeline_output import ZImagePipelineOutput
         from .pipeline_z_image import ZImagePipeline
+        from .pipeline_z_image_img2img import ZImageImg2ImgPipeline
 
 else:
     import sys
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py b/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
diff --git a/tests/pipelines/z_image/test_z_image_img2img.py b/tests/pipelines/z_image/test_z_image_img2img.py

Original file line number	Diff line number	Diff line change
`@@ -664,6 +664,7 @@`
`664`	`664`	`"WuerstchenCombinedPipeline",`
`665`	`665`	`"WuerstchenDecoderPipeline",`
`666`	`666`	`"WuerstchenPriorPipeline",`
	`667`	`+ "ZImageImg2ImgPipeline",`
`667`	`668`	`"ZImagePipeline",`
`668`	`669`	`]`
`669`	`670`	`)`
`@@ -1364,6 +1365,7 @@`
`1364`	`1365`	`WuerstchenCombinedPipeline,`
`1365`	`1366`	`WuerstchenDecoderPipeline,`
`1366`	`1367`	`WuerstchenPriorPipeline,`
	`1368`	`+ ZImageImg2ImgPipeline,`
`1367`	`1369`	`ZImagePipeline,`
`1368`	`1370`	`)`
`1369`	`1371`
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@`
`119`	`119`	`)`
`120`	`120`	`from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline`
`121`	`121`	`from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline`
	`122`	`+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline`
`122`	`123`
`123`	`124`
`124`	`125`	`AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(`
`@@ -162,6 +163,7 @@`
`162`	`163`	`("cogview4-control", CogView4ControlPipeline),`
`163`	`164`	`("qwenimage", QwenImagePipeline),`
`164`	`165`	`("qwenimage-controlnet", QwenImageControlNetPipeline),`
	`166`	`+ ("z-image", ZImagePipeline),`
`165`	`167`	`]`
`166`	`168`	`)`
`167`	`169`
`@@ -189,6 +191,7 @@`
`189`	`191`	`("qwenimage", QwenImageImg2ImgPipeline),`
`190`	`192`	`("qwenimage-edit", QwenImageEditPipeline),`
`191`	`193`	`("qwenimage-edit-plus", QwenImageEditPlusPipeline),`
	`194`	`+ ("z-image", ZImageImg2ImgPipeline),`
`192`	`195`	`]`
`193`	`196`	`)`
`194`	`197`