Skip to content

Commit 2fc5789

Browse files
authored
Merge branch 'main' into ltx-2-transformer
2 parents dd81242 + 88ffb00 commit 2fc5789

12 files changed

Lines changed: 878 additions & 86 deletions

docs/source/en/training/distributed_inference.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,8 @@ def main():
263263
world_size = dist.get_world_size()
264264

265265
pipeline = DiffusionPipeline.from_pretrained(
266-
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map=device
267-
)
266+
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
267+
).to(device)
268268
pipeline.transformer.set_attention_backend("_native_cudnn")
269269

270270
cp_config = ContextParallelConfig(ring_degree=world_size)

examples/community/pipeline_z_image_differential_img2img.py

Lines changed: 844 additions & 0 deletions
Large diffs are not rendered by default.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def run(self):
274274

275275
setup(
276276
name="diffusers",
277-
version="0.36.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
277+
version="0.37.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
278278
description="State-of-the-art diffusion in PyTorch and JAX.",
279279
long_description=open("README.md", "r", encoding="utf-8").read(),
280280
long_description_content_type="text/markdown",

src/diffusers/loaders/single_file_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,8 @@
226226
"cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
227227
"z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
228228
"z-image-turbo-controlnet": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union"},
229-
"z-image-turbo-controlnet-2.x": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
229+
"z-image-turbo-controlnet-2.0": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.0"},
230+
"z-image-turbo-controlnet-2.1": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
230231
}
231232

232233
# Use to configure model sample size when original config is provided
@@ -784,7 +785,13 @@ def infer_diffusers_model_type(checkpoint):
784785
raise ValueError(f"Unexpected x_embedder shape: {x_embedder_shape} when loading Cosmos 2.0 model.")
785786

786787
elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet-2.x"] in checkpoint:
787-
model_type = "z-image-turbo-controlnet-2.x"
788+
before_proj_weight = checkpoint.get("control_noise_refiner.0.before_proj.weight", None)
789+
if before_proj_weight is None:
790+
model_type = "z-image-turbo-controlnet-2.0"
791+
elif before_proj_weight is not None and torch.all(before_proj_weight == 0.0):
792+
model_type = "z-image-turbo-controlnet-2.0"
793+
else:
794+
model_type = "z-image-turbo-controlnet-2.1"
788795

789796
elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet"] in checkpoint:
790797
model_type = "z-image-turbo-controlnet"

src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from ..activations import get_activation
2828
from ..modeling_outputs import AutoencoderKLOutput
2929
from ..modeling_utils import ModelMixin
30-
from .vae import DecoderOutput, DiagonalGaussianDistribution
30+
from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution
3131

3232

3333
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -410,7 +410,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
410410
return h
411411

412412

413-
class AutoencoderKLHunyuanImage(ModelMixin, ConfigMixin, FromOriginalModelMixin):
413+
class AutoencoderKLHunyuanImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin):
414414
r"""
415415
A VAE model for 2D images with spatial tiling support.
416416
@@ -486,27 +486,6 @@ def enable_tiling(
486486
self.tile_overlap_factor = tile_overlap_factor or self.tile_overlap_factor
487487
self.tile_latent_min_size = self.tile_sample_min_size // self.config.spatial_compression_ratio
488488

489-
def disable_tiling(self) -> None:
490-
r"""
491-
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
492-
decoding in one step.
493-
"""
494-
self.use_tiling = False
495-
496-
def enable_slicing(self) -> None:
497-
r"""
498-
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
499-
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
500-
"""
501-
self.use_slicing = True
502-
503-
def disable_slicing(self) -> None:
504-
r"""
505-
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
506-
decoding in one step.
507-
"""
508-
self.use_slicing = False
509-
510489
def _encode(self, x: torch.Tensor):
511490

512491
batch_size, num_channels, height, width = x.shape

src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from ..activations import get_activation
2727
from ..modeling_outputs import AutoencoderKLOutput
2828
from ..modeling_utils import ModelMixin
29-
from .vae import DecoderOutput, DiagonalGaussianDistribution
29+
from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution
3030

3131

3232
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -584,7 +584,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
584584
return hidden_states
585585

586586

587-
class AutoencoderKLHunyuanImageRefiner(ModelMixin, ConfigMixin):
587+
class AutoencoderKLHunyuanImageRefiner(ModelMixin, AutoencoderMixin, ConfigMixin):
588588
r"""
589589
A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used for
590590
HunyuanImage-2.1 Refiner.
@@ -685,27 +685,6 @@ def enable_tiling(
685685
self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
686686
self.tile_overlap_factor = tile_overlap_factor or self.tile_overlap_factor
687687

688-
def disable_tiling(self) -> None:
689-
r"""
690-
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
691-
decoding in one step.
692-
"""
693-
self.use_tiling = False
694-
695-
def enable_slicing(self) -> None:
696-
r"""
697-
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
698-
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
699-
"""
700-
self.use_slicing = True
701-
702-
def disable_slicing(self) -> None:
703-
r"""
704-
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
705-
decoding in one step.
706-
"""
707-
self.use_slicing = False
708-
709688
def _encode(self, x: torch.Tensor) -> torch.Tensor:
710689
_, _, _, height, width = x.shape
711690

src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from ..activations import get_activation
2727
from ..modeling_outputs import AutoencoderKLOutput
2828
from ..modeling_utils import ModelMixin
29-
from .vae import DecoderOutput, DiagonalGaussianDistribution
29+
from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution
3030

3131

3232
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -625,7 +625,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
625625
return hidden_states
626626

627627

628-
class AutoencoderKLHunyuanVideo15(ModelMixin, ConfigMixin):
628+
class AutoencoderKLHunyuanVideo15(ModelMixin, AutoencoderMixin, ConfigMixin):
629629
r"""
630630
A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used for
631631
HunyuanVideo-1.5.
@@ -723,27 +723,6 @@ def enable_tiling(
723723
self.tile_latent_min_width = tile_latent_min_width or self.tile_latent_min_width
724724
self.tile_overlap_factor = tile_overlap_factor or self.tile_overlap_factor
725725

726-
def disable_tiling(self) -> None:
727-
r"""
728-
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
729-
decoding in one step.
730-
"""
731-
self.use_tiling = False
732-
733-
def enable_slicing(self) -> None:
734-
r"""
735-
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
736-
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
737-
"""
738-
self.use_slicing = True
739-
740-
def disable_slicing(self) -> None:
741-
r"""
742-
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
743-
decoding in one step.
744-
"""
745-
self.use_slicing = False
746-
747726
def _encode(self, x: torch.Tensor) -> torch.Tensor:
748727
_, _, _, height, width = x.shape
749728

src/diffusers/models/transformers/transformer_wan.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def apply_rotary_emb(
134134
dropout_p=0.0,
135135
is_causal=False,
136136
backend=self._attention_backend,
137-
parallel_config=self._parallel_config,
137+
# Reference: https://github.com/huggingface/diffusers/pull/12909
138+
parallel_config=None,
138139
)
139140
hidden_states_img = hidden_states_img.flatten(2, 3)
140141
hidden_states_img = hidden_states_img.type_as(query)
@@ -147,7 +148,8 @@ def apply_rotary_emb(
147148
dropout_p=0.0,
148149
is_causal=False,
149150
backend=self._attention_backend,
150-
parallel_config=self._parallel_config,
151+
# Reference: https://github.com/huggingface/diffusers/pull/12909
152+
parallel_config=(self._parallel_config if encoder_hidden_states is None else None),
151153
)
152154
hidden_states = hidden_states.flatten(2, 3)
153155
hidden_states = hidden_states.type_as(query)
@@ -552,9 +554,11 @@ class WanTransformer3DModel(
552554
"blocks.0": {
553555
"hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
554556
},
555-
"blocks.*": {
556-
"encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
557-
},
557+
# Reference: https://github.com/huggingface/diffusers/pull/12909
558+
# We need to disable the splitting of encoder_hidden_states because the image_encoder
559+
# (Wan 2.1 I2V) consistently generates 257 tokens for image_embed. This causes the shape
560+
# of encoder_hidden_states—whose token count is always 769 (512 + 257) after concatenation
561+
# —to be indivisible by the number of devices in the CP.
558562
"proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
559563
"": {
560564
"timestep": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),

src/diffusers/models/transformers/transformer_wan_animate.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,8 @@ def apply_rotary_emb(
609609
dropout_p=0.0,
610610
is_causal=False,
611611
backend=self._attention_backend,
612-
parallel_config=self._parallel_config,
612+
# Reference: https://github.com/huggingface/diffusers/pull/12909
613+
parallel_config=None,
613614
)
614615
hidden_states_img = hidden_states_img.flatten(2, 3)
615616
hidden_states_img = hidden_states_img.type_as(query)
@@ -622,7 +623,8 @@ def apply_rotary_emb(
622623
dropout_p=0.0,
623624
is_causal=False,
624625
backend=self._attention_backend,
625-
parallel_config=self._parallel_config,
626+
# Reference: https://github.com/huggingface/diffusers/pull/12909
627+
parallel_config=(self._parallel_config if encoder_hidden_states is None else None),
626628
)
627629
hidden_states = hidden_states.flatten(2, 3)
628630
hidden_states = hidden_states.type_as(query)

src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def retrieve_latents(
7676
7777
>>> model_id = "nvidia/Cosmos-Predict2.5-2B"
7878
>>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
79-
... model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
79+
... model_id, revision="diffusers/base/post-trained", torch_dtype=torch.bfloat16
8080
... )
8181
>>> pipe = pipe.to("cuda")
8282

0 commit comments

Comments
 (0)