Apply suggestions from code review

yiyixuxu · sayakpaul · web-flow · commit 237d318e8513 · 2025-11-30T13:22:43.000-10:00
Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py
@@ -215,8 +215,6 @@ class HunyuanVideo15Downsample(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
         super().__init__()
         factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
-        assert out_channels % factor == 0
-        # self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
         self.conv = HunyuanVideo15CausalConv3d(in_channels, out_channels // factor, kernel_size=3)
 
         self.add_temporal_downsample = add_temporal_downsample
@@ -531,7 +529,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
             hidden_states = self.mid_block(hidden_states)
 
-        # short_cut = rearrange(hidden_states, "b (c r) f h w -> b c r f h w", r=self.group_size).mean(dim=2)
         batch_size, _, frame, height, width = hidden_states.shape
         short_cut = hidden_states.view(batch_size, -1, self.group_size, frame, height, width).mean(dim=2)
 
@@ -546,7 +543,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 class HunyuanVideo15Decoder3D(nn.Module):
     r"""
-    Causal decoder for 3D video-like data used for HunyuanImage-2.1 Refiner.
+    Causal decoder for 3D video-like data used for HunyuanImage-1.5 Refiner.
     """
 
     def __init__(
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video15.py b/src/diffusers/models/transformers/transformer_hunyuan_video15.py
@@ -184,10 +184,7 @@ class HunyuanVideo15TimeEmbedding(nn.Module):
             The dimension of the output embedding.
     """
 
-    def __init__(
-        self,
-        embedding_dim: int,
-    ):
+    def __init__(self, embedding_dim: int):
         super().__init__()
 
         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
@@ -362,7 +359,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size]
 
         axes_grids = []
-        for i in range(3):
+        for i in range(len(rope_sizes)):
             # Note: The following line diverges from original behaviour. We create the grid on the device, whereas
             # original implementation creates it on CPU and then moves it to device. This results in numerical
             # differences in layerwise debugging outputs, but visually it is the same.
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py b/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py
@@ -34,7 +34,7 @@ def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0):
     return crop_size_list
 
 
-# copied fromhttps://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38
+# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38
 def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
     """
     Get the closest ratio in the buckets.