Formatting

coolkp · coolkp · commit 935435550942 · 2025-08-25T20:59:28.000Z
Signed-off-by: Kunjan Patel &lt;kunjan@ucla.edu&gt;
diff --git a/src/maxdiffusion/checkpointing/checkpointing_utils.py b/src/maxdiffusion/checkpointing/checkpointing_utils.py
@@ -17,11 +17,11 @@
 
 """Create an Orbax CheckpointManager with specified (Async or not) Checkpointer."""
 
-from typing import Optional,  Tuple
+from typing import Optional, Tuple
 import jax
 import numpy as np
 import os
-from jaxtyping import PyTree 
+from jaxtyping import PyTree
 import orbax.checkpoint
 from maxdiffusion import max_logging
 from etils import epath
@@ -137,7 +137,7 @@ def load_params_from_path(
     unboxed_abstract_params,
     checkpoint_item: str,
     step: Optional[int] = None,
-    checkpoint_item_config: Optional[str] = None
+    checkpoint_item_config: Optional[str] = None,
 ):
   ckptr = ocp.PyTreeCheckpointer()
 
@@ -153,11 +153,7 @@ def load_params_from_path(
 
   restore_args = ocp.checkpoint_utils.construct_restore_args(unboxed_abstract_params)
   restored = ckptr.restore(
-      ckpt_path, 
-      item={"params": unboxed_abstract_params}, 
-      transforms={}, 
-      restore_args={
-        "params": restore_args}
+      ckpt_path, item={"params": unboxed_abstract_params}, transforms={}, restore_args={"params": restore_args}
   )
   return restored["params"]
 
diff --git a/src/maxdiffusion/checkpointing/wan_checkpointer.py b/src/maxdiffusion/checkpointing/wan_checkpointer.py
@@ -19,7 +19,7 @@
 
 import jax
 import numpy as np
-from maxdiffusion.checkpointing.checkpointing_utils import (create_orbax_checkpoint_manager, load_params_from_path)
+from maxdiffusion.checkpointing.checkpointing_utils import (create_orbax_checkpoint_manager)
 from ..pipelines.wan.wan_pipeline import WanPipeline
 from .. import max_logging, max_utils
 import orbax.checkpoint as ocp
@@ -57,18 +57,16 @@ def load_wan_configs_from_orbax(self, step):
         return None
     max_logging.log(f"Loading WAN checkpoint from step {step}")
     metadatas = self.checkpoint_manager.item_metadata(step)
-    
+
     transformer_metadata = metadatas.wan_state
-    abstract_tree_structure_params = jax.tree_util.tree_map(
-        ocp.utils.to_shape_dtype_struct, transformer_metadata
-    )
+    abstract_tree_structure_params = jax.tree_util.tree_map(ocp.utils.to_shape_dtype_struct, transformer_metadata)
     params_restore = ocp.args.PyTreeRestore(
         restore_args=jax.tree.map(
             lambda _: ocp.RestoreArgs(restore_type=np.ndarray),
             abstract_tree_structure_params,
         )
     )
-    
+
     max_logging.log("Restoring WAN checkpoint")
     restored_checkpoint = self.checkpoint_manager.restore(
         directory=epath.Path(self.config.checkpoint_dir),
@@ -77,7 +75,7 @@ def load_wan_configs_from_orbax(self, step):
             wan_state=params_restore,
             # wan_state=params_restore_util_way,
             wan_config=ocp.args.JsonRestore(),
-              ),
+        ),
     )
     return restored_checkpoint
 
@@ -96,14 +94,16 @@ def load_checkpoint(self, step=None):
       pipeline = self.load_diffusers_checkpoint()
 
     return pipeline
-  
+
   def save_checkpoint(self, train_step, pipeline: WanPipeline, train_states: dict):
     """Saves the training state and model configurations."""
+
     def config_to_json(model_or_config):
       return json.loads(model_or_config.to_json_string())
+
     max_logging.log(f"Saving checkpoint for step {train_step}")
     items = {
-      "wan_config": ocp.args.JsonSave(config_to_json(pipeline.transformer)),
+        "wan_config": ocp.args.JsonSave(config_to_json(pipeline.transformer)),
     }
 
     items["wan_state"] = ocp.args.PyTreeSave(train_states)
@@ -112,54 +112,72 @@ def config_to_json(model_or_config):
     self.checkpoint_manager.save(train_step, args=ocp.args.Composite(**items))
     max_logging.log(f"Checkpoint for step {train_step} saved.")
 
-def save_checkpoint_orig(self, train_step, pipeline: WanPipeline, train_states: dict):
-    """Saves the training state and model configurations."""
-    def config_to_json(model_or_config):
-      """
-      only save the config that is needed and can be serialized to JSON.
-      """
-      if not hasattr(model_or_config, "config"):
-          return None
-      source_config = dict(model_or_config.config) 
-
-      # 1. configs that can be serialized to JSON
-      SAFE_KEYS = [
-          '_class_name', '_diffusers_version', 'model_type', 'patch_size', 
-          'num_attention_heads', 'attention_head_dim', 'in_channels', 
-          'out_channels', 'text_dim', 'freq_dim', 'ffn_dim', 'num_layers', 
-          'cross_attn_norm', 'qk_norm', 'eps', 'image_dim', 
-          'added_kv_proj_dim', 'rope_max_seq_len', 'pos_embed_seq_len', 
-          'flash_min_seq_length', 'flash_block_sizes', 'attention', 
-          '_use_default_values'
-      ]
-
-      # 2. save the config that are in the SAFE_KEYS list
-      clean_config = {}
-      for key in SAFE_KEYS:
-          if key in source_config:
-              clean_config[key] = source_config[key]
-
-      # 3. deal with special data type and precision
-      if 'dtype' in source_config and hasattr(source_config['dtype'], 'name'):
-          clean_config['dtype'] = source_config['dtype'].name  # e.g 'bfloat16'
-      
-      if 'weights_dtype' in source_config and hasattr(source_config['weights_dtype'], 'name'):
-          clean_config['weights_dtype'] = source_config['weights_dtype'].name
-
-      if 'precision' in source_config and isinstance(source_config['precision'], Precision):
-          clean_config['precision'] = source_config['precision'].name  # e.g. 'HIGHEST'
-
-      return clean_config
-
-    items_to_save = {
-        "transformer_config": ocp.args.JsonSave(config_to_json(pipeline.transformer)),
-    }
-
-    items_to_save["transformer_states"] = ocp.args.PyTreeSave(train_states)
-
-    # Create CompositeArgs for Orbax
-    save_args = ocp.args.Composite(**items_to_save)
 
-    # Save the checkpoint
-    self.checkpoint_manager.save(train_step, args=save_args)
-    max_logging.log(f"Checkpoint for step {train_step} saved.")
+def save_checkpoint_orig(self, train_step, pipeline: WanPipeline, train_states: dict):
+  """Saves the training state and model configurations."""
+
+  def config_to_json(model_or_config):
+    """
+    only save the config that is needed and can be serialized to JSON.
+    """
+    if not hasattr(model_or_config, "config"):
+      return None
+    source_config = dict(model_or_config.config)
+
+    # 1. configs that can be serialized to JSON
+    SAFE_KEYS = [
+        "_class_name",
+        "_diffusers_version",
+        "model_type",
+        "patch_size",
+        "num_attention_heads",
+        "attention_head_dim",
+        "in_channels",
+        "out_channels",
+        "text_dim",
+        "freq_dim",
+        "ffn_dim",
+        "num_layers",
+        "cross_attn_norm",
+        "qk_norm",
+        "eps",
+        "image_dim",
+        "added_kv_proj_dim",
+        "rope_max_seq_len",
+        "pos_embed_seq_len",
+        "flash_min_seq_length",
+        "flash_block_sizes",
+        "attention",
+        "_use_default_values",
+    ]
+
+    # 2. save the config that are in the SAFE_KEYS list
+    clean_config = {}
+    for key in SAFE_KEYS:
+      if key in source_config:
+        clean_config[key] = source_config[key]
+
+    # 3. deal with special data type and precision
+    if "dtype" in source_config and hasattr(source_config["dtype"], "name"):
+      clean_config["dtype"] = source_config["dtype"].name  # e.g 'bfloat16'
+
+    if "weights_dtype" in source_config and hasattr(source_config["weights_dtype"], "name"):
+      clean_config["weights_dtype"] = source_config["weights_dtype"].name
+
+    if "precision" in source_config and isinstance(source_config["precision"]):
+      clean_config["precision"] = source_config["precision"].name  # e.g. 'HIGHEST'
+
+    return clean_config
+
+  items_to_save = {
+      "transformer_config": ocp.args.JsonSave(config_to_json(pipeline.transformer)),
+  }
+
+  items_to_save["transformer_states"] = ocp.args.PyTreeSave(train_states)
+
+  # Create CompositeArgs for Orbax
+  save_args = ocp.args.Composite(**items_to_save)
+
+  # Save the checkpoint
+  self.checkpoint_manager.save(train_step, args=save_args)
+  max_logging.log(f"Checkpoint for step {train_step} saved.")
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -26,6 +26,7 @@
 def run(config, pipeline=None, filename_prefix=""):
   print("seed: ", config.seed)
   from maxdiffusion.checkpointing.wan_checkpointer import WanCheckpointer
+
   checkpoint_loader = WanCheckpointer(config, "WAN_CHECKPOINT")
   pipeline = checkpoint_loader.load_checkpoint()
   if pipeline is None:
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -66,7 +66,9 @@ def _add_sharding_rule(vs: nnx.VariableState, logical_axis_rules) -> nnx.Variabl
 
 
 # For some reason, jitting this function increases the memory significantly, so instead manually move weights to device.
-def create_sharded_logical_transformer(devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters, restored_checkpoint=None):
+def create_sharded_logical_transformer(
+    devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters, restored_checkpoint=None
+):
 
   def create_model(rngs: nnx.Rngs, wan_config: dict):
     wan_transformer = WanModel(**wan_config, rngs=rngs)
@@ -110,7 +112,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
     )
   params = jax.tree_util.tree_map(lambda x: x.astype(config.weights_dtype), params)
   for path, val in flax.traverse_util.flatten_dict(params).items():
-    if restored_checkpoint: 
+    if restored_checkpoint:
       path = path[:-1]
     sharding = logical_state_sharding[path].value
     state[path].value = device_put_replicated(val, sharding)
@@ -303,9 +305,13 @@ def quantize_transformer(cls, config: HyperParameters, model: WanModel, pipeline
     return quantized_model
 
   @classmethod
-  def load_transformer(cls, devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters, restored_checkpoint=None):
+  def load_transformer(
+      cls, devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters, restored_checkpoint=None
+  ):
     with mesh:
-      wan_transformer = create_sharded_logical_transformer(devices_array=devices_array, mesh=mesh, rngs=rngs, config=config, restored_checkpoint=restored_checkpoint)
+      wan_transformer = create_sharded_logical_transformer(
+          devices_array=devices_array, mesh=mesh, rngs=rngs, config=config, restored_checkpoint=restored_checkpoint
+      )
     return wan_transformer
 
   @classmethod
@@ -331,7 +337,9 @@ def from_checkpoint(cls, config: HyperParameters, restored_checkpoint=None, vae_
     if not vae_only:
       if load_transformer:
         with mesh:
-          transformer = cls.load_transformer(devices_array=devices_array, mesh=mesh, rngs=rngs, config=config, restored_checkpoint=restored_checkpoint)
+          transformer = cls.load_transformer(
+              devices_array=devices_array, mesh=mesh, rngs=rngs, config=config, restored_checkpoint=restored_checkpoint
+          )
 
       text_encoder = cls.load_text_encoder(config=config)
       tokenizer = cls.load_tokenizer(config=config)
@@ -353,7 +361,7 @@ def from_checkpoint(cls, config: HyperParameters, restored_checkpoint=None, vae_
         mesh=mesh,
         config=config,
     )
-    
+
   @classmethod
   def from_pretrained(cls, config: HyperParameters, vae_only=False, load_transformer=True):
     devices_array = max_utils.create_device_mesh(config)
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -149,8 +149,7 @@ def start_training(self):
 
     pipeline = self.load_checkpoint()
     # Generate a sample before training to compare against generated sample after training.
-    # UNCOMMENT
-    # pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
+    pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
 
     # save some memory.
     del pipeline.vae
@@ -168,7 +167,7 @@ def start_training(self):
     pipeline = self.training_loop(pipeline, optimizer, learning_rate_scheduler, train_data_iterator)
 
     posttrained_video_path = generate_sample(self.config, pipeline, filename_prefix="post-training-")
-    # print_ssim(pretrained_video_path, posttrained_video_path)
+    print_ssim(pretrained_video_path, posttrained_video_path)
 
   def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data_iterator):
     mesh = pipeline.mesh