wan training for single frame + bug fixes.

jfacevedo-google · jfacevedo-google · commit b90584cbaa77 · 2025-06-12T19:14:56.000Z
diff --git a/src/maxdiffusion/checkpointing/wan_checkpointer.py b/src/maxdiffusion/checkpointing/wan_checkpointer.py
@@ -15,6 +15,7 @@
 """
 
 from abc import ABC
+import jax
 from flax import nnx
 from maxdiffusion.checkpointing.checkpointing_utils import (create_orbax_checkpoint_manager)
 from ..pipelines.wan.wan_pipeline import WanPipeline
@@ -35,20 +36,14 @@ def __init__(self, config, checkpoint_type):
       dataset_type=config.dataset_type
     )
   
-  # @nnx.jit
   def _create_optimizer(self, model, config, learning_rate):
     learning_rate_scheduler = max_utils.create_learning_rate_schedule(
       learning_rate, config.learning_rate_schedule_steps, config.warmup_steps_fraction, config.max_train_steps
     )
     tx = max_utils.create_optimizer(config, learning_rate_scheduler)
-    # tx = nnx.Optimizer(model, tx)
-
-    # _, state, rest_of_state = nnx.split((model, tx), ...)
-    # nnx.update((model, tx), state, rest_of_state)
-
-
     return nnx.Optimizer(model, tx), learning_rate_scheduler
 
+
   def load_wan_configs_from_orbax(self, step):
     max_logging.log("Restoring stable diffusion configs")
     if step is None:
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -29,10 +29,13 @@ def run(config):
   slg_layers = config.slg_layers
   slg_start = config.slg_start
   slg_end = config.slg_end
-  
+
+  prompt = [config.prompt] * jax.device_count()
+  negative_prompt= [config.negative_prompt] * jax.device_count()
+   
   videos = pipeline(
-    prompt=config.prompt,
-    negative_prompt=config.negative_prompt,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
     height=config.height,
     width=config.width,
     num_frames=config.num_frames,
@@ -45,12 +48,12 @@ def run(config):
 
   print("compile time: ", (time.perf_counter() - s0))
   for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=16)
+    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
   s0 = time.perf_counter()
   with jax.profiler.trace("/tmp/trace/"):
     videos = pipeline(
-      prompt=config.prompt,
-      negative_prompt=config.negative_prompt,
+      prompt=prompt,
+      negative_prompt=negative_prompt,
       height=config.height,
       width=config.width,
       num_frames=config.num_frames,
@@ -62,7 +65,7 @@ def run(config):
     )
   print("generation time: ", (time.perf_counter() - s0))
   for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=16)
+    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
 
 
 def main(argv: Sequence[str]) -> None:
diff --git a/src/maxdiffusion/maxdiffusion_utils.py b/src/maxdiffusion/maxdiffusion_utils.py
@@ -286,6 +286,32 @@ def get_dummy_flux_inputs(config, pipeline, batch_size):
 
   return (latents, timesteps, latents_ids, guidance_vec, t5_hidden_states, t5_ids, clip_hidden_states)
 
+def get_dummy_wan_inputs(config, pipeline, batch_size):
+  latents = pipeline.prepare_latents(
+    batch_size,
+    vae_scale_factor_temporal=pipeline.vae_scale_factor_temporal,
+    vae_scale_factor_spatial=pipeline.vae_scale_factor_spatial,
+    height=config.height,
+    width=config.width,
+    num_frames=config.num_frames,
+    num_channels_latents=pipeline.transformer.config.in_channels
+  )
+  bsz = latents.shape[0]
+  prompt_embeds = jax.random.normal(jax.random.key(config.seed), (batch_size, 512, 4096))
+  timesteps = jnp.array([0] * bsz, dtype=jnp.int32)
+  return (latents, prompt_embeds, timesteps)
+
+def calculate_wan_tflops(config, pipeline, batch_size, rngs, train):
+  """
+  Calculates jflux tflops.
+  batch_size should be per_device_batch_size * jax.local_device_count() or attention's shard_map won't
+  cache the compilation when flash is enabled.
+  """
+  (latents, prompt_embeds, timesteps) = get_dummy_wan_inputs(config, pipeline, batch_size)
+  return max_utils.calculate_model_tflops(
+    pipeline.transformer,
+    
+  )
 
 def calculate_flux_tflops(config, pipeline, batch_size, rngs, train):
   """
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -58,7 +58,7 @@ def _add_sharding_rule(vs: nnx.VariableState, logical_axis_rules) -> nnx.Variabl
   return vs
 
 
-partial(nnx.jit, static_argnums=(3,))
+# For some reason, jitting this function increases the memory significantly, so instead manually move weights to device.
 def create_sharded_logical_transformer(devices_array: np.array, mesh: Mesh, rngs: nnx.Rngs, config: HyperParameters):
 
   def create_model(rngs: nnx.Rngs, wan_config: dict):
@@ -106,16 +106,15 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_transformer = nnx.merge(graphdef, state, rest_of_state)
   return wan_transformer
 
-
-partial(nnx.jit, static_argnums=(1,))
+@nnx.jit(static_argnums=(1,), donate_argnums=(0,))
 def create_sharded_logical_model(model, logical_axis_rules):
   graphdef, state, rest_of_state = nnx.split(model, nnx.Param, ...)
   p_add_sharding_rule = partial(_add_sharding_rule, logical_axis_rules=logical_axis_rules)
   state = jax.tree.map(p_add_sharding_rule, state, is_leaf=lambda x: isinstance(x, nnx.VariableState))
   pspecs = nnx.get_partition_spec(state)
   sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
-  wan_transformer = nnx.merge(graphdef, sharded_state, rest_of_state)
-  return wan_transformer
+  model = nnx.merge(graphdef, sharded_state, rest_of_state)
+  return model
 
 
 class WanPipeline:
@@ -473,9 +472,8 @@ def transformer_forward_pass(
     encoder_hidden_states=prompt_embeds,
     is_uncond=is_uncond,
     slg_mask=slg_mask
-  )[0]
+  )
 
-#@partial(jax.jit, static_argnums=(6, 7, 8))
 def run_inference(
   graphdef,
   sharded_state,
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -23,7 +23,7 @@
 import jax.tree_util as jtu
 from flax import nnx
 from ..schedulers import FlaxEulerDiscreteScheduler
-from .. import max_utils, max_logging, train_utils
+from .. import max_utils, max_logging, train_utils, maxdiffusion_utils
 from ..checkpointing.wan_checkpointer import (
   WanCheckpointer,
   WAN_CHECKPOINT
@@ -64,36 +64,15 @@ def load_dataset(self, pipeline):
     # prompt embeds shape: (1, 512, 4096)
     # For now, we will pass the same latents over and over
     # TODO - create a dataset
-    prompt_embeds = jax.random.normal(jax.random.key(self.config.seed), (self.global_batch_size, 512, 4096))
-    latents = pipeline.prepare_latents(
-      self.global_batch_size,
-      vae_scale_factor_temporal=pipeline.vae_scale_factor_temporal,
-      vae_scale_factor_spatial=pipeline.vae_scale_factor_spatial,
-      height=self.config.height,
-      width=self.config.width,
-      num_frames=self.config.num_frames,
-      num_channels_latents=pipeline.transformer.config.in_channels
-    )
-    return (latents, prompt_embeds)
+    return maxdiffusion_utils.get_dummy_wan_inputs(self.config, pipeline, self.global_batch_size)
 
   def start_training(self):
 
     pipeline = self.load_checkpoint()
-    mesh = pipeline.mesh
-
-    optimizer, learning_rate_scheduler = self._create_optimizer(pipeline.transformer, self.config, self.config.learning_rate)
-
-    # @nnx.jit
-    # def create_transformer_state(transformer):
-    #   optimizer = self._create_optimizer(transformer, self.config, self.config.learning_rate)
-    #   breakpoint()
-    #   _, state = nnx.split((transformer, optimizer))
-    
-    # with mesh:
-    #   create_transformer_state(pipeline.transformer)
-
-    #graphdef, state = nnx.plit((pipeline.transformer, optimizer))
+    del pipeline.vae
     dummy_inputs = self.load_dataset(pipeline)
+    mesh = pipeline.mesh
+    optimizer, learning_rate_scheduler = self._create_optimizer(pipeline.transformer, self.config, 1e-5)
     dummy_inputs = tuple([jtu.tree_map_with_path(functools.partial(_form_global_array, global_mesh=mesh), input) for input in dummy_inputs])
     self.training_loop(pipeline, optimizer, learning_rate_scheduler, dummy_inputs)
   
@@ -116,7 +95,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data):
     state = state.to_pure_dict()
     p_train_step = jax.jit(
       train_step,
-      donate_argnums=(1,),
+      donate_argnums=(0,),
     )
     rng = jax.random.key(self.config.seed)
     start_step = 0
@@ -137,7 +116,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data):
       if self.config.enable_profiler and step == first_profiling_step:
         max_utils.activate_profiler(self.config)
       with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh:
-        state, train_metric, rng = p_train_step(graphdef, state, data, rng)
+        state, train_metric, rng = p_train_step(state, graphdef, data, rng)
       
       new_time = datetime.datetime.now()
 
@@ -151,15 +130,13 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, data):
         train_utils.write_metrics(writer, local_metrics_file, running_gcs_metrics, train_metric, step, self.config)
       last_step_completion = new_time
 
-def train_step(graphdef, state, data, rng):
+def train_step(state, graphdef, data, rng):
   return step_optimizer(graphdef, state, data, rng)
 
 def step_optimizer(graphdef, state, data, rng):
   _, new_rng = jax.random.split(rng)
   def loss_fn(model):
-    latents, prompt_embeds = data
-    bsz = latents.shape[0]
-    timesteps = jnp.array([0] * bsz, dtype=jnp.int32)
+    latents, prompt_embeds, timesteps = data
 
     noise = jax.random.normal(
       key=new_rng,