AI-Hypercomputer
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 5 additions & 4 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 3 additions & 2 deletions b/‎src/maxdiffusion/models/attention_flax.py‎
Lines changed: 3 additions & 2 deletions
@@ -53,7 +53,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+attention: 'ring_flash' # Supported attention: dot_product, flash, cudnn_flash_te
 
 flash_block_sizes: {}
 # Use on v6e
@@ -143,8 +143,8 @@ data_sharding: [['data', 'fsdp', 'tensor']]
 # By default, product of the DCN axes should equal number of slices
 # and product of the ICI axes should equal number of devices per slice.
 dcn_data_parallelism: 1  # recommended DCN axis to be auto-sharded
-dcn_fsdp_parallelism: -1
-dcn_tensor_parallelism: 1
+dcn_fsdp_parallelism: 4
+dcn_tensor_parallelism: 2
 ici_data_parallelism: 1
 ici_fsdp_parallelism: -1  # recommended ICI axis to be auto-sharded
 ici_tensor_parallelism: 1
@@ -217,11 +217,12 @@ adam_eps: 1.e-8 # A small constant applied to denominator outside of the square
 adam_weight_decay: 0 # AdamW Weight decay
 max_grad_norm: 1.0
 
-enable_profiler: False
+enable_profiler: True
 # Skip first n steps for profiling, to omit things like compilation and to give
 # the iteration time a chance to stabilize.
 skip_first_n_steps_for_profiler: 5
 profiler_steps: 10
+tensorboard_dir: /home/kunjanp_google_com/wan-21-md/maxdiffusion/.trace/flash-tp
 
 # Generation parameters
 prompt: "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
 
@@ -166,7 +166,8 @@ def _tpu_ring_flash_attention_v1(
     dtype: jnp.dtype = jnp.float32,
 ) -> jax.Array:
     """TPU Ring Flash Attention with correct padding, transposition, and sharding."""
-    from ringattention import ringattention
+    # from ringattention import ringattention
+    from maxdiffusion.models.ringattention.ringattention_pallas_tpu import ring_flash_attention_tpu as ringattention
     from einops import rearrange
 
     max_block_size = 1024 if dtype == jnp.bfloat16 else 512
@@ -255,7 +256,7 @@ def _tpu_ring_flash_attention(
     usp_degree: Optional[int] = 1,
 ) -> jax.Array:
   """TPU Ring/USP Flash Attention with correct padding, transposition, and sharding."""
-  from ringattention import ringattention
+  from maxdiffusion.models.ringattention.ringattention_pallas_tpu import ring_flash_attention_tpu as ringattention
 
   max_block_size = 1024 if dtype == jnp.bfloat16 else 512
   blockwise_kwargs = {