AI-Hypercomputer
diff --git a/‎src/maxdiffusion/checkpointing/checkpointing_utils.py‎
Lines changed: 7 additions & 2 deletions b/‎src/maxdiffusion/checkpointing/checkpointing_utils.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 171 additions & 14 deletions b/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 171 additions & 14 deletions
diff --git a/‎src/maxdiffusion/max_utils.py‎
Lines changed: 5 additions & 1 deletion b/‎src/maxdiffusion/max_utils.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/maxdiffusion/models/ltx_video/main.py‎
Lines changed: 13 additions & 31 deletions b/‎src/maxdiffusion/models/ltx_video/main.py‎
Lines changed: 13 additions & 31 deletions
diff --git a/‎src/maxdiffusion/models/ltx_video/transformers/attention.py‎
Lines changed: 20 additions & 6 deletions b/‎src/maxdiffusion/models/ltx_video/transformers/attention.py‎
Lines changed: 20 additions & 6 deletions
@@ -213,8 +213,13 @@ def load_state_if_possible(
     max_logging.log(f"restoring from this run's directory latest step {latest_step}")
     try:
       if not enable_single_replica_ckpt_restoring:
-        item = {checkpoint_item: orbax.checkpoint.args.PyTreeRestore(item=abstract_unboxed_pre_state)}
-        return checkpoint_manager.restore(latest_step, args=orbax.checkpoint.args.Composite(**item))
+        # item = {checkpoint_item: orbax.checkpoint.args.PyTreeRestore(item=abstract_unboxed_pre_state)}
+        # return checkpoint_manager.restore(latest_step, args=orbax.checkpoint.args.Composite(**item))  #currently changed to this
+        if checkpoint_item == " ":
+          return checkpoint_manager.restore(latest_step, args=ocp.args.StandardRestore(abstract_unboxed_pre_state))
+        else:
+          item = {checkpoint_item: orbax.checkpoint.args.PyTreeRestore(item=abstract_unboxed_pre_state)}
+          return checkpoint_manager.restore(latest_step, args=orbax.checkpoint.args.Composite(**item))  #currently changed to this
 
       def map_to_pspec(data):
         pspec = data.sharding.spec
 
@@ -48,3 +48,4 @@ per_device_batch_size: 1
 compile_topology_num_slices: -1 
 quantization_local_shard_count: -1
 jit_initializers: True 
+enable_single_replica_ckpt_restoring: False
@@ -1,64 +1,211 @@
+from json import encoder
 from absl import app
 from typing import Sequence
 import jax
+from flax import linen as nn
 import json
+from flax.linen import partitioning as nn_partitioning
 from maxdiffusion.models.ltx_video.transformers.transformer3d import Transformer3DModel
 import os
 import functools
 import jax.numpy as jnp
-from maxdiffusion import FlaxAutoencoderKL, pyconfig, max_logging
+from maxdiffusion import pyconfig
 from maxdiffusion.max_utils import (
     create_device_mesh,
     setup_initial_state,
+    get_memory_allocations,
 )
-from jax.sharding import Mesh, PositionalSharding, PartitionSpec as P
+from jax.sharding import Mesh, PartitionSpec as P
+import orbax.checkpoint as ocp
 
 
-def validate_transformer_inputs(prompt_embeds, fractional_coords, latents, noise_cond):
+def validate_transformer_inputs(prompt_embeds, fractional_coords, latents, noise_cond, segment_ids, encoder_attention_segment_ids):
   print("prompts_embeds.shape: ", prompt_embeds.shape, prompt_embeds.dtype)
   print("fractional_coords.shape: ", fractional_coords.shape, fractional_coords.dtype)
   print("latents.shape: ", latents.shape, latents.dtype)
   print("noise_cond.shape: ", noise_cond.shape, noise_cond.dtype)
+  print("noise_cond.shape: ", noise_cond.shape, noise_cond.dtype)
+  print("segment_ids.shape: ", segment_ids.shape, segment_ids.dtype)
+  print("encoder_attention_segment_ids.shape: ", encoder_attention_segment_ids.shape, encoder_attention_segment_ids.dtype)
+
+
+def loop_body(
+    step,
+    args,
+    transformer,
+    fractional_cords,
+    prompt_embeds,
+    segment_ids,
+    encoder_attention_segment_ids
+):
+  latents, state, noise_cond = args
+  noise_pred = transformer.apply(
+      {"params": state.params},
+      hidden_states=latents,
+      indices_grid=fractional_cords,
+      encoder_hidden_states=prompt_embeds,
+      timestep=noise_cond,
+      segment_ids=segment_ids,
+      encoder_attention_segment_ids=encoder_attention_segment_ids
+  )
+  import pdb; pdb.set_trace()
+  return noise_pred, state, noise_cond   #need to make changes here? latents need to be changed based on noise_pred, but needs scheduler, return noise_pred for now
+
+
+
+def run_inference(
+    states, transformer, config, mesh, latents, fractional_cords, prompt_embeds, timestep, segment_ids, encoder_attention_segment_ids
+):
+  transformer_state = states["transformer"]
+  loop_body_p = functools.partial(
+      loop_body,
+      transformer=transformer,
+      fractional_cords=fractional_cords,
+      prompt_embeds=prompt_embeds,
+      segment_ids=segment_ids,
+      encoder_attention_segment_ids=encoder_attention_segment_ids
+  )
+  ## TODO: add vae decode step
+  ## TODO: add loop
+  with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
+    latents, transformer_state, _ = jax.lax.fori_loop(0, 1, loop_body_p, (latents, transformer_state, timestep))   
+  return latents
+  
 
 def run(config):
   key = jax.random.PRNGKey(0)
 
   devices_array = create_device_mesh(config) 
   mesh = Mesh(devices_array, config.mesh_axes)
 
-  batch_size, text_tokens, num_tokens, features = 4, 256, 2048, 128
   base_dir = os.path.dirname(__file__)
 
   ##load in model config
   config_path = os.path.join(base_dir, "models/ltx_video/xora_v1.2-13B-balanced-128.json")
   with open(config_path, "r") as f:
     model_config = json.load(f)
+  relative_ckpt_path = model_config["ckpt_path"]
 
+  ignored_keys = ["_class_name", "_diffusers_version", "_name_or_path", "causal_temporal_positioning", "in_channels", "ckpt_path"]
+  in_channels = model_config["in_channels"]
+  for name in ignored_keys:
+    if name in model_config:
+      del model_config[name]
 
-  transformer = Transformer3DModel(**model_config, dtype=jnp.bfloat16, gradient_checkpointing="matmul_without_batch")
-  transformer_param_shapes = transformer.init_weights(key, batch_size, text_tokens, num_tokens, features, eval_only = False) 
+ 
+  transformer = Transformer3DModel(**model_config, dtype=jnp.float32, gradient_checkpointing="matmul_without_batch", sharding_mesh=mesh)
+  transformer_param_shapes = transformer.init_weights(in_channels, model_config['caption_channels'], eval_only = True) #use this to test!
 
-  key, split_key = jax.random.split(key)
   weights_init_fn = functools.partial(
       transformer.init_weights, 
-      split_key, 
-      batch_size, 
-      text_tokens, 
-      num_tokens, 
-      features, 
-      eval_only = False
+      in_channels, 
+      model_config['caption_channels'],
+      eval_only = True
   )
 
-  transformer_state, transformer_state_shardings = setup_initial_state(  
+  absolute_ckpt_path = os.path.abspath(relative_ckpt_path)
+
+  checkpoint_manager = ocp.CheckpointManager(absolute_ckpt_path)
+  transformer_state, transformer_state_shardings = setup_initial_state(
     model=transformer,
     tx=None,
     config=config,
     mesh=mesh,
     weights_init_fn=weights_init_fn,
+    checkpoint_manager=checkpoint_manager,
+    checkpoint_item=" ",
     model_params=None,
     training=False,
   )
+
+  
+  
+  
+  transformer_state = jax.device_put(transformer_state, transformer_state_shardings)
+  get_memory_allocations()
+
+  states = {}
+  state_shardings = {}
+
+  state_shardings["transformer"] = transformer_state_shardings
+  states["transformer"] = transformer_state
+
+  #create dummy inputs:
+  example_inputs = {}
+  batch_size, num_tokens = 4, 256
+  input_shapes = {
+    "latents": (batch_size, num_tokens, in_channels),
+    "fractional_coords": (batch_size, 3, num_tokens),
+    "prompt_embeds": (batch_size, 128, model_config["caption_channels"]),
+    "timestep": (batch_size, 256),   #TODO: add in the segment id stuff
+    "segment_ids": (batch_size, 256),
+    "encoder_attention_segment_ids": (batch_size, 128),
+  }
+  for name, shape in input_shapes.items():
+    example_inputs[name] = jnp.ones(
+      shape, dtype=jnp.float32 if name not in ["attention_mask", "encoder_attention_mask"] else jnp.bool
+    )
+
+  data_sharding = jax.sharding.NamedSharding(mesh, P(*config.data_sharding))
+  latents = jax.device_put(example_inputs["latents"], data_sharding)
+  prompt_embeds = jax.device_put(example_inputs["prompt_embeds"], data_sharding)
+  fractional_coords = jax.device_put(example_inputs["fractional_coords"], data_sharding)
+  noise_cond = jax.device_put(example_inputs["timestep"], data_sharding)
+  segment_ids = jax.device_put(example_inputs["segment_ids"], data_sharding)
+  encoder_attention_segment_ids = jax.device_put(example_inputs["encoder_attention_segment_ids"], data_sharding)
+
+  validate_transformer_inputs(prompt_embeds, fractional_coords, latents, noise_cond, segment_ids, encoder_attention_segment_ids)
+  p_run_inference = jax.jit(
+      functools.partial(
+          run_inference,
+          transformer=transformer,
+          config=config,
+          mesh=mesh,
+          latents=latents,
+          fractional_cords=fractional_coords,
+          prompt_embeds=prompt_embeds,
+          timestep = noise_cond,
+          segment_ids=segment_ids,
+          encoder_attention_segment_ids=encoder_attention_segment_ids
+      ),
+      in_shardings=(state_shardings,),
+      out_shardings=None,
+  )
+  noise_pred = p_run_inference(states).block_until_ready()
+  print(noise_pred)  #(4, 256, 128)
+
+
+
+
+
+
+  
+
+
+
+
+
+
+
+
 
+
+
+
+
+  
+
+  
+
+  
+
+
+
+
+
+
+
+
 
 
 def main(argv: Sequence[str]) -> None:
@@ -71,3 +218,13 @@ def main(argv: Sequence[str]) -> None:
 
 
 
+
+###setup_initial_state, can optionally load from checkpoint
+
+
+
+
+
+
+
+#end to end steps from ltx repo: pipeline_ltx_video.py
@@ -402,7 +402,11 @@ def setup_initial_state(
           config.enable_single_replica_ckpt_restoring,
       )
       if state:
-        state = state[checkpoint_item]
+        ###!Edited
+        if checkpoint_item == " ":
+          state = state
+        else:
+          state = state[checkpoint_item]
     if not state:
       max_logging.log(f"Could not find the item in orbax, creating state...")
       init_train_state_partial = functools.partial(
 
@@ -1,40 +1,22 @@
+
+import argparse
+import json
+from typing import Any, Dict, Optional
 import os
 import jax
 import jax.numpy as jnp
-import json
+import jax.lib.xla_extension
+import flax
+from flax.training import train_state
+import torch
+import optax
+import orbax.checkpoint as ocp
+from safetensors.torch import load_file
 
+from maxdiffusion.models.ltx_video.transformers_pytorch.transformer_pt import Transformer3DModel_PT
 
-from models.transformers.transformer3d import Transformer3DModel
-
-# Load JSON config
 base_dir = os.path.dirname(__file__)
 config_path = os.path.join(base_dir, "xora_v1.2-13B-balanced-128.json")
 with open(config_path, "r") as f:
     model_config = json.load(f)
-
-key = jax.random.PRNGKey(0)
-model = Transformer3DModel(**model_config, dtype=jnp.bfloat16, gradient_checkpointing="matmul_without_batch")
-
-batch_size, text_tokens, num_tokens, features = 4, 256, 2048, 128
-prompt_embeds = jax.random.normal(key, shape=(batch_size, text_tokens, features), dtype=jnp.bfloat16)
-fractional_coords = jax.random.normal(key, shape=(batch_size, 3, num_tokens), dtype=jnp.bfloat16)
-latents = jax.random.normal(key, shape=(batch_size, num_tokens, features), dtype=jnp.bfloat16)
-noise_cond = jax.random.normal(key, shape=(batch_size, 1), dtype=jnp.bfloat16)
-
-model_params = model.init(
-    hidden_states=latents,
-    indices_grid=fractional_coords,
-    encoder_hidden_states=prompt_embeds,
-    timestep=noise_cond,
-    rngs={"params": key}
-)
-
-output = model.apply(
-    model_params,
-    hidden_states=latents,
-    indices_grid=fractional_coords,
-    encoder_hidden_states=prompt_embeds,
-    timestep=noise_cond,
-)
-
-print("done!")
+transformer = Transformer3DModel_PT.from_config(model_config)
@@ -1,8 +1,8 @@
 from functools import partial
+import functools
 import math
 from typing import Any, Dict, Optional, Tuple
 from enum import Enum, auto
-
 import jax
 import jax.nn as jnn
 import jax.numpy as jnp
@@ -604,7 +604,8 @@ def __call__(
             block_sizes = self.default_block_sizes(q, k, dtype)
 
         scale_factor = 1 / math.sqrt(q.shape[-1])
-
+        
+    
         def partial_flash_attention(q, k, v, q_segment_ids, kv_segment_ids):
             s = (
                 # flash attention expects segment ids to be float32
@@ -630,14 +631,27 @@ def partial_flash_attention(q, k, v, q_segment_ids, kv_segment_ids):
                 raise ValueError(f"Expected mask with 2 dims, got {q_segment_ids.ndim}.")
             # Based on: ("activation_kv_batch", "activation_kv_heads", "activation_length", "activation_kv_head_dim")
             # Computation of the spec based on the logical constraints can be found in logical_axes_to_spec.py.
+            # qkvo_sharding_spec = jax.sharding.PartitionSpec(
+            #     ("data", "fsdp", "fsdp_transpose", "expert"),
+            #     ("tensor", "tensor_transpose", "sequence", "tensor_sequence"),
+            #     None,
+            #     None,
+            # )
+            # qkvo_sharding_spec = jax.sharding.PartitionSpec(
+            #     ("data", "fsdp", "fsdp_transpose", "expert"),
+            #     ("tensor", "tensor_transpose", "sequence", "tensor_sequence"),
+            #     None,
+            #     None,
+            # )
             qkvo_sharding_spec = jax.sharding.PartitionSpec(
-                ("data", "fsdp", "fsdp_transpose", "expert"),
-                ("tensor", "tensor_transpose", "sequence", "tensor_sequence"),
+                None,
+                None,
                 None,
                 None,
             )
-            # Based on: ("activation_kv_batch", "activation_length")
-            qkv_segment_ids_spec = jax.sharding.PartitionSpec(("data", "fsdp", "fsdp_transpose", "expert"), "sequence")
+            #Based on: ("activation_kv_batch", "activation_length")
+            # qkv_segment_ids_spec = jax.sharding.PartitionSpec(("data", "fsdp", "fsdp_transpose", "expert"), "sequence")
+            qkv_segment_ids_spec = jax.sharding.PartitionSpec(None, None)
             wrapped_flash_attention = shard_map(
                 partial_flash_attention,
                 mesh=sharding_mesh,