AI-Hypercomputer
diff --git a/‎.github/workflows/UploadDockerImages.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/UploadDockerImages.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 6 additions & 0 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 3 additions & 1 deletion b/‎src/maxdiffusion/configs/ltx_video.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 112 additions & 2 deletions b/‎src/maxdiffusion/generate_ltx_video.py‎
Lines changed: 112 additions & 2 deletions
diff --git a/‎src/maxdiffusion/input_pipeline/_tfds_data_processing.py‎
Lines changed: 63 additions & 64 deletions b/‎src/maxdiffusion/input_pipeline/_tfds_data_processing.py‎
Lines changed: 63 additions & 64 deletions
@@ -32,6 +32,9 @@ jobs:
     - name: build maxdiffusion jax ai image
       run: |
         bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_stable_stack MODE=jax_ai_image PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_stable_stack BASEIMAGE=us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:latest
+    - name: build maxdiffusion w/ nightly jax ai image
+      run: |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_stable_stack_nightly MODE=jax_ai_image PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_stable_stack BASEIMAGE=us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/tpu/jax_nightly:latest
     - name: build maxdiffusion jax nightly image
       run: |
         bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxdiffusion_jax_nightly MODE=nightly PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxdiffusion_jax_nightly
 
@@ -17,6 +17,7 @@
 [![Unit Tests](https://github.com/google/maxtext/actions/workflows/UnitTests.yml/badge.svg)](https://github.com/google/maxdiffusion/actions/workflows/UnitTests.yml)
 
 # What's new?
+- **`2025/8/14`**: LTX-Video img2vid generation is now supported.
 - **`2025/7/29`**: LTX-Video text2vid generation is now supported.
 - **`2025/04/17`**: Flux Finetuning.
 - **`2025/02/12`**: Flux LoRA for inference.
@@ -42,7 +43,7 @@ MaxDiffusion supports
 * Load Multiple LoRA (SDXL inference).
 * ControlNet inference (Stable Diffusion 1.4 & SDXL).
 * Dreambooth training support for Stable Diffusion 1.x,2.x.
-* LTX-Video text2vid (inference).
+* LTX-Video text2vid, img2vid (inference).
 
 
 # Table of Contents
@@ -177,13 +178,14 @@ To generate images, run the following command:
   ## LTX-Video
   - In the folder src/maxdiffusion/models/ltx_video/utils, run:
     ```bash
-    python convert_torch_weights_to_jax.py --ckpt_path [LOCAL DIRECTORY FOR WEIGHTS] --transformer_config_path ../xora_v1.2-13B-balanced-128.json
+    python convert_torch_weights_to_jax.py --ckpt_path [LOCAL DIRECTORY FOR WEIGHTS] --transformer_config_path ../ltxv-13B.json
     ```
   - In the repo folder, run:
     ```bash
-    python src/maxdiffusion/generate_ltx_video.py src/maxdiffusion/configs/ltx_video.yml output_dir="[SAME DIRECTORY]" config_path="src/maxdiffusion/models/ltx_video/xora_v1.2-13B-balanced-128.json"
+    python src/maxdiffusion/generate_ltx_video.py src/maxdiffusion/configs/ltx_video.yml output_dir="[SAME DIRECTORY]" config_path="src/maxdiffusion/models/ltx_video/ltxv-13B.json"
     ```
-  - Other generation parameters can be set in ltx_video.yml file.
+  - Img2video Generation: 
+    Add conditioning image path as conditioning_media_paths in the form of ["IMAGE_PATH"] along with other generation parameters in the ltx_video.yml file. Then follow same instruction as above.
   ## Flux
 
   First make sure you have permissions to access the Flux repos in Huggingface.
 
@@ -40,6 +40,9 @@ weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
 
+# Replicates vae across devices instead of using the model's sharding annotations for sharding.
+replicate_vae: False
+
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
 # fp32 activations and fp32 weights with HIGHEST will provide the best precision
@@ -291,3 +294,6 @@ use_qwix_quantization: False # Whether to use qwix for quantization. If set to T
 # Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
 quantization_calibration_method: "absmax"
 
+# Eval model on per eval_every steps. -1 means don't eval.
+eval_every: -1
+eval_data_dir: ""
@@ -22,7 +22,7 @@ sampler: "from_checkpoint"
 
 # Generation parameters
 pipeline_type: multi-scale
-prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie. "
+prompt: "A man in a dimly lit room talks on a vintage telephone, hangs up, and looks down with a sad expression. He holds the black rotary phone to his right ear with his right hand, his left hand holding a rocks glass with amber liquid. He wears a brown suit jacket over a white shirt, and a gold ring on his left ring finger. His short hair is neatly combed, and he has light skin with visible wrinkles around his eyes. The camera remains stationary, focused on his face and upper body. The room is dark, lit only by a warm light source off-screen to the left, casting shadows on the wall behind him. The scene appears to be from a movie."
 #negative_prompt: "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
 height: 512
 width: 512
@@ -35,6 +35,8 @@ stg_mode: "attention_values"
 decode_timestep: 0.05
 decode_noise_scale: 0.025
 seed: 10
+conditioning_media_paths: None #["IMAGE_PATH"]
+conditioning_start_frames: [0]
 
 
 first_pass:
 
@@ -16,15 +16,19 @@
 
 import numpy as np
 from absl import app
-from typing import Sequence
+from typing import Sequence, List, Optional, Union
 from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXVideoPipeline
-from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline
+from maxdiffusion.pipelines.ltx_video.ltx_video_pipeline import LTXMultiScalePipeline, ConditioningItem
+import maxdiffusion.pipelines.ltx_video.crf_compressor as crf_compressor
 from maxdiffusion import pyconfig, max_logging
+import torchvision.transforms.functional as TVF
 import imageio
 from datetime import datetime
 import os
 import time
 from pathlib import Path
+from PIL import Image
+import torch
 
 
 def calculate_padding(
@@ -44,6 +48,79 @@ def calculate_padding(
   return padding
 
 
+def load_image_to_tensor_with_resize_and_crop(
+    image_input: Union[str, Image.Image],
+    target_height: int = 512,
+    target_width: int = 768,
+    just_crop: bool = False,
+) -> torch.Tensor:
+  """Load and process an image into a tensor.
+
+  Args:
+      image_input: Either a file path (str) or a PIL Image object
+      target_height: Desired height of output tensor
+      target_width: Desired width of output tensor
+      just_crop: If True, only crop the image to the target size without resizing
+  """
+  if isinstance(image_input, str):
+    image = Image.open(image_input).convert("RGB")
+  elif isinstance(image_input, Image.Image):
+    image = image_input
+  else:
+    raise ValueError("image_input must be either a file path or a PIL Image object")
+
+  input_width, input_height = image.size
+  aspect_ratio_target = target_width / target_height
+  aspect_ratio_frame = input_width / input_height
+  if aspect_ratio_frame > aspect_ratio_target:
+    new_width = int(input_height * aspect_ratio_target)
+    new_height = input_height
+    x_start = (input_width - new_width) // 2
+    y_start = 0
+  else:
+    new_width = input_width
+    new_height = int(input_width / aspect_ratio_target)
+    x_start = 0
+    y_start = (input_height - new_height) // 2
+
+  image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
+  if not just_crop:
+    image = image.resize((target_width, target_height))
+
+  frame_tensor = TVF.to_tensor(image)  # PIL -> tensor (C, H, W), [0,1]
+  frame_tensor = TVF.gaussian_blur(frame_tensor, kernel_size=3, sigma=1.0)
+  frame_tensor_hwc = frame_tensor.permute(1, 2, 0)  # (C, H, W) -> (H, W, C)
+  frame_tensor_hwc = crf_compressor.compress(frame_tensor_hwc)
+  frame_tensor = frame_tensor_hwc.permute(2, 0, 1) * 255.0  # (H, W, C) -> (C, H, W)
+  frame_tensor = (frame_tensor / 127.5) - 1.0
+  # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
+  return frame_tensor.unsqueeze(0).unsqueeze(2)
+
+
+def prepare_conditioning(
+    conditioning_media_paths: List[str],
+    conditioning_strengths: List[float],
+    conditioning_start_frames: List[int],
+    height: int,
+    width: int,
+    padding: tuple[int, int, int, int],
+) -> Optional[List[ConditioningItem]]:
+  """Prepare conditioning items based on input media paths and their parameters."""
+  conditioning_items = []
+  for path, strength, start_frame in zip(conditioning_media_paths, conditioning_strengths, conditioning_start_frames):
+    num_input_frames = 1
+    media_tensor = load_media_file(
+        media_path=path,
+        height=height,
+        width=width,
+        max_frames=num_input_frames,
+        padding=padding,
+        just_crop=True,
+    )
+    conditioning_items.append(ConditioningItem(media_tensor, start_frame, strength))
+  return conditioning_items
+
+
 def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
   # Remove non-letters and convert to lowercase
   clean_text = "".join(char.lower() for char in text if char.isalpha() or char.isspace())
@@ -68,6 +145,19 @@ def convert_prompt_to_filename(text: str, max_len: int = 20) -> str:
   return "-".join(result)
 
 
+def load_media_file(
+    media_path: str,
+    height: int,
+    width: int,
+    max_frames: int,
+    padding: tuple[int, int, int, int],
+    just_crop: bool = False,
+) -> torch.Tensor:
+  media_tensor = load_image_to_tensor_with_resize_and_crop(media_path, height, width, just_crop=just_crop)
+  media_tensor = torch.nn.functional.pad(media_tensor, padding)
+  return media_tensor
+
+
 def get_unique_filename(
     base: str,
     ext: str,
@@ -97,6 +187,25 @@ def run(config):
   pipeline = LTXVideoPipeline.from_pretrained(config, enhance_prompt=enhance_prompt)
   if config.pipeline_type == "multi-scale":
     pipeline = LTXMultiScalePipeline(pipeline)
+  conditioning_media_paths = config.conditioning_media_paths if isinstance(config.conditioning_media_paths, List) else None
+  conditioning_start_frames = config.conditioning_start_frames
+  conditioning_strengths = None
+  if conditioning_media_paths:
+    if not conditioning_strengths:
+      conditioning_strengths = [1.0] * len(conditioning_media_paths)
+  conditioning_items = (
+      prepare_conditioning(
+          conditioning_media_paths=conditioning_media_paths,
+          conditioning_strengths=conditioning_strengths,
+          conditioning_start_frames=conditioning_start_frames,
+          height=config.height,
+          width=config.width,
+          padding=padding,
+      )
+      if conditioning_media_paths
+      else None
+  )
+
   s0 = time.perf_counter()
   images = pipeline(
       height=height_padded,
@@ -106,6 +215,7 @@ def run(config):
       output_type="pt",
       config=config,
       enhance_prompt=enhance_prompt,
+      conditioning_items=conditioning_items,
       seed=config.seed,
   )
   max_logging.log(f"Compile time: {time.perf_counter() - s0:.1f}s.")
 
@@ -19,7 +19,7 @@
 import tensorflow.experimental.numpy as tnp
 from datasets import load_dataset, load_from_disk
 import jax
-from maxdiffusion import multihost_dataloading
+from maxdiffusion import multihost_dataloading, max_logging
 
 AUTOTUNE = tf.data.AUTOTUNE
 
@@ -78,92 +78,91 @@ def make_tf_iterator(
   train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
   return train_iter
 
-
-def make_cached_tfrecord_iterator(
-    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
-):
-  """
-  New iterator for TFRecords that contain the full 4 pre-computed latents and embeddings:
-  latents, input_ids, prompt_embeds, and text_embeds.
-  """
-
-  def _parse_tfrecord_fn(example):
-    return tf.io.parse_single_example(example, feature_description)
-
-  # This pipeline reads the sharded files and applies the parsing and preparation.
-  filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
-
-  train_ds = (
-      tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
-      .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
-      .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-      .map(prepare_sample_fn, num_parallel_calls=AUTOTUNE)
-      .shuffle(global_batch_size * 10)
-      .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
-      .repeat(-1)
-      .prefetch(AUTOTUNE)
-  )
-
-  # This wraps the tf.data.Dataset for use in the multi-host JAX environment.
-  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
-  return train_iter
-
-
 # TODO - https://github.com/google/array_record/blob/main/beam/examples/example_gcs_conversion.py
-def make_tfrecord_iterator(
-    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn
+def _make_tfrecord_iterator(
+    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description_fn, prepare_sample_fn, dataset_path, is_training: bool
 ):
-  """Iterator for TFRecord format. For Laion dataset,
-  check out preparation script
-  maxdiffusion/pedagogical_examples/to_tfrecords.py
-  """
   # set load_tfrecord_cached to True in config to use pre-processed tfrecord dataset.
   # pedagogical_examples/dataset_tf_cache_to_tfrecord.py to convert tf preprocessed dataset to tfrecord.
   # Dataset cache in github runner test doesn't contain all the features since its shared, Use the default tfrecord iterator.
+  # if is_training is True, loads the training dataset. If False, loads the evaluation dataset.
 
   # checks that the dataset path is valid. In case of gcs, the existance of the dir is not checked.
   is_dataset_dir_valid = "gs://" in config.dataset_save_location or os.path.isdir(config.dataset_save_location)
 
-  if (
-      config.cache_latents_text_encoder_outputs
-      and is_dataset_dir_valid
-      and "load_tfrecord_cached" in config.get_keys()
-      and config.load_tfrecord_cached
-  ):
-    return make_cached_tfrecord_iterator(
-        config,
-        dataloading_host_index,
-        dataloading_host_count,
-        mesh,
-        global_batch_size,
-        feature_description,
-        prepare_sample_fn,
-    )
+  # Determine whether to use the "cached" dataset, which requires externally
+  # provided parsing functions, or the default one with its internal parsing logic.
+  make_cached_tfrecord_iterator = (
+    config.cache_latents_text_encoder_outputs
+    and is_dataset_dir_valid
+    and "load_tfrecord_cached" in config.get_keys()
+    and config.load_tfrecord_cached
+  )
 
   feature_description = {
       "moments": tf.io.FixedLenFeature([], tf.string),
       "clip_embeddings": tf.io.FixedLenFeature([], tf.string),
   }
 
+  used_feature_description = feature_description_fn if make_cached_tfrecord_iterator else feature_description
+
   def _parse_tfrecord_fn(example):
-    return tf.io.parse_single_example(example, feature_description)
+    return tf.io.parse_single_example(example, used_feature_description)
 
   def prepare_sample(features):
     moments = tf.io.parse_tensor(tnp.asarray(features["moments"]), out_type=tf.float32)
     clip_embeddings = tf.io.parse_tensor(tnp.asarray(features["clip_embeddings"]), out_type=tf.float32)
     return {"pixel_values": moments, "input_ids": clip_embeddings}
 
-  filenames = tf.io.gfile.glob(os.path.join(config.train_data_dir, "*"))
-  train_ds = (
-      tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
-      .shard(num_shards=dataloading_host_count, index=dataloading_host_index)
-      .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
-      .map(prepare_sample, num_parallel_calls=AUTOTUNE)
-      .shuffle(global_batch_size * 10)
+  filenames = tf.io.gfile.glob(os.path.join(dataset_path, "*"))
+  ds = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
+
+  # --- PADDING LOGIC FOR EVALUATION ---
+  if not is_training:
+    num_eval_samples = 0
+    for _ in ds:
+        num_eval_samples += 1
+
+    remainder = num_eval_samples % global_batch_size
+    if remainder != 0:
+        num_to_pad = global_batch_size - remainder
+        # Create a dataset of padding samples from the beginning
+        padding_ds = ds.take(num_to_pad)
+        # Add the padding samples to the end
+        ds = ds.concatenate(padding_ds)
+        max_logging.log(f"Padded evaluation dataset with {num_to_pad} samples.")
+
+  used_prepare_sample = prepare_sample_fn if make_cached_tfrecord_iterator else prepare_sample
+  ds = (
+    ds.shard(num_shards=dataloading_host_count, index=dataloading_host_index)
+    .map(_parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
+    .map(used_prepare_sample, num_parallel_calls=AUTOTUNE)
+  )
+  if is_training:
+    ds = (
+      ds.shuffle(global_batch_size * 10)
       .batch(global_batch_size // dataloading_host_count, drop_remainder=True)
       .repeat(-1)
       .prefetch(AUTOTUNE)
-  )
+    )
+  # For Evaluation
+  else:
+    ds = (
+      ds.batch(global_batch_size // dataloading_host_count, drop_remainder=False)
+      .prefetch(AUTOTUNE)
+    )
 
-  train_iter = multihost_dataloading.MultiHostDataLoadIterator(train_ds, mesh)
-  return train_iter
+  iter = multihost_dataloading.MultiHostDataLoadIterator(ds, mesh)
+  return iter
+
+def make_tfrecord_iterator(
+    config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, is_training
+):
+  """Iterator for TFRecord format. For Laion dataset,
+  check out preparation script
+  maxdiffusion/pedagogical_examples/to_tfrecords.py
+  """
+  # Currently only support evaluation on tfrecord. To avoid influencing previous reference, judge whether is training dataset.
+  # TODO: refactor to support evaluation on all dataset format.
+  dataset_path = config.train_data_dir if is_training else config.eval_data_dir
+  return _make_tfrecord_iterator(config, dataloading_host_index, dataloading_host_count, mesh, global_batch_size, feature_description, prepare_sample_fn, dataset_path, is_training)