Merge pull request #3550 from AI-Hypercomputer:aireen/fix_mm_sft

Google-ML-Automation · Google-ML-Automation · commit dd8df220b473 · 2026-04-01T20:36:39.000-07:00
PiperOrigin-RevId: 893252738
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -146,15 +146,15 @@ def vision_sft_preprocessing_pipeline(
           query_column=text_columns[0],
           response_column=text_columns[1],
           max_target_length=config.max_target_length,
-          unk_id=pad_id,
+          pad_id=pad_id,
       )
   )
   # TODO(aireenmei, hengtaoguo): support packing
   operations.append(
       input_pipeline_utils.PadOrTrimToMaxLength(
           config.max_target_length,
           pad_id,
-          model_name=config.model_name,
+          config=config,
           max_num_images_per_example=config.max_num_images_per_example,
       )
   )
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -313,15 +313,15 @@ def map(self, element):
 class SFTPromptMaskingVision(grain.MapTransform):
   """SFT prompt masking for multimodal"""
 
-  def __init__(self, query_column, response_column, max_target_length, unk_id):
+  def __init__(self, query_column, response_column, max_target_length, pad_id):
     self.query_column = query_column
     self.response_column = response_column
     self.max_target_length = max_target_length
-    self.unk_id = unk_id
+    self.pad_id = pad_id
 
   def map(self, element):
     inputs = np.concatenate((element[self.query_column], element[self.response_column]))
-    targets = np.concatenate((np.asarray([self.unk_id] * len(element[self.query_column])), element[self.response_column]))
+    targets = np.concatenate((np.asarray([self.pad_id] * len(element[self.query_column])), element[self.response_column]))
     return {
         "inputs": np.asarray(inputs[: self.max_target_length], dtype=np.int32),
         "targets": np.asarray(targets[: self.max_target_length], dtype=np.int32),
@@ -559,13 +559,13 @@ def __init__(
       self,
       max_length: int,
       pad_id: int = 0,
-      model_name: str | None = None,
+      config=None,
       add_true_length: bool = False,
       max_num_images_per_example: int = -1,
   ):
     self.max_length = max_length
     self.pad_id = pad_id
-    self.model_name = model_name
+    self.config = config
     self.add_true_length = add_true_length
     self.max_num_images_per_example = max_num_images_per_example
 
@@ -614,7 +614,7 @@ def _pad_image_and_mask(self, preprocessed_image: mm_utils.PreprocessorOutput) -
       raise ValueError("Input preprocessed_image must have pixel_values to pad images.")
 
     # Determine the maximum number of images/masks allowed.
-    image_offsets = mm_processor.get_image_offsets(self.model_name, preprocessed_image)
+    image_offsets = mm_processor.get_image_offsets(self.config, preprocessed_image)
     single_image_offset = image_offsets // preprocessed_image.pixel_values.shape[0]
 
     # Reserve space for at least one text token.
@@ -680,7 +680,7 @@ def map(
 
     for key, _ in element.items():
       if key == "images":
-        if self.model_name is None:
+        if self.config.model_name is None:
           raise ValueError("model_name must be provided when padding images")
 
         element["images"] = self._pad_image_and_mask(element["images"])
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -598,8 +598,8 @@ def apply_ffn_activation(self, layer_w0, layer_w1):
     """Applies FFN activation function."""
     with jax.named_scope("ffn_act"):
       if self.config.decoder_block == ctypes.DecoderBlockType.GPT_OSS:
-        layer_w0 = jnp.clip(layer_w0, a_min=None, a_max=self.config.mlp_activations_limit)
-        layer_w1 = jnp.clip(layer_w1, a_min=-self.config.mlp_activations_limit, a_max=self.config.mlp_activations_limit)
+        layer_w0 = jnp.clip(layer_w0, min=None, max=self.config.mlp_activations_limit)
+        layer_w1 = jnp.clip(layer_w1, min=-self.config.mlp_activations_limit, max=self.config.mlp_activations_limit)
         layer_act = self.activation_fn(layer_w0 * 1.702)
         glu = jnp.multiply(layer_w0, layer_act)
         intermediate_layer = jnp.multiply(glu, (layer_w1 + 1))
diff --git a/src/maxtext/multimodal/processor_gemma3.py b/src/maxtext/multimodal/processor_gemma3.py
@@ -77,10 +77,9 @@ def preprocess_mm_data_gemma3(images):
     images_out.append(img)
 
   processor_output = Gemma3PreprocessorOutput(
-      num_images=len(images),
+      num_images=len(images_in),
       pixel_values=np.stack(images_out, axis=0).astype(np.float32),  # (N, H, W, C)
   )
-  processor_output.num_images = len(images)
   return processor_output
 
 
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -827,7 +827,7 @@ def calculate_tflops_training_per_device(config, log=True):
   if config.use_multimodal:
     # Add vision layers TFLOPs for multimodal models
     mm_total_tflops, mm_learnable_weight_tflops, mm_attention_tflops = calculate_vision_encoder_tflops(config)
-    if log:
+    if log and mm_total_tflops > 0:
       print(
           f"{config.model_name} vision layers per train step:\n",
           f"Total TFLOPs: {mm_total_tflops:.2f} \n",

Original file line number	Diff line number	Diff line change
`@@ -146,15 +146,15 @@ def vision_sft_preprocessing_pipeline(`
`146`	`146`	`query_column=text_columns[0],`
`147`	`147`	`response_column=text_columns[1],`
`148`	`148`	`max_target_length=config.max_target_length,`
`149`		`- unk_id=pad_id,`
	`149`	`+ pad_id=pad_id,`
`150`	`150`	`)`
`151`	`151`	`)`
`152`	`152`	`# TODO(aireenmei, hengtaoguo): support packing`
`153`	`153`	`operations.append(`
`154`	`154`	`input_pipeline_utils.PadOrTrimToMaxLength(`
`155`	`155`	`config.max_target_length,`
`156`	`156`	`pad_id,`
`157`		`- model_name=config.model_name,`
	`157`	`+ config=config,`
`158`	`158`	`max_num_images_per_example=config.max_num_images_per_example,`
`159`	`159`	`)`
`160`	`160`	`)`
Original file line number	Diff line number	Diff line change
`@@ -77,10 +77,9 @@ def preprocess_mm_data_gemma3(images):`
`77`	`77`	`images_out.append(img)`
`78`	`78`
`79`	`79`	`processor_output = Gemma3PreprocessorOutput(`
`80`		`- num_images=len(images),`
	`80`	`+ num_images=len(images_in),`
`81`	`81`	`pixel_values=np.stack(images_out, axis=0).astype(np.float32), # (N, H, W, C)`
`82`	`82`	`)`
`83`		`- processor_output.num_images = len(images)`
`84`	`83`	`return processor_output`
`85`	`84`
`86`	`85`