Add support for L2 loss in feature distillation.

gagika · Google-ML-Automation · commit f2161970cbc2 · 2026-04-11T07:35:26.000-07:00
PiperOrigin-RevId: 898201648
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1194,6 +1194,8 @@ distill_temperature: 1.0
 # distill_beta is used for cosine similarity loss between intermediate activataitions of out_proj in teacher/student models.
 # 0.0 value disables this feature.
 distill_beta: 0.0
+# distill_feature_loss_type is the type of loss to use for feature distillation ("cosine" or "l2").
+distill_feature_loss_type: "cosine"
 distill_layer_indices: None
 
 ##### Elastic training parameters
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1155,6 +1155,9 @@ class Distillation(BaseModel):
   distill_alpha: float = Field(0.5, description="Weight for the distillation loss component.")
   distill_temperature: float = Field(1.0, description="Temperature for distillation softening.")
   distill_beta: float = Field(0.0, description="Weight for the feature loss component. Use 0.0 to disable")
+  distill_feature_loss_type: Literal["cosine", "l2"] = Field(
+      "cosine", description="The type of loss to use for feature distillation ('cosine' or 'l2')."
+  )
   distill_layer_indices: None | list = Field(None, description="Feature indices for feature loss.")
 
   # --- Distillation freezing filter --
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -23,7 +23,7 @@
 from array_record.python import array_record_module
 
 import abc
-from typing import Any, Iterator, Optional, List, Callable
+from typing import Any, Iterator, Optional, List, Callable, Literal
 
 import flax
 from flax import nnx
@@ -262,6 +262,7 @@ def __init__(
       beta_feature: float = 0.0,
       layer_indices: Optional[List[int]] = None,
       feature_loss_fn: Callable[[jax.Array, jax.Array], jax.Array] | None = None,
+      feature_loss_type: Literal["cosine", "l2"] = "cosine",
       cosine_distance_axis: int | tuple[int, ...] = -1,
       vocab_size: int = 0,
   ):
@@ -275,6 +276,8 @@ def __init__(
         alpha: Weight to balance distillation loss and task loss (0.0 to 1.0).
         beta_feature: Weight to balance feature loss (0.0 to 1.0). 0.0 disables feature loss.
         layer_indices: Layer indices to apply feature loss.
+        feature_loss_type: The type of feature loss to use if `feature_loss_fn` is None.
+          Can be "cosine" (default) or "l2".
         feature_loss_fn: A function that takes two jax. Arrays (student_map,
           teacher_map) and returns a scalar loss. Defaults to Cosine Distance.
         cosine_distance_axis: The axis to use for cosine distance computation if
@@ -295,9 +298,16 @@ def __init__(
 
     self.feature_loss_fn = feature_loss_fn
     if feature_loss_fn is None:
-      self.feature_loss_fn = lambda student_features, teacher_features: jnp.mean(
-          optax.cosine_distance(student_features, teacher_features, axis=cosine_distance_axis)
-      )
+      if feature_loss_type == "cosine":
+        self.feature_loss_fn = lambda student_features, teacher_features: jnp.mean(
+            optax.cosine_distance(student_features, teacher_features, axis=cosine_distance_axis)
+        )
+      elif feature_loss_type == "l2":
+        self.feature_loss_fn = lambda student_features, teacher_features: jnp.mean(
+            optax.l2_loss(student_features, teacher_features)
+        )
+      else:
+        raise ValueError(f"Unsupported feature_loss_type: {feature_loss_type!r}")
 
   def compute_loss(
       self,
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -507,6 +507,7 @@ def build_training_components(
       alpha=student_config.distill_alpha,
       beta_feature=student_config.distill_beta,
       layer_indices=student_config.distill_layer_indices,
+      feature_loss_type=student_config.distill_feature_loss_type,
       vocab_size=student_config.vocab_size,
   )
 
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py
@@ -24,6 +24,7 @@
 import shutil
 import tempfile
 import unittest
+from typing import Literal
 from unittest import mock
 import jax
 import jax.numpy as jnp
@@ -372,12 +373,14 @@ def test_optimizer_factory(self):
       train_distill.get_distillation_optimizer(config, max_train_steps=100)
 
   def test_monitored_strategy(self):
-    self._test_monitored_strategy(False)
+    self._test_monitored_strategy(sft_mode=False, feature_loss_type="cosine")
+    self._test_monitored_strategy(sft_mode=False, feature_loss_type="l2")
 
   def test_monitored_strategy_sft(self):
-    self._test_monitored_strategy(True)
+    self._test_monitored_strategy(sft_mode=True, feature_loss_type="cosine")
+    self._test_monitored_strategy(sft_mode=True, feature_loss_type="l2")
 
-  def _test_monitored_strategy(self, sft_mode: bool):
+  def _test_monitored_strategy(self, *, sft_mode: bool, feature_loss_type: Literal["cosine", "l2"] = "cosine"):
     """Verifies the strategy calculates metrics and returns the correct tuple."""
     strategy = distillation_utils.CombinedDistillationStrategy(
         student_forward_fn=lambda m, **k: None,
@@ -386,6 +389,7 @@ def _test_monitored_strategy(self, sft_mode: bool):
         temperature=1.0,
         alpha=0.5,
         beta_feature=1.0,
+        feature_loss_type=feature_loss_type,
         layer_indices=None,
     )
 
@@ -1012,6 +1016,7 @@ def test_main_offline_mode_skips_teacher_loading(
     mock_student_cfg.distill_alpha = 0.5
     mock_student_cfg.distill_beta = 0.0
     mock_student_cfg.distill_layer_indices = None
+    mock_student_cfg.distill_feature_loss_type = "cosine"
     mock_student_cfg.use_sft = False
     mock_student_cfg.enable_dropout = False
 
@@ -1091,6 +1096,7 @@ def test_main_online_mode_loads_teacher(
     mock_student_cfg.distill_alpha = 0.5
     mock_student_cfg.distill_beta = 0.0
     mock_student_cfg.distill_layer_indices = None
+    mock_student_cfg.distill_feature_loss_type = "cosine"
     mock_student_cfg.use_sft = False
     mock_student_cfg.enable_dropout = False
 

Original file line number	Diff line number	Diff line change
`@@ -507,6 +507,7 @@ def build_training_components(`
`507`	`507`	`alpha=student_config.distill_alpha,`
`508`	`508`	`beta_feature=student_config.distill_beta,`
`509`	`509`	`layer_indices=student_config.distill_layer_indices,`
	`510`	`+ feature_loss_type=student_config.distill_feature_loss_type,`
`510`	`511`	`vocab_size=student_config.vocab_size,`
`511`	`512`	`)`
`512`	`513`