Freezing all but changed and norm layer student weights

vlad-karp · vlad-karp · commit 9f151e74fae8 · 2026-04-01T19:53:33.000Z
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1126,6 +1126,12 @@ class Distillation(BaseModel):
   distill_beta: float = Field(0.0, description="Weight for the feature loss component. Use 0.0 to disable")
   distill_layer_indices: None | list = Field(None, description="Feature indices for feature loss.")
 
+  # --- Distillation freezing filter --
+  student_params_to_update: None | list = Field(
+      None,
+      description="a list of model param name templates to finetune in the student model. The other parameters will be frozen if this attribute is non empty)",
+  )
+
 
 class TrainingLoop(BaseModel):
   """Configuration for the main training loop, evaluation, and reproducibility."""
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -34,7 +34,7 @@
 """
 
 import inspect
-from typing import Sequence, Callable
+from typing import Sequence, Callable, Any
 from absl import app
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
@@ -199,7 +199,15 @@ class MaxTextDistillationTrainer(peft_trainer.PeftTrainer):
   (positions, segment_ids) are passed to the model.
   """
 
-  def __init__(self, model, strategy: distillation_utils.DistillationStrategy, optimizer, training_config, **kwargs):
+  def __init__(
+      self,
+      model,
+      strategy: distillation_utils.DistillationStrategy,
+      optimizer,
+      training_config,
+      student_freeze_param_filter: Callable[[Any], bool] | None = None,
+      **kwargs,
+  ):
     # We pass a dummy optimizer to the base PeftTrainer temporarily to prevent PeftTrainer from eagerly
     # allocating massive optimizer states for the entire ModelBundle (including the frozen teacher) before
     # redefining the trainer optimizer here.
@@ -211,8 +219,20 @@ def __init__(self, model, strategy: distillation_utils.DistillationStrategy, opt
     # override optimizer to only use student_model.
     if training_config.gradient_accumulation_steps is not None and training_config.gradient_accumulation_steps > 1:
       optimizer = optax.MultiSteps(optimizer, training_config.gradient_accumulation_steps)
-    wrt = nnx.LoRAParam if self._lora_enabled else nnx.Param
-    self.optimizer = nnx.Optimizer(model.student_model, optimizer, wrt=wrt)
+
+    base_wrt = nnx.LoRAParam if getattr(self, "_lora_enabled", False) else nnx.Param
+    if student_freeze_param_filter:
+
+      def wrt_filter(path, x):
+        if not isinstance(x, base_wrt):
+          return False
+        return not student_freeze_param_filter(path)
+
+      self.wrt_filter = wrt_filter
+    else:
+      self.wrt_filter = base_wrt
+
+    self.optimizer = nnx.Optimizer(model.student_model, optimizer, wrt=self.wrt_filter)
 
     # Detect if Tunix expects _train_step to return grad_norm by inspecting the source
     self._tunix_expects_grad_norm = False
@@ -282,7 +302,7 @@ def loss_wrapper(student, teacher, batch):
     # we only compute gradients for the student.
     grad_fn = nnx.value_and_grad(
         loss_wrapper,
-        argnums=0,
+        argnums=nnx.DiffState(0, self.wrt_filter),
         has_aux=True,
     )
 
@@ -564,6 +584,12 @@ def train_distill(
     _log_config_details(student_config, "Student")
     student_model = get_maxtext_model(student_config, mesh)
 
+    student_params_to_update = getattr(student_config, "student_params_to_update", [])
+
+    def student_freeze_param_fn(path) -> bool:
+      path_str = "/".join(str(p) for p in path)
+      return not any(template in path_str for template in student_params_to_update)
+    
     if is_offline:
       max_logging.log("Offline Distillation: Skipping Teacher Model loading.")
       teacher_model = None
@@ -582,6 +608,7 @@ def train_distill(
         strategy=strategy,
         optimizer=optimizer,
         training_config=train_config,
+        student_freeze_param_filter=student_freeze_param_fn if student_params_to_update else None,
     )
     trainer.is_managed_externally = True
     trainer._has_aux = True  # pylint: disable=protected-access