Merge pull request #149 from AI-Hypercomputer/ajkv/multi-host-v7-training

ajkv-google · web-flow · commit 5023bd5c5f92 · 2026-01-22T14:25:33.000-08:00
Added multihost training for V7
diff --git a/Dockerfile b/Dockerfile
@@ -7,22 +7,26 @@ WORKDIR /app
 # This tells Python to look in /app for the 'recml' package
 ENV PYTHONPATH="${PYTHONPATH}:/app"
 
-# Install system tools if needed (e.g., git)
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# This prevents the "MessageFactory" crash when using Protobuf
+ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 
-# Install the latest jax-tpu-embedding wheel
-COPY jax_tpu_embedding-0.1.0.dev20260121-cp312-cp312-manylinux_2_31_x86_64.whl ./
-RUN pip install ./jax_tpu_embedding-0.1.0.dev20260121-cp312-cp312-manylinux_2_31_x86_64.whl
+# This prevents the "Unable to register cuFFT/cuBLAS" log spam and initialization errors
+ENV CUDA_VISIBLE_DEVICES=-1
 
-# Copy requirements.txt to current directory
-COPY  requirements.txt ./
+# Install system tools
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
-# Install dependencies
+# Install standard requirements
+COPY requirements.txt ./
 RUN pip install --upgrade pip
 RUN pip install -r ./requirements.txt
 
 # Force install the specific protobuf version
-RUN pip install "protobuf>=6.31.1" --no-deps
+RUN pip install "protobuf>=6.31.1"
+
+# Install the latest jax-tpu-embedding wheel
+COPY jax_tpu_embedding-0.1.0.dev20260121-cp312-cp312-manylinux_2_31_x86_64.whl ./
+RUN pip install ./jax_tpu_embedding-0.1.0.dev20260121-cp312-cp312-manylinux_2_31_x86_64.whl
 
 # Copy the current directory contents into the container
 COPY . /app
diff --git a/recml/core/training/jax_trainer.py b/recml/core/training/jax_trainer.py
@@ -398,6 +398,7 @@ def __init__(
       continuous_eval_timeout: int = 30,
       rng_seed: int = core.DEFAULT_RNG_SEED,
       rng_impl: str | None = None,
+      enable_checkpointing: bool = True,
   ):
     """Initializes the instance.
 
@@ -436,6 +437,7 @@ def __init__(
       rng_impl: The implementation of the PRNG key. By default this is set to
         None which means that the default implementation (generally
         partitionable threefry) will be used.
+      enable_checkpointing: Whether to enable checkpointing. Defaults to True.
     """
 
     if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
@@ -453,6 +455,7 @@ def __init__(
     self._max_checkpoints_to_keep = max_checkpoints_to_keep
     self._rng_impl = rng_impl
     self._rng_seed = rng_seed
+    self._enable_checkpointing = enable_checkpointing
 
   @functools.cached_property
   def checkpoint_manager(self) -> ocp.CheckpointManager:
@@ -467,14 +470,19 @@ def checkpoint_manager(self) -> ocp.CheckpointManager:
       save_on_steps.append(self._train_steps - 1)
 
     save_on_steps = set(save_on_steps)
-
-    return ocp.CheckpointManager(
-        directory=os.path.join(self._model_dir, core.CHECKPOINT_DIR),
-        options=ocp.CheckpointManagerOptions(
-            should_save_fn=lambda step, _: step in save_on_steps,
-            max_to_keep=self._max_checkpoints_to_keep,
-        ),
-    )
+    
+    if self._enable_checkpointing:
+
+      return ocp.CheckpointManager(
+          directory=os.path.join(self._model_dir, core.CHECKPOINT_DIR),
+          options=ocp.CheckpointManagerOptions(
+              should_save_fn=lambda step, _: step in save_on_steps,
+              max_to_keep=self._max_checkpoints_to_keep,
+          ),
+      )
+    else:
+      
+      return None
 
   @functools.cached_property
   def train_summary_writer(self) -> metrics_tools.AsyncMultiWriter:
@@ -510,6 +518,9 @@ def _maybe_save_checkpoint(
       metrics: Mapping[str, Any] | None = None,
   ):
     """Saves a checkpoint and returns a bool indicating whether it was saved."""
+    if not self._enable_checkpointing:
+      return
+
     items = {core.STATE_CHECKPOINT_KEY: ocp.args.StandardSave(state)}
     with self.report_progress.timed("checkpointing"):
       self.checkpoint_manager.save(
@@ -564,7 +575,7 @@ def _train_n_steps(
         state, metrics_update = train_step(inputs, state)
         metrics_accum.accumulate(metrics_update, step)
         self.report_progress(step)
-        if step != start_step + num_steps - 1:
+        if (step != start_step + num_steps - 1) and self._enable_checkpointing:
           self._maybe_save_checkpoint(step, state)
 
     metrics = metrics_accum.compute_and_log_scalars(start_step + num_steps - 1)
@@ -651,6 +662,7 @@ def _eval_step(
 
     if (
         check_for_checkpoints
+        and self._enable_checkpointing
         and self.checkpoint_manager.latest_step() is not None
     ):
       step_to_resume_from = self.checkpoint_manager.latest_step()
@@ -674,7 +686,7 @@ def _eval_step(
   def train(self, task: JaxTask) -> core.Logs:
     """Trains the model."""
     train_iter, _, state, train_step, _, step = self.process_task(
-        task, training=True, check_for_checkpoints=True
+        task, training=True, check_for_checkpoints=False
     )
 
     logging.info(
@@ -698,25 +710,27 @@ def train(self, task: JaxTask) -> core.Logs:
           f" {_format_output(train_metrics)}"
       )
       metrics[core.TRAIN_LOG_DIRNAME] = train_metrics
-
-      self._maybe_save_checkpoint(curr_step, state, metrics=metrics)
+      if self._enable_checkpointing:
+        self._maybe_save_checkpoint(curr_step, state, metrics=metrics)
       step = curr_step + 1
 
-    self.checkpoint_manager.wait_until_finished()
+    if self._enable_checkpointing:
+      self.checkpoint_manager.wait_until_finished()
 
     if jax.process_index() == 0:
       self._write_marker_file()
       task.export_model(state, self._model_dir)
 
-    self.checkpoint_manager.close()
-    del self.checkpoint_manager
+    if self._enable_checkpointing:
+      self.checkpoint_manager.close()
+      del self.checkpoint_manager
 
     return metrics
 
   def evaluate(self, task: JaxTask) -> core.Logs:
     """Evaluates the model."""
     _, eval_iters, state, _, eval_step, step = self.process_task(
-        task, training=False, check_for_checkpoints=True
+        task, training=False, check_for_checkpoints=False
     )
     eval_summary_writers = self._create_eval_summary_writers(eval_iters)
 
@@ -749,7 +763,7 @@ def evaluate(self, task: JaxTask) -> core.Logs:
   def train_and_evaluate(self, task: JaxTask) -> core.Logs:
     """Trains and evaluates the model."""
     train_iter, eval_iters, state, train_step, eval_step, step = (
-        self.process_task(task, training=True, check_for_checkpoints=True)
+        self.process_task(task, training=True, check_for_checkpoints=False)
     )
     eval_summary_writers = self._create_eval_summary_writers(eval_iters)
 
@@ -794,18 +808,20 @@ def train_and_evaluate(self, task: JaxTask) -> core.Logs:
               f" {_format_output(eval_metrics)}"
           )
           metrics[_val_logdir(key)] = eval_metrics
-
-      self._maybe_save_checkpoint(curr_step, state, metrics=metrics)
+      if self._enable_checkpointing:
+        self._maybe_save_checkpoint(curr_step, state, metrics=metrics)
       step = curr_step + 1
 
-    self.checkpoint_manager.wait_until_finished()
+    if self._enable_checkpointing:
+      self.checkpoint_manager.wait_until_finished()
 
     if jax.process_index() == 0:
       self._write_marker_file()
       task.export_model(state, self._model_dir)
 
-    self.checkpoint_manager.close()
-    del self.checkpoint_manager
+    if self._enable_checkpointing:
+      self.checkpoint_manager.close()
+      del self.checkpoint_manager
 
     return metrics
 
@@ -833,7 +849,8 @@ def timeout_fn() -> bool:
         timeout_fn=timeout_fn,
     ):
       try:
-        state = self._maybe_restore_checkpoint(state, step)
+        if self._enable_checkpointing:
+          state = self._maybe_restore_checkpoint(state, step)
         logging.info(f"eval | step: {step: 6d} | {steps_msg}")
         with self.report_progress.timed("eval"):
           for key, eval_iter in eval_iters.items():
@@ -930,3 +947,4 @@ def _format_output(output: Any, indent: int = 4, width: int = 80) -> str:
     return formatted
   lines = [" " * indent + line for line in lines]
   return "\n" + "\n".join(lines)
+
diff --git a/recml/examples/dlrm_experiment.py b/recml/examples/dlrm_experiment.py
@@ -398,5 +398,6 @@ def experiment() -> fdl.Config[recml.Experiment]:
       train_steps=1_000,
       steps_per_eval=100,
       steps_per_loop=100,
+      enable_checkpointing=False
   )
   return fdl.Config(recml.Experiment, task=task, trainer=trainer)
diff --git a/recml/examples/dlrm_experiment_test.py b/recml/examples/dlrm_experiment_test.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """Tests for the DLRM experiment."""
 
+import jax
+from absl import logging
 import sys
 import os
 # Add the RecML folder to the system path
@@ -43,6 +45,7 @@ def test_dlrm_experiment(self):
     experiment.trainer.train_steps = 12
     experiment.trainer.steps_per_loop = 4
     experiment.trainer.steps_per_eval = 4
+    experiment.trainer.enable_checkpointing = False
 
     for cfg in selectors.select(experiment, dlrm_experiment.SparseFeature):
       cfg.vocab_size = 200
@@ -53,4 +56,4 @@ def test_dlrm_experiment(self):
 
 
 if __name__ == "__main__":
-  absltest.main()
+  absltest.main()
diff --git a/requirements.txt b/requirements.txt
@@ -18,7 +18,6 @@ etils==1.12.2
 fiddle==0.3.0
 filelock==3.18.0
 flatbuffers==25.2.10
-flax==0.12.2
 fsspec==2025.3.2
 gast==0.6.0
 google-pasta==0.2.0
@@ -33,8 +32,6 @@ immutabledict==4.2.1
 importlib-resources==6.5.2
 iniconfig==2.1.0
 isort==6.0.1
-jax==0.8.2
-jaxlib==0.8.2
 jaxtyping==0.3.1
 Jinja2==3.1.6
 kagglehub==0.3.11
@@ -59,19 +56,6 @@ nest-asyncio==1.6.0
 networkx==3.4.2
 nodeenv==1.9.1
 numpy==2.1.3
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-cusparselt-cu12==0.6.2
-nvidia-nccl-cu12==2.21.5
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
 opt-einsum==3.4.0
 optax==0.2.4
 optree==0.15.0
@@ -82,7 +66,6 @@ pluggy==1.5.0
 portpicker==1.6.0
 pre-commit==4.2.0
 promise==2.3
-# protobuf==6.33.4
 psutil==7.0.0
 pyarrow==19.0.1
 Pygments==2.19.1

Original file line number	Diff line number	Diff line change
`@@ -398,5 +398,6 @@ def experiment() -> fdl.Config[recml.Experiment]:`
`398`	`398`	`train_steps=1_000,`
`399`	`399`	`steps_per_eval=100,`
`400`	`400`	`steps_per_loop=100,`
	`401`	`+ enable_checkpointing=False`
`401`	`402`	`)`
`402`	`403`	`return fdl.Config(recml.Experiment, task=task, trainer=trainer)`