Merge pull request #3556 from CIeNET-International:charlesli/gcs_benchmarks_move

Google-ML-Automation · Google-ML-Automation · commit 655c1033b539 · 2026-04-02T17:07:49.000-07:00
PiperOrigin-RevId: 893778063
diff --git a/RESTRUCTURE.md b/RESTRUCTURE.md
@@ -299,9 +299,6 @@ comments, or questions by creating a new
     ├── dev/
     │   ├── code_style.sh
     │   └── unit_test_and_lint.sh
-    ├── gcs_benchmarks/
-    │   ├── standalone_checkpointer.py
-    │   └── standalone_dataloader.py
     ├── orchestration/
     │   ├── gpu_multi_process_run.sh
     │   ├── multihost_job.py
diff --git a/docs/conf.py b/docs/conf.py
@@ -119,6 +119,8 @@
     os.path.join("run_maxtext", "run_maxtext_via_multihost_job.md"),
     os.path.join("run_maxtext", "run_maxtext_via_multihost_runner.md"),
     os.path.join("reference", "core_concepts", "llm_calculator.ipynb"),
+    os.path.join("reference", "api.rst"),
+    os.path.join("reference", "api_generated", "MaxText*.rst"),
     os.path.join("reference", "api_generated", "modules.rst"),
     os.path.join("reference", "api_generated", "dependencies.github_deps.rst"),
     os.path.join("reference", "api_generated", "dependencies.github_deps.install_pre_train_deps.rst"),
diff --git a/docs/guides/data_input_pipeline/data_input_grain.md b/docs/guides/data_input_pipeline/data_input_grain.md
@@ -34,10 +34,10 @@ Grain ensures determinism in data input pipelines by saving the pipeline's state
 
 1. Grain currently supports three data formats: [ArrayRecord](https://github.com/google/array_record) (random access), [Parquet](https://arrow.apache.org/docs/python/parquet.html) (partial random-access through row groups) and [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord)(sequential access). Only the ArrayRecord format supports the global shuffle mentioned above. For converting a dataset into ArrayRecord, see [Apache Beam Integration for ArrayRecord](https://github.com/google/array_record/tree/main/beam). Additionally, other random access data sources can be supported via a custom [data source](https://google-grain.readthedocs.io/en/latest/data_sources/protocol.html) class.
    - **Community Resource**: The MaxText community has created a [ArrayRecord Documentation](https://array-record.readthedocs.io/). Note: we appreciate the contribution from the community, but as of now it has not been verified by the MaxText or ArrayRecord developers yet.
-2. If the dataset is hosted on a Cloud Storage bucket, the path `gs://` can be provided directly. However, for the best performance, it's recommended to read the bucket through [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse). This will significantly improve the perf for the ArrayRecord format as it allows meta data caching to speeds up random access. The installation of Cloud Storage FUSE is included in [setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/setup.sh). The user then needs to mount the Cloud Storage bucket to a local path for each worker, using the script [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/4e44e065cc6379e76f9f1ac4785f81c05cafb58f/src/dependencies/scripts/setup_gcsfuse.sh). The script configures some parameters for the mount.
+2. If the dataset is hosted on a Cloud Storage bucket, the path `gs://` can be provided directly. However, for the best performance, it's recommended to read the bucket through [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse). This will significantly improve the perf for the ArrayRecord format as it allows meta data caching to speeds up random access. The installation of Cloud Storage FUSE is included in [setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/setup.sh). The user then needs to mount the Cloud Storage bucket to a local path for each worker, using the script [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/dependencies/scripts/setup_gcsfuse.sh). The script configures some parameters for the mount.
 
 ```sh
-bash tools/setup/setup_gcsfuse.sh \
+bash src/dependencies/scripts/setup_gcsfuse.sh \
 DATASET_GCS_BUCKET=${BUCKET_NAME?} \
 MOUNT_PATH=${MOUNT_PATH?} \
 [FILE_PATH=${MOUNT_PATH?}/my_dataset]
@@ -47,7 +47,7 @@ Note that `FILE_PATH` is optional; when provided, the script runs `ls -R` for pr
 
 1. Set `dataset_type=grain`, `grain_file_type={arrayrecord|parquet|tfrecord}`, `grain_train_files` in `src/maxtext/configs/base.yml` or through command line arguments to match the file pattern on the mounted local path.
 
-2. Tune `grain_worker_count` for performance. This parameter controls the number of child processes used by Grain (more details in [behind_the_scenes](https://google-grain.readthedocs.io/en/latest/behind_the_scenes.html)). If you use a large number of workers, check your config for gcsfuse in [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/4e44e065cc6379e76f9f1ac4785f81c05cafb58f/src/dependencies/scripts/setup_gcsfuse.sh) to avoid gcsfuse throttling.
+2. Tune `grain_worker_count` for performance. This parameter controls the number of child processes used by Grain (more details in [behind_the_scenes](https://google-grain.readthedocs.io/en/latest/behind_the_scenes.html)). If you use a large number of workers, check your config for gcsfuse in [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/dependencies/scripts/setup_gcsfuse.sh) to avoid gcsfuse throttling.
 
 3. ArrayRecord Only: For multi-source blending, you can specify multiple data sources with their respective weights using semicolon (;) as a separator and a comma (,) for weights. The weights will be automatically normalized to sum to 1.0. For example:
 
diff --git a/docs/tutorials/pretraining.md b/docs/tutorials/pretraining.md
@@ -87,7 +87,7 @@ eval metrics after step: 9, loss=9.420, total_weights=75264.0
 
 Grain is a library for reading data for training and evaluating JAX models. It is the recommended input pipeline for determinism and resilience! It supports data formats like ArrayRecord and Parquet. You can check [Grain pipeline](../guides/data_input_pipeline/data_input_grain.md) for more details.
 
-**Data preparation**: You need to download data to a Cloud Storage bucket, and read data via Cloud Storage Fuse with [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/4e44e065cc6379e76f9f1ac4785f81c05cafb58f/src/dependencies/scripts/setup_gcsfuse.sh).
+**Data preparation**: You need to download data to a Cloud Storage bucket, and read data via Cloud Storage Fuse with [setup_gcsfuse.sh](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/dependencies/scripts/setup_gcsfuse.sh).
 
 - For example, we can mount the bucket `gs://maxtext-dataset` on the local path `/tmp/gcsfuse` before training
   ```bash
diff --git a/src/maxtext/utils/standalone_checkpointer.py b/src/maxtext/utils/standalone_checkpointer.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,13 +53,9 @@ def checkpoint_loop(config, state=None):
   """
   model = from_config(config)
   mesh = model.mesh
-  init_rng, checkpoint_manager, _, tx = train_utils.create_training_tools(
-      config, model, mesh
-  )
+  init_rng, checkpoint_manager, _, tx = train_utils.create_training_tools(config, model, mesh)
 
-  unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(
-      model, tx, config, init_rng, mesh, is_training=True
-  )
+  unboxed_abstract_state, _, _ = maxtext_utils.get_abstract_state(model, tx, config, init_rng, mesh, is_training=True)
   # A barrier to sync all hosts before starting to restore checkpoint
   jax.experimental.multihost_utils.sync_global_devices("Barrier before load")
   checkpoint_load_start = datetime.datetime.now()
@@ -82,30 +78,24 @@ def checkpoint_loop(config, state=None):
   if state is not None:  # Checkpoint was available for restore
     if jax.process_index() == 0:
       max_logging.log(
-          "STANDALONE CHECKPOINTER : Checkpoint restored in :"
-          f" {checkpoint_load_end - checkpoint_load_start}"
+          "STANDALONE CHECKPOINTER : Checkpoint restored in :" f" {checkpoint_load_end - checkpoint_load_start}"
       )
   else:  # Checkpoint was unavailable, state needs to be initialized
-    state, _, _, _ = maxtext_utils.setup_training_state(
-        model, None, tx, config, init_rng, mesh, checkpoint_manager
-    )
+    state, _, _, _ = maxtext_utils.setup_training_state(model, None, tx, config, init_rng, mesh, checkpoint_manager)
   state = add_entropy_to_checkpoint(state)
 
   start_step = get_first_step(state)  # this is the start_step for training
   for step in np.arange(start_step, config.steps):
     if checkpoint_manager is not None:
       start_time = datetime.datetime.now()
       # A barrier to sync all hosts before starting to save checkpoint
-      jax.experimental.multihost_utils.sync_global_devices(
-          "Barrier before save"
-      )
+      jax.experimental.multihost_utils.sync_global_devices("Barrier before save")
       if checkpointing.save_checkpoint(checkpoint_manager, int(step), state):
         checkpoint_manager.wait_until_finished()
         end_time = datetime.datetime.now()
         if jax.process_index() == 0:
           max_logging.log(
-              "STANDALONE CHECKPOINTER : Checkpoint saved in"
-              f" {end_time - start_time} ,step {step}, on host 0"
+              "STANDALONE CHECKPOINTER : Checkpoint saved in" f" {end_time - start_time} ,step {step}, on host 0"
           )
 
   return state
@@ -123,12 +113,8 @@ def add_entropy_to_checkpoint(state):
     state: Returns state with entropy added to the optimizer state.
   """
   opt_0 = state.opt_state[0]
-  opt_0 = opt_0._replace(
-      mu=jax.tree_util.tree_map(lambda k: jnp.cos(1000 * k), state.params)
-  )
-  opt_0 = opt_0._replace(
-      nu=jax.tree_util.tree_map(lambda k: jnp.sin(1000 * k), state.params)
-  )
+  opt_0 = opt_0._replace(mu=jax.tree_util.tree_map(lambda k: jnp.cos(1000 * k), state.params))
+  opt_0 = opt_0._replace(nu=jax.tree_util.tree_map(lambda k: jnp.sin(1000 * k), state.params))
   new_opt = [opt_0] + list(state.opt_state[1:])
   state = state.replace(opt_state=new_opt)
   return state
diff --git a/src/maxtext/utils/standalone_dataloader.py b/src/maxtext/utils/standalone_dataloader.py
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/integration/standalone_dl_ckpt_test.py b/tests/integration/standalone_dl_ckpt_test.py
@@ -15,8 +15,8 @@
 """ Tests for the standalone_checkpointer.py """
 import unittest
 import pytest
-from tools.gcs_benchmarks.standalone_checkpointer import main as sckpt_main
-from tools.gcs_benchmarks.standalone_dataloader import main as sdl_main
+from maxtext.utils.standalone_checkpointer import main as sckpt_main
+from maxtext.utils.standalone_dataloader import main as sdl_main
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT
 from maxtext.common.gcloud_stub import is_decoupled
 
@@ -50,6 +50,7 @@ def _get_random_test_name(self, test_name):
 
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
+  @pytest.mark.scheduled_only
   def test_standalone_dataloader(self):
     random_run_name = self._get_random_test_name("standalone_dataloader")
     sdl_main(
@@ -68,6 +69,7 @@ def test_standalone_dataloader(self):
 
   @pytest.mark.integration_test
   @pytest.mark.tpu_only
+  @pytest.mark.scheduled_only
   def test_standalone_checkpointer(self):
     random_run_name = self._get_random_test_name("standalone_checkpointer")
     # checkpoint at 50
diff --git a/tools/gcs_benchmarks/__init__.py b/tools/gcs_benchmarks/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2023–2025 Google LLC`
	`1`	`+# Copyright 2023–2026 Google LLC`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`