AI-Hypercomputer
diff --git a/‎docs/conf.py‎
Lines changed: 2 additions & 0 deletions b/‎docs/conf.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/guides/monitoring_and_debugging/megascale_hang_playbook.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/guides/monitoring_and_debugging/megascale_hang_playbook.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/run_maxtext/run_maxtext_single_host_gpu.md‎
Lines changed: 12 additions & 5 deletions b/‎docs/run_maxtext/run_maxtext_single_host_gpu.md‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎docs/run_maxtext/run_maxtext_via_pathways.md‎
Lines changed: 42 additions & 21 deletions b/‎docs/run_maxtext/run_maxtext_via_pathways.md‎
Lines changed: 42 additions & 21 deletions
diff --git a/‎docs/run_maxtext/run_maxtext_via_xpk.md‎
Lines changed: 55 additions & 30 deletions b/‎docs/run_maxtext/run_maxtext_via_xpk.md‎
Lines changed: 55 additions & 30 deletions
diff --git a/‎docs/tutorials/posttraining/full_finetuning.md‎
Lines changed: 19 additions & 6 deletions b/‎docs/tutorials/posttraining/full_finetuning.md‎
Lines changed: 19 additions & 6 deletions
@@ -162,6 +162,8 @@
     r"https://cla\.developers\.google\.com/.*",
     # Ignore GitHub commit history links which frequently trigger rate limiting (429)
     r"https://github\.com/jax-ml/jax/commits/.*",
+    # Ignore Hugging Face settings links which require login
+    r"https://huggingface\.co/settings/tokens",
 ]
 
 
 
@@ -108,4 +108,4 @@ After [creating an HLO Dump](https://openxla.org/xla/hlo_dumps), you can share i
 gcloud storage cp -r /tmp/xla_dump gs://<bucket_location>
 ```
 
-When sharing the HLO dump, you will need to give Google permission to access the GCS bucket. A Google user can then download the HLO graph using `gsutil`.
+When sharing the HLO dump, you will need to give Google permission to access the GCS bucket. A Google user can then download the HLO graph using `gcloud storage`.
@@ -135,15 +135,22 @@ https://github.com/AI-Hypercomputer/maxtext/tree/main/src/maxtext/configs/gpu/a3
 echo "Running 1vm.sh"
 
 # Example command to invoke this script via XPK
-# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME?} \
-# --workload ${WORKLOAD_NAME?} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME?} \
+# python3 xpk/xpk.py workload create --cluster ${GKE_CLUSTER?} \
+# --workload ${RUN_NAME?} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME?} \
 # --device-type ${DEVICE_TYPE?} --num-slices 1 \
 # --command "bash src/maxtext/configs/gpu/a3/llama_2_7b/1vm.sh"
 
 # Stop execution if any command exits with error
 set -e
 
-export OUTPUT_PATH="provide an output path"
+# Use a GCS bucket you own to store logs and checkpoints. Ideally in the same
+# region as your GPUs to minimize latency and costs.
+# You can list your buckets and their locations in the
+# [Cloud Console](https://console.cloud.google.com/storage/browser).
+export BASE_OUTPUT_DIRECTORY=<gcs bucket path> # e.g., gs://my-bucket/maxtext-runs
+
+# An arbitrary string to identify this specific run.
+# Note: Kubernetes requires workload names to be valid DNS labels (lowercase, no underscores or periods).
 export RUN_NAME="llama-2-1vm-$(date +%Y-%m-%d-%H-%M)"
 
 # Set environment variables
@@ -152,7 +159,7 @@ for ARGUMENT in "$@"; do
     export "$KEY"="$VALUE"
 done
 
-export XLA_FLAGS="--xla_dump_to=${OUTPUT_PATH?}/${RUN_NAME?}/HLO_dumps/
+export XLA_FLAGS="--xla_dump_to=${BASE_OUTPUT_DIRECTORY?}/${RUN_NAME?}/HLO_dumps/
 --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
  --xla_gpu_enable_command_buffer='' --xla_gpu_enable_highest_priority_async_stream=true
  --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
@@ -165,6 +172,6 @@ export XLA_FLAGS="--xla_dump_to=${OUTPUT_PATH?}/${RUN_NAME?}/HLO_dumps/
 
 # 1 node, DATA_DP=1, ICI_FSDP=8
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/gpu/models/llama2_7b.yml run_name=${RUN_NAME?} dcn_data_parallelism=1 \
-  ici_fsdp_parallelism=8 base_output_directory=${OUTPUT_PATH?} attention=cudnn_flash_te scan_layers=False \
+  ici_fsdp_parallelism=8 base_output_directory=${BASE_OUTPUT_DIRECTORY?} attention=cudnn_flash_te scan_layers=False \
   use_iota_embed=True hardware=gpu
 ```
@@ -43,20 +43,41 @@ The following commands use placeholder variables. Before running them, set these
 
 ```bash
 # -- Google Cloud Configuration --
-export PROJECT="your-gcp-project-id"
-export ZONE="your-gcp-zone"
-export CLUSTER="your-gke-cluster-name"
+# Your GCP project ID. Find it on the [Cloud Console Dashboard](https://console.cloud.google.com/home/dashboard).
+export PROJECT_ID=<GCP project ID>
+
+# The GCP location (listed as "Location" in the UI) and name of your
+# TPU-enabled GKE cluster. Both can be found on the
+# [Cloud Console](https://console.cloud.google.com/kubernetes/list).
+export ZONE=<GCP location> # e.g., 'us-central1'
+export GKE_CLUSTER=<cluster name>
 
 # -- Workload Configuration --
-export WORKLOAD_NAME="maxtext-job-$(date +%Y%m%d-%H%M%S)"
+# An arbitrary string to identify this specific run.
+# Note: Kubernetes requires workload names to be valid DNS labels (lowercase, no underscores or periods).
+export RUN_NAME="maxtext-run-$(date +%Y%m%d-%H%M%S)"
+
+# For a full list of MaxText-supported TPU types, see: `src/maxtext/utils/accelerator_to_spec_map.py`. To see the TPU type
+# of your cluster:
+
+# 1. Connect to the cluster (required for kubectl commands later):
+# gcloud container clusters get-credentials ${GKE_CLUSTER?} --location ${ZONE?} --project ${PROJECT_ID?}
+
+# 2. Find your TPU type (e.g., 'v5p-128') by checking the accelerator labels on your nodes:
+# kubectl get nodes -l cloud.google.com/gke-tpu-accelerator -o jsonpath='{.items[*].metadata.labels.cloud\.google\.com/gke-tpu-accelerator}' | tr ' ' '\n' | sort -u
 export TPU_TYPE="v5p-8" # Or your desired TPU type, e.g., v5e-4
-export WORKLOAD_NODEPOOL_COUNT=1 # Number of TPU slices for your job
+export NUM_SLICES=1 # Number of TPU slices for your job
 
 # -- MaxText & Storage Configuration --
-export BUCKET_NAME="your-gcs-bucket-name"
-export RUN_NAME="maxtext-run-1"
+# Use a GCS bucket you own to store logs and checkpoints. Ideally in the same
+# region as your TPUs to minimize latency and costs.
+# You can list your buckets and their locations in the
+# [Cloud Console](https://console.cloud.google.com/storage/browser).
+export BASE_OUTPUT_DIRECTORY=<gcs bucket path> # e.g., gs://my-bucket/maxtext-runs
+
 # The Docker image you pushed in the prerequisite step
-export DOCKER_IMAGE="gcr.io/${PROJECT?}/${CLOUD_IMAGE_NAME}"
+export CLOUD_IMAGE_NAME=<image name>
+export DOCKER_IMAGE="gcr.io/${PROJECT_ID?}/${CLOUD_IMAGE_NAME?}"
 ```
 
 ## 3. Running a batch workload
@@ -69,15 +90,15 @@ Use the `xpk workload create-pathways` command to start the job.
 
 ```bash
 xpk workload create-pathways \
-  --workload=${WORKLOAD_NAME?} \
-  --cluster=${CLUSTER?} \
-  --num-slices=${WORKLOAD_NODEPOOL_COUNT?} \
+  --workload=${RUN_NAME?} \
+  --cluster=${GKE_CLUSTER?} \
+  --num-slices=${NUM_SLICES?} \
   --tpu-type=${TPU_TYPE?} \
-  --project=${PROJECT?} \
+  --project=${PROJECT_ID?} \
   --zone=${ZONE?} \
   --docker-image=${DOCKER_IMAGE?} \
   --command="python3 -m maxtext.trainers.pre_train.train \
-    base_output_directory=gs://${BUCKET_NAME?} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
     per_device_batch_size=1 \
     enable_checkpointing=false \
     dataset_type=synthetic \
@@ -90,7 +111,7 @@ xpk workload create-pathways \
 You can check the status of your running workloads with the `xpk workload list` command.
 
 ```bash
-xpk workload list --cluster=${CLUSTER?} --project=${PROJECT?} --zone=${ZONE?}
+xpk workload list --cluster=${GKE_CLUSTER?} --project=${PROJECT_ID?} --zone=${ZONE?}
 ```
 
 ## 4. Running a headless (interactive) workload
@@ -104,12 +125,12 @@ This command reserves the TPUs and starts the Pathways head service on the clust
 ```bash
 xpk workload create-pathways \
   --headless \
-  --workload=${WORKLOAD_NAME?} \
-  --num-slices=${WORKLOAD_NODEPOOL_COUNT?} \
+  --workload=${RUN_NAME?} \
+  --num-slices=${NUM_SLICES?} \
   --tpu-type=${TPU_TYPE?} \
-  --project=${PROJECT?} \
+  --project=${PROJECT_ID?} \
   --zone=${ZONE?} \
-  --cluster=${CLUSTER?}
+  --cluster=${GKE_CLUSTER?}
 ```
 
 ### Step 2: Connect to the cluster via port forwarding
@@ -120,7 +141,7 @@ This command forwards local port 29000 to the controller pod in the cluster. It
 
 ```bash
 kubectl port-forward \
-  "$(kubectl get pods -o name | grep ${WORKLOAD_NAME?}-pathways-head)" \
+  "$(kubectl get pods -o name | grep ${RUN_NAME?}-pathways-head)" \
   29000:29000 &> /dev/null &
 ```
 
@@ -135,7 +156,7 @@ export JAX_BACKEND_TARGET=grpc://127.0.0.1:29000
 
 # Run the training script
 python3 -m maxtext.trainers.pre_train.train \
-  base_output_directory=gs://${BUCKET_NAME?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
   per_device_batch_size=1 \
   enable_checkpointing=false \
   dataset_type=synthetic \
@@ -153,7 +174,7 @@ The output streams directly to your terminal, just as if you were running on a l
   - Ensure you have successfully pushed the image to your project's Artifact Registry.
   - Check that your GKE cluster has permissions to pull from the registry.
 - **`kubectl port-forward` fails**:
-  - Confirm that the pod from Step 1 is running (`kubectl get pods`). The name should match `${WORKLOAD_NAME?}-pathways-head-0`.
+  - Confirm that the pod from Step 1 is running (`kubectl get pods`). The name should match `${RUN_NAME?}-pathways-head-0`.
   - Ensure you are authenticated with `kubectl` and have the correct context set for your GKE cluster.
 - Make sure you import `pathwaysutils` package and call `pathwaysutils.initialize()` in your script when running the workload.
 
 
@@ -115,51 +115,76 @@ This guide focuses on submitting workloads to an existing cluster. Cluster creat
 
 1. **Set your configuration**
 
-   ```
-   export PROJECT_ID="your-gcp-project-id"
-   export ZONE="your-gcp-zone" # e.g., us-central1-a
-   export CLUSTER_NAME="your-existing-cluster-name"
-   export BASE_OUTPUT_DIR="gs://your-output-bucket/"
+   Set up the following environment variables to configure your training run. Replace
+   placeholders with your actual values.
+
+   ```bash
+   # -- Google Cloud Configuration --
+   # Your GCP project ID. Find it on the [Cloud Console Dashboard](https://console.cloud.google.com/home/dashboard).
+   # If you've already set it in your local config, you can retrieve it via:
+   # gcloud config get-value project
+   export PROJECT_ID=<GCP project ID>
+
+   # The GCP location (listed as "Location" in the UI) and name of your
+   # TPU-enabled (or GPU-enabled) GKE cluster. Both can be found on the
+   # [Cloud Console](https://console.cloud.google.com/kubernetes/list).
+   export ZONE=<GCP location> # e.g., 'us-central1' or 'us-central1-a'
+   export GKE_CLUSTER=<cluster name>
+
+   # -- Workload Configuration --
+   # An arbitrary string to identify this specific run.
+   # Note: Kubernetes requires workload names to be valid DNS labels (lowercase, no underscores or periods).
+   export RUN_NAME="maxtext-run-$(date +%Y%m%d-%H%M%S)"
+
+   # Number of TPU slices (for TPU clusters) or number of nodes (for GPU clusters)
+   export NUM_SLICES=1
+
+   # -- MaxText & Storage Configuration --
+   # Use a GCS bucket you own to store logs and checkpoints. Ideally in the same
+   # region as your TPUs to minimize latency and costs.
+   # You can list your buckets and their locations in the
+   # [Cloud Console](https://console.cloud.google.com/storage/browser).
+   export BASE_OUTPUT_DIRECTORY=<gcs bucket path> # e.g., gs://my-bucket/maxtext-runs
    export DATASET_PATH="gs://your-dataset-bucket/"
    ```
 
 2. **Configure gcloud CLI**
 
-   ```
+   ```bash
    gcloud config set project ${PROJECT_ID?}
    gcloud config set compute/zone ${ZONE?}
    ```
 
 ### A Note on multi-slice and multi-node runs
 
-The examples below run on a single TPU slice (`--num-slices=1`) or a small number of GPU nodes (`--num-nodes=2`). To scale your job to a larger, multi-host configuration, you simply increase these values.
+The examples below run on a single TPU slice (`--num-slices=1`) or a small number of GPU nodes (`--num-nodes=2`). To scale your job to a larger, multi-host configuration, you simply increase the `NUM_SLICES` value.
 
-For instance, to run a job across **four TPU slices**, you would change `--num-slices=1` to `--num-slices=4`. This tells XPK to allocate four `v5litepod-256` slices and orchestrate the training job across all of them as a single workload. Similarly, for GPUs, you would increase the `--num-nodes` value.
+For instance, to run a job across **four TPU slices**, you would change `export NUM_SLICES=1` to `export NUM_SLICES=4`. This tells XPK to allocate four `v5litepod-256` slices and orchestrate the training job across all of them as a single workload. Similarly, for GPUs, you would increase the value.
 
 3. **Create the workload (run the job)**
 
    - **On your TPU cluster:**
 
-     ```
-     xpk workload create\
-       --cluster ${CLUSTER_NAME?}\
-       --workload ${USER}-tpu-job\
-       --base-docker-image maxtext_base_image\
-       --tpu-type v5litepod-256\
-       --num-slices 1\
-       --command "python3 -m maxtext.trainers.pre_train.train run_name=${USER}-tpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100"
+     ```bash
+     xpk workload create \
+       --cluster ${GKE_CLUSTER?} \
+       --workload ${RUN_NAME?} \
+       --base-docker-image maxtext_base_image \
+       --tpu-type v5litepod-256 \
+       --num-slices ${NUM_SLICES?} \
+       --command "python3 -m maxtext.trainers.pre_train.train run_name=${RUN_NAME?} base_output_directory=${BASE_OUTPUT_DIRECTORY?} dataset_path=${DATASET_PATH?} steps=100"
      ```
 
    - **On your GPU cluster:**
 
-     ```
-     xpk workload create\
-       --cluster ${CLUSTER_NAME?}\
-       --workload ${USER}-gpu-job\
-       --base-docker-image maxtext_base_image\
-       --device-type h100-80gb-8\
-       --num-nodes 2\
-       --command "python3 -m maxtext.trainers.pre_train.train run_name=${USER}-gpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100"
+     ```bash
+     xpk workload create \
+       --cluster ${GKE_CLUSTER?} \
+       --workload ${RUN_NAME?} \
+       --base-docker-image maxtext_base_image \
+       --device-type h100-80gb-8 \
+       --num-nodes ${NUM_SLICES?} \
+       --command "python3 -m maxtext.trainers.pre_train.train run_name=${RUN_NAME?} base_output_directory=${BASE_OUTPUT_DIRECTORY?} dataset_path=${DATASET_PATH?} steps=100"
      ```
 
 ______________________________________________________________________
@@ -172,20 +197,20 @@ ______________________________________________________________________
 
   2. Go to **Workloads**.
 
-  3. Find your workload (e.g., `${USER}-tpu-job`) and click on it.
+  3. Find your workload (e.g., `${RUN_NAME?}`) and click on it.
 
   4. Select the **Logs** tab to view the container logs.
 
 - **List your jobs:**
 
-  ```
-  xpk workload list --cluster ${CLUSTER_NAME?}
+  ```bash
+  xpk workload list --cluster ${GKE_CLUSTER?}
   ```
 
-- **Analyze output:** Checkpoints and other artifacts will be saved to the Google Cloud Storage bucket you specified in `BASE_OUTPUT_DIR`.
+- **Analyze output:** Checkpoints and other artifacts will be saved to the Google Cloud Storage bucket you specified in `BASE_OUTPUT_DIRECTORY`.
 
 - **Delete a job:**
 
-  ```
-  xpk workload delete --cluster ${CLUSTER_NAME?} --workload <your-workload-name>
+  ```bash
+  xpk workload delete --cluster ${GKE_CLUSTER?} --workload ${RUN_NAME?}
   ```
@@ -34,13 +34,26 @@ Login to Hugging Face. Provide your access token when prompted:
 hf auth login
 ```
 
-```sh
+Set up the following environment variables to configure your training run. Replace
+placeholders with your actual values.
+
+```bash
 # -- Model configuration --
-export MODEL=<model name> # e.g., 'llama3.1-8b-Instruct'
+# The MaxText model name. See `src/maxtext/configs/types.py` for `ModelName` for a
+# full list of supported models.
+export MODEL=<MaxText Model> # e.g., 'llama3.1-8b-Instruct'
 
 # -- MaxText configuration --
-export BASE_OUTPUT_DIRECTORY=<output directory to store run logs> # e.g., gs://my-bucket/my-output-directory
-export RUN_NAME=<name for this run> # e.g., $(date +%Y-%m-%d-%H-%M-%S)
+# Use a GCS bucket you own to store logs and checkpoints. Ideally in the same
+# region as your TPUs to minimize latency and costs.
+# You can list your buckets and their locations in the
+# [Cloud Console](https://console.cloud.google.com/storage/browser).
+export BASE_OUTPUT_DIRECTORY=<gcs bucket path> # e.g., gs://my-bucket/maxtext-runs
+
+# An arbitrary string to identify this specific run.
+# We recommend to include the model, user, and timestamp.
+# Note: Kubernetes requires workload names to be valid DNS labels (lowercase, no underscores or periods).
+export RUN_NAME=<Name for this run>
 ```
 
 ## Hugging Face checkpoint to Maxtext checkpoint
@@ -77,10 +90,10 @@ Run these steps once per project prior to any local development or cluster exper
 MaxText assumes these GCS buckets are created in the same project and that it has permissions to read and write from them.
 
 ```sh
-export PROJECT=<Google Cloud Project ID>
+export PROJECT_ID=<Google Cloud Project ID>
 export DATASET_GCS_BUCKET=<GCS for dataset> # e.g., gs://my-bucket/my-dataset
 
-bash tools/data_generation/download_dataset.sh ${PROJECT?} ${DATASET_GCS_BUCKET?}
+bash tools/data_generation/download_dataset.sh ${PROJECT_ID?} ${DATASET_GCS_BUCKET?}
 ```
 
 The above will download the c4 dataset to the GCS BUCKET.
Original file line number	Diff line number	Diff line change
`@@ -162,6 +162,8 @@`
`162`	`162`	`r"https://cla\.developers\.google\.com/.*",`
`163`	`163`	`# Ignore GitHub commit history links which frequently trigger rate limiting (429)`
`164`	`164`	`r"https://github\.com/jax-ml/jax/commits/.*",`
	`165`	`+ # Ignore Hugging Face settings links which require login`
	`166`	`+ r"https://huggingface\.co/settings/tokens",`
`165`	`167`	`]`
`166`	`168`
`167`	`169`