Add model ckpt conversion and AQT scripts for JetStream MaxText Serving (#23)

JoeZijunZhou · web-flow · commit 22458763af4f · 2024-03-29T18:34:12.000-07:00
* add model ckpt conversion scripts for JetStream MaxText Serving

* Add description and device requirements

* address nit

* verification
diff --git a/README.md b/README.md
@@ -6,6 +6,11 @@ JetStream is a fast library for LLM inference and serving on TPUs.
 
 ## Getting Started
 
+### Setup
+```
+pip install -r requirements.txt
+```
+
 ### Run local server & Testing
 
 Use the following commands to run a server locally:
@@ -14,10 +19,10 @@ Use the following commands to run a server locally:
 python -m jetstream.core.implementations.mock.server
 
 # Test local mock server
-python -m jetstream.core.tools.requester
+python -m jetstream.tools.requester
 
 # Load test local mock server
-python -m jetstream.core.tools.load_tester
+python -m jetstream.tools.load_tester
 
 ```
 
diff --git a/benchmarks/eval_accuracy.py b/benchmarks/eval_accuracy.py
@@ -1,3 +1,17 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import nltk
 import evaluate
diff --git a/jetstream/core/utils/async_multifuture.py b/jetstream/core/utils/async_multifuture.py
@@ -1,3 +1,17 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import asyncio
 from concurrent import futures
 import threading
diff --git a/jetstream/tests/engine/test_token_utils.py b/jetstream/tests/engine/test_token_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import unittest
 from typing import List
diff --git a/jetstream/tools/load_tester.py b/jetstream/tools/load_tester.py
diff --git a/jetstream/tools/maxtext/model_ckpt_conversion.sh b/jetstream/tools/maxtext/model_ckpt_conversion.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script will do the following:
+# - Create GCS buckets to store model artifacts for the JetStream Maxtext Inference demo.
+# - Convert the downloaded checkpoints to MaxText compatible checkpoints.
+# - Convert the MaxText compatible checkpoints to unscanned checkpoints for inference.
+# Device requirements:
+# - Both checkpoints conversion only requires CPU (with JAX CPU mode).
+set -ex
+
+idx=$(date +%Y-%m-%d-%H-%M)
+# Modify the `MODEL` and `MODEL_VARIATION` based on the model you use.
+export MODEL=$1
+export MODEL_VARIATION=$2
+export MODEL_NAME=${MODEL}-${MODEL_VARIATION}
+
+# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET \
+# Please use seperate GCS paths for uploading open source model weights ($CHKPT_BUCKET) and MaxText compatible weights ($MODEL_BUCKET).
+# Point these variables to a GCS bucket that you created.
+export CHKPT_BUCKET=gs://${USER}-maxtext/chkpt/${MODEL}/${MODEL_VARIATION}
+export MODEL_BUCKET=gs://${USER}-maxtext
+
+# Point `BASE_OUTPUT_DIRECTORY` to a GCS bucket that you created, this bucket will store all the files generated by MaxText during a run.
+export BASE_OUTPUT_DIRECTORY=gs://${USER}-runner-maxtext-logs
+
+# Point `DATASET_PATH` to the GCS bucket where you have your training data
+export DATASET_PATH=gs://${USER}-maxtext-dataset
+
+export BUCKET_LOCATION=US
+
+# Create three GCS buckets for the demo.
+gcloud storage buckets create ${MODEL_BUCKET} --location=${BUCKET_LOCATION} || true
+gcloud storage buckets create ${BASE_OUTPUT_DIRECTORY} --location=${BUCKET_LOCATION} || true
+gcloud storage buckets create ${DATASET_PATH} --location=${BUCKET_LOCATION} || true
+
+# Copy the downloaded checkpoints to `CHKPT_BUCKET`.
+# Gemma example: gsutil -m cp -r 7b ${CHKPT_BUCKET}
+# Llama2 example: gsutil -m cp -r llama-2-7b ${CHKPT_BUCKET}
+sudo gsutil -m cp -r $3 ${CHKPT_BUCKET}
+
+# Covert model checkpoints to MaxText compatible checkpoints.
+if [ "$MODEL" == "gemma" ]; then
+    CONVERT_CKPT_SCRIPT="convert_gemma_chkpt.py"
+    JAX_PLATFORMS=cpu python MaxText/${CONVERT_CKPT_SCRIPT} \
+    --base_model_path ${CHKPT_BUCKET} \
+    --maxtext_model_path ${MODEL_BUCKET}/${MODEL}/${MODEL_VARIATION}/${idx} \
+    --model_size ${MODEL_VARIATION}
+else
+    # We install torch CPU because the checkpoint conversion script MaxText/llama_or_mistral_ckpt.py does not need a TPU/GPU
+    pip install torch --index-url https://download.pytorch.org/whl/cpu
+    CONVERT_CKPT_SCRIPT="llama_or_mistral_ckpt.py"
+    JAX_PLATFORMS=cpu python MaxText/${CONVERT_CKPT_SCRIPT} \
+    --base-model-path ${CHKPT_BUCKET} \
+    --maxtext-model-path ${MODEL_BUCKET}/${MODEL}/${MODEL_VARIATION}/${idx} \
+    --model-size ${MODEL_VARIATION}
+fi
+echo "Written MaxText compatible checkpoint to ${MODEL_BUCKET}/${MODEL}/${MODEL_VARIATION}/${idx}"
+
+# We define `CONVERTED_CHECKPOINT` to refer to the checkpoint subdirectory.
+export CONVERTED_CHECKPOINT=${MODEL_BUCKET}/${MODEL}/${MODEL_VARIATION}/${idx}/0/items
+
+# Covert MaxText compatible checkpoints to unscanned checkpoints.
+# Note that the `CONVERTED_CHECKPOINT` is in a `scanned` format which is great for training but for efficient decoding performance we want the checkpoint in an `unscanned` format.
+export RUN_NAME=${MODEL_NAME}_unscanned_chkpt_${idx}
+
+JAX_PLATFORMS=cpu python MaxText/generate_param_only_checkpoint.py \
+MaxText/configs/base.yml \
+base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+load_parameters_path=${CONVERTED_CHECKPOINT} \
+run_name=${RUN_NAME} \
+model_name=${MODEL_NAME} \
+force_unroll=true
+echo "Written MaxText unscanned checkpoint to ${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints"
+
+# We will use the unscanned checkpoints by passing `UNSCANNED_CKPT_PATH` into `LOAD_PARAMETERS_PATH` in the following sections.
+export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints/0/items
diff --git a/jetstream/tools/maxtext/model_ckpt_finetune_with_aqt.sh b/jetstream/tools/maxtext/model_ckpt_finetune_with_aqt.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script will do the following:
+# - Finetuning the MaxText compatible checkpoint (converted from original checkpoints) with AQT
+# - Convert the AQT-finetuned checkpoints to unscanned checkpoints for inference
+# TPU device requirements:
+# - For llama2-7b, it requires at least a v5e-8 TPU VM.
+# - For llama2-13B/70b, it requires a v4-128 TPU VM.
+set -ex
+
+idx=$(date +%Y-%m-%d-%H-%M)
+# Modify the `MODEL` and `MODEL_VARIATION` based on the model you use.
+export MODEL=$1
+export MODEL_VARIATION=$2
+export MODEL_NAME=${MODEL}-${MODEL_VARIATION}
+
+# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET \
+# Please use seperate GCS paths for uploading open source model weights ($CHKPT_BUCKET) and MaxText compatible weights ($MODEL_BUCKET).
+# Point these variables to a GCS bucket that you created.
+export CHKPT_BUCKET=gs://${USER}-maxtext/chkpt/${MODEL}/${MODEL_VARIATION}
+export MODEL_BUCKET=gs://${USER}-maxtext
+
+# Point `BASE_OUTPUT_DIRECTORY` to a GCS bucket that you created, this bucket will store all the files generated by MaxText during a run.
+export BASE_OUTPUT_DIRECTORY=gs://${USER}-runner-maxtext-logs
+
+# Point `DATASET_PATH` to the GCS bucket where you have your training data
+export DATASET_PATH=gs://${USER}-maxtext-dataset
+
+# Prepare C4 dataset for fine tuning: https://github.com/allenai/allennlp/discussions/5056
+sudo gsutil -u $3 -m cp 'gs://allennlp-tensorflow-datasets/c4/en/3.0.1/*' ${DATASET_PATH}/c4/en/3.0.1/
+
+# We define `CONVERTED_CHECKPOINT` to refer to the checkpoint subdirectory.
+export CONVERTED_CHECKPOINT=${MODEL_BUCKET}/${MODEL}/${MODEL_VARIATION}/${idx}/0/items
+
+# Fine tune the converted model checkpoints with AQT.
+export RUN_NAME=finetune_aqt_${idx}
+
+python3 MaxText/train.py \
+MaxText/configs/base.yml \
+run_name=${RUN_NAME} \
+base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+dataset_path=${DATASET_PATH} \
+steps=501 \
+enable_checkpointing=True \
+load_parameters_path=${CONVERTED_CHECKPOINT} \
+model_name=${MODEL_NAME} \
+per_device_batch_size=1 \
+quantization=int8 \
+checkpoint_period=100
+
+# We will convert the `AQT_CKPT` to unscanned checkpoint in the next step.
+export AQT_CKPT=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints/100/items
+
+# Covert MaxText compatible AQT-fine-tuned checkpoints to unscanned checkpoints.
+# Note that the `AQT_CKPT` is in a `scanned` format which is great for training but for efficient decoding performance we want the checkpoint in an `unscanned` format.
+export RUN_NAME=${MODEL_NAME}_unscanned_chkpt_${idx}
+
+JAX_PLATFORMS=cpu python MaxText/generate_param_only_checkpoint.py \
+MaxText/configs/base.yml \
+base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+load_parameters_path=${AQT_CKPT} \
+run_name=${RUN_NAME} \
+model_name=${MODEL_NAME} \
+force_unroll=true
+echo "Written MaxText unscanned checkpoint to ${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints"
+
+# We will use the unscanned checkpoints by passing `UNSCANNED_CKPT_PATH` into `LOAD_PARAMETERS_PATH` in the following sections.
+export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints/0/items
diff --git a/jetstream/tools/requester.py b/jetstream/tools/requester.py