Update maxtext user guide (#56)

JoeZijunZhou · web-flow · commit 94690540b2e4 · 2024-04-24T16:08:06.000-07:00
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Currently, there are two reference engine implementations available -- one for J
 ### Jax
 
 - Git: https://github.com/google/maxtext
-- README: https://github.com/google/JetStream/blob/main/jetstream/max_text/README.md
+- README: https://github.com/google/JetStream/blob/main/docs/online-inference-with-maxtext-engine.md
 
 ### Pytorch
 
diff --git a/docs/online-inference-with-maxtext-engine.md b/docs/online-inference-with-maxtext-engine.md
@@ -126,7 +126,7 @@ export ICI_AUTOREGRESSIVE_PARALLELISM=-1
 export ICI_TENSOR_PARALLELISM=1
 export SCAN_LAYERS=false
 export WEIGHT_DTYPE=bfloat16
-export PER_DEVICE_BATCH_SIZE=6
+export PER_DEVICE_BATCH_SIZE=4
 ```
 
 #### Create Llama2-13b environment variables for server flags
@@ -191,7 +191,10 @@ Note: these flags are from [MaxText config](https://github.com/google/maxtext/bl
 
 ```bash
 cd ~
-python JetStream/jetstream/tools/requester.py
+# For Gemma model
+python JetStream/jetstream/tools/requester.py --tokenizer maxtext/assets/tokenizer.gemma
+# For Llama2 model
+python JetStream/jetstream/tools/requester.py --tokenizer maxtext/assets/tokenizer.llama2
 ```
 
 The output will be similar to the following:
@@ -250,7 +253,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # run benchmark with the downloaded dataset and the tokenizer in maxtext
 # You can control the qps by setting `--request-rate`, the default value is inf.
 python JetStream/benchmarks/benchmark_serving.py \
---tokenizer /home/$USER/maxtext/assets/tokenizer.gemma \ 
+--tokenizer maxtext/assets/tokenizer.gemma \ 
 --num-prompts 1000 \
 --dataset sharegpt \
 --dataset-path ~/ShareGPT_V3_unfiltered_cleaned_split.json \