Accuracy eval mlperf (#76)

jwyang-google · web-flow · commit 909361bcefcd · 2024-05-07T10:03:14.000-07:00
* align accuracy eval with mlperf llama2 setup.

* modify readme for accuracy evaluation.

* check in openorca calibration dataset.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -7,7 +7,7 @@ cd ~/JetStream/benchmarks
 pip install -r requirements.in
 ```
 
-## Benchmark
+## Benchmark with shareGPT
 
 ### Prepare DataSet
 
@@ -61,11 +61,31 @@ python benchmark_serving.py \
 
 ```
 
+## Benchmark with openorca dataset (openorca is used by MLPerf inference for LLaMA2 models)
+```
+python JetStream/benchmarks/benchmark_serving.py   \
+--tokenizer ~/maxtext/assets/tokenizer.llama2  \
+--warmup-first true   \
+--save-result   \
+--save-request-outputs   \
+--request-outputs-file-path outputs.json   \
+--num-prompts 1000   \
+--max-output-length 1024   \
+--dataset openorca
+
+```
+
 ## Standalone Evaluation Run
 
 If you used `--save-request-outputs`, you can separately evaluate against the generated outputs.
 
 ```
-python eval_accuracy.py
+python eval_accuracy.py outputs.json
 
-```
+```
+
+With openorca dataset and llama2-chat models (used by MLPerf), here are the reference accuracy numbers:
+```
+llama2-7b-chat {'rouge1': 42.0706, 'rouge2': 19.8021, 'rougeL': 26.8474, 'rougeLsum': 39.5952, 'gen_len': 1146679, 'gen_num': 998}
+llama2-70b-chat {'rouge1': 44.4312, 'rouge2': 22.0352, 'rougeL': 28.6162}
+``` 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -66,13 +66,17 @@
 import random
 import time
 from typing import Any, AsyncGenerator, Optional
+import os
+
 
 import grpc
 from jetstream.core.proto import jetstream_pb2
 from jetstream.core.proto import jetstream_pb2_grpc
 from jetstream.engine.token_utils import load_vocab
 import numpy as np
 from tqdm.asyncio import tqdm  # pytype: disable=pyi-error
+import pandas
+
 from eval_accuracy import eval_accuracy
 
 
@@ -163,14 +167,20 @@ def load_sharegpt_dataset(
   return dataset
 
 
-def load_openorca_dataset(dataset_path: str) -> list[tuple[Any, Any]]:
-  # Load the dataset.
-  with open(dataset_path, "r", encoding="utf-8") as f:
-    dataset = json.load(f)
+def load_openorca_dataset_pkl():
+  # read pickle file
+  samples = pandas.read_pickle(
+      os.path.join(
+          os.path.dirname(os.path.relpath(__file__)),
+          "open_orca_gpt4_tokenized_llama.calibration_1000.pkl",
+      )
+  )
 
-  # Tokenize the prompts and completions.
-  prompts = dataset["prompts"]
-  outputs = dataset["results"]
+  prompts = []
+  outputs = []
+  for _, row in samples.iterrows():
+    prompts.append(row["input"])
+    outputs.append(row["output"])
 
   return [(prompt, output) for prompt, output in zip(prompts, outputs)]
 
@@ -542,7 +552,7 @@ def main(args: argparse.Namespace):
     )  # e.g. [("AB", 2, "AB", 3)]
   else:
     if args.dataset == "openorca":
-      dataset = load_openorca_dataset(args.dataset_path)
+      dataset = load_openorca_dataset_pkl()
     elif args.dataset == "sharegpt":
       dataset = load_sharegpt_dataset(
           args.dataset_path,
diff --git a/benchmarks/open_orca_gpt4_tokenized_llama.calibration_1000.pkl b/benchmarks/open_orca_gpt4_tokenized_llama.calibration_1000.pkl