Skip to content

Commit 909361b

Browse files
Accuracy eval mlperf (#76)
* align accuracy eval with mlperf llama2 setup. * modify readme for accuracy evaluation. * check in openorca calibration dataset.
1 parent dabded4 commit 909361b

3 files changed

Lines changed: 41 additions & 11 deletions

File tree

benchmarks/README.md

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ cd ~/JetStream/benchmarks
77
pip install -r requirements.in
88
```
99

10-
## Benchmark
10+
## Benchmark with shareGPT
1111

1212
### Prepare DataSet
1313

@@ -61,11 +61,31 @@ python benchmark_serving.py \
6161
6262
```
6363

64+
## Benchmark with openorca dataset (openorca is used by MLPerf inference for LLaMA2 models)
65+
```
66+
python JetStream/benchmarks/benchmark_serving.py \
67+
--tokenizer ~/maxtext/assets/tokenizer.llama2 \
68+
--warmup-first true \
69+
--save-result \
70+
--save-request-outputs \
71+
--request-outputs-file-path outputs.json \
72+
--num-prompts 1000 \
73+
--max-output-length 1024 \
74+
--dataset openorca
75+
76+
```
77+
6478
## Standalone Evaluation Run
6579

6680
If you used `--save-request-outputs`, you can separately evaluate against the generated outputs.
6781

6882
```
69-
python eval_accuracy.py
83+
python eval_accuracy.py outputs.json
7084
71-
```
85+
```
86+
87+
With openorca dataset and llama2-chat models (used by MLPerf), here are the reference accuracy numbers:
88+
```
89+
llama2-7b-chat {'rouge1': 42.0706, 'rouge2': 19.8021, 'rougeL': 26.8474, 'rougeLsum': 39.5952, 'gen_len': 1146679, 'gen_num': 998}
90+
llama2-70b-chat {'rouge1': 44.4312, 'rouge2': 22.0352, 'rougeL': 28.6162}
91+
```

benchmarks/benchmark_serving.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,17 @@
6666
import random
6767
import time
6868
from typing import Any, AsyncGenerator, Optional
69+
import os
70+
6971

7072
import grpc
7173
from jetstream.core.proto import jetstream_pb2
7274
from jetstream.core.proto import jetstream_pb2_grpc
7375
from jetstream.engine.token_utils import load_vocab
7476
import numpy as np
7577
from tqdm.asyncio import tqdm # pytype: disable=pyi-error
78+
import pandas
79+
7680
from eval_accuracy import eval_accuracy
7781

7882

@@ -163,14 +167,20 @@ def load_sharegpt_dataset(
163167
return dataset
164168

165169

166-
def load_openorca_dataset(dataset_path: str) -> list[tuple[Any, Any]]:
167-
# Load the dataset.
168-
with open(dataset_path, "r", encoding="utf-8") as f:
169-
dataset = json.load(f)
170+
def load_openorca_dataset_pkl():
171+
# read pickle file
172+
samples = pandas.read_pickle(
173+
os.path.join(
174+
os.path.dirname(os.path.relpath(__file__)),
175+
"open_orca_gpt4_tokenized_llama.calibration_1000.pkl",
176+
)
177+
)
170178

171-
# Tokenize the prompts and completions.
172-
prompts = dataset["prompts"]
173-
outputs = dataset["results"]
179+
prompts = []
180+
outputs = []
181+
for _, row in samples.iterrows():
182+
prompts.append(row["input"])
183+
outputs.append(row["output"])
174184

175185
return [(prompt, output) for prompt, output in zip(prompts, outputs)]
176186

@@ -542,7 +552,7 @@ def main(args: argparse.Namespace):
542552
) # e.g. [("AB", 2, "AB", 3)]
543553
else:
544554
if args.dataset == "openorca":
545-
dataset = load_openorca_dataset(args.dataset_path)
555+
dataset = load_openorca_dataset_pkl()
546556
elif args.dataset == "sharegpt":
547557
dataset = load_sharegpt_dataset(
548558
args.dataset_path,
3.54 MB
Binary file not shown.

0 commit comments

Comments
 (0)