|
66 | 66 | import random |
67 | 67 | import time |
68 | 68 | from typing import Any, AsyncGenerator, Optional |
| 69 | +import os |
| 70 | + |
69 | 71 |
|
70 | 72 | import grpc |
71 | 73 | from jetstream.core.proto import jetstream_pb2 |
72 | 74 | from jetstream.core.proto import jetstream_pb2_grpc |
73 | 75 | from jetstream.engine.token_utils import load_vocab |
74 | 76 | import numpy as np |
75 | 77 | from tqdm.asyncio import tqdm # pytype: disable=pyi-error |
| 78 | +import pandas |
| 79 | + |
76 | 80 | from eval_accuracy import eval_accuracy |
77 | 81 |
|
78 | 82 |
|
@@ -163,14 +167,20 @@ def load_sharegpt_dataset( |
163 | 167 | return dataset |
164 | 168 |
|
165 | 169 |
|
166 | | -def load_openorca_dataset(dataset_path: str) -> list[tuple[Any, Any]]: |
167 | | - # Load the dataset. |
168 | | - with open(dataset_path, "r", encoding="utf-8") as f: |
169 | | - dataset = json.load(f) |
| 170 | +def load_openorca_dataset_pkl(): |
| 171 | + # read pickle file |
| 172 | + samples = pandas.read_pickle( |
| 173 | + os.path.join( |
| 174 | + os.path.dirname(os.path.relpath(__file__)), |
| 175 | + "open_orca_gpt4_tokenized_llama.calibration_1000.pkl", |
| 176 | + ) |
| 177 | + ) |
170 | 178 |
|
171 | | - # Tokenize the prompts and completions. |
172 | | - prompts = dataset["prompts"] |
173 | | - outputs = dataset["results"] |
| 179 | + prompts = [] |
| 180 | + outputs = [] |
| 181 | + for _, row in samples.iterrows(): |
| 182 | + prompts.append(row["input"]) |
| 183 | + outputs.append(row["output"]) |
174 | 184 |
|
175 | 185 | return [(prompt, output) for prompt, output in zip(prompts, outputs)] |
176 | 186 |
|
@@ -542,7 +552,7 @@ def main(args: argparse.Namespace): |
542 | 552 | ) # e.g. [("AB", 2, "AB", 3)] |
543 | 553 | else: |
544 | 554 | if args.dataset == "openorca": |
545 | | - dataset = load_openorca_dataset(args.dataset_path) |
| 555 | + dataset = load_openorca_dataset_pkl() |
546 | 556 | elif args.dataset == "sharegpt": |
547 | 557 | dataset = load_sharegpt_dataset( |
548 | 558 | args.dataset_path, |
|
0 commit comments