Skip to content

Commit e861926

Browse files
rework application to use vLLM
1 parent 6063a98 commit e861926

2 files changed

Lines changed: 64 additions & 71 deletions

File tree

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,15 @@
1-
FROM nvcr.io/nvidia/cuda:12.9.0-runtime-ubuntu24.04
1+
FROM pytorch/pytorch:2.9.0-cuda13.0-cudnn9-devel
22

3-
RUN apt-get update \
4-
&& apt-get install -y --no-install-recommends \
5-
git \
6-
python3 \
7-
python3-pip \
8-
python3-venv \
9-
&& apt-get clean \
10-
&& rm -rf /var/lib/apt/lists/*
3+
# Install vLLM and dependencies
4+
RUN pip install vllm accelerate
5+
# Optional: install bitsandbytes if needed
6+
RUN pip install bitsandbytes
117

12-
ENV VIRTUAL_ENV=/opt/env
13-
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
8+
# Set working directory
9+
WORKDIR /app
1410

15-
RUN python3 -m venv $VIRTUAL_ENV
11+
# Copy your script and input file
12+
COPY . /app
1613

17-
RUN pip install --no-cache-dir --upgrade pip
18-
19-
COPY app.py requirements.txt .
20-
RUN pip install --no-cache-dir -r requirements.txt
21-
22-
# CMD ["python", "app.py"]
14+
# Entrypoint
15+
CMD ["python", "app.py"]
Lines changed: 53 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,62 @@
11
import argparse
2-
import json
32
import os
4-
import transformers
5-
from transformers import AutoTokenizer, AutoModelForCausalLM
6-
7-
model_name = 'tiiuae/falcon-7b-instruct'
8-
9-
tokenizer = AutoTokenizer.from_pretrained(
10-
model_name,
11-
)
12-
13-
model = AutoModelForCausalLM.from_pretrained(
14-
model_name,
15-
device_map='auto',
16-
low_cpu_mem_usage=False,
17-
)
3+
import json
4+
import time
5+
from vllm import LLM, SamplingParams
186

19-
generator = transformers.pipeline(
20-
'text-generation',
21-
model=model,
22-
tokenizer=tokenizer,
23-
device_map='auto',
24-
)
7+
access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
8+
task_index = os.getenv("CE_TASK_INDEX")
9+
#model_name = 'tiiuae/falcon-7b-instruct'
10+
model_name = os.getenv("MODEL_NAME", "ibm-granite/granite-4.0-h-small")
2511

2612
parser = argparse.ArgumentParser()
2713
parser.add_argument('input')
2814
args = parser.parse_args()
2915

30-
with open(args.input) as cmdFile:
31-
for line in cmdFile:
32-
inPath, outPath = line.split(';', 1)
33-
34-
with open(inPath) as inFile:
35-
inputData = json.load(inFile)
36-
inputData['quantitativeMeasures'] = []
37-
38-
recipe =' '.join(inputData['directions'])
39-
prompts = [
40-
f'User: extract temperature and duration values for each step of the following recipe. Use the following format for each sentence of the recipe: temperature=..., duration=....\nRecipe:\n{recipe}\n\nAssistant:',
41-
f'User: from the following recipe, list temperature and time like: temperature=..., duration=...\n{recipe}\n\nAssistant:',
42-
f'User: summarize temperature and time values for this recipe, where applicable in the following format: step1: temperature=..., time=...; step2: etc.\n{recipe}\n\nAssistant:'
43-
]
44-
45-
46-
for idx, prompt in enumerate(prompts):
47-
outputs = generator(
48-
prompt,
49-
do_sample=False,
50-
max_new_tokens=200,
51-
return_full_text=False,
52-
truncation=True
53-
)
54-
output_text = outputs[0]['generated_text']
55-
56-
print(f'output = {output_text}')
57-
58-
inputData['quantitativeMeasures'].append(output_text)
16+
def run_conversation():
17+
start_download_time = time.perf_counter()
18+
19+
llm = LLM(model=model_name, quantization="fp8") # or "int8" if supported
20+
end_download_time = time.perf_counter()
21+
print(f"Model initilization completed in {end_download_time - start_download_time:.2f} seconds")
22+
23+
prompts = []
24+
with open(args.input, 'r', encoding='utf-8') as file:
25+
for inPath in file:
26+
with open(inPath.rstrip()) as inFile:
27+
inputData = json.load(inFile)
28+
inputData['quantitativeMeasures'] = []
29+
30+
recipe =' '.join(inputData['directions'])
31+
prompt = f'User: extract temperature and duration values for each step of the following recipe. Use the following format for each sentence of the recipe: temperature=..., duration=....\nRecipe:\n{recipe}\n\nAssistant:'
32+
prompts.append(prompt)
33+
34+
sampling_params = SamplingParams(max_tokens=300)
35+
36+
start_time = time.perf_counter()
37+
outputs = llm.generate(prompts, sampling_params)
38+
end_time = time.perf_counter()
39+
40+
print(f"Batch inferencing completed in {end_time - start_time:.2f} seconds")
41+
42+
start_time_output = time.perf_counter()
43+
# iterate through the outputs of each prompt
44+
results = []
45+
for i, output in enumerate(outputs):
46+
result = {
47+
"input": prompts[i],
48+
"output": output.outputs[0].text
49+
}
50+
results.append(result)
51+
52+
# write output to a single file for the whole batch
53+
output_path = "/output/inferencing_vllm_task-%s.jsonl" % task_index
54+
with open(output_path, 'w', encoding='utf-8') as out_file:
55+
out_file.write(json.dumps(results) + "\n")
56+
end_time_output = time.perf_counter()
57+
58+
print(f"Writing outputs completed in {end_time_output - start_time_output:.2f} seconds")
59+
60+
if __name__ == "__main__":
61+
run_conversation()
5962

60-
os.makedirs(os.path.dirname(outPath), exist_ok=True)
61-
with open(outPath, 'w') as outFile:
62-
json.dump(inputData, outFile)

0 commit comments

Comments
 (0)