rework application to use vLLM

jeremiaswerner · jeremiaswerner · commit e8619262b850 · 2025-11-14T15:33:13.000+01:00
diff --git a/serverless-fleets/tutorials/inferencing/src/Dockerfile b/serverless-fleets/tutorials/inferencing/src/Dockerfile
@@ -1,22 +1,15 @@
-FROM nvcr.io/nvidia/cuda:12.9.0-runtime-ubuntu24.04
+FROM pytorch/pytorch:2.9.0-cuda13.0-cudnn9-devel
 
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-    git \
-    python3 \
-    python3-pip \
-    python3-venv \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+# Install vLLM and dependencies
+RUN pip install vllm accelerate
+# Optional: install bitsandbytes if needed
+RUN pip install bitsandbytes
 
-ENV VIRTUAL_ENV=/opt/env
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Set working directory
+WORKDIR /app
 
-RUN python3 -m venv $VIRTUAL_ENV
+# Copy your script and input file
+COPY . /app
 
-RUN pip install --no-cache-dir --upgrade pip
-
-COPY app.py requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# CMD ["python", "app.py"]
+# Entrypoint
+CMD ["python", "app.py"]
diff --git a/serverless-fleets/tutorials/inferencing/src/app.py b/serverless-fleets/tutorials/inferencing/src/app.py
@@ -1,62 +1,62 @@
 import argparse
-import json
 import os
-import transformers
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-model_name = 'tiiuae/falcon-7b-instruct'
-
-tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        )
-
-model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map='auto',
-        low_cpu_mem_usage=False,
-        )
+import json
+import time
+from vllm import LLM, SamplingParams
 
-generator = transformers.pipeline(
-        'text-generation',
-        model=model,
-        tokenizer=tokenizer,
-        device_map='auto',
-        )
+access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+task_index = os.getenv("CE_TASK_INDEX")
+#model_name = 'tiiuae/falcon-7b-instruct'
+model_name = os.getenv("MODEL_NAME", "ibm-granite/granite-4.0-h-small")
 
 parser = argparse.ArgumentParser()
 parser.add_argument('input')
 args = parser.parse_args()
 
-with open(args.input) as cmdFile:
-    for line in cmdFile:
-        inPath, outPath = line.split(';', 1)
-
-        with open(inPath) as inFile:
-            inputData = json.load(inFile)
-            inputData['quantitativeMeasures'] = []
-
-            recipe =' '.join(inputData['directions'])
-            prompts = [
-                    f'User: extract temperature and duration values for each step of the following recipe. Use the following format for each sentence of the recipe: temperature=..., duration=....\nRecipe:\n{recipe}\n\nAssistant:',
-                    f'User: from the following recipe, list temperature and time like: temperature=..., duration=...\n{recipe}\n\nAssistant:',
-                    f'User: summarize temperature and time values for this recipe, where applicable in the following format: step1: temperature=..., time=...; step2: etc.\n{recipe}\n\nAssistant:'
-            ]
-
-
-            for idx, prompt in enumerate(prompts):
-                outputs = generator(
-                        prompt,
-                        do_sample=False,
-                        max_new_tokens=200,
-                        return_full_text=False,
-                        truncation=True
-                        )
-                output_text = outputs[0]['generated_text']
-
-                print(f'output = {output_text}')
-
-                inputData['quantitativeMeasures'].append(output_text)
+def run_conversation():
+    start_download_time = time.perf_counter()
+
+    llm = LLM(model=model_name, quantization="fp8")  # or "int8" if supported
+    end_download_time = time.perf_counter()
+    print(f"Model initilization completed in {end_download_time - start_download_time:.2f} seconds")
+
+    prompts = []
+    with open(args.input, 'r', encoding='utf-8') as file:
+        for inPath in file:
+            with open(inPath.rstrip()) as inFile:
+                inputData = json.load(inFile)
+                inputData['quantitativeMeasures'] = []
+
+                recipe =' '.join(inputData['directions'])
+                prompt = f'User: extract temperature and duration values for each step of the following recipe. Use the following format for each sentence of the recipe: temperature=..., duration=....\nRecipe:\n{recipe}\n\nAssistant:'
+                prompts.append(prompt)
+
+        sampling_params = SamplingParams(max_tokens=300)
+
+        start_time = time.perf_counter()
+        outputs = llm.generate(prompts, sampling_params)
+        end_time = time.perf_counter()
+
+        print(f"Batch inferencing completed in {end_time - start_time:.2f} seconds")
+
+        start_time_output = time.perf_counter()
+        # iterate through the outputs of each prompt
+        results = []
+        for i, output in enumerate(outputs):
+            result = {
+                "input": prompts[i],
+                "output": output.outputs[0].text
+            }
+            results.append(result)
+        
+        # write output to a single file for the whole batch 
+        output_path = "/output/inferencing_vllm_task-%s.jsonl" % task_index
+        with open(output_path, 'w', encoding='utf-8') as out_file:
+            out_file.write(json.dumps(results) + "\n")
+        end_time_output = time.perf_counter()
+
+        print(f"Writing outputs completed in {end_time_output - start_time_output:.2f} seconds")
+
+if __name__ == "__main__":
+    run_conversation()
 
-            os.makedirs(os.path.dirname(outPath), exist_ok=True)
-            with open(outPath, 'w') as outFile:
-                json.dump(inputData, outFile)