1+ # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2+ # SPDX-License-Identifier: MIT
3+ import logging
4+ import time
5+ import threading
6+
7+ from cozeloop import set_log_level , new_client
8+
9+
10+ class LLMRunner :
11+ def __init__ (self , client ):
12+ self .client = client
13+
14+ def llm_call (self ):
15+ """
16+ Simulate an LLM call and set relevant span tags.
17+ """
18+ input_data = 'test input'
19+ span = self .client .start_span ("llmCall" , "model" )
20+ try :
21+ # Assuming llm is processing
22+ # output = ChatOpenAI().invoke(input=input_data)
23+
24+ # mock resp
25+ time .sleep (1 )
26+ output = "I'm a robot. I don't have a specific name. You can give me one."
27+ input_token = 232
28+ output_token = 1211
29+
30+ # set tag key: `input`
31+ span .set_input (input_data )
32+ # set tag key: `output`
33+ span .set_output (output )
34+ # set tag key: `model_provider`, e.g., openai, etc.
35+ span .set_model_provider ("openai" )
36+ # set tag key: `start_time_first_resp`
37+ # Timestamp of the first packet return from LLM, unit: microseconds.
38+ # When `start_time_first_resp` is set, a tag named `latency_first_resp` calculated
39+ # based on the span's StartTime will be added, meaning the latency for the first packet.
40+ span .set_start_time_first_resp (int (time .time () * 1000000 ))
41+ # set tag key: `input_tokens`. The amount of input tokens.
42+ # when the `input_tokens` value is set, it will automatically sum with the `output_tokens` to calculate the `tokens` tag.
43+ span .set_input_tokens (input_token )
44+ # set tag key: `output_tokens`. The amount of output tokens.
45+ # when the `output_tokens` value is set, it will automatically sum with the `input_tokens` to calculate the `tokens` tag.
46+ span .set_output_tokens (output_token )
47+ # set tag key: `model_name`, e.g., gpt-4-1106-preview, etc.
48+ span .set_model_name ("gpt-4-1106-preview" )
49+
50+ return None
51+ except Exception as e :
52+ raise e
53+ finally :
54+ span .finish ()
55+
56+
57+ set_log_level (logging .DEBUG )
58+ client = new_client ()
59+ llm_runner = LLMRunner (client )
60+
61+ def worker (interval , stop_event ):
62+ while not stop_event .is_set ():
63+ start_time = time .time ()
64+
65+ threading .Thread (target = llm_runner .llm_call ).start ()
66+
67+ elapsed_time = time .time () - start_time
68+
69+ if elapsed_time < interval :
70+ time .sleep (interval - elapsed_time )
71+
72+
73+ def benchmark (qps , duration ):
74+ """
75+ qps: qps
76+ duration: test run time
77+ """
78+ interval = 1.0 / qps
79+ stop_event = threading .Event ()
80+
81+
82+ control_thread = threading .Thread (target = worker , args = (interval , stop_event ))
83+ control_thread .start ()
84+
85+ # run duration
86+ time .sleep (duration )
87+
88+ stop_event .set ()
89+ control_thread .join ()
90+
91+
92+ def test_trace_benchmark ():
93+ benchmark (qps = 1000 , duration = 20 )
0 commit comments