|
| 1 | +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates |
| 2 | +# SPDX-License-Identifier: MIT |
| 3 | + |
| 4 | +import json |
| 5 | +import time |
| 6 | +from typing import List |
| 7 | + |
| 8 | +import cozeloop |
| 9 | +from cozeloop import Message |
| 10 | +from cozeloop.entities.prompt import Role |
| 11 | +from cozeloop.spec.tracespec import CALL_OPTIONS, ModelCallOption, ModelMessage, ModelInput |
| 12 | + |
| 13 | + |
| 14 | +def convert_model_input(messages: List[Message]) -> ModelInput: |
| 15 | + model_messages = [] |
| 16 | + for message in messages: |
| 17 | + model_messages.append(ModelMessage( |
| 18 | + role=str(message.role), |
| 19 | + content=message.content if message.content is not None else "" |
| 20 | + )) |
| 21 | + |
| 22 | + return ModelInput( |
| 23 | + messages=model_messages |
| 24 | + ) |
| 25 | + |
| 26 | + |
| 27 | +class LLMRunner: |
| 28 | + def __init__(self, client): |
| 29 | + self.client = client |
| 30 | + |
| 31 | + def llm_call(self, input_data): |
| 32 | + """ |
| 33 | + Simulate an LLM call and set relevant span tags. |
| 34 | + """ |
| 35 | + span = self.client.start_span("llmCall", "model") |
| 36 | + try: |
| 37 | + # Assuming llm is processing |
| 38 | + # output = ChatOpenAI().invoke(input=input_data) |
| 39 | + |
| 40 | + # mock resp |
| 41 | + time.sleep(1) |
| 42 | + output = "I'm a robot. I don't have a specific name. You can give me one." |
| 43 | + input_token = 232 |
| 44 | + output_token = 1211 |
| 45 | + |
| 46 | + # set tag key: `input` |
| 47 | + span.set_input(convert_model_input(input_data)) |
| 48 | + # set tag key: `output` |
| 49 | + span.set_output(output) |
| 50 | + # set tag key: `model_provider`, e.g., openai, etc. |
| 51 | + span.set_model_provider("openai") |
| 52 | + # set tag key: `start_time_first_resp` |
| 53 | + # Timestamp of the first packet return from LLM, unit: microseconds. |
| 54 | + # When `start_time_first_resp` is set, a tag named `latency_first_resp` calculated |
| 55 | + # based on the span's StartTime will be added, meaning the latency for the first packet. |
| 56 | + span.set_start_time_first_resp(int(time.time() * 1000000)) |
| 57 | + # set tag key: `input_tokens`. The amount of input tokens. |
| 58 | + # when the `input_tokens` value is set, it will automatically sum with the `output_tokens` to calculate the `tokens` tag. |
| 59 | + span.set_input_tokens(input_token) |
| 60 | + # set tag key: `output_tokens`. The amount of output tokens. |
| 61 | + # when the `output_tokens` value is set, it will automatically sum with the `input_tokens` to calculate the `tokens` tag. |
| 62 | + span.set_output_tokens(output_token) |
| 63 | + # set tag key: `model_name`, e.g., gpt-4-1106-preview, etc. |
| 64 | + span.set_model_name("gpt-4-1106-preview") |
| 65 | + span.set_tags({CALL_OPTIONS: ModelCallOption( |
| 66 | + temperature=0.5, |
| 67 | + top_p=0.5, |
| 68 | + top_k=10, |
| 69 | + presence_penalty=0.5, |
| 70 | + frequency_penalty=0.5, |
| 71 | + max_tokens=1024, |
| 72 | + )}) |
| 73 | + |
| 74 | + return None |
| 75 | + except Exception as e: |
| 76 | + raise e |
| 77 | + finally: |
| 78 | + span.finish() |
| 79 | + |
| 80 | +# If you want to use the jinja templates in prompts, you can refer to the following. |
| 81 | +if __name__ == '__main__': |
| 82 | + # 1.Create a prompt on the platform |
| 83 | + # You can create a Prompt on the platform's Prompt development page (set Prompt Key to 'prompt_hub_demo'), |
| 84 | + # add the following messages to the template, and submit a version. |
| 85 | + # System: You are a helpful bot, the conversation topic is {{var1}}. |
| 86 | + # Placeholder: placeholder1 |
| 87 | + # User: My question is {{var2}} |
| 88 | + # Placeholder: placeholder2 |
| 89 | + |
| 90 | + # Set the following environment variables first. |
| 91 | + # COZELOOP_WORKSPACE_ID=your workspace id |
| 92 | + # COZELOOP_API_TOKEN=your token |
| 93 | + # 2.New loop client |
| 94 | + client = cozeloop.new_client( |
| 95 | + # Set whether to report a trace span when get or format prompt. |
| 96 | + # Default value is false. |
| 97 | + prompt_trace=True) |
| 98 | + |
| 99 | + # 3. new root span |
| 100 | + rootSpan = client.start_span("root_span", "main_span") |
| 101 | + |
| 102 | + # 4. Get the prompt |
| 103 | + # If no specific version is specified, the latest version of the corresponding prompt will be obtained |
| 104 | + prompt = client.get_prompt(prompt_key="prompt_hub_demo", version="0.0.1") |
| 105 | + if prompt is not None: |
| 106 | + # Get messages of the prompt |
| 107 | + if prompt.prompt_template is not None: |
| 108 | + messages = prompt.prompt_template.messages |
| 109 | + print( |
| 110 | + f"prompt messages: {json.dumps([message.model_dump(exclude_none=True) for message in messages], ensure_ascii=False)}") |
| 111 | + # Get llm config of the prompt |
| 112 | + if prompt.llm_config is not None: |
| 113 | + llm_config = prompt.llm_config |
| 114 | + print(f"prompt llm_config: {llm_config.model_dump_json(exclude_none=True)}") |
| 115 | + |
| 116 | + # 5.Format messages of the prompt |
| 117 | + formatted_messages = client.prompt_format(prompt, { |
| 118 | + "var_string": "hi", |
| 119 | + "var_int": 5, |
| 120 | + "var_bool": True, |
| 121 | + "var_float": 1.0, |
| 122 | + "var_object": { |
| 123 | + "name": "John", |
| 124 | + "age": 30, |
| 125 | + "hobbies": ["reading", "coding"], |
| 126 | + "address": { |
| 127 | + "city": "bejing", |
| 128 | + "street": "123 Main", |
| 129 | + }, |
| 130 | + }, |
| 131 | + "var_array_string": ["hello", "nihao"], |
| 132 | + "var_array_boolean": [True, False, True], |
| 133 | + "var_array_int": [1, 2, 3, 4], |
| 134 | + "var_array_float": [1.0, 2.0], |
| 135 | + "var_array_object": [{"key": "123"}, {"value": 100}], |
| 136 | + # Placeholder variable type should be Message/List[Message] |
| 137 | + "placeholder1": [Message(role=Role.USER, content="Hello!"), |
| 138 | + Message(role=Role.ASSISTANT, content="Hello!")] |
| 139 | + # Other variables in the prompt template that are not provided with corresponding values will be |
| 140 | + # considered as empty values. |
| 141 | + }) |
| 142 | + print( |
| 143 | + f"formatted_messages: {json.dumps([message.model_dump(exclude_none=True) for message in formatted_messages], ensure_ascii=False)}") |
| 144 | + |
| 145 | + # 6.LLM call |
| 146 | + llm_runner = LLMRunner(client) |
| 147 | + llm_runner.llm_call(formatted_messages) |
| 148 | + |
| 149 | + rootSpan.finish() |
| 150 | + # 4. (optional) flush or close |
| 151 | + # -- force flush, report all traces in the queue |
| 152 | + # Warning! In general, this method is not needed to be call, as spans will be automatically reported in batches. |
| 153 | + # Note that flush will block and wait for the report to complete, and it may cause frequent reporting, |
| 154 | + # affecting performance. |
| 155 | + client.flush() |
0 commit comments