posthog-python/posthog/ai/gemini/gemini_converter.py at c981d91825d126ccbca7cb69a10a589d96b19c8e · PostHog/posthog-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
"""
Gemini-specific conversion utilities.

This module handles the conversion of Gemini API responses and inputs
into standardized formats for PostHog tracking.
"""

from typing import Any, Dict, List, Optional, TypedDict, Union

from posthog.ai.types import (
    FormattedContentItem,
    FormattedMessage,
    TokenUsage,
)
from posthog.ai.utils import serialize_raw_usage


class GeminiPart(TypedDict, total=False):
    """Represents a part in a Gemini message."""

    text: str


class GeminiMessage(TypedDict, total=False):
    """Represents a Gemini message with various possible fields."""

    role: str
    parts: List[Union[GeminiPart, Dict[str, Any]]]
    content: Union[str, List[Any]]
    text: str


def _format_parts_as_content_blocks(parts: List[Any]) -> List[FormattedContentItem]:
    """
    Format Gemini parts array into structured content blocks.

    Preserves structure for multimodal content (text + images) instead of
    concatenating everything into a string.

    Args:
        parts: List of parts that may contain text, inline_data, etc.

    Returns:
        List of formatted content blocks
    """
    content_blocks: List[FormattedContentItem] = []

    for part in parts:
        # Handle dict with text field
        if isinstance(part, dict) and "text" in part:
            content_blocks.append({"type": "text", "text": part["text"]})

        # Handle string parts
        elif isinstance(part, str):
            content_blocks.append({"type": "text", "text": part})

        # Handle dict with inline_data (images, documents, etc.)
        elif isinstance(part, dict) and "inline_data" in part:
            inline_data = part["inline_data"]
            mime_type = inline_data.get("mime_type", "")
            content_type = "image" if mime_type.startswith("image/") else "document"

            content_blocks.append(
                {
                    "type": content_type,
                    "inline_data": inline_data,
                }
            )

        # Handle object with text attribute
        elif hasattr(part, "text"):
            text_value = getattr(part, "text", "")
            if text_value:
                content_blocks.append({"type": "text", "text": text_value})

        # Handle object with inline_data attribute
        elif hasattr(part, "inline_data"):
            inline_data = part.inline_data
            # Convert to dict if needed
            if hasattr(inline_data, "mime_type") and hasattr(inline_data, "data"):
                # Determine type based on mime_type
                mime_type = inline_data.mime_type
                content_type = "image" if mime_type.startswith("image/") else "document"

                content_blocks.append(
                    {
                        "type": content_type,
                        "inline_data": {
                            "mime_type": mime_type,
                            "data": inline_data.data,
                        },
                    }
                )
            else:
                content_blocks.append(
                    {
                        "type": "image",
                        "inline_data": inline_data,
                    }
                )

    return content_blocks


def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
    """
    Format a dictionary message into standardized format.

    Args:
        item: Dictionary containing message data

    Returns:
        Formatted message with role and content
    """

    # Handle dict format with parts array (Gemini-specific format)
    if "parts" in item and isinstance(item["parts"], list):
        content_blocks = _format_parts_as_content_blocks(item["parts"])
        return {"role": item.get("role", "user"), "content": content_blocks}

    # Handle dict with content field
    if "content" in item:
        content = item["content"]

        if isinstance(content, list):
            # If content is a list, format it as content blocks
            content_blocks = _format_parts_as_content_blocks(content)
            return {"role": item.get("role", "user"), "content": content_blocks}

        elif not isinstance(content, str):
            content = str(content)

        return {"role": item.get("role", "user"), "content": content}

    # Handle dict with text field
    if "text" in item:
        return {"role": item.get("role", "user"), "content": item["text"]}

    # Fallback to string representation
    return {"role": "user", "content": str(item)}


def _format_object_message(item: Any) -> FormattedMessage:
    """
    Format an object (with attributes) into standardized format.

    Args:
        item: Object that may have text or parts attributes

    Returns:
        Formatted message with role and content
    """

    # Handle object with parts attribute
    if hasattr(item, "parts") and hasattr(item.parts, "__iter__"):
        content_blocks = _format_parts_as_content_blocks(list(item.parts))
        role = getattr(item, "role", "user") if hasattr(item, "role") else "user"

        # Ensure role is a string
        if not isinstance(role, str):
            role = "user"

        return {"role": role, "content": content_blocks}

    # Handle object with text attribute
    if hasattr(item, "text"):
        role = getattr(item, "role", "user") if hasattr(item, "role") else "user"

        # Ensure role is a string
        if not isinstance(role, str):
            role = "user"

        return {"role": role, "content": item.text}

    # Handle object with content attribute
    if hasattr(item, "content"):
        role = getattr(item, "role", "user") if hasattr(item, "role") else "user"

        # Ensure role is a string
        if not isinstance(role, str):
            role = "user"

        content = item.content

        if isinstance(content, list):
            content_blocks = _format_parts_as_content_blocks(content)
            return {"role": role, "content": content_blocks}

        elif not isinstance(content, str):
            content = str(content)
        return {"role": role, "content": content}

    # Fallback to string representation
    return {"role": "user", "content": str(item)}


def format_gemini_response(response: Any) -> List[FormattedMessage]:
    """
    Format a Gemini response into standardized message format.

    Args:
        response: The response object from Gemini API

    Returns:
        List of formatted messages with role and content
    """

    output: List[FormattedMessage] = []

    if response is None:
        return output

    if hasattr(response, "candidates") and response.candidates:
        for candidate in response.candidates:
            if hasattr(candidate, "content") and candidate.content:
                content: List[FormattedContentItem] = []

                if hasattr(candidate.content, "parts") and candidate.content.parts:
                    for part in candidate.content.parts:
                        if hasattr(part, "text") and part.text:
                            content.append(
                                {
                                    "type": "text",
                                    "text": part.text,
                                }
                            )

                        elif hasattr(part, "function_call") and part.function_call:
                            function_call = part.function_call
                            content.append(
                                {
                                    "type": "function",
                                    "function": {
                                        "name": function_call.name,
                                        "arguments": function_call.args,
                                    },
                                }
                            )

                        elif hasattr(part, "inline_data") and part.inline_data:
                            # Handle audio/media inline data
                            import base64

                            inline_data = part.inline_data
                            mime_type = getattr(inline_data, "mime_type", "audio/pcm")
                            raw_data = getattr(inline_data, "data", b"")

                            # Encode binary data as base64 string for JSON serialization
                            if isinstance(raw_data, bytes):
                                data = base64.b64encode(raw_data).decode("utf-8")
                            else:
                                # Already a string (base64)
                                data = raw_data

                            content.append(
                                {
                                    "type": "audio",
                                    "mime_type": mime_type,
                                    "data": data,
                                }
                            )

                if content:
                    output.append(
                        {
                            "role": "assistant",
                            "content": content,
                        }
                    )

            elif hasattr(candidate, "text") and candidate.text:
                output.append(
                    {
                        "role": "assistant",
                        "content": [{"type": "text", "text": candidate.text}],
                    }
                )

    elif hasattr(response, "text") and response.text:
        output.append(
            {
                "role": "assistant",
                "content": [{"type": "text", "text": response.text}],
            }
        )

    return output


def extract_gemini_stop_reason(response: Any) -> Optional[str]:
    """Extract stop reason from Gemini response."""
    if response and hasattr(response, "candidates") and response.candidates:
        candidate = response.candidates[0]
        finish_reason = getattr(candidate, "finish_reason", None)
        if finish_reason is not None:
            # Gemini uses enum values — convert to string name
            if hasattr(finish_reason, "name"):
                return finish_reason.name
            return str(finish_reason)
    return None


def extract_gemini_stop_reason_from_chunk(chunk: Any) -> Optional[str]:
    """Extract stop reason from a Gemini streaming chunk."""
    return extract_gemini_stop_reason(chunk)


def extract_gemini_system_instruction(config: Any) -> Optional[str]:
    """
    Extract system instruction from Gemini config parameter.

    Args:
        config: Config object or dict that may contain system instruction

    Returns:
        System instruction string if present, None otherwise
    """
    if config is None:
        return None

    # Handle different config formats
    if hasattr(config, "system_instruction"):
        return config.system_instruction
    elif isinstance(config, dict) and "system_instruction" in config:
        return config["system_instruction"]
    elif isinstance(config, dict) and "systemInstruction" in config:
        return config["systemInstruction"]

    return None


def extract_gemini_tools(kwargs: Dict[str, Any]) -> Optional[Any]:
    """
    Extract tool definitions from Gemini API kwargs.

    Args:
        kwargs: Keyword arguments passed to Gemini API

    Returns:
        Tool definitions if present, None otherwise
    """

    if "config" in kwargs and hasattr(kwargs["config"], "tools"):
        return kwargs["config"].tools

    return None


def format_gemini_input_with_system(
    contents: Any, config: Any = None
) -> List[FormattedMessage]:
    """
    Format Gemini input contents into standardized message format, including system instruction handling.

    Args:
        contents: Input contents in various possible formats
        config: Config object or dict that may contain system instruction

    Returns:
        List of formatted messages with role and content fields, with system message prepended if needed
    """
    formatted_messages = format_gemini_input(contents)

    # Check if system instruction is provided in config parameter
    system_instruction = extract_gemini_system_instruction(config)

    if system_instruction is not None:
        has_system = any(msg.get("role") == "system" for msg in formatted_messages)
        if not has_system:
            from posthog.ai.types import FormattedMessage

            system_message: FormattedMessage = {
                "role": "system",
                "content": system_instruction,
            }
            formatted_messages = [system_message] + list(formatted_messages)

    return formatted_messages


def format_gemini_input(contents: Any) -> List[FormattedMessage]:
    """
    Format Gemini input contents into standardized message format for PostHog tracking.

    This function handles various input formats:
    - String inputs
    - List of strings, dicts, or objects
    - Single dict or object
    - Gemini-specific format with parts array

    Args:
        contents: Input contents in various possible formats

    Returns:
        List of formatted messages with role and content fields
    """

    # Handle string input
    if isinstance(contents, str):
        return [{"role": "user", "content": contents}]

    # Handle list input
    if isinstance(contents, list):
        formatted: List[FormattedMessage] = []

        for item in contents:
            if isinstance(item, str):
                formatted.append({"role": "user", "content": item})

            elif isinstance(item, dict):
                formatted.append(_format_dict_message(item))

            else:
                formatted.append(_format_object_message(item))

        return formatted

    # Handle single dict input
    if isinstance(contents, dict):
        return [_format_dict_message(contents)]

    # Handle single object input
    return [_format_object_message(contents)]


def extract_gemini_web_search_count(response: Any) -> int:
    """
    Extract web search count from Gemini response.

    Gemini bills per request that uses grounding, not per query.
    Returns 1 if grounding_metadata is present with actual search data, 0 otherwise.

    Args:
        response: The response from Gemini API

    Returns:
        1 if web search/grounding was used, 0 otherwise
    """

    # Check for grounding_metadata in candidates
    if hasattr(response, "candidates"):
        for candidate in response.candidates:
            if (
                hasattr(candidate, "grounding_metadata")
                and candidate.grounding_metadata
            ):
                grounding_metadata = candidate.grounding_metadata

                # Check if web_search_queries exists and is non-empty
                if hasattr(grounding_metadata, "web_search_queries"):
                    queries = grounding_metadata.web_search_queries

                    if queries is not None and len(queries) > 0:
                        return 1

                # Check if grounding_chunks exists and is non-empty
                if hasattr(grounding_metadata, "grounding_chunks"):
                    chunks = grounding_metadata.grounding_chunks

                    if chunks is not None and len(chunks) > 0:
                        return 1

            # Also check for google_search or grounding in function call names
            if hasattr(candidate, "content") and candidate.content:
                if hasattr(candidate.content, "parts") and candidate.content.parts:
                    for part in candidate.content.parts:
                        if hasattr(part, "function_call") and part.function_call:
                            function_name = getattr(
                                part.function_call, "name", ""
                            ).lower()

                            if (
                                "google_search" in function_name
                                or "grounding" in function_name
                            ):
                                return 1

    return 0


def _extract_usage_from_metadata(metadata: Any) -> TokenUsage:
    """
    Common logic to extract usage from Gemini metadata.
    Used by both streaming and non-streaming paths.

    Args:
        metadata: usage_metadata from Gemini response or chunk

    Returns:
        TokenUsage with standardized usage
    """
    usage = TokenUsage(
        input_tokens=getattr(metadata, "prompt_token_count", 0),
        output_tokens=getattr(metadata, "candidates_token_count", 0),
    )

    # Add cache tokens if present (don't add if 0)
    if hasattr(metadata, "cached_content_token_count"):
        cache_tokens = metadata.cached_content_token_count
        if cache_tokens and cache_tokens > 0:
            usage["cache_read_input_tokens"] = cache_tokens

    # Add reasoning tokens if present (don't add if 0)
    if hasattr(metadata, "thoughts_token_count"):
        reasoning_tokens = metadata.thoughts_token_count
        if reasoning_tokens and reasoning_tokens > 0:
            usage["reasoning_tokens"] = reasoning_tokens

    # Capture raw usage metadata for backend processing
    # Serialize to dict here in the converter (not in utils)
    serialized = serialize_raw_usage(metadata)
    if serialized:
        usage["raw_usage"] = serialized

    return usage


def extract_gemini_usage_from_response(response: Any) -> TokenUsage:
    """
    Extract usage statistics from a full Gemini response (non-streaming).

    Args:
        response: The complete response from Gemini API

    Returns:
        TokenUsage with standardized usage statistics
    """
    if not hasattr(response, "usage_metadata") or not response.usage_metadata:
        return TokenUsage(input_tokens=0, output_tokens=0)

    usage = _extract_usage_from_metadata(response.usage_metadata)

    # Add web search count if present
    web_search_count = extract_gemini_web_search_count(response)
    if web_search_count > 0:
        usage["web_search_count"] = web_search_count

    return usage


def extract_gemini_usage_from_chunk(chunk: Any) -> TokenUsage:
    """
    Extract usage statistics from a Gemini streaming chunk.

    Args:
        chunk: Streaming chunk from Gemini API

    Returns:
        TokenUsage with standardized usage statistics
    """

    usage: TokenUsage = TokenUsage()

    # Extract web search count from the chunk before checking for usage_metadata
    # Web search indicators can appear on any chunk, not just those with usage data
    web_search_count = extract_gemini_web_search_count(chunk)
    if web_search_count > 0:
        usage["web_search_count"] = web_search_count

    if not hasattr(chunk, "usage_metadata") or not chunk.usage_metadata:
        return usage

    usage_from_metadata = _extract_usage_from_metadata(chunk.usage_metadata)

    # Merge the usage from metadata with any web search count we found
    usage.update(usage_from_metadata)

    return usage


def extract_gemini_content_from_chunk(chunk: Any) -> Optional[Dict[str, Any]]:
    """
    Extract content (text or function call) from a Gemini streaming chunk.

    Args:
        chunk: Streaming chunk from Gemini API

    Returns:
        Content block dictionary if present, None otherwise
    """

    # Check for text content
    if hasattr(chunk, "text") and chunk.text:
        return {"type": "text", "text": chunk.text}

    # Check for function calls in candidates
    if hasattr(chunk, "candidates") and chunk.candidates:
        for candidate in chunk.candidates:
            if hasattr(candidate, "content") and candidate.content:
                if hasattr(candidate.content, "parts") and candidate.content.parts:
                    for part in candidate.content.parts:
                        # Check for function_call part
                        if hasattr(part, "function_call") and part.function_call:
                            function_call = part.function_call
                            return {
                                "type": "function",
                                "function": {
                                    "name": function_call.name,
                                    "arguments": function_call.args,
                                },
                            }
                        # Also check for text in parts
                        elif hasattr(part, "text") and part.text:
                            return {"type": "text", "text": part.text}

    return None


def format_gemini_streaming_output(
    accumulated_content: Union[str, List[Any]],
) -> List[FormattedMessage]:
    """
    Format the final output from Gemini streaming.

    Args:
        accumulated_content: Accumulated content from streaming (string, list of strings, or list of content blocks)

    Returns:
        List of formatted messages
    """

    # Handle legacy string input (backward compatibility)
    if isinstance(accumulated_content, str):
        return [
            {
                "role": "assistant",
                "content": [{"type": "text", "text": accumulated_content}],
            }
        ]

    # Handle list input
    if isinstance(accumulated_content, list):
        content: List[FormattedContentItem] = []
        text_parts = []

        for item in accumulated_content:
            if isinstance(item, str):
                # Legacy support: accumulate strings
                text_parts.append(item)
            elif isinstance(item, dict):
                # New format: content blocks
                if item.get("type") == "text":
                    text_parts.append(item.get("text", ""))
                elif item.get("type") == "function":
                    # If we have accumulated text, add it first
                    if text_parts:
                        content.append(
                            {
                                "type": "text",
                                "text": "".join(text_parts),
                            }
                        )
                        text_parts = []

                    # Add the function call
                    content.append(
                        {
                            "type": "function",
                            "function": item.get("function", {}),
                        }
                    )

        # Add any remaining text
        if text_parts:
            content.append(
                {
                    "type": "text",
                    "text": "".join(text_parts),
                }
            )

        # If we have content, return it
        if content:
            return [{"role": "assistant", "content": content}]

    # Fallback for empty or unexpected input
    return [{"role": "assistant", "content": [{"type": "text", "text": ""}]}]