openlayer-python/src/openlayer/lib/integrations/async_openai_tracer.py at 1bd1c19ea1c029e28c7ed33e1bc922f30c247e66 · openlayer-ai/openlayer-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
"""Module with methods used to trace async OpenAI/Azure OpenAI LLMs."""

import json
import logging
import time
from functools import wraps
from typing import Any, AsyncIterator, Optional, Union, TYPE_CHECKING

try:
    import openai

    HAVE_OPENAI = True
except ImportError:
    HAVE_OPENAI = False

if TYPE_CHECKING:
    import openai

from .openai_tracer import (
    get_model_parameters,
    create_trace_args,
    add_to_trace,
    parse_non_streaming_output_data,
    parse_structured_output_data,
    # Import Responses API helper functions
    extract_responses_chunk_data,
    extract_responses_inputs,
    parse_responses_output_data,
    extract_responses_usage,
    get_responses_model_parameters,
)

logger = logging.getLogger(__name__)


def trace_async_openai(
    client: Union["openai.AsyncOpenAI", "openai.AsyncAzureOpenAI"],
) -> Union["openai.AsyncOpenAI", "openai.AsyncAzureOpenAI"]:
    """Patch the AsyncOpenAI or AsyncAzureOpenAI client to trace chat completions and responses.

    This function patches both the Chat Completions API (client.chat.completions.create)
    and the Responses API (client.responses.create) to provide comprehensive tracing
    for both APIs while maintaining backward compatibility.

    The following information is collected for each completion/response:
    - start_time: The time when the completion/response was requested.
    - end_time: The time when the completion/response was received.
    - latency: The time it took to generate the completion/response.
    - tokens: The total number of tokens used to generate the completion/response.
    - prompt_tokens: The number of tokens in the prompt/input.
    - completion_tokens: The number of tokens in the completion/output.
    - model: The model used to generate the completion/response.
    - model_parameters: The parameters used to configure the model.
    - raw_output: The raw output of the model.
    - inputs: The inputs used to generate the completion/response.
    - metadata: Additional metadata about the completion/response. For example, the time it
    took to generate the first token, when streaming.

    Parameters
    ----------
    client : Union[openai.AsyncOpenAI, openai.AsyncAzureOpenAI]
        The AsyncOpenAI client to patch.

    Returns
    -------
    Union[openai.AsyncOpenAI, openai.AsyncAzureOpenAI]
        The patched AsyncOpenAI client.
    """
    if not HAVE_OPENAI:
        raise ImportError("OpenAI library is not installed. Please install it with: pip install openai")

    is_azure_openai = isinstance(client, openai.AsyncAzureOpenAI)

    # Patch Chat Completions API
    chat_create_func = client.chat.completions.create

    @wraps(chat_create_func)
    async def traced_chat_create_func(*args, **kwargs):
        inference_id = kwargs.pop("inference_id", None)
        stream = kwargs.get("stream", False)

        if stream:
            return handle_async_streaming_create(
                *args,
                **kwargs,
                create_func=chat_create_func,
                inference_id=inference_id,
                is_azure_openai=is_azure_openai,
                api_type="chat_completions",
            )
        return await handle_async_non_streaming_create(
            *args,
            **kwargs,
            create_func=chat_create_func,
            inference_id=inference_id,
            is_azure_openai=is_azure_openai,
            api_type="chat_completions",
        )

    client.chat.completions.create = traced_chat_create_func

    # Patch parse method if it exists
    if hasattr(client.chat.completions, 'parse'):
        parse_func = client.chat.completions.parse

        @wraps(parse_func)
        async def traced_parse_func(*args, **kwargs):
            inference_id = kwargs.pop("inference_id", None)
            stream = kwargs.get("stream", False)

            if stream:
                return handle_async_streaming_parse(
                    *args,
                    **kwargs,
                    parse_func=parse_func,
                    inference_id=inference_id,
                    is_azure_openai=is_azure_openai,
                )
            return await handle_async_non_streaming_parse(
                *args,
                **kwargs,
                parse_func=parse_func,
                inference_id=inference_id,
                is_azure_openai=is_azure_openai,
            )

        client.chat.completions.parse = traced_parse_func

    # Patch Responses API (if available)
    if hasattr(client, "responses"):
        responses_create_func = client.responses.create

        @wraps(responses_create_func)
        async def traced_responses_create_func(*args, **kwargs):
            inference_id = kwargs.pop("inference_id", None)
            stream = kwargs.get("stream", False)

            if stream:
                return handle_async_responses_streaming_create(
                    *args,
                    **kwargs,
                    create_func=responses_create_func,
                    inference_id=inference_id,
                    is_azure_openai=is_azure_openai,
                )
            return await handle_async_responses_non_streaming_create(
                *args,
                **kwargs,
                create_func=responses_create_func,
                inference_id=inference_id,
                is_azure_openai=is_azure_openai,
            )

        client.responses.create = traced_responses_create_func
    else:
        logger.debug("Responses API not available in this AsyncOpenAI client version")

    return client


async def handle_async_streaming_create(
    create_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    api_type: str = "chat_completions",
    **kwargs,
) -> AsyncIterator[Any]:
    """Handles the create method when streaming is enabled.

    Parameters
    ----------
    create_func : callable
        The create method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    AsyncIterator[Any]
        A generator that yields the chunks of the completion.
    """
    chunks = await create_func(*args, **kwargs)

    # Create and return a new async generator that processes chunks
    collected_output_data = []
    collected_function_call = {
        "name": "",
        "arguments": "",
    }
    raw_outputs = []
    start_time = time.time()
    end_time = None
    first_token_time = None
    num_of_completion_tokens = None
    latency = None
    try:
        i = 0
        async for chunk in chunks:
            raw_outputs.append(chunk.model_dump())
            if i == 0:
                first_token_time = time.time()
            if i > 0:
                num_of_completion_tokens = i + 1
            i += 1

            choices = getattr(chunk, "choices", None)
            if not choices:
                yield chunk
                continue

            delta = chunk.choices[0].delta

            if delta.content:
                collected_output_data.append(delta.content)
            elif delta.function_call:
                if delta.function_call.name:
                    collected_function_call["name"] += delta.function_call.name
                if delta.function_call.arguments:
                    collected_function_call["arguments"] += delta.function_call.arguments
            elif delta.tool_calls:
                if delta.tool_calls[0].function.name:
                    collected_function_call["name"] += delta.tool_calls[0].function.name
                if delta.tool_calls[0].function.arguments:
                    collected_function_call["arguments"] += delta.tool_calls[0].function.arguments

            yield chunk

        end_time = time.time()
        latency = (end_time - start_time) * 1000
    # pylint: disable=broad-except
    except Exception as e:
        logger.error("Failed yield chunk. %s", e)
    finally:
        # Try to add step to the trace
        try:
            collected_output_data = [message for message in collected_output_data if message is not None]
            if collected_output_data:
                output_data = "".join(collected_output_data)
            else:
                if collected_function_call["arguments"]:
                    try:
                        collected_function_call["arguments"] = json.loads(
                            collected_function_call["arguments"]
                        )
                    except json.JSONDecodeError:
                        pass
                output_data = collected_function_call

            trace_args = create_trace_args(
                end_time=end_time,
                inputs={"prompt": kwargs["messages"]},
                output=output_data,
                latency=latency,
                tokens=num_of_completion_tokens,
                prompt_tokens=0,
                completion_tokens=num_of_completion_tokens,
                model=kwargs.get("model"),
                model_parameters=get_model_parameters(kwargs),
                raw_output=raw_outputs,
                id=inference_id,
                metadata={"timeToFirstToken": ((first_token_time - start_time) * 1000 if first_token_time else None)},
            )
            add_to_trace(
                **trace_args,
                is_azure_openai=is_azure_openai,
            )

        # pylint: disable=broad-except
        except Exception as e:
            logger.error(
                "Failed to trace the create chat completion request with Openlayer. %s",
                e,
            )


async def handle_async_non_streaming_create(
    create_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    api_type: str = "chat_completions",
    **kwargs,
) -> Union["openai.types.chat.chat_completion.ChatCompletion", Any]:
    """Handles the create method when streaming is disabled.

    Parameters
    ----------
    create_func : callable
        The create method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    openai.types.chat.chat_completion.ChatCompletion
        The chat completion response.
    """
    start_time = time.time()
    response = await create_func(*args, **kwargs)
    end_time = time.time()

    # Try to add step to the trace
    try:
        output_data = parse_non_streaming_output_data(response)
        trace_args = create_trace_args(
            end_time=end_time,
            inputs={"prompt": kwargs["messages"]},
            output=output_data,
            latency=(end_time - start_time) * 1000,
            tokens=response.usage.total_tokens,
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            model=response.model,
            model_parameters=get_model_parameters(kwargs),
            raw_output=response.model_dump(),
            id=inference_id,
        )

        add_to_trace(
            is_azure_openai=is_azure_openai,
            **trace_args,
        )
    # pylint: disable=broad-except
    except Exception as e:
        logger.error("Failed to trace the create chat completion request with Openlayer. %s", e)

    return response


# -------------------------------- Async Responses API Handlers -------------------------------- #


async def handle_async_responses_streaming_create(
    create_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    **kwargs,
) -> AsyncIterator[Any]:
    """Handles the Responses API create method when streaming is enabled (async version).

    Parameters
    ----------
    create_func : callable
        The Responses API create method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    AsyncIterator[Any]
        An async generator that yields the chunks of the response stream.
    """
    chunks = await create_func(*args, **kwargs)

    # Create and return a new async generator that processes chunks
    collected_output_data = []
    collected_function_call = {
        "name": "",
        "arguments": "",
    }
    raw_outputs = []
    start_time = time.time()
    end_time = None
    first_token_time = None
    num_of_completion_tokens = None
    latency = None

    try:
        i = 0
        async for chunk in chunks:
            raw_outputs.append(chunk.model_dump() if hasattr(chunk, "model_dump") else str(chunk))
            if i == 0:
                first_token_time = time.time()
            if i > 0:
                num_of_completion_tokens = i + 1
            i += 1

            # Handle different types of ResponseStreamEvent
            chunk_data = extract_responses_chunk_data(chunk)

            if chunk_data.get("content"):
                collected_output_data.append(chunk_data["content"])
            elif chunk_data.get("function_call"):
                func_call = chunk_data["function_call"]
                if func_call.get("name"):
                    collected_function_call["name"] += func_call["name"]
                if func_call.get("arguments"):
                    collected_function_call["arguments"] += func_call["arguments"]

            yield chunk

        end_time = time.time()
        latency = (end_time - start_time) * 1000
    # pylint: disable=broad-except
    except Exception as e:
        logger.error("Failed yield chunk. %s", e)
    finally:
        # Try to add step to the trace
        try:
            collected_output_data = [message for message in collected_output_data if message is not None]
            if collected_output_data:
                output_data = "".join(collected_output_data)
            else:
                if collected_function_call["arguments"]:
                    try:
                        collected_function_call["arguments"] = json.loads(collected_function_call["arguments"])
                    except json.JSONDecodeError:
                        # Keep as string if not valid JSON
                        pass
                output_data = collected_function_call

            trace_args = create_trace_args(
                end_time=end_time,
                inputs=extract_responses_inputs(kwargs),
                output=output_data,
                latency=latency,
                tokens=num_of_completion_tokens,
                prompt_tokens=0,
                completion_tokens=num_of_completion_tokens,
                model=kwargs.get("model", "unknown"),
                model_parameters=get_responses_model_parameters(kwargs),
                raw_output=raw_outputs,
                id=inference_id,
                metadata={
                    "timeToFirstToken": ((first_token_time - start_time) * 1000 if first_token_time else None),
                    "api_type": "responses",
                },
            )
            add_to_trace(
                **trace_args,
                is_azure_openai=is_azure_openai,
                api_type="responses",
            )

        # pylint: disable=broad-except
        except Exception as e:
            logger.error(
                "Failed to trace the Responses API request with Openlayer. %s",
                e,
            )


async def handle_async_responses_non_streaming_create(
    create_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    **kwargs,
) -> Any:
    """Handles the Responses API create method when streaming is disabled (async version).

    Parameters
    ----------
    create_func : callable
        The Responses API create method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    Any
        The response object.
    """
    start_time = time.time()
    response = await create_func(*args, **kwargs)
    end_time = time.time()

    # Try to add step to the trace
    try:
        output_data = parse_responses_output_data(response)
        usage_data = extract_responses_usage(response)

        trace_args = create_trace_args(
            end_time=end_time,
            inputs=extract_responses_inputs(kwargs),
            output=output_data,
            latency=(end_time - start_time) * 1000,
            tokens=usage_data.get("total_tokens", 0),
            prompt_tokens=usage_data.get("prompt_tokens", 0),
            completion_tokens=usage_data.get("completion_tokens", 0),
            model=getattr(response, "model", kwargs.get("model", "unknown")),
            model_parameters=get_responses_model_parameters(kwargs),
            raw_output=response.model_dump() if hasattr(response, "model_dump") else str(response),
            id=inference_id,
            metadata={"api_type": "responses"},
        )

        add_to_trace(
            is_azure_openai=is_azure_openai,
            api_type="responses",
            **trace_args,
        )
    # pylint: disable=broad-except
    except Exception as e:
        logger.error("Failed to trace the Responses API request with Openlayer. %s", e)

    return response


async def handle_async_streaming_parse(
    parse_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    **kwargs,
) -> AsyncIterator[Any]:
    """Handles the parse method when streaming is enabled.

    Parameters
    ----------
    parse_func : callable
        The parse method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    AsyncIterator[Any]
        A generator that yields the chunks of the completion.
    """
    chunks = await parse_func(*args, **kwargs)

    # Create and return a new async generator that processes chunks
    collected_output_data = []
    collected_function_call = {
        "name": "",
        "arguments": "",
    }
    raw_outputs = []
    start_time = time.time()
    end_time = None
    first_token_time = None
    num_of_completion_tokens = None
    latency = None
    try:
        i = 0
        async for chunk in chunks:
            raw_outputs.append(chunk.model_dump())
            if i == 0:
                first_token_time = time.time()
            if i > 0:
                num_of_completion_tokens = i + 1
            i += 1

            # Skip chunks with empty choices (e.g., Azure OpenAI heartbeat chunks)
            choices = getattr(chunk, "choices", None)
            if not choices:
                yield chunk
                continue

            delta = chunk.choices[0].delta

            if delta.content:
                collected_output_data.append(delta.content)
            elif delta.function_call:
                if delta.function_call.name:
                    collected_function_call["name"] += delta.function_call.name
                if delta.function_call.arguments:
                    collected_function_call[
                        "arguments"
                    ] += delta.function_call.arguments
            elif delta.tool_calls:
                if delta.tool_calls[0].function.name:
                    collected_function_call["name"] += delta.tool_calls[0].function.name
                if delta.tool_calls[0].function.arguments:
                    collected_function_call["arguments"] += delta.tool_calls[
                        0
                    ].function.arguments

            yield chunk

        end_time = time.time()
        latency = (end_time - start_time) * 1000
    # pylint: disable=broad-except
    except Exception as e:
        logger.error("Failed yield chunk. %s", e)
    finally:
        # Try to add step to the trace
        try:
            collected_output_data = [
                message for message in collected_output_data if message is not None
            ]
            if collected_output_data:
                output_data = "".join(collected_output_data)
            else:
                if collected_function_call["arguments"]:
                    try:
                        collected_function_call["arguments"] = json.loads(
                            collected_function_call["arguments"]
                        )
                    except json.JSONDecodeError:
                        pass
                output_data = collected_function_call

            trace_args = create_trace_args(
                end_time=end_time,
                inputs={"prompt": kwargs["messages"]},
                output=output_data,
                latency=latency,
                tokens=num_of_completion_tokens,
                prompt_tokens=0,
                completion_tokens=num_of_completion_tokens,
                model=kwargs.get("model"),
                model_parameters=get_model_parameters(kwargs),
                raw_output=raw_outputs,
                id=inference_id,
                metadata={
                    "timeToFirstToken": (
                        (first_token_time - start_time) * 1000
                        if first_token_time
                        else None
                    ),
                    "method": "parse",
                    "response_format": kwargs.get("response_format"),
                },
            )
            add_to_trace(
                **trace_args,
                is_azure_openai=is_azure_openai,
            )

        # pylint: disable=broad-except
        except Exception as e:
            logger.error(
                "Failed to trace the parse chat completion request with Openlayer. %s",
                e,
            )


async def handle_async_non_streaming_parse(
    parse_func: callable,
    *args,
    is_azure_openai: bool = False,
    inference_id: Optional[str] = None,
    **kwargs,
) -> Any:
    """Handles the parse method when streaming is disabled.

    Parameters
    ----------
    parse_func : callable
        The parse method to handle.
    is_azure_openai : bool, optional
        Whether the client is an Azure OpenAI client, by default False
    inference_id : Optional[str], optional
        A user-generated inference id, by default None

    Returns
    -------
    Any
        The parsed completion response.
    """
    start_time = time.time()
    response = await parse_func(*args, **kwargs)
    end_time = time.time()

    # Try to add step to the trace
    try:
        output_data = parse_structured_output_data(response)
        trace_args = create_trace_args(
            end_time=end_time,
            inputs={"prompt": kwargs["messages"]},
            output=output_data,
            latency=(end_time - start_time) * 1000,
            tokens=response.usage.total_tokens,
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            model=response.model,
            model_parameters=get_model_parameters(kwargs),
            raw_output=response.model_dump(),
            id=inference_id,
            metadata={
                "method": "parse",
                "response_format": kwargs.get("response_format"),
            },
        )

        add_to_trace(
            is_azure_openai=is_azure_openai,
            **trace_args,
        )
    # pylint: disable=broad-except
    except Exception as e:
        logger.error(
            "Failed to trace the parse chat completion request with Openlayer. %s", e
        )

    return response