From 7ce51e93023f66f3e343e379fc1930ddba335e9b Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 9 Jul 2025 02:26:12 +0000
Subject: [PATCH 01/20] chore(internal): bump pinned h11 dep

---
 requirements-dev.lock | 4 ++--
 requirements.lock     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements-dev.lock b/requirements-dev.lock
index a84b5f4..7999ff4 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -48,9 +48,9 @@ filelock==3.12.4
 frozenlist==1.6.2
     # via aiohttp
     # via aiosignal
-h11==0.14.0
+h11==0.16.0
     # via httpcore
-httpcore==1.0.2
+httpcore==1.0.9
     # via httpx
 httpx==0.28.1
     # via codex-sdk
diff --git a/requirements.lock b/requirements.lock
index a0807d8..bde9133 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -36,9 +36,9 @@ exceptiongroup==1.2.2
 frozenlist==1.6.2
     # via aiohttp
     # via aiosignal
-h11==0.14.0
+h11==0.16.0
     # via httpcore
-httpcore==1.0.2
+httpcore==1.0.9
     # via httpx
 httpx==0.28.1
     # via codex-sdk

From 5cba94956fff8ca4de99426a20e5c67f0ce6a2ac Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 9 Jul 2025 02:45:16 +0000
Subject: [PATCH 02/20] chore(package): mark python 3.13 as supported

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 16e362c..e3d7c1f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ classifiers = [
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
   "Operating System :: OS Independent",
   "Operating System :: POSIX",
   "Operating System :: MacOS",

From b374589baf01ca1236cf0823305e6bca037cf12b Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 10 Jul 2025 02:40:45 +0000
Subject: [PATCH 03/20] fix(parsing): correctly handle nested discriminated
 unions

---
 src/codex/_models.py | 13 ++++++++-----
 tests/test_models.py | 45 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/codex/_models.py b/src/codex/_models.py
index 4f21498..528d568 100644
--- a/src/codex/_models.py
+++ b/src/codex/_models.py
@@ -2,9 +2,10 @@
 
 import os
 import inspect
-from typing import TYPE_CHECKING, Any, Type, Union, Generic, TypeVar, Callable, cast
+from typing import TYPE_CHECKING, Any, Type, Union, Generic, TypeVar, Callable, Optional, cast
 from datetime import date, datetime
 from typing_extensions import (
+    List,
     Unpack,
     Literal,
     ClassVar,
@@ -366,7 +367,7 @@ def _construct_field(value: object, field: FieldInfo, key: str) -> object:
     if type_ is None:
         raise RuntimeError(f"Unexpected field type is None for {key}")
 
-    return construct_type(value=value, type_=type_)
+    return construct_type(value=value, type_=type_, metadata=getattr(field, "metadata", None))
 
 
 def is_basemodel(type_: type) -> bool:
@@ -420,7 +421,7 @@ def construct_type_unchecked(*, value: object, type_: type[_T]) -> _T:
     return cast(_T, construct_type(value=value, type_=type_))
 
 
-def construct_type(*, value: object, type_: object) -> object:
+def construct_type(*, value: object, type_: object, metadata: Optional[List[Any]] = None) -> object:
     """Loose coercion to the expected type with construction of nested values.
 
     If the given value does not match the expected type then it is returned as-is.
@@ -438,8 +439,10 @@ def construct_type(*, value: object, type_: object) -> object:
         type_ = type_.__value__  # type: ignore[unreachable]
 
     # unwrap `Annotated[T, ...]` -> `T`
-    if is_annotated_type(type_):
-        meta: tuple[Any, ...] = get_args(type_)[1:]
+    if metadata is not None:
+        meta: tuple[Any, ...] = tuple(metadata)
+    elif is_annotated_type(type_):
+        meta = get_args(type_)[1:]
         type_ = extract_type_arg(type_, 0)
     else:
         meta = tuple()
diff --git a/tests/test_models.py b/tests/test_models.py
index c96609c..3452a61 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -889,3 +889,48 @@ class ModelB(BaseModel):
     )
 
     assert isinstance(m, ModelB)
+
+
+def test_nested_discriminated_union() -> None:
+    class InnerType1(BaseModel):
+        type: Literal["type_1"]
+
+    class InnerModel(BaseModel):
+        inner_value: str
+
+    class InnerType2(BaseModel):
+        type: Literal["type_2"]
+        some_inner_model: InnerModel
+
+    class Type1(BaseModel):
+        base_type: Literal["base_type_1"]
+        value: Annotated[
+            Union[
+                InnerType1,
+                InnerType2,
+            ],
+            PropertyInfo(discriminator="type"),
+        ]
+
+    class Type2(BaseModel):
+        base_type: Literal["base_type_2"]
+
+    T = Annotated[
+        Union[
+            Type1,
+            Type2,
+        ],
+        PropertyInfo(discriminator="base_type"),
+    ]
+
+    model = construct_type(
+        type_=T,
+        value={
+            "base_type": "base_type_1",
+            "value": {
+                "type": "type_2",
+            },
+        },
+    )
+    assert isinstance(model, Type1)
+    assert isinstance(model.value, InnerType2)

From d05336d89f5a49b09d7b1f85e7cb3ed74035157a Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 11 Jul 2025 02:58:34 +0000
Subject: [PATCH 04/20] chore(readme): fix version rendering on pypi

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5a5a805..71cf0a0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 # Codex SDK
 
-[![PyPI version](<https://img.shields.io/pypi/v/codex-sdk.svg?label=pypi%20(stable)>)](https://pypi.org/project/codex-sdk/)
+<!-- prettier-ignore -->
+[![PyPI version](https://img.shields.io/pypi/v/codex-sdk.svg?label=pypi%20(stable))](https://pypi.org/project/codex-sdk/)
 
 This library is not meant to be used directly. Refer to https://pypi.org/project/cleanlab-codex/ instead.

From 4732aaeb03872abffb4e13df6dd1994711bd4268 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Sat, 12 Jul 2025 02:05:53 +0000
Subject: [PATCH 05/20] fix(client): don't send Content-Type header on GET
 requests

---
 pyproject.toml            |  2 +-
 src/codex/_base_client.py | 11 +++++++++--
 tests/test_client.py      |  4 ++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e3d7c1f..964b48a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ Homepage = "https://github.com/cleanlab/codex-python"
 Repository = "https://github.com/cleanlab/codex-python"
 
 [project.optional-dependencies]
-aiohttp = ["aiohttp", "httpx_aiohttp>=0.1.6"]
+aiohttp = ["aiohttp", "httpx_aiohttp>=0.1.8"]
 
 [tool.rye]
 managed = true
diff --git a/src/codex/_base_client.py b/src/codex/_base_client.py
index 1eca89e..6da89f6 100644
--- a/src/codex/_base_client.py
+++ b/src/codex/_base_client.py
@@ -529,6 +529,15 @@ def _build_request(
             # work around https://github.com/encode/httpx/discussions/2880
             kwargs["extensions"] = {"sni_hostname": prepared_url.host.replace("_", "-")}
 
+        is_body_allowed = options.method.lower() != "get"
+
+        if is_body_allowed:
+            kwargs["json"] = json_data if is_given(json_data) else None
+            kwargs["files"] = files
+        else:
+            headers.pop("Content-Type", None)
+            kwargs.pop("data", None)
+
         # TODO: report this error to httpx
         return self._client.build_request(  # pyright: ignore[reportUnknownMemberType]
             headers=headers,
@@ -540,8 +549,6 @@ def _build_request(
             # so that passing a `TypedDict` doesn't cause an error.
             # https://github.com/microsoft/pyright/issues/3526#event-6715453066
             params=self.qs.stringify(cast(Mapping[str, Any], params)) if params else None,
-            json=json_data if is_given(json_data) else None,
-            files=files,
             **kwargs,
         )
 
diff --git a/tests/test_client.py b/tests/test_client.py
index c012f4d..2474915 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -427,7 +427,7 @@ def test_request_extra_query(self) -> None:
     def test_multipart_repeating_array(self, client: Codex) -> None:
         request = client._build_request(
             FinalRequestOptions.construct(
-                method="get",
+                method="post",
                 url="/foo",
                 headers={"Content-Type": "multipart/form-data; boundary=6b7ba517decee4a450543ea6ae821c82"},
                 json_data={"array": ["foo", "bar"]},
@@ -1211,7 +1211,7 @@ def test_request_extra_query(self) -> None:
     def test_multipart_repeating_array(self, async_client: AsyncCodex) -> None:
         request = async_client._build_request(
             FinalRequestOptions.construct(
-                method="get",
+                method="post",
                 url="/foo",
                 headers={"Content-Type": "multipart/form-data; boundary=6b7ba517decee4a450543ea6ae821c82"},
                 json_data={"array": ["foo", "bar"]},

From b956ce083ef3c507a7649577724f337a562c427a Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 14 Jul 2025 18:17:47 +0000
Subject: [PATCH 06/20] feat(api): api update

---
 .stats.yml                                    |   2 +-
 src/codex/types/project_validate_params.py    | 102 ++++----
 src/codex/types/project_validate_response.py  |   4 +-
 .../query_log_list_by_group_response.py       | 239 +++++++++++++++++-
 .../query_log_list_groups_response.py         | 231 ++++++++++++++++-
 .../types/projects/query_log_list_response.py | 231 ++++++++++++++++-
 .../projects/query_log_retrieve_response.py   | 231 ++++++++++++++++-
 ...remediation_list_resolved_logs_response.py | 237 ++++++++++++++++-
 .../projects/test_remediations.py             |  20 +-
 tests/api_resources/test_projects.py          |  20 +-
 10 files changed, 1238 insertions(+), 79 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 889336e..20ee827 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 55
-openapi_spec_hash: 922886934580d0b2addcb6e26ada0e09
+openapi_spec_hash: b3a1a58600b52a20671bef2b25f5dbc4
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 0862cbc..7b85d06 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -33,23 +33,23 @@
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
     "MessageChatCompletionSystemMessageParam",
     "MessageChatCompletionSystemMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParam",
-    "MessageChatCompletionUserMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParam",
-    "MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "MessageChatCompletionUserMessageParamContentUnionMember1File",
-    "MessageChatCompletionUserMessageParamContentUnionMember1FileFile",
-    "MessageChatCompletionAssistantMessageParam",
-    "MessageChatCompletionAssistantMessageParamAudio",
-    "MessageChatCompletionAssistantMessageParamContentUnionMember1",
-    "MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartRefusalParam",
-    "MessageChatCompletionAssistantMessageParamFunctionCall",
-    "MessageChatCompletionAssistantMessageParamToolCall",
-    "MessageChatCompletionAssistantMessageParamToolCallFunction",
+    "MessageChatCompletionUserMessageParamInput",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile",
+    "MessageChatCompletionAssistantMessageParamInput",
+    "MessageChatCompletionAssistantMessageParamInputAudio",
+    "MessageChatCompletionAssistantMessageParamInputContentUnionMember1",
+    "MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "MessageChatCompletionAssistantMessageParamInputFunctionCall",
+    "MessageChatCompletionAssistantMessageParamInputToolCall",
+    "MessageChatCompletionAssistantMessageParamInputToolCallFunction",
     "MessageChatCompletionToolMessageParam",
     "MessageChatCompletionToolMessageParamContentUnionMember1",
     "MessageChatCompletionFunctionMessageParam",
@@ -468,7 +468,7 @@ class MessageChatCompletionSystemMessageParam(TypedDict, total=False):
     name: str
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartTextParam(
+class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam(
     TypedDict, total=False
 ):
     text: Required[str]
@@ -476,7 +476,7 @@ class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionCont
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
     TypedDict, total=False
 ):
     url: Required[str]
@@ -484,17 +484,17 @@ class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionCont
     detail: Literal["auto", "low", "high"]
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParam(
+class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam(
     TypedDict, total=False
 ):
     image_url: Required[
-        MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParamImageURL
+        MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL
     ]
 
     type: Required[Literal["image_url"]]
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
     TypedDict, total=False
 ):
     data: Required[str]
@@ -502,17 +502,17 @@ class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionCont
     format: Required[Literal["wav", "mp3"]]
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParam(
+class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam(
     TypedDict, total=False
 ):
     input_audio: Required[
-        MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+        MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
     ]
 
     type: Required[Literal["input_audio"]]
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1FileFile(TypedDict, total=False):
+class MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile(TypedDict, total=False):
     file_data: str
 
     file_id: str
@@ -520,33 +520,33 @@ class MessageChatCompletionUserMessageParamContentUnionMember1FileFile(TypedDict
     filename: str
 
 
-class MessageChatCompletionUserMessageParamContentUnionMember1File(TypedDict, total=False):
-    file: Required[MessageChatCompletionUserMessageParamContentUnionMember1FileFile]
+class MessageChatCompletionUserMessageParamInputContentUnionMember1File(TypedDict, total=False):
+    file: Required[MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile]
 
     type: Required[Literal["file"]]
 
 
-MessageChatCompletionUserMessageParamContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartImageParam,
-    MessageChatCompletionUserMessageParamContentUnionMember1ChatCompletionContentPartInputAudioParam,
-    MessageChatCompletionUserMessageParamContentUnionMember1File,
+MessageChatCompletionUserMessageParamInputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam,
+    MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    MessageChatCompletionUserMessageParamInputContentUnionMember1File,
 ]
 
 
-class MessageChatCompletionUserMessageParam(TypedDict, total=False):
-    content: Required[Union[str, Iterable[MessageChatCompletionUserMessageParamContentUnionMember1]]]
+class MessageChatCompletionUserMessageParamInput(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageChatCompletionUserMessageParamInputContentUnionMember1]]]
 
     role: Required[Literal["user"]]
 
     name: str
 
 
-class MessageChatCompletionAssistantMessageParamAudio(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInputAudio(TypedDict, total=False):
     id: Required[str]
 
 
-class MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartTextParam(
+class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam(
     TypedDict, total=False
 ):
     text: Required[str]
@@ -554,7 +554,7 @@ class MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletio
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartRefusalParam(
+class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam(
     TypedDict, total=False
 ):
     refusal: Required[str]
@@ -562,46 +562,46 @@ class MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletio
     type: Required[Literal["refusal"]]
 
 
-MessageChatCompletionAssistantMessageParamContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionAssistantMessageParamContentUnionMember1ChatCompletionContentPartRefusalParam,
+MessageChatCompletionAssistantMessageParamInputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam,
 ]
 
 
-class MessageChatCompletionAssistantMessageParamFunctionCall(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInputFunctionCall(TypedDict, total=False):
     arguments: Required[str]
 
     name: Required[str]
 
 
-class MessageChatCompletionAssistantMessageParamToolCallFunction(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInputToolCallFunction(TypedDict, total=False):
     arguments: Required[str]
 
     name: Required[str]
 
 
-class MessageChatCompletionAssistantMessageParamToolCall(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInputToolCall(TypedDict, total=False):
     id: Required[str]
 
-    function: Required[MessageChatCompletionAssistantMessageParamToolCallFunction]
+    function: Required[MessageChatCompletionAssistantMessageParamInputToolCallFunction]
 
     type: Required[Literal["function"]]
 
 
-class MessageChatCompletionAssistantMessageParam(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInput(TypedDict, total=False):
     role: Required[Literal["assistant"]]
 
-    audio: Optional[MessageChatCompletionAssistantMessageParamAudio]
+    audio: Optional[MessageChatCompletionAssistantMessageParamInputAudio]
 
-    content: Union[str, Iterable[MessageChatCompletionAssistantMessageParamContentUnionMember1], None]
+    content: Union[str, Iterable[MessageChatCompletionAssistantMessageParamInputContentUnionMember1], None]
 
-    function_call: Optional[MessageChatCompletionAssistantMessageParamFunctionCall]
+    function_call: Optional[MessageChatCompletionAssistantMessageParamInputFunctionCall]
 
     name: str
 
     refusal: Optional[str]
 
-    tool_calls: Iterable[MessageChatCompletionAssistantMessageParamToolCall]
+    tool_calls: Iterable[MessageChatCompletionAssistantMessageParamInputToolCall]
 
 
 class MessageChatCompletionToolMessageParamContentUnionMember1(TypedDict, total=False):
@@ -629,8 +629,8 @@ class MessageChatCompletionFunctionMessageParam(TypedDict, total=False):
 Message: TypeAlias = Union[
     MessageChatCompletionDeveloperMessageParam,
     MessageChatCompletionSystemMessageParam,
-    MessageChatCompletionUserMessageParam,
-    MessageChatCompletionAssistantMessageParam,
+    MessageChatCompletionUserMessageParamInput,
+    MessageChatCompletionAssistantMessageParamInput,
     MessageChatCompletionToolMessageParam,
     MessageChatCompletionFunctionMessageParam,
 ]
diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py
index 3b06db2..4488311 100644
--- a/src/codex/types/project_validate_response.py
+++ b/src/codex/types/project_validate_response.py
@@ -48,8 +48,8 @@ class ProjectValidateResponse(BaseModel):
 
     expert_answer: Optional[str] = None
     """
-    Alternate SME-provided answer from Codex if the response was flagged as bad and
-    an answer was found in the Codex Project, or None otherwise.
+    Alternate SME-provided answer from Codex if a relevant answer was found in the
+    Codex Project, or None otherwise.
     """
 
     is_bad_response: bool
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index 1685073..ccd2d5e 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List, Union, Optional
 from datetime import datetime
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 
@@ -16,6 +16,31 @@
     "QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores",
     "QueryLogsByGroupQueryLogContext",
     "QueryLogsByGroupQueryLogDeterministicGuardrailsResults",
+    "QueryLogsByGroupQueryLogMessage",
+    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall",
+    "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction",
+    "QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam",
 ]
 
 
@@ -68,6 +93,200 @@ class QueryLogsByGroupQueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+
+    role: Literal["developer"]
+
+    name: Optional[str] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
+
+    role: Literal["system"]
+
+    name: Optional[str] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[Literal["auto", "low", "high"]] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+    BaseModel
+):
+    data: str
+
+    format: Literal["wav", "mp3"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam(
+    BaseModel
+):
+    input_audio: QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+
+    type: Literal["input_audio"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile(BaseModel):
+    file_data: Optional[str] = None
+
+    file_id: Optional[str] = None
+
+    filename: Optional[str] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File(BaseModel):
+    file: QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile
+
+    type: Literal["file"]
+
+
+QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File,
+]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1]]
+
+    role: Literal["user"]
+
+    name: Optional[str] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
+
+
+QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[
+        str, List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None
+    ] = None
+
+    function_call: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
+
+    name: Optional[str] = None
+
+    refusal: Optional[str] = None
+
+    tool_calls: Optional[List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
+
+    role: Literal["tool"]
+
+    tool_call_id: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
+
+    name: str
+
+    role: Literal["function"]
+
+
+QueryLogsByGroupQueryLogMessage: TypeAlias = Union[
+    QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput,
+    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput,
+    QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam,
+]
+
+
 class QueryLogsByGroupQueryLog(BaseModel):
     id: str
 
@@ -85,10 +304,14 @@ class QueryLogsByGroupQueryLog(BaseModel):
 
     formatted_guardrail_eval_scores: Optional[Dict[str, QueryLogsByGroupQueryLogFormattedGuardrailEvalScores]] = None
 
+    formatted_messages: Optional[str] = None
+
     formatted_non_guardrail_eval_scores: Optional[
         Dict[str, QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores]
     ] = None
 
+    formatted_original_question: Optional[str] = None
+
     is_bad_response: bool
 
     needs_review: bool
@@ -140,6 +363,20 @@ class QueryLogsByGroupQueryLog(BaseModel):
     guardrailed: Optional[bool] = None
     """If true, the response was guardrailed"""
 
+    messages: Optional[List[QueryLogsByGroupQueryLogMessage]] = None
+    """Optional message history to provide conversation context for the query.
+
+    Used to rewrite query into a self-contained version of itself. If not provided,
+    the query will be treated as self-contained.
+    """
+
+    original_question: Optional[str] = None
+    """The original question that was asked before any rewriting or processing.
+
+    For all non-conversational RAG, original_question should be the same as the
+    final question seen in Codex.
+    """
+
     primary_eval_issue: Optional[str] = None
     """Primary issue identified in evaluation"""
 
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index 5d9222b..9adb422 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List, Union, Optional
 from datetime import datetime
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 
@@ -14,6 +14,31 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "Message",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionAssistantMessageParamOutput",
+    "MessageChatCompletionAssistantMessageParamOutputAudio",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "MessageChatCompletionAssistantMessageParamOutputFunctionCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
+    "MessageChatCompletionToolMessageParam",
+    "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionFunctionMessageParam",
 ]
 
 
@@ -66,6 +91,192 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+
+    role: Literal["developer"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+
+    role: Literal["system"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[Literal["auto", "low", "high"]] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam(BaseModel):
+    image_url: MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+    BaseModel
+):
+    data: str
+
+    format: Literal["wav", "mp3"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam(BaseModel):
+    input_audio: (
+        MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+    )
+
+    type: Literal["input_audio"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile(BaseModel):
+    file_data: Optional[str] = None
+
+    file_id: Optional[str] = None
+
+    filename: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1File(BaseModel):
+    file: MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile
+
+    type: Literal["file"]
+
+
+MessageChatCompletionUserMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1File,
+]
+
+
+class MessageChatCompletionUserMessageParamOutput(BaseModel):
+    content: Union[str, List[MessageChatCompletionUserMessageParamOutputContentUnionMember1]]
+
+    role: Literal["user"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
+
+    name: Optional[str] = None
+
+    refusal: Optional[str] = None
+
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
+
+    role: Literal["tool"]
+
+    tool_call_id: str
+
+
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
+
+    name: str
+
+    role: Literal["function"]
+
+
+Message: TypeAlias = Union[
+    MessageChatCompletionDeveloperMessageParam,
+    MessageChatCompletionSystemMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionAssistantMessageParamOutput,
+    MessageChatCompletionToolMessageParam,
+    MessageChatCompletionFunctionMessageParam,
+]
+
+
 class QueryLogListGroupsResponse(BaseModel):
     id: str
 
@@ -83,8 +294,12 @@ class QueryLogListGroupsResponse(BaseModel):
 
     formatted_guardrail_eval_scores: Optional[Dict[str, FormattedGuardrailEvalScores]] = None
 
+    formatted_messages: Optional[str] = None
+
     formatted_non_guardrail_eval_scores: Optional[Dict[str, FormattedNonGuardrailEvalScores]] = None
 
+    formatted_original_question: Optional[str] = None
+
     is_bad_response: bool
 
     needs_review: bool
@@ -138,6 +353,20 @@ class QueryLogListGroupsResponse(BaseModel):
     guardrailed: Optional[bool] = None
     """If true, the response was guardrailed"""
 
+    messages: Optional[List[Message]] = None
+    """Optional message history to provide conversation context for the query.
+
+    Used to rewrite query into a self-contained version of itself. If not provided,
+    the query will be treated as self-contained.
+    """
+
+    original_question: Optional[str] = None
+    """The original question that was asked before any rewriting or processing.
+
+    For all non-conversational RAG, original_question should be the same as the
+    final question seen in Codex.
+    """
+
     primary_eval_issue: Optional[str] = None
     """Primary issue identified in evaluation"""
 
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index ccdeb03..f6fbba1 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List, Union, Optional
 from datetime import datetime
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 
@@ -14,6 +14,31 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "Message",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionAssistantMessageParamOutput",
+    "MessageChatCompletionAssistantMessageParamOutputAudio",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "MessageChatCompletionAssistantMessageParamOutputFunctionCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
+    "MessageChatCompletionToolMessageParam",
+    "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionFunctionMessageParam",
 ]
 
 
@@ -66,6 +91,192 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+
+    role: Literal["developer"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+
+    role: Literal["system"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[Literal["auto", "low", "high"]] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam(BaseModel):
+    image_url: MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+    BaseModel
+):
+    data: str
+
+    format: Literal["wav", "mp3"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam(BaseModel):
+    input_audio: (
+        MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+    )
+
+    type: Literal["input_audio"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile(BaseModel):
+    file_data: Optional[str] = None
+
+    file_id: Optional[str] = None
+
+    filename: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1File(BaseModel):
+    file: MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile
+
+    type: Literal["file"]
+
+
+MessageChatCompletionUserMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1File,
+]
+
+
+class MessageChatCompletionUserMessageParamOutput(BaseModel):
+    content: Union[str, List[MessageChatCompletionUserMessageParamOutputContentUnionMember1]]
+
+    role: Literal["user"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
+
+    name: Optional[str] = None
+
+    refusal: Optional[str] = None
+
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
+
+    role: Literal["tool"]
+
+    tool_call_id: str
+
+
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
+
+    name: str
+
+    role: Literal["function"]
+
+
+Message: TypeAlias = Union[
+    MessageChatCompletionDeveloperMessageParam,
+    MessageChatCompletionSystemMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionAssistantMessageParamOutput,
+    MessageChatCompletionToolMessageParam,
+    MessageChatCompletionFunctionMessageParam,
+]
+
+
 class QueryLogListResponse(BaseModel):
     id: str
 
@@ -83,8 +294,12 @@ class QueryLogListResponse(BaseModel):
 
     formatted_guardrail_eval_scores: Optional[Dict[str, FormattedGuardrailEvalScores]] = None
 
+    formatted_messages: Optional[str] = None
+
     formatted_non_guardrail_eval_scores: Optional[Dict[str, FormattedNonGuardrailEvalScores]] = None
 
+    formatted_original_question: Optional[str] = None
+
     is_bad_response: bool
 
     project_id: str
@@ -132,6 +347,20 @@ class QueryLogListResponse(BaseModel):
     guardrailed: Optional[bool] = None
     """If true, the response was guardrailed"""
 
+    messages: Optional[List[Message]] = None
+    """Optional message history to provide conversation context for the query.
+
+    Used to rewrite query into a self-contained version of itself. If not provided,
+    the query will be treated as self-contained.
+    """
+
+    original_question: Optional[str] = None
+    """The original question that was asked before any rewriting or processing.
+
+    For all non-conversational RAG, original_question should be the same as the
+    final question seen in Codex.
+    """
+
     primary_eval_issue: Optional[str] = None
     """Primary issue identified in evaluation"""
 
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 380bacb..784009c 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List, Union, Optional
 from datetime import datetime
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 
@@ -14,6 +14,31 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "Message",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionAssistantMessageParamOutput",
+    "MessageChatCompletionAssistantMessageParamOutputAudio",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "MessageChatCompletionAssistantMessageParamOutputFunctionCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCall",
+    "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
+    "MessageChatCompletionToolMessageParam",
+    "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionFunctionMessageParam",
 ]
 
 
@@ -66,6 +91,192 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+
+    role: Literal["developer"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+
+    role: Literal["system"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[Literal["auto", "low", "high"]] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam(BaseModel):
+    image_url: MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+    BaseModel
+):
+    data: str
+
+    format: Literal["wav", "mp3"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam(BaseModel):
+    input_audio: (
+        MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+    )
+
+    type: Literal["input_audio"]
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile(BaseModel):
+    file_data: Optional[str] = None
+
+    file_id: Optional[str] = None
+
+    filename: Optional[str] = None
+
+
+class MessageChatCompletionUserMessageParamOutputContentUnionMember1File(BaseModel):
+    file: MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile
+
+    type: Literal["file"]
+
+
+MessageChatCompletionUserMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    MessageChatCompletionUserMessageParamOutputContentUnionMember1File,
+]
+
+
+class MessageChatCompletionUserMessageParamOutput(BaseModel):
+    content: Union[str, List[MessageChatCompletionUserMessageParamOutputContentUnionMember1]]
+
+    role: Literal["user"]
+
+    name: Optional[str] = None
+
+
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
+
+    name: Optional[str] = None
+
+    refusal: Optional[str] = None
+
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
+
+    role: Literal["tool"]
+
+    tool_call_id: str
+
+
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
+
+    name: str
+
+    role: Literal["function"]
+
+
+Message: TypeAlias = Union[
+    MessageChatCompletionDeveloperMessageParam,
+    MessageChatCompletionSystemMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionAssistantMessageParamOutput,
+    MessageChatCompletionToolMessageParam,
+    MessageChatCompletionFunctionMessageParam,
+]
+
+
 class QueryLogRetrieveResponse(BaseModel):
     id: str
 
@@ -83,8 +294,12 @@ class QueryLogRetrieveResponse(BaseModel):
 
     formatted_guardrail_eval_scores: Optional[Dict[str, FormattedGuardrailEvalScores]] = None
 
+    formatted_messages: Optional[str] = None
+
     formatted_non_guardrail_eval_scores: Optional[Dict[str, FormattedNonGuardrailEvalScores]] = None
 
+    formatted_original_question: Optional[str] = None
+
     is_bad_response: bool
 
     needs_review: bool
@@ -136,6 +351,20 @@ class QueryLogRetrieveResponse(BaseModel):
     guardrailed: Optional[bool] = None
     """If true, the response was guardrailed"""
 
+    messages: Optional[List[Message]] = None
+    """Optional message history to provide conversation context for the query.
+
+    Used to rewrite query into a self-contained version of itself. If not provided,
+    the query will be treated as self-contained.
+    """
+
+    original_question: Optional[str] = None
+    """The original question that was asked before any rewriting or processing.
+
+    For all non-conversational RAG, original_question should be the same as the
+    final question seen in Codex.
+    """
+
     primary_eval_issue: Optional[str] = None
     """Primary issue identified in evaluation"""
 
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index 876e7ce..1e0154c 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List, Union, Optional
 from datetime import datetime
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 
@@ -15,6 +15,31 @@
     "QueryLogFormattedNonGuardrailEvalScores",
     "QueryLogContext",
     "QueryLogDeterministicGuardrailsResults",
+    "QueryLogMessage",
+    "QueryLogMessageChatCompletionDeveloperMessageParam",
+    "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogMessageChatCompletionSystemMessageParam",
+    "QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
+    "QueryLogMessageChatCompletionUserMessageParamOutput",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutput",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall",
+    "QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction",
+    "QueryLogMessageChatCompletionToolMessageParam",
+    "QueryLogMessageChatCompletionToolMessageParamContentUnionMember1",
+    "QueryLogMessageChatCompletionFunctionMessageParam",
 ]
 
 
@@ -67,6 +92,198 @@ class QueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+
+    role: Literal["developer"]
+
+    name: Optional[str] = None
+
+
+class QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogMessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
+
+    role: Literal["system"]
+
+    name: Optional[str] = None
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL(
+    BaseModel
+):
+    url: str
+
+    detail: Optional[Literal["auto", "low", "high"]] = None
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam(
+    BaseModel
+):
+    image_url: QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL
+
+    type: Literal["image_url"]
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio(
+    BaseModel
+):
+    data: str
+
+    format: Literal["wav", "mp3"]
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam(
+    BaseModel
+):
+    input_audio: QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio
+
+    type: Literal["input_audio"]
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile(BaseModel):
+    file_data: Optional[str] = None
+
+    file_id: Optional[str] = None
+
+    filename: Optional[str] = None
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File(BaseModel):
+    file: QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile
+
+    type: Literal["file"]
+
+
+QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam,
+    QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam,
+    QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File,
+]
+
+
+class QueryLogMessageChatCompletionUserMessageParamOutput(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1]]
+
+    role: Literal["user"]
+
+    name: Optional[str] = None
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
+
+
+QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
+
+    name: Optional[str] = None
+
+    refusal: Optional[str] = None
+
+    tool_calls: Optional[List[QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class QueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+    text: str
+
+    type: Literal["text"]
+
+
+class QueryLogMessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
+
+    role: Literal["tool"]
+
+    tool_call_id: str
+
+
+class QueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
+
+    name: str
+
+    role: Literal["function"]
+
+
+QueryLogMessage: TypeAlias = Union[
+    QueryLogMessageChatCompletionDeveloperMessageParam,
+    QueryLogMessageChatCompletionSystemMessageParam,
+    QueryLogMessageChatCompletionUserMessageParamOutput,
+    QueryLogMessageChatCompletionAssistantMessageParamOutput,
+    QueryLogMessageChatCompletionToolMessageParam,
+    QueryLogMessageChatCompletionFunctionMessageParam,
+]
+
+
 class QueryLog(BaseModel):
     id: str
 
@@ -84,8 +301,12 @@ class QueryLog(BaseModel):
 
     formatted_guardrail_eval_scores: Optional[Dict[str, QueryLogFormattedGuardrailEvalScores]] = None
 
+    formatted_messages: Optional[str] = None
+
     formatted_non_guardrail_eval_scores: Optional[Dict[str, QueryLogFormattedNonGuardrailEvalScores]] = None
 
+    formatted_original_question: Optional[str] = None
+
     is_bad_response: bool
 
     project_id: str
@@ -133,6 +354,20 @@ class QueryLog(BaseModel):
     guardrailed: Optional[bool] = None
     """If true, the response was guardrailed"""
 
+    messages: Optional[List[QueryLogMessage]] = None
+    """Optional message history to provide conversation context for the query.
+
+    Used to rewrite query into a self-contained version of itself. If not provided,
+    the query will be treated as self-contained.
+    """
+
+    original_question: Optional[str] = None
+    """The original question that was asked before any rewriting or processing.
+
+    For all non-conversational RAG, original_question should be the same as the
+    final question seen in Codex.
+    """
+
     primary_eval_issue: Optional[str] = None
     """Primary issue identified in evaluation"""
 
diff --git a/tests/api_resources/projects/test_remediations.py b/tests/api_resources/projects/test_remediations.py
index 947850f..5866dbe 100644
--- a/tests/api_resources/projects/test_remediations.py
+++ b/tests/api_resources/projects/test_remediations.py
@@ -35,7 +35,7 @@ class TestRemediations:
     def test_method_create(self, client: Codex) -> None:
         remediation = client.projects.remediations.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         )
         assert_matches_type(RemediationCreateResponse, remediation, path=["response"])
 
@@ -44,7 +44,7 @@ def test_method_create(self, client: Codex) -> None:
     def test_method_create_with_all_params(self, client: Codex) -> None:
         remediation = client.projects.remediations.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
             answer="answer",
             draft_answer="draft_answer",
         )
@@ -55,7 +55,7 @@ def test_method_create_with_all_params(self, client: Codex) -> None:
     def test_raw_response_create(self, client: Codex) -> None:
         response = client.projects.remediations.with_raw_response.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         )
 
         assert response.is_closed is True
@@ -68,7 +68,7 @@ def test_raw_response_create(self, client: Codex) -> None:
     def test_streaming_response_create(self, client: Codex) -> None:
         with client.projects.remediations.with_streaming_response.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -84,7 +84,7 @@ def test_path_params_create(self, client: Codex) -> None:
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
             client.projects.remediations.with_raw_response.create(
                 project_id="",
-                question="question",
+                question="x",
             )
 
     @pytest.mark.skip()
@@ -636,7 +636,7 @@ class TestAsyncRemediations:
     async def test_method_create(self, async_client: AsyncCodex) -> None:
         remediation = await async_client.projects.remediations.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         )
         assert_matches_type(RemediationCreateResponse, remediation, path=["response"])
 
@@ -645,7 +645,7 @@ async def test_method_create(self, async_client: AsyncCodex) -> None:
     async def test_method_create_with_all_params(self, async_client: AsyncCodex) -> None:
         remediation = await async_client.projects.remediations.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
             answer="answer",
             draft_answer="draft_answer",
         )
@@ -656,7 +656,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncCodex) ->
     async def test_raw_response_create(self, async_client: AsyncCodex) -> None:
         response = await async_client.projects.remediations.with_raw_response.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         )
 
         assert response.is_closed is True
@@ -669,7 +669,7 @@ async def test_raw_response_create(self, async_client: AsyncCodex) -> None:
     async def test_streaming_response_create(self, async_client: AsyncCodex) -> None:
         async with async_client.projects.remediations.with_streaming_response.create(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            question="question",
+            question="x",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -685,7 +685,7 @@ async def test_path_params_create(self, async_client: AsyncCodex) -> None:
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
             await async_client.projects.remediations.with_raw_response.create(
                 project_id="",
-                question="question",
+                question="x",
             )
 
     @pytest.mark.skip()
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 9ecffa0..4507741 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -632,7 +632,7 @@ def test_method_validate(self, client: Codex) -> None:
         project = client.projects.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         )
         assert_matches_type(ProjectValidateResponse, project, path=["response"])
@@ -643,7 +643,7 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
         project = client.projects.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
             use_llm_matching=True,
             constrain_outputs=["string"],
@@ -685,7 +685,7 @@ def test_raw_response_validate(self, client: Codex) -> None:
         response = client.projects.with_raw_response.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         )
 
@@ -700,7 +700,7 @@ def test_streaming_response_validate(self, client: Codex) -> None:
         with client.projects.with_streaming_response.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         ) as response:
             assert not response.is_closed
@@ -718,7 +718,7 @@ def test_path_params_validate(self, client: Codex) -> None:
             client.projects.with_raw_response.validate(
                 project_id="",
                 context="context",
-                query="query",
+                query="x",
                 response="string",
             )
 
@@ -1334,7 +1334,7 @@ async def test_method_validate(self, async_client: AsyncCodex) -> None:
         project = await async_client.projects.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         )
         assert_matches_type(ProjectValidateResponse, project, path=["response"])
@@ -1345,7 +1345,7 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
         project = await async_client.projects.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
             use_llm_matching=True,
             constrain_outputs=["string"],
@@ -1387,7 +1387,7 @@ async def test_raw_response_validate(self, async_client: AsyncCodex) -> None:
         response = await async_client.projects.with_raw_response.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         )
 
@@ -1402,7 +1402,7 @@ async def test_streaming_response_validate(self, async_client: AsyncCodex) -> No
         async with async_client.projects.with_streaming_response.validate(
             project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
             context="context",
-            query="query",
+            query="x",
             response="string",
         ) as response:
             assert not response.is_closed
@@ -1420,6 +1420,6 @@ async def test_path_params_validate(self, async_client: AsyncCodex) -> None:
             await async_client.projects.with_raw_response.validate(
                 project_id="",
                 context="context",
-                query="query",
+                query="x",
                 response="string",
             )

From f55f4b768f8c1d00bdf61e56b0a7227c8424c5b6 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 15 Jul 2025 22:17:46 +0000
Subject: [PATCH 07/20] feat(api): api update

---
 .stats.yml                                    |  2 +-
 api.md                                        |  2 +-
 src/codex/resources/projects/evals.py         | 38 ++++++++++++++++---
 src/codex/types/projects/__init__.py          |  1 +
 src/codex/types/projects/eval_list_params.py  | 16 ++++++++
 .../types/projects/eval_list_response.py      | 11 ++++--
 tests/api_resources/projects/test_evals.py    | 38 +++++++++++++++----
 7 files changed, 89 insertions(+), 19 deletions(-)
 create mode 100644 src/codex/types/projects/eval_list_params.py

diff --git a/.stats.yml b/.stats.yml
index 20ee827..b16c056 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 55
-openapi_spec_hash: b3a1a58600b52a20671bef2b25f5dbc4
+openapi_spec_hash: 1e86d5a7384400f4c3ddfb824fb31d84
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/api.md b/api.md
index 0c02791..22281ae 100644
--- a/api.md
+++ b/api.md
@@ -192,7 +192,7 @@ Methods:
 
 - <code title="post /api/projects/{project_id}/evals">client.projects.evals.<a href="./src/codex/resources/projects/evals.py">create</a>(project_id, \*\*<a href="src/codex/types/projects/eval_create_params.py">params</a>) -> <a href="./src/codex/types/project_return_schema.py">ProjectReturnSchema</a></code>
 - <code title="put /api/projects/{project_id}/evals/{eval_key}">client.projects.evals.<a href="./src/codex/resources/projects/evals.py">update</a>(path_eval_key, \*, project_id, \*\*<a href="src/codex/types/projects/eval_update_params.py">params</a>) -> <a href="./src/codex/types/project_return_schema.py">ProjectReturnSchema</a></code>
-- <code title="get /api/projects/{project_id}/evals">client.projects.evals.<a href="./src/codex/resources/projects/evals.py">list</a>(project_id) -> <a href="./src/codex/types/projects/eval_list_response.py">EvalListResponse</a></code>
+- <code title="get /api/projects/{project_id}/evals">client.projects.evals.<a href="./src/codex/resources/projects/evals.py">list</a>(project_id, \*\*<a href="src/codex/types/projects/eval_list_params.py">params</a>) -> <a href="./src/codex/types/projects/eval_list_response.py">EvalListResponse</a></code>
 - <code title="delete /api/projects/{project_id}/evals/{eval_key}">client.projects.evals.<a href="./src/codex/resources/projects/evals.py">delete</a>(eval_key, \*, project_id) -> <a href="./src/codex/types/project_return_schema.py">ProjectReturnSchema</a></code>
 
 ## QueryLogs
diff --git a/src/codex/resources/projects/evals.py b/src/codex/resources/projects/evals.py
index 1fc9589..9de41b7 100644
--- a/src/codex/resources/projects/evals.py
+++ b/src/codex/resources/projects/evals.py
@@ -18,7 +18,7 @@
     async_to_streamed_response_wrapper,
 )
 from ..._base_client import make_request_options
-from ...types.projects import eval_create_params, eval_update_params
+from ...types.projects import eval_list_params, eval_create_params, eval_update_params
 from ...types.project_return_schema import ProjectReturnSchema
 from ...types.projects.eval_list_response import EvalListResponse
 
@@ -324,6 +324,9 @@ def list(
         self,
         project_id: str,
         *,
+        guardrails_only: bool | NotGiven = NOT_GIVEN,
+        limit: Optional[int] | NotGiven = NOT_GIVEN,
+        offset: int | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -332,7 +335,7 @@ def list(
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> EvalListResponse:
         """
-        Get the evaluations config for a project.
+        Get the evaluations config for a project with optional pagination.
 
         Args:
           extra_headers: Send extra headers
@@ -348,7 +351,18 @@ def list(
         return self._get(
             f"/api/projects/{project_id}/evals",
             options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                query=maybe_transform(
+                    {
+                        "guardrails_only": guardrails_only,
+                        "limit": limit,
+                        "offset": offset,
+                    },
+                    eval_list_params.EvalListParams,
+                ),
             ),
             cast_to=EvalListResponse,
         )
@@ -689,6 +703,9 @@ async def list(
         self,
         project_id: str,
         *,
+        guardrails_only: bool | NotGiven = NOT_GIVEN,
+        limit: Optional[int] | NotGiven = NOT_GIVEN,
+        offset: int | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -697,7 +714,7 @@ async def list(
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> EvalListResponse:
         """
-        Get the evaluations config for a project.
+        Get the evaluations config for a project with optional pagination.
 
         Args:
           extra_headers: Send extra headers
@@ -713,7 +730,18 @@ async def list(
         return await self._get(
             f"/api/projects/{project_id}/evals",
             options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                query=await async_maybe_transform(
+                    {
+                        "guardrails_only": guardrails_only,
+                        "limit": limit,
+                        "offset": offset,
+                    },
+                    eval_list_params.EvalListParams,
+                ),
             ),
             cast_to=EvalListResponse,
         )
diff --git a/src/codex/types/projects/__init__.py b/src/codex/types/projects/__init__.py
index 4f75470..cb2989f 100644
--- a/src/codex/types/projects/__init__.py
+++ b/src/codex/types/projects/__init__.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from .eval_list_params import EvalListParams as EvalListParams
 from .access_key_schema import AccessKeySchema as AccessKeySchema
 from .eval_create_params import EvalCreateParams as EvalCreateParams
 from .eval_list_response import EvalListResponse as EvalListResponse
diff --git a/src/codex/types/projects/eval_list_params.py b/src/codex/types/projects/eval_list_params.py
new file mode 100644
index 0000000..b0f2fb6
--- /dev/null
+++ b/src/codex/types/projects/eval_list_params.py
@@ -0,0 +1,16 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Optional
+from typing_extensions import TypedDict
+
+__all__ = ["EvalListParams"]
+
+
+class EvalListParams(TypedDict, total=False):
+    guardrails_only: bool
+
+    limit: Optional[int]
+
+    offset: int
diff --git a/src/codex/types/projects/eval_list_response.py b/src/codex/types/projects/eval_list_response.py
index 48859b8..eb2cb9a 100644
--- a/src/codex/types/projects/eval_list_response.py
+++ b/src/codex/types/projects/eval_list_response.py
@@ -1,14 +1,14 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import List, Optional
-from typing_extensions import Literal, TypeAlias
+from typing_extensions import Literal
 
 from ..._models import BaseModel
 
-__all__ = ["EvalListResponse", "EvalListResponseItem"]
+__all__ = ["EvalListResponse", "Eval"]
 
 
-class EvalListResponseItem(BaseModel):
+class Eval(BaseModel):
     criteria: str
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -69,4 +69,7 @@ class EvalListResponseItem(BaseModel):
     """Whether the evaluation fails when score is above or below the threshold"""
 
 
-EvalListResponse: TypeAlias = List[EvalListResponseItem]
+class EvalListResponse(BaseModel):
+    evals: List[Eval]
+
+    total_count: int
diff --git a/tests/api_resources/projects/test_evals.py b/tests/api_resources/projects/test_evals.py
index 22b8380..f36de27 100644
--- a/tests/api_resources/projects/test_evals.py
+++ b/tests/api_resources/projects/test_evals.py
@@ -259,7 +259,18 @@ def test_path_params_update_overload_2(self, client: Codex) -> None:
     @parametrize
     def test_method_list(self, client: Codex) -> None:
         eval = client.projects.evals.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+        )
+        assert_matches_type(EvalListResponse, eval, path=["response"])
+
+    @pytest.mark.skip()
+    @parametrize
+    def test_method_list_with_all_params(self, client: Codex) -> None:
+        eval = client.projects.evals.list(
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            guardrails_only=True,
+            limit=1,
+            offset=0,
         )
         assert_matches_type(EvalListResponse, eval, path=["response"])
 
@@ -267,7 +278,7 @@ def test_method_list(self, client: Codex) -> None:
     @parametrize
     def test_raw_response_list(self, client: Codex) -> None:
         response = client.projects.evals.with_raw_response.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
         )
 
         assert response.is_closed is True
@@ -279,7 +290,7 @@ def test_raw_response_list(self, client: Codex) -> None:
     @parametrize
     def test_streaming_response_list(self, client: Codex) -> None:
         with client.projects.evals.with_streaming_response.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -294,7 +305,7 @@ def test_streaming_response_list(self, client: Codex) -> None:
     def test_path_params_list(self, client: Codex) -> None:
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
             client.projects.evals.with_raw_response.list(
-                "",
+                project_id="",
             )
 
     @pytest.mark.skip()
@@ -596,7 +607,18 @@ async def test_path_params_update_overload_2(self, async_client: AsyncCodex) ->
     @parametrize
     async def test_method_list(self, async_client: AsyncCodex) -> None:
         eval = await async_client.projects.evals.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+        )
+        assert_matches_type(EvalListResponse, eval, path=["response"])
+
+    @pytest.mark.skip()
+    @parametrize
+    async def test_method_list_with_all_params(self, async_client: AsyncCodex) -> None:
+        eval = await async_client.projects.evals.list(
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            guardrails_only=True,
+            limit=1,
+            offset=0,
         )
         assert_matches_type(EvalListResponse, eval, path=["response"])
 
@@ -604,7 +626,7 @@ async def test_method_list(self, async_client: AsyncCodex) -> None:
     @parametrize
     async def test_raw_response_list(self, async_client: AsyncCodex) -> None:
         response = await async_client.projects.evals.with_raw_response.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
         )
 
         assert response.is_closed is True
@@ -616,7 +638,7 @@ async def test_raw_response_list(self, async_client: AsyncCodex) -> None:
     @parametrize
     async def test_streaming_response_list(self, async_client: AsyncCodex) -> None:
         async with async_client.projects.evals.with_streaming_response.list(
-            "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+            project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -631,7 +653,7 @@ async def test_streaming_response_list(self, async_client: AsyncCodex) -> None:
     async def test_path_params_list(self, async_client: AsyncCodex) -> None:
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
             await async_client.projects.evals.with_raw_response.list(
-                "",
+                project_id="",
             )
 
     @pytest.mark.skip()

From 575d1901319984fea901ce216323a5259e17f98c Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:17:47 +0000
Subject: [PATCH 08/20] feat(api): api update

---
 .stats.yml                                                  | 2 +-
 .../types/projects/query_log_list_by_group_response.py      | 6 +++---
 src/codex/types/projects/query_log_list_groups_response.py  | 6 +++---
 src/codex/types/projects/query_log_list_response.py         | 6 +++---
 src/codex/types/projects/query_log_retrieve_response.py     | 6 +++---
 .../projects/remediation_list_resolved_logs_response.py     | 6 +++---
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index b16c056..138e8d5 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 55
-openapi_spec_hash: 1e86d5a7384400f4c3ddfb824fb31d84
+openapi_spec_hash: fd2542df68972f34edeb819c58600791
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index ccd2d5e..0ff5909 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -364,10 +364,10 @@ class QueryLogsByGroupQueryLog(BaseModel):
     """If true, the response was guardrailed"""
 
     messages: Optional[List[QueryLogsByGroupQueryLogMessage]] = None
-    """Optional message history to provide conversation context for the query.
+    """Message history to provide conversation context for the query.
 
-    Used to rewrite query into a self-contained version of itself. If not provided,
-    the query will be treated as self-contained.
+    Used for TrustworthyRAG and to rewrite query into a self-contained version of
+    itself.
     """
 
     original_question: Optional[str] = None
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index 9adb422..495fc56 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -354,10 +354,10 @@ class QueryLogListGroupsResponse(BaseModel):
     """If true, the response was guardrailed"""
 
     messages: Optional[List[Message]] = None
-    """Optional message history to provide conversation context for the query.
+    """Message history to provide conversation context for the query.
 
-    Used to rewrite query into a self-contained version of itself. If not provided,
-    the query will be treated as self-contained.
+    Used for TrustworthyRAG and to rewrite query into a self-contained version of
+    itself.
     """
 
     original_question: Optional[str] = None
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index f6fbba1..72a8cab 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -348,10 +348,10 @@ class QueryLogListResponse(BaseModel):
     """If true, the response was guardrailed"""
 
     messages: Optional[List[Message]] = None
-    """Optional message history to provide conversation context for the query.
+    """Message history to provide conversation context for the query.
 
-    Used to rewrite query into a self-contained version of itself. If not provided,
-    the query will be treated as self-contained.
+    Used for TrustworthyRAG and to rewrite query into a self-contained version of
+    itself.
     """
 
     original_question: Optional[str] = None
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 784009c..4324269 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -352,10 +352,10 @@ class QueryLogRetrieveResponse(BaseModel):
     """If true, the response was guardrailed"""
 
     messages: Optional[List[Message]] = None
-    """Optional message history to provide conversation context for the query.
+    """Message history to provide conversation context for the query.
 
-    Used to rewrite query into a self-contained version of itself. If not provided,
-    the query will be treated as self-contained.
+    Used for TrustworthyRAG and to rewrite query into a self-contained version of
+    itself.
     """
 
     original_question: Optional[str] = None
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index 1e0154c..cebfaf4 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -355,10 +355,10 @@ class QueryLog(BaseModel):
     """If true, the response was guardrailed"""
 
     messages: Optional[List[QueryLogMessage]] = None
-    """Optional message history to provide conversation context for the query.
+    """Message history to provide conversation context for the query.
 
-    Used to rewrite query into a self-contained version of itself. If not provided,
-    the query will be treated as self-contained.
+    Used for TrustworthyRAG and to rewrite query into a self-contained version of
+    itself.
     """
 
     original_question: Optional[str] = None

From 1cdf391742b196d5a723307e8c202a69e00b371d Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 22 Jul 2025 02:05:22 +0000
Subject: [PATCH 09/20] fix(parsing): ignore empty metadata

---
 src/codex/_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/codex/_models.py b/src/codex/_models.py
index 528d568..ffcbf67 100644
--- a/src/codex/_models.py
+++ b/src/codex/_models.py
@@ -439,7 +439,7 @@ def construct_type(*, value: object, type_: object, metadata: Optional[List[Any]
         type_ = type_.__value__  # type: ignore[unreachable]
 
     # unwrap `Annotated[T, ...]` -> `T`
-    if metadata is not None:
+    if metadata is not None and len(metadata) > 0:
         meta: tuple[Any, ...] = tuple(metadata)
     elif is_annotated_type(type_):
         meta = get_args(type_)[1:]

From 0a33c4710d4890d17ddd973ba4a2ed183e45e4c7 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 22 Jul 2025 22:17:30 +0000
Subject: [PATCH 10/20] feat(api): api update

---
 .stats.yml                                    |   4 +-
 api.md                                        |   1 -
 src/codex/resources/projects/projects.py      | 102 ---------------
 src/codex/types/__init__.py                   |   1 -
 .../types/project_increment_queries_params.py |  11 --
 tests/api_resources/test_projects.py          | 118 ------------------
 6 files changed, 2 insertions(+), 235 deletions(-)
 delete mode 100644 src/codex/types/project_increment_queries_params.py

diff --git a/.stats.yml b/.stats.yml
index 138e8d5..c31fbb0 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
-configured_endpoints: 55
-openapi_spec_hash: fd2542df68972f34edeb819c58600791
+configured_endpoints: 54
+openapi_spec_hash: 168bdf5a611596d39812ce7259416529
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/api.md b/api.md
index 22281ae..1646d0b 100644
--- a/api.md
+++ b/api.md
@@ -153,7 +153,6 @@ Methods:
 - <code title="get /api/projects/">client.projects.<a href="./src/codex/resources/projects/projects.py">list</a>(\*\*<a href="src/codex/types/project_list_params.py">params</a>) -> <a href="./src/codex/types/project_list_response.py">ProjectListResponse</a></code>
 - <code title="delete /api/projects/{project_id}">client.projects.<a href="./src/codex/resources/projects/projects.py">delete</a>(project_id) -> None</code>
 - <code title="get /api/projects/{project_id}/export">client.projects.<a href="./src/codex/resources/projects/projects.py">export</a>(project_id) -> object</code>
-- <code title="post /api/projects/{project_id}/increment_queries">client.projects.<a href="./src/codex/resources/projects/projects.py">increment_queries</a>(project_id, \*\*<a href="src/codex/types/project_increment_queries_params.py">params</a>) -> object</code>
 - <code title="post /api/projects/{project_id}/notifications">client.projects.<a href="./src/codex/resources/projects/projects.py">invite_sme</a>(project_id, \*\*<a href="src/codex/types/project_invite_sme_params.py">params</a>) -> <a href="./src/codex/types/project_invite_sme_response.py">ProjectInviteSmeResponse</a></code>
 - <code title="get /api/projects/{project_id}/analytics/">client.projects.<a href="./src/codex/resources/projects/projects.py">retrieve_analytics</a>(project_id, \*\*<a href="src/codex/types/project_retrieve_analytics_params.py">params</a>) -> <a href="./src/codex/types/project_retrieve_analytics_response.py">ProjectRetrieveAnalyticsResponse</a></code>
 - <code title="post /api/projects/{project_id}/validate">client.projects.<a href="./src/codex/resources/projects/projects.py">validate</a>(project_id, \*\*<a href="src/codex/types/project_validate_params.py">params</a>) -> <a href="./src/codex/types/project_validate_response.py">ProjectValidateResponse</a></code>
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index dc01b11..3a109ed 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import typing_extensions
 from typing import Dict, List, Iterable, Optional
 from typing_extensions import Literal
 
@@ -22,7 +21,6 @@
     project_update_params,
     project_validate_params,
     project_invite_sme_params,
-    project_increment_queries_params,
     project_retrieve_analytics_params,
 )
 from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
@@ -349,45 +347,6 @@ def export(
             cast_to=object,
         )
 
-    @typing_extensions.deprecated("deprecated")
-    def increment_queries(
-        self,
-        project_id: str,
-        *,
-        count: int | NotGiven = NOT_GIVEN,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> object:
-        """
-        Increment the queries metric for a project.
-
-        Args:
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        if not project_id:
-            raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}")
-        return self._post(
-            f"/api/projects/{project_id}/increment_queries",
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-                query=maybe_transform({"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams),
-            ),
-            cast_to=object,
-        )
-
     def invite_sme(
         self,
         project_id: str,
@@ -956,47 +915,6 @@ async def export(
             cast_to=object,
         )
 
-    @typing_extensions.deprecated("deprecated")
-    async def increment_queries(
-        self,
-        project_id: str,
-        *,
-        count: int | NotGiven = NOT_GIVEN,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> object:
-        """
-        Increment the queries metric for a project.
-
-        Args:
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        if not project_id:
-            raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}")
-        return await self._post(
-            f"/api/projects/{project_id}/increment_queries",
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-                query=await async_maybe_transform(
-                    {"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams
-                ),
-            ),
-            cast_to=object,
-        )
-
     async def invite_sme(
         self,
         project_id: str,
@@ -1308,11 +1226,6 @@ def __init__(self, projects: ProjectsResource) -> None:
         self.export = to_raw_response_wrapper(
             projects.export,
         )
-        self.increment_queries = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                projects.increment_queries  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.invite_sme = to_raw_response_wrapper(
             projects.invite_sme,
         )
@@ -1362,11 +1275,6 @@ def __init__(self, projects: AsyncProjectsResource) -> None:
         self.export = async_to_raw_response_wrapper(
             projects.export,
         )
-        self.increment_queries = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                projects.increment_queries  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.invite_sme = async_to_raw_response_wrapper(
             projects.invite_sme,
         )
@@ -1416,11 +1324,6 @@ def __init__(self, projects: ProjectsResource) -> None:
         self.export = to_streamed_response_wrapper(
             projects.export,
         )
-        self.increment_queries = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                projects.increment_queries  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.invite_sme = to_streamed_response_wrapper(
             projects.invite_sme,
         )
@@ -1470,11 +1373,6 @@ def __init__(self, projects: AsyncProjectsResource) -> None:
         self.export = async_to_streamed_response_wrapper(
             projects.export,
         )
-        self.increment_queries = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                projects.increment_queries  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.invite_sme = async_to_streamed_response_wrapper(
             projects.invite_sme,
         )
diff --git a/src/codex/types/__init__.py b/src/codex/types/__init__.py
index 70713a3..daa1635 100644
--- a/src/codex/types/__init__.py
+++ b/src/codex/types/__init__.py
@@ -19,7 +19,6 @@
 from .organization_schema_public import OrganizationSchemaPublic as OrganizationSchemaPublic
 from .project_invite_sme_response import ProjectInviteSmeResponse as ProjectInviteSmeResponse
 from .user_activate_account_params import UserActivateAccountParams as UserActivateAccountParams
-from .project_increment_queries_params import ProjectIncrementQueriesParams as ProjectIncrementQueriesParams
 from .project_retrieve_analytics_params import ProjectRetrieveAnalyticsParams as ProjectRetrieveAnalyticsParams
 from .organization_list_members_response import OrganizationListMembersResponse as OrganizationListMembersResponse
 from .project_retrieve_analytics_response import ProjectRetrieveAnalyticsResponse as ProjectRetrieveAnalyticsResponse
diff --git a/src/codex/types/project_increment_queries_params.py b/src/codex/types/project_increment_queries_params.py
deleted file mode 100644
index f6043a7..0000000
--- a/src/codex/types/project_increment_queries_params.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing_extensions import TypedDict
-
-__all__ = ["ProjectIncrementQueriesParams"]
-
-
-class ProjectIncrementQueriesParams(TypedDict, total=False):
-    count: int
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 4507741..ce4b7f5 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -18,8 +18,6 @@
 )
 from tests.utils import assert_matches_type
 
-# pyright: reportDeprecated=false
-
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
 
@@ -462,64 +460,6 @@ def test_path_params_export(self, client: Codex) -> None:
                 "",
             )
 
-    @pytest.mark.skip()
-    @parametrize
-    def test_method_increment_queries(self, client: Codex) -> None:
-        with pytest.warns(DeprecationWarning):
-            project = client.projects.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            )
-
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    def test_method_increment_queries_with_all_params(self, client: Codex) -> None:
-        with pytest.warns(DeprecationWarning):
-            project = client.projects.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-                count=0,
-            )
-
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    def test_raw_response_increment_queries(self, client: Codex) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.projects.with_raw_response.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        project = response.parse()
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    def test_streaming_response_increment_queries(self, client: Codex) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.projects.with_streaming_response.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                project = response.parse()
-                assert_matches_type(object, project, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @pytest.mark.skip()
-    @parametrize
-    def test_path_params_increment_queries(self, client: Codex) -> None:
-        with pytest.warns(DeprecationWarning):
-            with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
-                client.projects.with_raw_response.increment_queries(
-                    project_id="",
-                )
-
     @pytest.mark.skip()
     @parametrize
     def test_method_invite_sme(self, client: Codex) -> None:
@@ -1164,64 +1104,6 @@ async def test_path_params_export(self, async_client: AsyncCodex) -> None:
                 "",
             )
 
-    @pytest.mark.skip()
-    @parametrize
-    async def test_method_increment_queries(self, async_client: AsyncCodex) -> None:
-        with pytest.warns(DeprecationWarning):
-            project = await async_client.projects.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            )
-
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    async def test_method_increment_queries_with_all_params(self, async_client: AsyncCodex) -> None:
-        with pytest.warns(DeprecationWarning):
-            project = await async_client.projects.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-                count=0,
-            )
-
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    async def test_raw_response_increment_queries(self, async_client: AsyncCodex) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.projects.with_raw_response.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        project = await response.parse()
-        assert_matches_type(object, project, path=["response"])
-
-    @pytest.mark.skip()
-    @parametrize
-    async def test_streaming_response_increment_queries(self, async_client: AsyncCodex) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.projects.with_streaming_response.increment_queries(
-                project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                project = await response.parse()
-                assert_matches_type(object, project, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @pytest.mark.skip()
-    @parametrize
-    async def test_path_params_increment_queries(self, async_client: AsyncCodex) -> None:
-        with pytest.warns(DeprecationWarning):
-            with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
-                await async_client.projects.with_raw_response.increment_queries(
-                    project_id="",
-                )
-
     @pytest.mark.skip()
     @parametrize
     async def test_method_invite_sme(self, async_client: AsyncCodex) -> None:

From 3c74ca0f1a913bed65cc4c6580dda25a07a90b74 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 23 Jul 2025 02:06:52 +0000
Subject: [PATCH 11/20] fix(parsing): parse extra field types

---
 src/codex/_models.py | 25 +++++++++++++++++++++++--
 tests/test_models.py | 29 ++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/codex/_models.py b/src/codex/_models.py
index ffcbf67..b8387ce 100644
--- a/src/codex/_models.py
+++ b/src/codex/_models.py
@@ -208,14 +208,18 @@ def construct(  # pyright: ignore[reportIncompatibleMethodOverride]
             else:
                 fields_values[name] = field_get_default(field)
 
+        extra_field_type = _get_extra_fields_type(__cls)
+
         _extra = {}
         for key, value in values.items():
             if key not in model_fields:
+                parsed = construct_type(value=value, type_=extra_field_type) if extra_field_type is not None else value
+
                 if PYDANTIC_V2:
-                    _extra[key] = value
+                    _extra[key] = parsed
                 else:
                     _fields_set.add(key)
-                    fields_values[key] = value
+                    fields_values[key] = parsed
 
         object.__setattr__(m, "__dict__", fields_values)
 
@@ -370,6 +374,23 @@ def _construct_field(value: object, field: FieldInfo, key: str) -> object:
     return construct_type(value=value, type_=type_, metadata=getattr(field, "metadata", None))
 
 
+def _get_extra_fields_type(cls: type[pydantic.BaseModel]) -> type | None:
+    if not PYDANTIC_V2:
+        # TODO
+        return None
+
+    schema = cls.__pydantic_core_schema__
+    if schema["type"] == "model":
+        fields = schema["schema"]
+        if fields["type"] == "model-fields":
+            extras = fields.get("extras_schema")
+            if extras and "cls" in extras:
+                # mypy can't narrow the type
+                return extras["cls"]  # type: ignore[no-any-return]
+
+    return None
+
+
 def is_basemodel(type_: type) -> bool:
     """Returns whether or not the given type is either a `BaseModel` or a union of `BaseModel`"""
     if is_union(type_):
diff --git a/tests/test_models.py b/tests/test_models.py
index 3452a61..a989702 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,5 +1,5 @@
 import json
-from typing import Any, Dict, List, Union, Optional, cast
+from typing import TYPE_CHECKING, Any, Dict, List, Union, Optional, cast
 from datetime import datetime, timezone
 from typing_extensions import Literal, Annotated, TypeAliasType
 
@@ -934,3 +934,30 @@ class Type2(BaseModel):
     )
     assert isinstance(model, Type1)
     assert isinstance(model.value, InnerType2)
+
+
+@pytest.mark.skipif(not PYDANTIC_V2, reason="this is only supported in pydantic v2 for now")
+def test_extra_properties() -> None:
+    class Item(BaseModel):
+        prop: int
+
+    class Model(BaseModel):
+        __pydantic_extra__: Dict[str, Item] = Field(init=False)  # pyright: ignore[reportIncompatibleVariableOverride]
+
+        other: str
+
+        if TYPE_CHECKING:
+
+            def __getattr__(self, attr: str) -> Item: ...
+
+    model = construct_type(
+        type_=Model,
+        value={
+            "a": {"prop": 1},
+            "other": "foo",
+        },
+    )
+    assert isinstance(model, Model)
+    assert model.a.prop == 1
+    assert isinstance(model.a, Item)
+    assert model.other == "foo"

From 7e7caf9a3ad214c5df3686122e4f26b850dcb8b0 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 23 Jul 2025 20:17:58 +0000
Subject: [PATCH 12/20] feat(api): api update

---
 .stats.yml                                               | 2 +-
 src/codex/resources/projects/query_logs.py               | 4 ++--
 src/codex/types/projects/query_log_list_groups_params.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index c31fbb0..4cb0619 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 168bdf5a611596d39812ce7259416529
+openapi_spec_hash: 1ef62145e2247a442b75c87b23267e2d
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/resources/projects/query_logs.py b/src/codex/resources/projects/query_logs.py
index 9ccecbe..6fa490e 100644
--- a/src/codex/resources/projects/query_logs.py
+++ b/src/codex/resources/projects/query_logs.py
@@ -286,7 +286,7 @@ def list_groups(
             List[Literal["hallucination", "search_failure", "unhelpful", "difficult_query", "ungrounded"]]
         ]
         | NotGiven = NOT_GIVEN,
-        sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank"]]
+        sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank", "impact_score"]]
         | NotGiven = NOT_GIVEN,
         was_cache_hit: Optional[bool] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
@@ -646,7 +646,7 @@ def list_groups(
             List[Literal["hallucination", "search_failure", "unhelpful", "difficult_query", "ungrounded"]]
         ]
         | NotGiven = NOT_GIVEN,
-        sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank"]]
+        sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank", "impact_score"]]
         | NotGiven = NOT_GIVEN,
         was_cache_hit: Optional[bool] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
diff --git a/src/codex/types/projects/query_log_list_groups_params.py b/src/codex/types/projects/query_log_list_groups_params.py
index cd82d9a..f75ee29 100644
--- a/src/codex/types/projects/query_log_list_groups_params.py
+++ b/src/codex/types/projects/query_log_list_groups_params.py
@@ -44,7 +44,7 @@ class QueryLogListGroupsParams(TypedDict, total=False):
     ]
     """Filter logs that have ANY of these primary evaluation issues (OR operation)"""
 
-    sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank"]]
+    sort: Optional[Literal["created_at", "primary_eval_issue_score", "total_count", "custom_rank", "impact_score"]]
 
     was_cache_hit: Optional[bool]
     """Filter by cache hit status"""

From 6992031e6aa610031f24d818040050b0fc185c34 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 24 Jul 2025 18:17:59 +0000
Subject: [PATCH 13/20] feat(api): api update

---
 .stats.yml                                   | 2 +-
 src/codex/types/project_create_params.py     | 2 ++
 src/codex/types/project_list_response.py     | 2 ++
 src/codex/types/project_retrieve_response.py | 2 ++
 src/codex/types/project_return_schema.py     | 2 ++
 src/codex/types/project_update_params.py     | 2 ++
 tests/api_resources/test_projects.py         | 4 ++++
 7 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/.stats.yml b/.stats.yml
index 4cb0619..de1764d 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 1ef62145e2247a442b75c87b23267e2d
+openapi_spec_hash: 1c6e7d5ed06d72868a57e64381bc473c
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/types/project_create_params.py b/src/codex/types/project_create_params.py
index a836978..c75023f 100644
--- a/src/codex/types/project_create_params.py
+++ b/src/codex/types/project_create_params.py
@@ -333,4 +333,6 @@ class Config(TypedDict, total=False):
 
     query_use_llm_matching: bool
 
+    tlm_evals_model: str
+
     upper_llm_match_distance_threshold: float
diff --git a/src/codex/types/project_list_response.py b/src/codex/types/project_list_response.py
index c39bf08..4ac3849 100644
--- a/src/codex/types/project_list_response.py
+++ b/src/codex/types/project_list_response.py
@@ -323,6 +323,8 @@ class ProjectConfig(BaseModel):
 
     query_use_llm_matching: Optional[bool] = None
 
+    tlm_evals_model: Optional[str] = None
+
     upper_llm_match_distance_threshold: Optional[float] = None
 
 
diff --git a/src/codex/types/project_retrieve_response.py b/src/codex/types/project_retrieve_response.py
index 7d1f8ed..6e87d65 100644
--- a/src/codex/types/project_retrieve_response.py
+++ b/src/codex/types/project_retrieve_response.py
@@ -322,6 +322,8 @@ class Config(BaseModel):
 
     query_use_llm_matching: Optional[bool] = None
 
+    tlm_evals_model: Optional[str] = None
+
     upper_llm_match_distance_threshold: Optional[float] = None
 
 
diff --git a/src/codex/types/project_return_schema.py b/src/codex/types/project_return_schema.py
index 170d799..bb087cd 100644
--- a/src/codex/types/project_return_schema.py
+++ b/src/codex/types/project_return_schema.py
@@ -322,6 +322,8 @@ class Config(BaseModel):
 
     query_use_llm_matching: Optional[bool] = None
 
+    tlm_evals_model: Optional[str] = None
+
     upper_llm_match_distance_threshold: Optional[float] = None
 
 
diff --git a/src/codex/types/project_update_params.py b/src/codex/types/project_update_params.py
index 3e24441..c550b43 100644
--- a/src/codex/types/project_update_params.py
+++ b/src/codex/types/project_update_params.py
@@ -331,4 +331,6 @@ class Config(TypedDict, total=False):
 
     query_use_llm_matching: bool
 
+    tlm_evals_model: str
+
     upper_llm_match_distance_threshold: float
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index ce4b7f5..9312ca0 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -118,6 +118,7 @@ def test_method_create_with_all_params(self, client: Codex) -> None:
                 "lower_llm_match_distance_threshold": 0,
                 "max_distance": 0,
                 "query_use_llm_matching": True,
+                "tlm_evals_model": "tlm_evals_model",
                 "upper_llm_match_distance_threshold": 0,
             },
             name="name",
@@ -293,6 +294,7 @@ def test_method_update_with_all_params(self, client: Codex) -> None:
                 "lower_llm_match_distance_threshold": 0,
                 "max_distance": 0,
                 "query_use_llm_matching": True,
+                "tlm_evals_model": "tlm_evals_model",
                 "upper_llm_match_distance_threshold": 0,
             },
             description="description",
@@ -762,6 +764,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncCodex) ->
                 "lower_llm_match_distance_threshold": 0,
                 "max_distance": 0,
                 "query_use_llm_matching": True,
+                "tlm_evals_model": "tlm_evals_model",
                 "upper_llm_match_distance_threshold": 0,
             },
             name="name",
@@ -937,6 +940,7 @@ async def test_method_update_with_all_params(self, async_client: AsyncCodex) ->
                 "lower_llm_match_distance_threshold": 0,
                 "max_distance": 0,
                 "query_use_llm_matching": True,
+                "tlm_evals_model": "tlm_evals_model",
                 "upper_llm_match_distance_threshold": 0,
             },
             description="description",

From 00df8ec35d44e5bdc6e68661a92d9d21905222c7 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 25 Jul 2025 03:08:09 +0000
Subject: [PATCH 14/20] chore(project): add settings file for vscode

---
 .gitignore            | 1 -
 .vscode/settings.json | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/settings.json

diff --git a/.gitignore b/.gitignore
index 8779740..95ceb18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 .prism.log
-.vscode
 _dev
 
 __pycache__
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..5b01030
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.importFormat": "relative",
+}

From 2ee809593ddb15c4de776a2048883287ec5c0cdb Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 25 Jul 2025 21:18:01 +0000
Subject: [PATCH 15/20] feat(api): api update

---
 .stats.yml                                    |   2 +-
 src/codex/types/project_validate_params.py    | 172 ++++++++---------
 .../query_log_list_by_group_response.py       | 176 +++++++++---------
 .../query_log_list_groups_response.py         | 168 ++++++++---------
 .../types/projects/query_log_list_response.py | 168 ++++++++---------
 .../projects/query_log_retrieve_response.py   | 168 ++++++++---------
 ...remediation_list_resolved_logs_response.py | 172 ++++++++---------
 tests/api_resources/test_projects.py          |  36 +++-
 8 files changed, 547 insertions(+), 515 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index de1764d..9fb5140 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 1c6e7d5ed06d72868a57e64381bc473c
+openapi_spec_hash: f7b67b502828e6d0ca3944d40d00d89b
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 7b85d06..081dd2a 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -29,19 +29,6 @@
     "ResponseChatCompletionUsageCompletionTokensDetails",
     "ResponseChatCompletionUsagePromptTokensDetails",
     "Message",
-    "MessageChatCompletionDeveloperMessageParam",
-    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "MessageChatCompletionSystemMessageParam",
-    "MessageChatCompletionSystemMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParamInput",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1File",
-    "MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile",
     "MessageChatCompletionAssistantMessageParamInput",
     "MessageChatCompletionAssistantMessageParamInputAudio",
     "MessageChatCompletionAssistantMessageParamInputContentUnionMember1",
@@ -52,7 +39,20 @@
     "MessageChatCompletionAssistantMessageParamInputToolCallFunction",
     "MessageChatCompletionToolMessageParam",
     "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamInput",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
     "MessageChatCompletionFunctionMessageParam",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
     "Options",
 ]
 
@@ -440,32 +440,80 @@ class ResponseChatCompletionTyped(TypedDict, total=False):
 Response: TypeAlias = Union[str, ResponseChatCompletion]
 
 
-class MessageChatCompletionDeveloperMessageParamContentUnionMember1(TypedDict, total=False):
+class MessageChatCompletionAssistantMessageParamInputAudio(TypedDict, total=False):
+    id: Required[str]
+
+
+class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam(
+    TypedDict, total=False
+):
     text: Required[str]
 
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
-    content: Required[Union[str, Iterable[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]]
+class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    TypedDict, total=False
+):
+    refusal: Required[str]
+
+    type: Required[Literal["refusal"]]
 
-    role: Required[Literal["developer"]]
+
+MessageChatCompletionAssistantMessageParamInputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamInputFunctionCall(TypedDict, total=False):
+    arguments: Required[str]
+
+    name: Required[str]
+
+
+class MessageChatCompletionAssistantMessageParamInputToolCallFunction(TypedDict, total=False):
+    arguments: Required[str]
+
+    name: Required[str]
+
+
+class MessageChatCompletionAssistantMessageParamInputToolCall(TypedDict, total=False):
+    id: Required[str]
+
+    function: Required[MessageChatCompletionAssistantMessageParamInputToolCallFunction]
+
+    type: Required[Literal["function"]]
+
+
+class MessageChatCompletionAssistantMessageParamInput(TypedDict, total=False):
+    role: Required[Literal["assistant"]]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamInputAudio]
+
+    content: Union[str, Iterable[MessageChatCompletionAssistantMessageParamInputContentUnionMember1], None]
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamInputFunctionCall]
 
     name: str
 
+    refusal: Optional[str]
 
-class MessageChatCompletionSystemMessageParamContentUnionMember1(TypedDict, total=False):
+    tool_calls: Iterable[MessageChatCompletionAssistantMessageParamInputToolCall]
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(TypedDict, total=False):
     text: Required[str]
 
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionSystemMessageParam(TypedDict, total=False):
-    content: Required[Union[str, Iterable[MessageChatCompletionSystemMessageParamContentUnionMember1]]]
+class MessageChatCompletionToolMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageChatCompletionToolMessageParamContentUnionMember1]]]
 
-    role: Required[Literal["system"]]
+    role: Required[Literal["tool"]]
 
-    name: str
+    tool_call_id: Required[str]
 
 
 class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam(
@@ -542,97 +590,49 @@ class MessageChatCompletionUserMessageParamInput(TypedDict, total=False):
     name: str
 
 
-class MessageChatCompletionAssistantMessageParamInputAudio(TypedDict, total=False):
-    id: Required[str]
-
-
-class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam(
-    TypedDict, total=False
-):
+class MessageChatCompletionSystemMessageParamContentUnionMember1(TypedDict, total=False):
     text: Required[str]
 
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    TypedDict, total=False
-):
-    refusal: Required[str]
-
-    type: Required[Literal["refusal"]]
-
-
-MessageChatCompletionAssistantMessageParamInputContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class MessageChatCompletionSystemMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageChatCompletionSystemMessageParamContentUnionMember1]]]
 
-class MessageChatCompletionAssistantMessageParamInputFunctionCall(TypedDict, total=False):
-    arguments: Required[str]
+    role: Required[Literal["system"]]
 
-    name: Required[str]
+    name: str
 
 
-class MessageChatCompletionAssistantMessageParamInputToolCallFunction(TypedDict, total=False):
-    arguments: Required[str]
+class MessageChatCompletionFunctionMessageParam(TypedDict, total=False):
+    content: Required[Optional[str]]
 
     name: Required[str]
 
-
-class MessageChatCompletionAssistantMessageParamInputToolCall(TypedDict, total=False):
-    id: Required[str]
-
-    function: Required[MessageChatCompletionAssistantMessageParamInputToolCallFunction]
-
-    type: Required[Literal["function"]]
-
-
-class MessageChatCompletionAssistantMessageParamInput(TypedDict, total=False):
-    role: Required[Literal["assistant"]]
-
-    audio: Optional[MessageChatCompletionAssistantMessageParamInputAudio]
-
-    content: Union[str, Iterable[MessageChatCompletionAssistantMessageParamInputContentUnionMember1], None]
-
-    function_call: Optional[MessageChatCompletionAssistantMessageParamInputFunctionCall]
-
-    name: str
-
-    refusal: Optional[str]
-
-    tool_calls: Iterable[MessageChatCompletionAssistantMessageParamInputToolCall]
+    role: Required[Literal["function"]]
 
 
-class MessageChatCompletionToolMessageParamContentUnionMember1(TypedDict, total=False):
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(TypedDict, total=False):
     text: Required[str]
 
     type: Required[Literal["text"]]
 
 
-class MessageChatCompletionToolMessageParam(TypedDict, total=False):
-    content: Required[Union[str, Iterable[MessageChatCompletionToolMessageParamContentUnionMember1]]]
-
-    role: Required[Literal["tool"]]
-
-    tool_call_id: Required[str]
-
-
-class MessageChatCompletionFunctionMessageParam(TypedDict, total=False):
-    content: Required[Optional[str]]
+class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
+    content: Required[Union[str, Iterable[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]]
 
-    name: Required[str]
+    role: Required[Literal["developer"]]
 
-    role: Required[Literal["function"]]
+    name: str
 
 
 Message: TypeAlias = Union[
-    MessageChatCompletionDeveloperMessageParam,
-    MessageChatCompletionSystemMessageParam,
-    MessageChatCompletionUserMessageParamInput,
     MessageChatCompletionAssistantMessageParamInput,
     MessageChatCompletionToolMessageParam,
+    MessageChatCompletionUserMessageParamInput,
+    MessageChatCompletionSystemMessageParam,
     MessageChatCompletionFunctionMessageParam,
+    MessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index 0ff5909..ae49b95 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -17,19 +17,6 @@
     "QueryLogsByGroupQueryLogContext",
     "QueryLogsByGroupQueryLogDeterministicGuardrailsResults",
     "QueryLogsByGroupQueryLogMessage",
-    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
-    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam",
-    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
-    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
@@ -40,7 +27,20 @@
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction",
     "QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
     "QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
+    "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
 ]
 
 
@@ -93,32 +93,82 @@ class QueryLogsByGroupQueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
-    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
 
-    role: Literal["developer"]
+    type: Literal["refusal"]
+
+
+QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[
+        str, List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None
+    ] = None
+
+    function_call: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
 
     name: Optional[str] = None
 
+    refusal: Optional[str] = None
 
-class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    tool_calls: Optional[List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam(BaseModel):
-    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
+class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
 
-    role: Literal["system"]
+    role: Literal["tool"]
 
-    name: Optional[str] = None
+    tool_call_id: str
 
 
 class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
@@ -191,99 +241,49 @@ class QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput(BaseMo
     name: Optional[str] = None
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
-    id: str
-
-
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
-    BaseModel
-):
+class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    BaseModel
-):
-    refusal: str
-
-    type: Literal["refusal"]
-
-
-QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
-    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
-    QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
 
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
-    arguments: str
+    role: Literal["system"]
 
-    name: str
+    name: Optional[str] = None
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
-    arguments: str
+class QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
 
     name: str
 
-
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
-    id: str
-
-    function: QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
-
-    type: Literal["function"]
-
-
-class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
-    role: Literal["assistant"]
-
-    audio: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
-
-    content: Union[
-        str, List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None
-    ] = None
-
-    function_call: Optional[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
-
-    name: Optional[str] = None
-
-    refusal: Optional[str] = None
-
-    tool_calls: Optional[List[QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+    role: Literal["function"]
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam(BaseModel):
-    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
-
-    role: Literal["tool"]
-
-    tool_call_id: str
-
-
-class QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
-    content: Optional[str] = None
+class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
 
-    name: str
+    role: Literal["developer"]
 
-    role: Literal["function"]
+    name: Optional[str] = None
 
 
 QueryLogsByGroupQueryLogMessage: TypeAlias = Union[
-    QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam,
-    QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam,
-    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput,
     QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput,
     QueryLogsByGroupQueryLogMessageChatCompletionToolMessageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionUserMessageParamOutput,
+    QueryLogsByGroupQueryLogMessageChatCompletionSystemMessageParam,
     QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam,
+    QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index 495fc56..cc3b208 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -15,19 +15,6 @@
     "Context",
     "DeterministicGuardrailsResults",
     "Message",
-    "MessageChatCompletionDeveloperMessageParam",
-    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "MessageChatCompletionSystemMessageParam",
-    "MessageChatCompletionSystemMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutput",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
     "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
@@ -38,7 +25,20 @@
     "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
     "MessageChatCompletionToolMessageParam",
     "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
     "MessageChatCompletionFunctionMessageParam",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
 ]
 
 
@@ -91,32 +91,78 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionDeveloperMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
 
-    role: Literal["developer"]
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
 
     name: Optional[str] = None
 
+    refusal: Optional[str] = None
 
-class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionSystemMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
 
-    role: Literal["system"]
+    role: Literal["tool"]
 
-    name: Optional[str] = None
+    tool_call_id: str
 
 
 class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
@@ -185,95 +231,49 @@ class MessageChatCompletionUserMessageParamOutput(BaseModel):
     name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
-    id: str
-
-
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    BaseModel
-):
-    refusal: str
-
-    type: Literal["refusal"]
-
-
-MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
 
-class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
-    arguments: str
+    role: Literal["system"]
 
-    name: str
+    name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
-    arguments: str
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
 
     name: str
 
-
-class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
-    id: str
-
-    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
-
-    type: Literal["function"]
-
-
-class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
-    role: Literal["assistant"]
-
-    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
-
-    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
-
-    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
-
-    name: Optional[str] = None
-
-    refusal: Optional[str] = None
-
-    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+    role: Literal["function"]
 
 
-class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionToolMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
-
-    role: Literal["tool"]
-
-    tool_call_id: str
-
-
-class MessageChatCompletionFunctionMessageParam(BaseModel):
-    content: Optional[str] = None
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
 
-    name: str
+    role: Literal["developer"]
 
-    role: Literal["function"]
+    name: Optional[str] = None
 
 
 Message: TypeAlias = Union[
-    MessageChatCompletionDeveloperMessageParam,
-    MessageChatCompletionSystemMessageParam,
-    MessageChatCompletionUserMessageParamOutput,
     MessageChatCompletionAssistantMessageParamOutput,
     MessageChatCompletionToolMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionSystemMessageParam,
     MessageChatCompletionFunctionMessageParam,
+    MessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index 72a8cab..0778898 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -15,19 +15,6 @@
     "Context",
     "DeterministicGuardrailsResults",
     "Message",
-    "MessageChatCompletionDeveloperMessageParam",
-    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "MessageChatCompletionSystemMessageParam",
-    "MessageChatCompletionSystemMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutput",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
     "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
@@ -38,7 +25,20 @@
     "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
     "MessageChatCompletionToolMessageParam",
     "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
     "MessageChatCompletionFunctionMessageParam",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
 ]
 
 
@@ -91,32 +91,78 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionDeveloperMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
 
-    role: Literal["developer"]
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
 
     name: Optional[str] = None
 
+    refusal: Optional[str] = None
 
-class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionSystemMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
 
-    role: Literal["system"]
+    role: Literal["tool"]
 
-    name: Optional[str] = None
+    tool_call_id: str
 
 
 class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
@@ -185,95 +231,49 @@ class MessageChatCompletionUserMessageParamOutput(BaseModel):
     name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
-    id: str
-
-
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    BaseModel
-):
-    refusal: str
-
-    type: Literal["refusal"]
-
-
-MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
 
-class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
-    arguments: str
+    role: Literal["system"]
 
-    name: str
+    name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
-    arguments: str
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
 
     name: str
 
-
-class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
-    id: str
-
-    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
-
-    type: Literal["function"]
-
-
-class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
-    role: Literal["assistant"]
-
-    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
-
-    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
-
-    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
-
-    name: Optional[str] = None
-
-    refusal: Optional[str] = None
-
-    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+    role: Literal["function"]
 
 
-class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionToolMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
-
-    role: Literal["tool"]
-
-    tool_call_id: str
-
-
-class MessageChatCompletionFunctionMessageParam(BaseModel):
-    content: Optional[str] = None
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
 
-    name: str
+    role: Literal["developer"]
 
-    role: Literal["function"]
+    name: Optional[str] = None
 
 
 Message: TypeAlias = Union[
-    MessageChatCompletionDeveloperMessageParam,
-    MessageChatCompletionSystemMessageParam,
-    MessageChatCompletionUserMessageParamOutput,
     MessageChatCompletionAssistantMessageParamOutput,
     MessageChatCompletionToolMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionSystemMessageParam,
     MessageChatCompletionFunctionMessageParam,
+    MessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 4324269..2751ef2 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -15,19 +15,6 @@
     "Context",
     "DeterministicGuardrailsResults",
     "Message",
-    "MessageChatCompletionDeveloperMessageParam",
-    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "MessageChatCompletionSystemMessageParam",
-    "MessageChatCompletionSystemMessageParamContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutput",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
-    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
     "MessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
@@ -38,7 +25,20 @@
     "MessageChatCompletionAssistantMessageParamOutputToolCallFunction",
     "MessageChatCompletionToolMessageParam",
     "MessageChatCompletionToolMessageParamContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutput",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "MessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "MessageChatCompletionSystemMessageParam",
+    "MessageChatCompletionSystemMessageParamContentUnionMember1",
     "MessageChatCompletionFunctionMessageParam",
+    "MessageChatCompletionDeveloperMessageParam",
+    "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
 ]
 
 
@@ -91,32 +91,78 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionDeveloperMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
 
-    role: Literal["developer"]
+    type: Literal["refusal"]
+
+
+MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
 
     name: Optional[str] = None
 
+    refusal: Optional[str] = None
 
-class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionSystemMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
+class MessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
 
-    role: Literal["system"]
+    role: Literal["tool"]
 
-    name: Optional[str] = None
+    tool_call_id: str
 
 
 class MessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
@@ -185,95 +231,49 @@ class MessageChatCompletionUserMessageParamOutput(BaseModel):
     name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
-    id: str
-
-
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(BaseModel):
+class MessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    BaseModel
-):
-    refusal: str
-
-    type: Literal["refusal"]
-
-
-MessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
-    MessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class MessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionSystemMessageParamContentUnionMember1]]
 
-class MessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
-    arguments: str
+    role: Literal["system"]
 
-    name: str
+    name: Optional[str] = None
 
 
-class MessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
-    arguments: str
+class MessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
 
     name: str
 
-
-class MessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
-    id: str
-
-    function: MessageChatCompletionAssistantMessageParamOutputToolCallFunction
-
-    type: Literal["function"]
-
-
-class MessageChatCompletionAssistantMessageParamOutput(BaseModel):
-    role: Literal["assistant"]
-
-    audio: Optional[MessageChatCompletionAssistantMessageParamOutputAudio] = None
-
-    content: Union[str, List[MessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
-
-    function_call: Optional[MessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
-
-    name: Optional[str] = None
-
-    refusal: Optional[str] = None
-
-    tool_calls: Optional[List[MessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+    role: Literal["function"]
 
 
-class MessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+class MessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class MessageChatCompletionToolMessageParam(BaseModel):
-    content: Union[str, List[MessageChatCompletionToolMessageParamContentUnionMember1]]
-
-    role: Literal["tool"]
-
-    tool_call_id: str
-
-
-class MessageChatCompletionFunctionMessageParam(BaseModel):
-    content: Optional[str] = None
+class MessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]
 
-    name: str
+    role: Literal["developer"]
 
-    role: Literal["function"]
+    name: Optional[str] = None
 
 
 Message: TypeAlias = Union[
-    MessageChatCompletionDeveloperMessageParam,
-    MessageChatCompletionSystemMessageParam,
-    MessageChatCompletionUserMessageParamOutput,
     MessageChatCompletionAssistantMessageParamOutput,
     MessageChatCompletionToolMessageParam,
+    MessageChatCompletionUserMessageParamOutput,
+    MessageChatCompletionSystemMessageParam,
     MessageChatCompletionFunctionMessageParam,
+    MessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index cebfaf4..d56f9a4 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -16,19 +16,6 @@
     "QueryLogContext",
     "QueryLogDeterministicGuardrailsResults",
     "QueryLogMessage",
-    "QueryLogMessageChatCompletionDeveloperMessageParam",
-    "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "QueryLogMessageChatCompletionSystemMessageParam",
-    "QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
-    "QueryLogMessageChatCompletionUserMessageParamOutput",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
-    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
     "QueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
     "QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1",
@@ -39,7 +26,20 @@
     "QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction",
     "QueryLogMessageChatCompletionToolMessageParam",
     "QueryLogMessageChatCompletionToolMessageParamContentUnionMember1",
+    "QueryLogMessageChatCompletionUserMessageParamOutput",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartImageParamImageURL",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParam",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1File",
+    "QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1FileFile",
+    "QueryLogMessageChatCompletionSystemMessageParam",
+    "QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1",
     "QueryLogMessageChatCompletionFunctionMessageParam",
+    "QueryLogMessageChatCompletionDeveloperMessageParam",
+    "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
 ]
 
 
@@ -92,32 +92,80 @@ class QueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
+class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
+    id: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
+    BaseModel
+):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
-    content: Union[str, List[QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
+class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
+    BaseModel
+):
+    refusal: str
+
+    type: Literal["refusal"]
 
-    role: Literal["developer"]
+
+QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
+    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
+    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
+]
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
+    id: str
+
+    function: QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
+
+    type: Literal["function"]
+
+
+class QueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
+    role: Literal["assistant"]
+
+    audio: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
+
+    content: Union[str, List[QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
+
+    function_call: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
 
     name: Optional[str] = None
 
+    refusal: Optional[str] = None
 
-class QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
+    tool_calls: Optional[List[QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+
+
+class QueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogMessageChatCompletionSystemMessageParam(BaseModel):
-    content: Union[str, List[QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
+class QueryLogMessageChatCompletionToolMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
 
-    role: Literal["system"]
+    role: Literal["tool"]
 
-    name: Optional[str] = None
+    tool_call_id: str
 
 
 class QueryLogMessageChatCompletionUserMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
@@ -190,97 +238,49 @@ class QueryLogMessageChatCompletionUserMessageParamOutput(BaseModel):
     name: Optional[str] = None
 
 
-class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
-    id: str
-
-
-class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam(
-    BaseModel
-):
+class QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam(
-    BaseModel
-):
-    refusal: str
-
-    type: Literal["refusal"]
-
-
-QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1: TypeAlias = Union[
-    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartTextParam,
-    QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1ChatCompletionContentPartRefusalParam,
-]
-
+class QueryLogMessageChatCompletionSystemMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionSystemMessageParamContentUnionMember1]]
 
-class QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall(BaseModel):
-    arguments: str
+    role: Literal["system"]
 
-    name: str
+    name: Optional[str] = None
 
 
-class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction(BaseModel):
-    arguments: str
+class QueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
+    content: Optional[str] = None
 
     name: str
 
-
-class QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall(BaseModel):
-    id: str
-
-    function: QueryLogMessageChatCompletionAssistantMessageParamOutputToolCallFunction
-
-    type: Literal["function"]
-
-
-class QueryLogMessageChatCompletionAssistantMessageParamOutput(BaseModel):
-    role: Literal["assistant"]
-
-    audio: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputAudio] = None
-
-    content: Union[str, List[QueryLogMessageChatCompletionAssistantMessageParamOutputContentUnionMember1], None] = None
-
-    function_call: Optional[QueryLogMessageChatCompletionAssistantMessageParamOutputFunctionCall] = None
-
-    name: Optional[str] = None
-
-    refusal: Optional[str] = None
-
-    tool_calls: Optional[List[QueryLogMessageChatCompletionAssistantMessageParamOutputToolCall]] = None
+    role: Literal["function"]
 
 
-class QueryLogMessageChatCompletionToolMessageParamContentUnionMember1(BaseModel):
+class QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1(BaseModel):
     text: str
 
     type: Literal["text"]
 
 
-class QueryLogMessageChatCompletionToolMessageParam(BaseModel):
-    content: Union[str, List[QueryLogMessageChatCompletionToolMessageParamContentUnionMember1]]
-
-    role: Literal["tool"]
-
-    tool_call_id: str
-
-
-class QueryLogMessageChatCompletionFunctionMessageParam(BaseModel):
-    content: Optional[str] = None
+class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
+    content: Union[str, List[QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1]]
 
-    name: str
+    role: Literal["developer"]
 
-    role: Literal["function"]
+    name: Optional[str] = None
 
 
 QueryLogMessage: TypeAlias = Union[
-    QueryLogMessageChatCompletionDeveloperMessageParam,
-    QueryLogMessageChatCompletionSystemMessageParam,
-    QueryLogMessageChatCompletionUserMessageParamOutput,
     QueryLogMessageChatCompletionAssistantMessageParamOutput,
     QueryLogMessageChatCompletionToolMessageParam,
+    QueryLogMessageChatCompletionUserMessageParamOutput,
+    QueryLogMessageChatCompletionSystemMessageParam,
     QueryLogMessageChatCompletionFunctionMessageParam,
+    QueryLogMessageChatCompletionDeveloperMessageParam,
 ]
 
 
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 9312ca0..ae3f4f0 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -594,9 +594,25 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
             eval_scores={"foo": 0},
             messages=[
                 {
+                    "role": "assistant",
+                    "audio": {"id": "id"},
                     "content": "string",
-                    "role": "developer",
+                    "function_call": {
+                        "arguments": "arguments",
+                        "name": "name",
+                    },
                     "name": "name",
+                    "refusal": "refusal",
+                    "tool_calls": [
+                        {
+                            "id": "id",
+                            "function": {
+                                "arguments": "arguments",
+                                "name": "name",
+                            },
+                            "type": "function",
+                        }
+                    ],
                 }
             ],
             options={
@@ -1240,9 +1256,25 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
             eval_scores={"foo": 0},
             messages=[
                 {
+                    "role": "assistant",
+                    "audio": {"id": "id"},
                     "content": "string",
-                    "role": "developer",
+                    "function_call": {
+                        "arguments": "arguments",
+                        "name": "name",
+                    },
                     "name": "name",
+                    "refusal": "refusal",
+                    "tool_calls": [
+                        {
+                            "id": "id",
+                            "function": {
+                                "arguments": "arguments",
+                                "name": "name",
+                            },
+                            "type": "function",
+                        }
+                    ],
                 }
             ],
             options={

From 3039fdde263eb1a0da9b733958dcdcf653a4509b Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 25 Jul 2025 23:18:00 +0000
Subject: [PATCH 16/20] codegen metadata

---
 .stats.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.stats.yml b/.stats.yml
index 9fb5140..19c8465 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: f7b67b502828e6d0ca3944d40d00d89b
+openapi_spec_hash: 57e29e33aec4bbc20171ec3128594e75
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56

From 1a06cfc7c19943ac468b2ec9f2787215363cf77e Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 28 Jul 2025 17:03:15 +0000
Subject: [PATCH 17/20] feat(api): api update

---
 .stats.yml                                    |   2 +-
 src/codex/resources/projects/projects.py      |  74 +++++-----
 src/codex/resources/tlm.py                    | 128 +++++++++---------
 src/codex/types/project_validate_params.py    |  58 +++++---
 .../query_log_list_by_group_response.py       |  46 +++++++
 .../query_log_list_groups_response.py         |  46 +++++++
 .../types/projects/query_log_list_response.py |  46 +++++++
 .../projects/query_log_retrieve_response.py   |  46 +++++++
 ...remediation_list_resolved_logs_response.py |  46 +++++++
 src/codex/types/tlm_prompt_params.py          |  34 ++---
 src/codex/types/tlm_score_params.py           |  34 ++---
 tests/api_resources/test_projects.py          |  24 ++++
 tests/api_resources/test_tlm.py               |   4 +
 13 files changed, 443 insertions(+), 145 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 19c8465..3fdd5d0 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 57e29e33aec4bbc20171ec3128594e75
+openapi_spec_hash: 49989625bf633c5fdb3e11140f788f2d
 config_hash: 8f6e5c3b064cbb77569a6bf654954a56
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index 3a109ed..f82bcd0 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -460,6 +460,7 @@ def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
+        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -504,17 +505,16 @@ def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -550,12 +550,11 @@ def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -574,6 +573,8 @@ def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -582,6 +583,9 @@ def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
+          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
+              provided to the LLM.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -620,6 +624,7 @@ def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
+                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
@@ -1028,6 +1033,7 @@ async def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
+        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -1072,17 +1078,16 @@ async def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -1118,12 +1123,11 @@ async def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -1142,6 +1146,8 @@ async def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -1150,6 +1156,9 @@ async def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
+          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
+              provided to the LLM.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -1188,6 +1197,7 @@ async def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
+                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
diff --git a/src/codex/resources/tlm.py b/src/codex/resources/tlm.py
index 12ff6c0..c6064ed 100644
--- a/src/codex/resources/tlm.py
+++ b/src/codex/resources/tlm.py
@@ -79,17 +79,16 @@ def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -125,12 +124,11 @@ def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -149,6 +147,8 @@ def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -217,17 +217,16 @@ def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -263,12 +262,11 @@ def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -287,6 +285,8 @@ def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -371,17 +371,16 @@ async def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -417,12 +416,11 @@ async def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -441,6 +439,8 @@ async def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -509,17 +509,16 @@ async def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -555,12 +554,11 @@ async def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -579,6 +577,8 @@ async def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 081dd2a..6231367 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -54,6 +54,8 @@
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
     "Options",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -106,17 +108,16 @@ class ProjectValidateParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -152,12 +153,11 @@ class ProjectValidateParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -175,6 +175,8 @@ class ProjectValidateParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     prompt: Optional[str]
@@ -194,6 +196,12 @@ class ProjectValidateParams(TypedDict, total=False):
 
     task: Optional[str]
 
+    tools: Optional[Iterable[Tool]]
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
     x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
 
     x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")]
@@ -649,8 +657,26 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
 
     use_self_reflection: bool
+
+
+class ToolFunction(TypedDict, total=False):
+    name: Required[str]
+
+    description: str
+
+    parameters: object
+
+    strict: Optional[bool]
+
+
+class Tool(TypedDict, total=False):
+    function: Required[ToolFunction]
+
+    type: Required[Literal["function"]]
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index ae49b95..b3c774b 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -16,6 +16,8 @@
     "QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores",
     "QueryLogsByGroupQueryLogContext",
     "QueryLogsByGroupQueryLogDeterministicGuardrailsResults",
+    "QueryLogsByGroupQueryLogEvaluatedResponseToolCall",
+    "QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction",
     "QueryLogsByGroupQueryLogMessage",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -41,6 +43,8 @@
     "QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogTool",
+    "QueryLogsByGroupQueryLogToolFunction",
 ]
 
 
@@ -93,6 +97,20 @@ class QueryLogsByGroupQueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogEvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -287,6 +305,22 @@ class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseMod
 ]
 
 
+class QueryLogsByGroupQueryLogToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class QueryLogsByGroupQueryLogTool(BaseModel):
+    function: QueryLogsByGroupQueryLogToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogsByGroupQueryLog(BaseModel):
     id: str
 
@@ -357,6 +391,12 @@ class QueryLogsByGroupQueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[QueryLogsByGroupQueryLogEvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -383,6 +423,12 @@ class QueryLogsByGroupQueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    tools: Optional[List[QueryLogsByGroupQueryLogTool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
 
 class QueryLogsByGroup(BaseModel):
     query_logs: List[QueryLogsByGroupQueryLog]
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index cc3b208..6ed4d14 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogListGroupsResponse(BaseModel):
     id: str
 
@@ -347,6 +381,12 @@ class QueryLogListGroupsResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -372,3 +412,9 @@ class QueryLogListGroupsResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index 0778898..c6737b2 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogListResponse(BaseModel):
     id: str
 
@@ -341,6 +375,12 @@ class QueryLogListResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -366,3 +406,9 @@ class QueryLogListResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 2751ef2..8fd8662 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogRetrieveResponse(BaseModel):
     id: str
 
@@ -345,6 +379,12 @@ class QueryLogRetrieveResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -370,3 +410,9 @@ class QueryLogRetrieveResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index d56f9a4..567a086 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -15,6 +15,8 @@
     "QueryLogFormattedNonGuardrailEvalScores",
     "QueryLogContext",
     "QueryLogDeterministicGuardrailsResults",
+    "QueryLogEvaluatedResponseToolCall",
+    "QueryLogEvaluatedResponseToolCallFunction",
     "QueryLogMessage",
     "QueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -40,6 +42,8 @@
     "QueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogTool",
+    "QueryLogToolFunction",
 ]
 
 
@@ -92,6 +96,20 @@ class QueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogEvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogEvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: QueryLogEvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -284,6 +302,22 @@ class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class QueryLogToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class QueryLogTool(BaseModel):
+    function: QueryLogToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLog(BaseModel):
     id: str
 
@@ -348,6 +382,12 @@ class QueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[QueryLogEvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -374,6 +414,12 @@ class QueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    tools: Optional[List[QueryLogTool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
 
 class RemediationListResolvedLogsResponse(BaseModel):
     query_logs: List[QueryLog]
diff --git a/src/codex/types/tlm_prompt_params.py b/src/codex/types/tlm_prompt_params.py
index 3c04bfc..8749c5a 100644
--- a/src/codex/types/tlm_prompt_params.py
+++ b/src/codex/types/tlm_prompt_params.py
@@ -30,17 +30,16 @@ class TlmPromptParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -76,12 +75,11 @@ class TlmPromptParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -99,6 +97,8 @@ class TlmPromptParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -120,6 +120,8 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/src/codex/types/tlm_score_params.py b/src/codex/types/tlm_score_params.py
index 95bcc4c..4a0a32a 100644
--- a/src/codex/types/tlm_score_params.py
+++ b/src/codex/types/tlm_score_params.py
@@ -32,17 +32,16 @@ class TlmScoreParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -78,12 +77,11 @@ class TlmScoreParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -101,6 +99,8 @@ class TlmScoreParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -122,6 +122,8 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index ae3f4f0..7884db0 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -622,6 +622,7 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -630,6 +631,17 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
+            tools=[
+                {
+                    "function": {
+                        "name": "name",
+                        "description": "description",
+                        "parameters": {},
+                        "strict": True,
+                    },
+                    "type": "function",
+                }
+            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
@@ -1284,6 +1296,7 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -1292,6 +1305,17 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
+            tools=[
+                {
+                    "function": {
+                        "name": "name",
+                        "description": "description",
+                        "parameters": {},
+                        "strict": True,
+                    },
+                    "type": "function",
+                }
+            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
diff --git a/tests/api_resources/test_tlm.py b/tests/api_resources/test_tlm.py
index 41376a4..da0a9ad 100644
--- a/tests/api_resources/test_tlm.py
+++ b/tests/api_resources/test_tlm.py
@@ -38,6 +38,7 @@ def test_method_prompt_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -96,6 +97,7 @@ def test_method_score_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -160,6 +162,7 @@ async def test_method_prompt_with_all_params(self, async_client: AsyncCodex) ->
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -218,6 +221,7 @@ async def test_method_score_with_all_params(self, async_client: AsyncCodex) -> N
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,

From 6b52a985af9df1b6618d0685fafee2bae7e98566 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 28 Jul 2025 17:04:05 +0000
Subject: [PATCH 18/20] feat(api): remove deprecated endpoint increment_queries

---
 .stats.yml                                    |   4 +-
 src/codex/resources/projects/projects.py      |  74 +++++-----
 src/codex/resources/tlm.py                    | 128 +++++++++---------
 src/codex/types/project_validate_params.py    |  58 +++-----
 .../query_log_list_by_group_response.py       |  46 -------
 .../query_log_list_groups_response.py         |  46 -------
 .../types/projects/query_log_list_response.py |  46 -------
 .../projects/query_log_retrieve_response.py   |  46 -------
 ...remediation_list_resolved_logs_response.py |  46 -------
 src/codex/types/tlm_prompt_params.py          |  34 +++--
 src/codex/types/tlm_score_params.py           |  34 +++--
 tests/api_resources/test_projects.py          |  24 ----
 tests/api_resources/test_tlm.py               |   4 -
 13 files changed, 146 insertions(+), 444 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 3fdd5d0..031dedf 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 49989625bf633c5fdb3e11140f788f2d
-config_hash: 8f6e5c3b064cbb77569a6bf654954a56
+openapi_spec_hash: 57e29e33aec4bbc20171ec3128594e75
+config_hash: 930284cfa37f835d949c8a1b124f4807
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index f82bcd0..3a109ed 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -460,7 +460,6 @@ def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
-        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -505,16 +504,17 @@ def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -550,11 +550,12 @@ def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -573,8 +574,6 @@ def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -583,9 +582,6 @@ def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
-          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
-              provided to the LLM.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -624,7 +620,6 @@ def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
-                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
@@ -1033,7 +1028,6 @@ async def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
-        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -1078,16 +1072,17 @@ async def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -1123,11 +1118,12 @@ async def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -1146,8 +1142,6 @@ async def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -1156,9 +1150,6 @@ async def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
-          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
-              provided to the LLM.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -1197,7 +1188,6 @@ async def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
-                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
diff --git a/src/codex/resources/tlm.py b/src/codex/resources/tlm.py
index c6064ed..12ff6c0 100644
--- a/src/codex/resources/tlm.py
+++ b/src/codex/resources/tlm.py
@@ -79,16 +79,17 @@ def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -124,11 +125,12 @@ def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -147,8 +149,6 @@ def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -217,16 +217,17 @@ def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -262,11 +263,12 @@ def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -285,8 +287,6 @@ def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -371,16 +371,17 @@ async def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -416,11 +417,12 @@ async def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -439,8 +441,6 @@ async def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -509,16 +509,17 @@ async def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -554,11 +555,12 @@ async def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -577,8 +579,6 @@ async def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 6231367..081dd2a 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -54,8 +54,6 @@
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
     "Options",
-    "Tool",
-    "ToolFunction",
 ]
 
 
@@ -108,16 +106,17 @@ class ProjectValidateParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"none"`.
-    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-      `reasoning_effort` = `"none"`.
+    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+      `use_self_reflection` = True.
+    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+      `use_self_reflection` = True.
+    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+      "base" preset, a faster self-reflection is employed.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -153,11 +152,12 @@ class ProjectValidateParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+        and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -175,8 +175,6 @@ class ProjectValidateParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
-
-        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     prompt: Optional[str]
@@ -196,12 +194,6 @@ class ProjectValidateParams(TypedDict, total=False):
 
     task: Optional[str]
 
-    tools: Optional[Iterable[Tool]]
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
-
     x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
 
     x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")]
@@ -657,26 +649,8 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
-    num_self_reflections: int
-
     reasoning_effort: str
 
     similarity_measure: str
 
     use_self_reflection: bool
-
-
-class ToolFunction(TypedDict, total=False):
-    name: Required[str]
-
-    description: str
-
-    parameters: object
-
-    strict: Optional[bool]
-
-
-class Tool(TypedDict, total=False):
-    function: Required[ToolFunction]
-
-    type: Required[Literal["function"]]
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index b3c774b..ae49b95 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -16,8 +16,6 @@
     "QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores",
     "QueryLogsByGroupQueryLogContext",
     "QueryLogsByGroupQueryLogDeterministicGuardrailsResults",
-    "QueryLogsByGroupQueryLogEvaluatedResponseToolCall",
-    "QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction",
     "QueryLogsByGroupQueryLogMessage",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -43,8 +41,6 @@
     "QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "QueryLogsByGroupQueryLogTool",
-    "QueryLogsByGroupQueryLogToolFunction",
 ]
 
 
@@ -97,20 +93,6 @@ class QueryLogsByGroupQueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction(BaseModel):
-    arguments: str
-
-    name: str
-
-
-class QueryLogsByGroupQueryLogEvaluatedResponseToolCall(BaseModel):
-    id: str
-
-    function: QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction
-
-    type: Literal["function"]
-
-
 class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -305,22 +287,6 @@ class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseMod
 ]
 
 
-class QueryLogsByGroupQueryLogToolFunction(BaseModel):
-    name: str
-
-    description: Optional[str] = None
-
-    parameters: Optional[object] = None
-
-    strict: Optional[bool] = None
-
-
-class QueryLogsByGroupQueryLogTool(BaseModel):
-    function: QueryLogsByGroupQueryLogToolFunction
-
-    type: Literal["function"]
-
-
 class QueryLogsByGroupQueryLog(BaseModel):
     id: str
 
@@ -391,12 +357,6 @@ class QueryLogsByGroupQueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
-    evaluated_response_tool_calls: Optional[List[QueryLogsByGroupQueryLogEvaluatedResponseToolCall]] = None
-    """Tool calls from the evaluated response, if any.
-
-    Used to log tool calls in the query log.
-    """
-
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -423,12 +383,6 @@ class QueryLogsByGroupQueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
-    tools: Optional[List[QueryLogsByGroupQueryLogTool]] = None
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
-
 
 class QueryLogsByGroup(BaseModel):
     query_logs: List[QueryLogsByGroupQueryLog]
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index 6ed4d14..cc3b208 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -14,8 +14,6 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
-    "EvaluatedResponseToolCall",
-    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -41,8 +39,6 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "Tool",
-    "ToolFunction",
 ]
 
 
@@ -95,20 +91,6 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class EvaluatedResponseToolCallFunction(BaseModel):
-    arguments: str
-
-    name: str
-
-
-class EvaluatedResponseToolCall(BaseModel):
-    id: str
-
-    function: EvaluatedResponseToolCallFunction
-
-    type: Literal["function"]
-
-
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -295,22 +277,6 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
-class ToolFunction(BaseModel):
-    name: str
-
-    description: Optional[str] = None
-
-    parameters: Optional[object] = None
-
-    strict: Optional[bool] = None
-
-
-class Tool(BaseModel):
-    function: ToolFunction
-
-    type: Literal["function"]
-
-
 class QueryLogListGroupsResponse(BaseModel):
     id: str
 
@@ -381,12 +347,6 @@ class QueryLogListGroupsResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
-    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
-    """Tool calls from the evaluated response, if any.
-
-    Used to log tool calls in the query log.
-    """
-
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -412,9 +372,3 @@ class QueryLogListGroupsResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
-
-    tools: Optional[List[Tool]] = None
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index c6737b2..0778898 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -14,8 +14,6 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
-    "EvaluatedResponseToolCall",
-    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -41,8 +39,6 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "Tool",
-    "ToolFunction",
 ]
 
 
@@ -95,20 +91,6 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class EvaluatedResponseToolCallFunction(BaseModel):
-    arguments: str
-
-    name: str
-
-
-class EvaluatedResponseToolCall(BaseModel):
-    id: str
-
-    function: EvaluatedResponseToolCallFunction
-
-    type: Literal["function"]
-
-
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -295,22 +277,6 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
-class ToolFunction(BaseModel):
-    name: str
-
-    description: Optional[str] = None
-
-    parameters: Optional[object] = None
-
-    strict: Optional[bool] = None
-
-
-class Tool(BaseModel):
-    function: ToolFunction
-
-    type: Literal["function"]
-
-
 class QueryLogListResponse(BaseModel):
     id: str
 
@@ -375,12 +341,6 @@ class QueryLogListResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
-    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
-    """Tool calls from the evaluated response, if any.
-
-    Used to log tool calls in the query log.
-    """
-
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -406,9 +366,3 @@ class QueryLogListResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
-
-    tools: Optional[List[Tool]] = None
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 8fd8662..2751ef2 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -14,8 +14,6 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
-    "EvaluatedResponseToolCall",
-    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -41,8 +39,6 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "Tool",
-    "ToolFunction",
 ]
 
 
@@ -95,20 +91,6 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class EvaluatedResponseToolCallFunction(BaseModel):
-    arguments: str
-
-    name: str
-
-
-class EvaluatedResponseToolCall(BaseModel):
-    id: str
-
-    function: EvaluatedResponseToolCallFunction
-
-    type: Literal["function"]
-
-
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -295,22 +277,6 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
-class ToolFunction(BaseModel):
-    name: str
-
-    description: Optional[str] = None
-
-    parameters: Optional[object] = None
-
-    strict: Optional[bool] = None
-
-
-class Tool(BaseModel):
-    function: ToolFunction
-
-    type: Literal["function"]
-
-
 class QueryLogRetrieveResponse(BaseModel):
     id: str
 
@@ -379,12 +345,6 @@ class QueryLogRetrieveResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
-    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
-    """Tool calls from the evaluated response, if any.
-
-    Used to log tool calls in the query log.
-    """
-
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -410,9 +370,3 @@ class QueryLogRetrieveResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
-
-    tools: Optional[List[Tool]] = None
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index 567a086..d56f9a4 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -15,8 +15,6 @@
     "QueryLogFormattedNonGuardrailEvalScores",
     "QueryLogContext",
     "QueryLogDeterministicGuardrailsResults",
-    "QueryLogEvaluatedResponseToolCall",
-    "QueryLogEvaluatedResponseToolCallFunction",
     "QueryLogMessage",
     "QueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -42,8 +40,6 @@
     "QueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
-    "QueryLogTool",
-    "QueryLogToolFunction",
 ]
 
 
@@ -96,20 +92,6 @@ class QueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
-class QueryLogEvaluatedResponseToolCallFunction(BaseModel):
-    arguments: str
-
-    name: str
-
-
-class QueryLogEvaluatedResponseToolCall(BaseModel):
-    id: str
-
-    function: QueryLogEvaluatedResponseToolCallFunction
-
-    type: Literal["function"]
-
-
 class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -302,22 +284,6 @@ class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
-class QueryLogToolFunction(BaseModel):
-    name: str
-
-    description: Optional[str] = None
-
-    parameters: Optional[object] = None
-
-    strict: Optional[bool] = None
-
-
-class QueryLogTool(BaseModel):
-    function: QueryLogToolFunction
-
-    type: Literal["function"]
-
-
 class QueryLog(BaseModel):
     id: str
 
@@ -382,12 +348,6 @@ class QueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
-    evaluated_response_tool_calls: Optional[List[QueryLogEvaluatedResponseToolCall]] = None
-    """Tool calls from the evaluated response, if any.
-
-    Used to log tool calls in the query log.
-    """
-
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -414,12 +374,6 @@ class QueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
-    tools: Optional[List[QueryLogTool]] = None
-    """Tools to use for the LLM call.
-
-    If not provided, it is assumed no tools were provided to the LLM.
-    """
-
 
 class RemediationListResolvedLogsResponse(BaseModel):
     query_logs: List[QueryLog]
diff --git a/src/codex/types/tlm_prompt_params.py b/src/codex/types/tlm_prompt_params.py
index 8749c5a..3c04bfc 100644
--- a/src/codex/types/tlm_prompt_params.py
+++ b/src/codex/types/tlm_prompt_params.py
@@ -30,16 +30,17 @@ class TlmPromptParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"none"`.
-    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-      `reasoning_effort` = `"none"`.
+    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+      `use_self_reflection` = True.
+    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+      `use_self_reflection` = True.
+    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+      "base" preset, a faster self-reflection is employed.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -75,11 +76,12 @@ class TlmPromptParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+        and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -97,8 +99,6 @@ class TlmPromptParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
-
-        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -120,8 +120,6 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
-    num_self_reflections: int
-
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/src/codex/types/tlm_score_params.py b/src/codex/types/tlm_score_params.py
index 4a0a32a..95bcc4c 100644
--- a/src/codex/types/tlm_score_params.py
+++ b/src/codex/types/tlm_score_params.py
@@ -32,16 +32,17 @@ class TlmScoreParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"high"`.
-    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-      `reasoning_effort` = `"none"`.
-    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-      `reasoning_effort` = `"none"`.
+    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+      `use_self_reflection` = True. This preset improves LLM responses.
+    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+      `use_self_reflection` = True.
+    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+      `use_self_reflection` = True.
+    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+      "base" preset, a faster self-reflection is employed.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -77,11 +78,12 @@ class TlmScoreParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+        and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -99,8 +101,6 @@ class TlmScoreParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
-
-        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -122,8 +122,6 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
-    num_self_reflections: int
-
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 7884db0..ae3f4f0 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -622,7 +622,6 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -631,17 +630,6 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
-            tools=[
-                {
-                    "function": {
-                        "name": "name",
-                        "description": "description",
-                        "parameters": {},
-                        "strict": True,
-                    },
-                    "type": "function",
-                }
-            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
@@ -1296,7 +1284,6 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -1305,17 +1292,6 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
-            tools=[
-                {
-                    "function": {
-                        "name": "name",
-                        "description": "description",
-                        "parameters": {},
-                        "strict": True,
-                    },
-                    "type": "function",
-                }
-            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
diff --git a/tests/api_resources/test_tlm.py b/tests/api_resources/test_tlm.py
index da0a9ad..41376a4 100644
--- a/tests/api_resources/test_tlm.py
+++ b/tests/api_resources/test_tlm.py
@@ -38,7 +38,6 @@ def test_method_prompt_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -97,7 +96,6 @@ def test_method_score_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -162,7 +160,6 @@ async def test_method_prompt_with_all_params(self, async_client: AsyncCodex) ->
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -221,7 +218,6 @@ async def test_method_score_with_all_params(self, async_client: AsyncCodex) -> N
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
-                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,

From ce6d89f3c885765b21c6ba43b1b7b9a1ebf8a61e Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 28 Jul 2025 17:18:00 +0000
Subject: [PATCH 19/20] feat(api): api update

---
 .stats.yml                                    |   2 +-
 src/codex/resources/projects/projects.py      |  74 +++++-----
 src/codex/resources/tlm.py                    | 128 +++++++++---------
 src/codex/types/project_validate_params.py    |  58 +++++---
 .../query_log_list_by_group_response.py       |  46 +++++++
 .../query_log_list_groups_response.py         |  46 +++++++
 .../types/projects/query_log_list_response.py |  46 +++++++
 .../projects/query_log_retrieve_response.py   |  46 +++++++
 ...remediation_list_resolved_logs_response.py |  46 +++++++
 src/codex/types/tlm_prompt_params.py          |  34 ++---
 src/codex/types/tlm_score_params.py           |  34 ++---
 tests/api_resources/test_projects.py          |  24 ++++
 tests/api_resources/test_tlm.py               |   4 +
 13 files changed, 443 insertions(+), 145 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 031dedf..4f2aa48 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 57e29e33aec4bbc20171ec3128594e75
+openapi_spec_hash: 49989625bf633c5fdb3e11140f788f2d
 config_hash: 930284cfa37f835d949c8a1b124f4807
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index 3a109ed..f82bcd0 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -460,6 +460,7 @@ def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
+        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -504,17 +505,16 @@ def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -550,12 +550,11 @@ def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -574,6 +573,8 @@ def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -582,6 +583,9 @@ def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
+          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
+              provided to the LLM.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -620,6 +624,7 @@ def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
+                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
@@ -1028,6 +1033,7 @@ async def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
+        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -1072,17 +1078,16 @@ async def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -1118,12 +1123,11 @@ async def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -1142,6 +1146,8 @@ async def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -1150,6 +1156,9 @@ async def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
+          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
+              provided to the LLM.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -1188,6 +1197,7 @@ async def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
+                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
diff --git a/src/codex/resources/tlm.py b/src/codex/resources/tlm.py
index 12ff6c0..c6064ed 100644
--- a/src/codex/resources/tlm.py
+++ b/src/codex/resources/tlm.py
@@ -79,17 +79,16 @@ def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -125,12 +124,11 @@ def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -149,6 +147,8 @@ def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -217,17 +217,16 @@ def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -263,12 +262,11 @@ def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -287,6 +285,8 @@ def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -371,17 +371,16 @@ async def prompt(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -417,12 +416,11 @@ async def prompt(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -441,6 +439,8 @@ async def prompt(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
@@ -509,17 +509,16 @@ async def score(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-                `use_self_reflection` = True. This preset improves LLM responses.
-              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-                `use_self_reflection` = True.
-              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-                `use_self_reflection` = True.
-              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-                "base" preset, a faster self-reflection is employed.
+              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"high"`.
+              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+                `reasoning_effort` = `"none"`.
+              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+                `reasoning_effort` = `"none"`.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -555,12 +554,11 @@ async def score(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-                  and catches responses that are noticeably incorrect/bad upon further analysis.
+                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -579,6 +577,8 @@ async def score(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
+                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
+
           quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
 
           extra_headers: Send extra headers
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 081dd2a..6231367 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -54,6 +54,8 @@
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
     "Options",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -106,17 +108,16 @@ class ProjectValidateParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -152,12 +153,11 @@ class ProjectValidateParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -175,6 +175,8 @@ class ProjectValidateParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     prompt: Optional[str]
@@ -194,6 +196,12 @@ class ProjectValidateParams(TypedDict, total=False):
 
     task: Optional[str]
 
+    tools: Optional[Iterable[Tool]]
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
     x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
 
     x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")]
@@ -649,8 +657,26 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
 
     use_self_reflection: bool
+
+
+class ToolFunction(TypedDict, total=False):
+    name: Required[str]
+
+    description: str
+
+    parameters: object
+
+    strict: Optional[bool]
+
+
+class Tool(TypedDict, total=False):
+    function: Required[ToolFunction]
+
+    type: Required[Literal["function"]]
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index ae49b95..b3c774b 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -16,6 +16,8 @@
     "QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores",
     "QueryLogsByGroupQueryLogContext",
     "QueryLogsByGroupQueryLogDeterministicGuardrailsResults",
+    "QueryLogsByGroupQueryLogEvaluatedResponseToolCall",
+    "QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction",
     "QueryLogsByGroupQueryLogMessage",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -41,6 +43,8 @@
     "QueryLogsByGroupQueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogsByGroupQueryLogTool",
+    "QueryLogsByGroupQueryLogToolFunction",
 ]
 
 
@@ -93,6 +97,20 @@ class QueryLogsByGroupQueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogsByGroupQueryLogEvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: QueryLogsByGroupQueryLogEvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class QueryLogsByGroupQueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -287,6 +305,22 @@ class QueryLogsByGroupQueryLogMessageChatCompletionDeveloperMessageParam(BaseMod
 ]
 
 
+class QueryLogsByGroupQueryLogToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class QueryLogsByGroupQueryLogTool(BaseModel):
+    function: QueryLogsByGroupQueryLogToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogsByGroupQueryLog(BaseModel):
     id: str
 
@@ -357,6 +391,12 @@ class QueryLogsByGroupQueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[QueryLogsByGroupQueryLogEvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -383,6 +423,12 @@ class QueryLogsByGroupQueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    tools: Optional[List[QueryLogsByGroupQueryLogTool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
 
 class QueryLogsByGroup(BaseModel):
     query_logs: List[QueryLogsByGroupQueryLog]
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index cc3b208..6ed4d14 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogListGroupsResponse(BaseModel):
     id: str
 
@@ -347,6 +381,12 @@ class QueryLogListGroupsResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -372,3 +412,9 @@ class QueryLogListGroupsResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index 0778898..c6737b2 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogListResponse(BaseModel):
     id: str
 
@@ -341,6 +375,12 @@ class QueryLogListResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -366,3 +406,9 @@ class QueryLogListResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index 2751ef2..8fd8662 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -14,6 +14,8 @@
     "FormattedNonGuardrailEvalScores",
     "Context",
     "DeterministicGuardrailsResults",
+    "EvaluatedResponseToolCall",
+    "EvaluatedResponseToolCallFunction",
     "Message",
     "MessageChatCompletionAssistantMessageParamOutput",
     "MessageChatCompletionAssistantMessageParamOutputAudio",
@@ -39,6 +41,8 @@
     "MessageChatCompletionFunctionMessageParam",
     "MessageChatCompletionDeveloperMessageParam",
     "MessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "Tool",
+    "ToolFunction",
 ]
 
 
@@ -91,6 +95,20 @@ class DeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class EvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class EvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: EvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class MessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -277,6 +295,22 @@ class MessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class ToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class Tool(BaseModel):
+    function: ToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLogRetrieveResponse(BaseModel):
     id: str
 
@@ -345,6 +379,12 @@ class QueryLogRetrieveResponse(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[EvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -370,3 +410,9 @@ class QueryLogRetrieveResponse(BaseModel):
 
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
+
+    tools: Optional[List[Tool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index d56f9a4..567a086 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -15,6 +15,8 @@
     "QueryLogFormattedNonGuardrailEvalScores",
     "QueryLogContext",
     "QueryLogDeterministicGuardrailsResults",
+    "QueryLogEvaluatedResponseToolCall",
+    "QueryLogEvaluatedResponseToolCallFunction",
     "QueryLogMessage",
     "QueryLogMessageChatCompletionAssistantMessageParamOutput",
     "QueryLogMessageChatCompletionAssistantMessageParamOutputAudio",
@@ -40,6 +42,8 @@
     "QueryLogMessageChatCompletionFunctionMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParam",
     "QueryLogMessageChatCompletionDeveloperMessageParamContentUnionMember1",
+    "QueryLogTool",
+    "QueryLogToolFunction",
 ]
 
 
@@ -92,6 +96,20 @@ class QueryLogDeterministicGuardrailsResults(BaseModel):
     matches: Optional[List[str]] = None
 
 
+class QueryLogEvaluatedResponseToolCallFunction(BaseModel):
+    arguments: str
+
+    name: str
+
+
+class QueryLogEvaluatedResponseToolCall(BaseModel):
+    id: str
+
+    function: QueryLogEvaluatedResponseToolCallFunction
+
+    type: Literal["function"]
+
+
 class QueryLogMessageChatCompletionAssistantMessageParamOutputAudio(BaseModel):
     id: str
 
@@ -284,6 +302,22 @@ class QueryLogMessageChatCompletionDeveloperMessageParam(BaseModel):
 ]
 
 
+class QueryLogToolFunction(BaseModel):
+    name: str
+
+    description: Optional[str] = None
+
+    parameters: Optional[object] = None
+
+    strict: Optional[bool] = None
+
+
+class QueryLogTool(BaseModel):
+    function: QueryLogToolFunction
+
+    type: Literal["function"]
+
+
 class QueryLog(BaseModel):
     id: str
 
@@ -348,6 +382,12 @@ class QueryLog(BaseModel):
     evaluated_response: Optional[str] = None
     """The response being evaluated from the RAG system (before any remediation)"""
 
+    evaluated_response_tool_calls: Optional[List[QueryLogEvaluatedResponseToolCall]] = None
+    """Tool calls from the evaluated response, if any.
+
+    Used to log tool calls in the query log.
+    """
+
     guardrail_evals: Optional[List[str]] = None
     """Evals that should trigger guardrail"""
 
@@ -374,6 +414,12 @@ class QueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    tools: Optional[List[QueryLogTool]] = None
+    """Tools to use for the LLM call.
+
+    If not provided, it is assumed no tools were provided to the LLM.
+    """
+
 
 class RemediationListResolvedLogsResponse(BaseModel):
     query_logs: List[QueryLog]
diff --git a/src/codex/types/tlm_prompt_params.py b/src/codex/types/tlm_prompt_params.py
index 3c04bfc..8749c5a 100644
--- a/src/codex/types/tlm_prompt_params.py
+++ b/src/codex/types/tlm_prompt_params.py
@@ -30,17 +30,16 @@ class TlmPromptParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -76,12 +75,11 @@ class TlmPromptParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -99,6 +97,8 @@ class TlmPromptParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -120,6 +120,8 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/src/codex/types/tlm_score_params.py b/src/codex/types/tlm_score_params.py
index 95bcc4c..4a0a32a 100644
--- a/src/codex/types/tlm_score_params.py
+++ b/src/codex/types/tlm_score_params.py
@@ -32,17 +32,16 @@ class TlmScoreParams(TypedDict, total=False):
 
     The default values corresponding to each quality preset are:
 
-    - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
-      `use_self_reflection` = True. This preset improves LLM responses.
-    - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
-      `use_self_reflection` = True.
-    - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
-      `use_self_reflection` = True.
-    - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
-      `use_self_reflection` = False. When using `get_trustworthiness_score()` on
-      "base" preset, a faster self-reflection is employed.
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
+      `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
+      `reasoning_effort` = `"none"`.
 
     By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
     `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -78,12 +77,11 @@ class TlmScoreParams(TypedDict, total=False):
         strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
         TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-        use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-        Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-        Reflection helps quantify aleatoric uncertainty associated with challenging prompts
-        and catches responses that are noticeably incorrect/bad upon further analysis.
+        num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+        The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
 
-        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
         trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
         Supported similarity measures include - "semantic" (based on natural language inference),
         "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -101,6 +99,8 @@ class TlmScoreParams(TypedDict, total=False):
         The expected input format is a list of dictionaries, where each dictionary has the following keys:
         - name: Name of the evaluation criteria.
         - criteria: Instructions specifying the evaluation criteria.
+
+        use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
     """
 
     quality_preset: Literal["best", "high", "medium", "low", "base"]
@@ -122,6 +122,8 @@ class Options(TypedDict, total=False):
 
     num_consistency_samples: int
 
+    num_self_reflections: int
+
     reasoning_effort: str
 
     similarity_measure: str
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index ae3f4f0..7884db0 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -622,6 +622,7 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -630,6 +631,17 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
+            tools=[
+                {
+                    "function": {
+                        "name": "name",
+                        "description": "description",
+                        "parameters": {},
+                        "strict": True,
+                    },
+                    "type": "function",
+                }
+            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
@@ -1284,6 +1296,7 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -1292,6 +1305,17 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
             quality_preset="best",
             rewritten_question="rewritten_question",
             task="task",
+            tools=[
+                {
+                    "function": {
+                        "name": "name",
+                        "description": "description",
+                        "parameters": {},
+                        "strict": True,
+                    },
+                    "type": "function",
+                }
+            ],
             x_client_library_version="x-client-library-version",
             x_integration_type="x-integration-type",
             x_source="x-source",
diff --git a/tests/api_resources/test_tlm.py b/tests/api_resources/test_tlm.py
index 41376a4..da0a9ad 100644
--- a/tests/api_resources/test_tlm.py
+++ b/tests/api_resources/test_tlm.py
@@ -38,6 +38,7 @@ def test_method_prompt_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -96,6 +97,7 @@ def test_method_score_with_all_params(self, client: Codex) -> None:
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -160,6 +162,7 @@ async def test_method_prompt_with_all_params(self, async_client: AsyncCodex) ->
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,
@@ -218,6 +221,7 @@ async def test_method_score_with_all_params(self, async_client: AsyncCodex) -> N
                 "model": "model",
                 "num_candidate_responses": 0,
                 "num_consistency_samples": 0,
+                "num_self_reflections": 0,
                 "reasoning_effort": "reasoning_effort",
                 "similarity_measure": "similarity_measure",
                 "use_self_reflection": True,

From 8b2ae15a6976beeb10f3cbdf7ef9f9adb95b238c Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 28 Jul 2025 17:18:19 +0000
Subject: [PATCH 20/20] release: 0.1.0-alpha.24

---
 .release-please-manifest.json |  2 +-
 CHANGELOG.md                  | 33 +++++++++++++++++++++++++++++++++
 pyproject.toml                |  2 +-
 src/codex/_version.py         |  2 +-
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 1c0bb88..380b6f9 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.1.0-alpha.23"
+  ".": "0.1.0-alpha.24"
 }
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 131d328..e4f0a42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # Changelog
 
+## 0.1.0-alpha.24 (2025-07-28)
+
+Full Changelog: [v0.1.0-alpha.23...v0.1.0-alpha.24](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.23...v0.1.0-alpha.24)
+
+### Features
+
+* **api:** api update ([ce6d89f](https://github.com/cleanlab/codex-python/commit/ce6d89f3c885765b21c6ba43b1b7b9a1ebf8a61e))
+* **api:** api update ([1a06cfc](https://github.com/cleanlab/codex-python/commit/1a06cfc7c19943ac468b2ec9f2787215363cf77e))
+* **api:** api update ([2ee8095](https://github.com/cleanlab/codex-python/commit/2ee809593ddb15c4de776a2048883287ec5c0cdb))
+* **api:** api update ([6992031](https://github.com/cleanlab/codex-python/commit/6992031e6aa610031f24d818040050b0fc185c34))
+* **api:** api update ([7e7caf9](https://github.com/cleanlab/codex-python/commit/7e7caf9a3ad214c5df3686122e4f26b850dcb8b0))
+* **api:** api update ([0a33c47](https://github.com/cleanlab/codex-python/commit/0a33c4710d4890d17ddd973ba4a2ed183e45e4c7))
+* **api:** api update ([575d190](https://github.com/cleanlab/codex-python/commit/575d1901319984fea901ce216323a5259e17f98c))
+* **api:** api update ([f55f4b7](https://github.com/cleanlab/codex-python/commit/f55f4b768f8c1d00bdf61e56b0a7227c8424c5b6))
+* **api:** api update ([b956ce0](https://github.com/cleanlab/codex-python/commit/b956ce083ef3c507a7649577724f337a562c427a))
+* **api:** remove deprecated endpoint increment_queries ([6b52a98](https://github.com/cleanlab/codex-python/commit/6b52a985af9df1b6618d0685fafee2bae7e98566))
+
+
+### Bug Fixes
+
+* **client:** don't send Content-Type header on GET requests ([4732aae](https://github.com/cleanlab/codex-python/commit/4732aaeb03872abffb4e13df6dd1994711bd4268))
+* **parsing:** correctly handle nested discriminated unions ([b374589](https://github.com/cleanlab/codex-python/commit/b374589baf01ca1236cf0823305e6bca037cf12b))
+* **parsing:** ignore empty metadata ([1cdf391](https://github.com/cleanlab/codex-python/commit/1cdf391742b196d5a723307e8c202a69e00b371d))
+* **parsing:** parse extra field types ([3c74ca0](https://github.com/cleanlab/codex-python/commit/3c74ca0f1a913bed65cc4c6580dda25a07a90b74))
+
+
+### Chores
+
+* **internal:** bump pinned h11 dep ([7ce51e9](https://github.com/cleanlab/codex-python/commit/7ce51e93023f66f3e343e379fc1930ddba335e9b))
+* **package:** mark python 3.13 as supported ([5cba949](https://github.com/cleanlab/codex-python/commit/5cba94956fff8ca4de99426a20e5c67f0ce6a2ac))
+* **project:** add settings file for vscode ([00df8ec](https://github.com/cleanlab/codex-python/commit/00df8ec35d44e5bdc6e68661a92d9d21905222c7))
+* **readme:** fix version rendering on pypi ([d05336d](https://github.com/cleanlab/codex-python/commit/d05336d89f5a49b09d7b1f85e7cb3ed74035157a))
+
 ## 0.1.0-alpha.23 (2025-07-07)
 
 Full Changelog: [v0.1.0-alpha.22...v0.1.0-alpha.23](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.22...v0.1.0-alpha.23)
diff --git a/pyproject.toml b/pyproject.toml
index 964b48a..a061100 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "codex-sdk"
-version = "0.1.0-alpha.23"
+version = "0.1.0-alpha.24"
 description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead."
 dynamic = ["readme"]
 license = "MIT"
diff --git a/src/codex/_version.py b/src/codex/_version.py
index 18f2dcb..e020cb9 100644
--- a/src/codex/_version.py
+++ b/src/codex/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "codex"
-__version__ = "0.1.0-alpha.23"  # x-release-please-version
+__version__ = "0.1.0-alpha.24"  # x-release-please-version