From 69477646d7c8eff2bae01199949e4037771ba460 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 7 May 2025 18:32:14 +0000
Subject: [PATCH 1/7] feat(api): api update
---
.stats.yml | 2 +-
api.md | 2 +-
src/codex/resources/projects/projects.py | 17 ++++++++--
src/codex/types/__init__.py | 1 +
.../types/project_increment_queries_params.py | 11 ++++++
tests/api_resources/test_projects.py | 34 ++++++++++++++-----
6 files changed, 55 insertions(+), 12 deletions(-)
create mode 100644 src/codex/types/project_increment_queries_params.py
diff --git a/.stats.yml b/.stats.yml
index 7a2f56a6..4a4a129e 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
configured_endpoints: 43
-openapi_spec_hash: b7beefbd38b4fcdd191cdb81a18a023b
+openapi_spec_hash: 51dd9bdb04307116617d3eefe3237755
config_hash: 5e459b33c53ffa6e554087a779bdb790
diff --git a/api.md b/api.md
index f240632f..3bd2cf4f 100644
--- a/api.md
+++ b/api.md
@@ -153,7 +153,7 @@ Methods:
- client.projects.list(\*\*params) -> ProjectListResponse
- client.projects.delete(project_id) -> None
- client.projects.export(project_id) -> object
-- client.projects.increment_queries(project_id) -> object
+- client.projects.increment_queries(project_id, \*\*params) -> object
- client.projects.retrieve_analytics(project_id, \*\*params) -> ProjectRetrieveAnalyticsResponse
## AccessKeys
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index bd50a684..7c676b6d 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -11,6 +11,7 @@
project_list_params,
project_create_params,
project_update_params,
+ project_increment_queries_params,
project_retrieve_analytics_params,
)
from .entries import (
@@ -331,6 +332,7 @@ def increment_queries(
self,
project_id: str,
*,
+ count: int | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
extra_headers: Headers | None = None,
@@ -355,7 +357,11 @@ def increment_queries(
return self._post(
f"/api/projects/{project_id}/increment_queries",
options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+ extra_headers=extra_headers,
+ extra_query=extra_query,
+ extra_body=extra_body,
+ timeout=timeout,
+ query=maybe_transform({"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams),
),
cast_to=object,
)
@@ -685,6 +691,7 @@ async def increment_queries(
self,
project_id: str,
*,
+ count: int | NotGiven = NOT_GIVEN,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
extra_headers: Headers | None = None,
@@ -709,7 +716,13 @@ async def increment_queries(
return await self._post(
f"/api/projects/{project_id}/increment_queries",
options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+ extra_headers=extra_headers,
+ extra_query=extra_query,
+ extra_body=extra_body,
+ timeout=timeout,
+ query=await async_maybe_transform(
+ {"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams
+ ),
),
cast_to=object,
)
diff --git a/src/codex/types/__init__.py b/src/codex/types/__init__.py
index 53d1ab6b..7f18b9c1 100644
--- a/src/codex/types/__init__.py
+++ b/src/codex/types/__init__.py
@@ -15,6 +15,7 @@
from .project_retrieve_response import ProjectRetrieveResponse as ProjectRetrieveResponse
from .organization_schema_public import OrganizationSchemaPublic as OrganizationSchemaPublic
from .user_activate_account_params import UserActivateAccountParams as UserActivateAccountParams
+from .project_increment_queries_params import ProjectIncrementQueriesParams as ProjectIncrementQueriesParams
from .project_retrieve_analytics_params import ProjectRetrieveAnalyticsParams as ProjectRetrieveAnalyticsParams
from .organization_list_members_response import OrganizationListMembersResponse as OrganizationListMembersResponse
from .project_retrieve_analytics_response import ProjectRetrieveAnalyticsResponse as ProjectRetrieveAnalyticsResponse
diff --git a/src/codex/types/project_increment_queries_params.py b/src/codex/types/project_increment_queries_params.py
new file mode 100644
index 00000000..f6043a76
--- /dev/null
+++ b/src/codex/types/project_increment_queries_params.py
@@ -0,0 +1,11 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import TypedDict
+
+__all__ = ["ProjectIncrementQueriesParams"]
+
+
+class ProjectIncrementQueriesParams(TypedDict, total=False):
+ count: int
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 772a7b29..40f40eb7 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -322,7 +322,16 @@ def test_path_params_export(self, client: Codex) -> None:
@parametrize
def test_method_increment_queries(self, client: Codex) -> None:
project = client.projects.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ )
+ assert_matches_type(object, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ def test_method_increment_queries_with_all_params(self, client: Codex) -> None:
+ project = client.projects.increment_queries(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ count=0,
)
assert_matches_type(object, project, path=["response"])
@@ -330,7 +339,7 @@ def test_method_increment_queries(self, client: Codex) -> None:
@parametrize
def test_raw_response_increment_queries(self, client: Codex) -> None:
response = client.projects.with_raw_response.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
)
assert response.is_closed is True
@@ -342,7 +351,7 @@ def test_raw_response_increment_queries(self, client: Codex) -> None:
@parametrize
def test_streaming_response_increment_queries(self, client: Codex) -> None:
with client.projects.with_streaming_response.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
) as response:
assert not response.is_closed
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -357,7 +366,7 @@ def test_streaming_response_increment_queries(self, client: Codex) -> None:
def test_path_params_increment_queries(self, client: Codex) -> None:
with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
client.projects.with_raw_response.increment_queries(
- "",
+ project_id="",
)
@pytest.mark.skip()
@@ -716,7 +725,16 @@ async def test_path_params_export(self, async_client: AsyncCodex) -> None:
@parametrize
async def test_method_increment_queries(self, async_client: AsyncCodex) -> None:
project = await async_client.projects.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ )
+ assert_matches_type(object, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_method_increment_queries_with_all_params(self, async_client: AsyncCodex) -> None:
+ project = await async_client.projects.increment_queries(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ count=0,
)
assert_matches_type(object, project, path=["response"])
@@ -724,7 +742,7 @@ async def test_method_increment_queries(self, async_client: AsyncCodex) -> None:
@parametrize
async def test_raw_response_increment_queries(self, async_client: AsyncCodex) -> None:
response = await async_client.projects.with_raw_response.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
)
assert response.is_closed is True
@@ -736,7 +754,7 @@ async def test_raw_response_increment_queries(self, async_client: AsyncCodex) ->
@parametrize
async def test_streaming_response_increment_queries(self, async_client: AsyncCodex) -> None:
async with async_client.projects.with_streaming_response.increment_queries(
- "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
) as response:
assert not response.is_closed
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -751,7 +769,7 @@ async def test_streaming_response_increment_queries(self, async_client: AsyncCod
async def test_path_params_increment_queries(self, async_client: AsyncCodex) -> None:
with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
await async_client.projects.with_raw_response.increment_queries(
- "",
+ project_id="",
)
@pytest.mark.skip()
From a52c74a22fb720f10265021d057f34874f73846b Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 25 Apr 2025 19:16:21 +0000
Subject: [PATCH 2/7] feat(api): api update
---
.stats.yml | 2 +-
src/codex/types/users/user_schema.py | 2 ++
src/codex/types/users/user_schema_public.py | 2 +-
3 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/.stats.yml b/.stats.yml
index 4a4a129e..4fd111cc 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
configured_endpoints: 43
-openapi_spec_hash: 51dd9bdb04307116617d3eefe3237755
+openapi_spec_hash: 6d2d01f4951c677a47cffe973084413e
config_hash: 5e459b33c53ffa6e554087a779bdb790
diff --git a/src/codex/types/users/user_schema.py b/src/codex/types/users/user_schema.py
index b1665f21..d22c9563 100644
--- a/src/codex/types/users/user_schema.py
+++ b/src/codex/types/users/user_schema.py
@@ -23,6 +23,8 @@ class UserSchema(BaseModel):
email: str
+ email_verified: bool
+
updated_at: datetime
user_provided_company_name: Optional[str] = None
diff --git a/src/codex/types/users/user_schema_public.py b/src/codex/types/users/user_schema_public.py
index 181113b0..d5e1d9bf 100644
--- a/src/codex/types/users/user_schema_public.py
+++ b/src/codex/types/users/user_schema_public.py
@@ -14,7 +14,7 @@ class UserSchemaPublic(BaseModel):
email: str
- email_verified: Optional[bool] = None
+ email_verified: bool
first_name: Optional[str] = None
From 61fdb7aaaa6c2533ebcfdfe3c0aff31474e75d51 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Tue, 29 Apr 2025 14:17:04 +0000
Subject: [PATCH 3/7] feat(api): api update
---
.stats.yml | 2 +-
src/codex/resources/projects/entries.py | 12 ++++++
.../types/projects/entry_query_params.py | 39 ++++++++++++++++++-
tests/api_resources/projects/test_entries.py | 12 ++++++
4 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/.stats.yml b/.stats.yml
index 4fd111cc..a1b247c4 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
configured_endpoints: 43
-openapi_spec_hash: 6d2d01f4951c677a47cffe973084413e
+openapi_spec_hash: 3873591605b529e6ae298fc7f04d4ba1
config_hash: 5e459b33c53ffa6e554087a779bdb790
diff --git a/src/codex/resources/projects/entries.py b/src/codex/resources/projects/entries.py
index a9e690b9..346dd353 100644
--- a/src/codex/resources/projects/entries.py
+++ b/src/codex/resources/projects/entries.py
@@ -319,6 +319,7 @@ def query(
question: str,
use_llm_matching: bool | NotGiven = NOT_GIVEN,
client_metadata: Optional[object] | NotGiven = NOT_GIVEN,
+ query_metadata: Optional[entry_query_params.QueryMetadata] | NotGiven = NOT_GIVEN,
x_client_library_version: str | NotGiven = NOT_GIVEN,
x_integration_type: str | NotGiven = NOT_GIVEN,
x_source: str | NotGiven = NOT_GIVEN,
@@ -334,6 +335,10 @@ def query(
Query Entries Route
Args:
+ client_metadata: Deprecated: Use query_metadata instead
+
+ query_metadata: Optional logging data that can be provided by the client.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -361,6 +366,7 @@ def query(
{
"question": question,
"client_metadata": client_metadata,
+ "query_metadata": query_metadata,
},
entry_query_params.EntryQueryParams,
),
@@ -708,6 +714,7 @@ async def query(
question: str,
use_llm_matching: bool | NotGiven = NOT_GIVEN,
client_metadata: Optional[object] | NotGiven = NOT_GIVEN,
+ query_metadata: Optional[entry_query_params.QueryMetadata] | NotGiven = NOT_GIVEN,
x_client_library_version: str | NotGiven = NOT_GIVEN,
x_integration_type: str | NotGiven = NOT_GIVEN,
x_source: str | NotGiven = NOT_GIVEN,
@@ -723,6 +730,10 @@ async def query(
Query Entries Route
Args:
+ client_metadata: Deprecated: Use query_metadata instead
+
+ query_metadata: Optional logging data that can be provided by the client.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -750,6 +761,7 @@ async def query(
{
"question": question,
"client_metadata": client_metadata,
+ "query_metadata": query_metadata,
},
entry_query_params.EntryQueryParams,
),
diff --git a/src/codex/types/projects/entry_query_params.py b/src/codex/types/projects/entry_query_params.py
index d58b7bfa..1edabbed 100644
--- a/src/codex/types/projects/entry_query_params.py
+++ b/src/codex/types/projects/entry_query_params.py
@@ -2,12 +2,12 @@
from __future__ import annotations
-from typing import Optional
+from typing import Dict, List, Union, Iterable, Optional
from typing_extensions import Required, Annotated, TypedDict
from ..._utils import PropertyInfo
-__all__ = ["EntryQueryParams"]
+__all__ = ["EntryQueryParams", "QueryMetadata", "QueryMetadataContextUnionMember3"]
class EntryQueryParams(TypedDict, total=False):
@@ -16,6 +16,10 @@ class EntryQueryParams(TypedDict, total=False):
use_llm_matching: bool
client_metadata: Optional[object]
+ """Deprecated: Use query_metadata instead"""
+
+ query_metadata: Optional[QueryMetadata]
+ """Optional logging data that can be provided by the client."""
x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
@@ -24,3 +28,34 @@ class EntryQueryParams(TypedDict, total=False):
x_source: Annotated[str, PropertyInfo(alias="x-source")]
x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")]
+
+
+class QueryMetadataContextUnionMember3(TypedDict, total=False):
+ content: Required[str]
+ """The actual content/text of the document."""
+
+ id: Optional[str]
+ """Unique identifier for the document. Useful for tracking documents"""
+
+ source: Optional[str]
+ """Source or origin of the document. Useful for citations."""
+
+ tags: Optional[List[str]]
+ """Tags or categories for the document. Useful for filtering"""
+
+ title: Optional[str]
+ """Title or heading of the document. Useful for display and context."""
+
+
+class QueryMetadata(TypedDict, total=False):
+ context: Union[str, List[str], Iterable[object], Iterable[QueryMetadataContextUnionMember3], None]
+ """RAG context used for the query"""
+
+ custom_metadata: Optional[object]
+ """Arbitrary metadata supplied by the user/system"""
+
+ eval_scores: Optional[Dict[str, float]]
+ """Evaluation scores for the original response"""
+
+ evaluated_response: Optional[str]
+ """The response being evaluated from the RAG system(before any remediation)"""
diff --git a/tests/api_resources/projects/test_entries.py b/tests/api_resources/projects/test_entries.py
index 31a5e408..73a45ad4 100644
--- a/tests/api_resources/projects/test_entries.py
+++ b/tests/api_resources/projects/test_entries.py
@@ -396,6 +396,12 @@ def test_method_query_with_all_params(self, client: Codex) -> None:
question="question",
use_llm_matching=True,
client_metadata={},
+ query_metadata={
+ "context": "string",
+ "custom_metadata": {},
+ "eval_scores": {"foo": 0},
+ "evaluated_response": "evaluated_response",
+ },
x_client_library_version="x-client-library-version",
x_integration_type="x-integration-type",
x_source="x-source",
@@ -871,6 +877,12 @@ async def test_method_query_with_all_params(self, async_client: AsyncCodex) -> N
question="question",
use_llm_matching=True,
client_metadata={},
+ query_metadata={
+ "context": "string",
+ "custom_metadata": {},
+ "eval_scores": {"foo": 0},
+ "evaluated_response": "evaluated_response",
+ },
x_client_library_version="x-client-library-version",
x_integration_type="x-integration-type",
x_source="x-source",
From 8e01ccdc8e341d010e8989f30f4fd887effa1871 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 18:16:50 +0000
Subject: [PATCH 4/7] feat(api): api update
---
.stats.yml | 2 +-
src/codex/resources/tlm.py | 272 ++++++++++++++-------------
src/codex/types/tlm_prompt_params.py | 67 +++----
src/codex/types/tlm_score_params.py | 67 +++----
4 files changed, 215 insertions(+), 193 deletions(-)
diff --git a/.stats.yml b/.stats.yml
index a1b247c4..bcd9e36c 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
configured_endpoints: 43
-openapi_spec_hash: 3873591605b529e6ae298fc7f04d4ba1
+openapi_spec_hash: f0d588a39e2040ff516a5cff26c4ab58
config_hash: 5e459b33c53ffa6e554087a779bdb790
diff --git a/src/codex/resources/tlm.py b/src/codex/resources/tlm.py
index 78f97e2e..5e4cd7e1 100644
--- a/src/codex/resources/tlm.py
+++ b/src/codex/resources/tlm.py
@@ -97,59 +97,63 @@ def prompt(
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -234,59 +238,63 @@ def score(
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -387,59 +395,63 @@ async def prompt(
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -524,59 +536,63 @@ async def score(
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
diff --git a/src/codex/types/tlm_prompt_params.py b/src/codex/types/tlm_prompt_params.py
index 860f1a77..94536055 100644
--- a/src/codex/types/tlm_prompt_params.py
+++ b/src/codex/types/tlm_prompt_params.py
@@ -48,61 +48,64 @@ class TlmPromptParams(TypedDict, total=False):
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
"""
quality_preset: Literal["best", "high", "medium", "low", "base"]
+ """The quality preset to use for the TLM or Trustworthy RAG API."""
task: Optional[str]
diff --git a/src/codex/types/tlm_score_params.py b/src/codex/types/tlm_score_params.py
index 213da422..a0d90175 100644
--- a/src/codex/types/tlm_score_params.py
+++ b/src/codex/types/tlm_score_params.py
@@ -50,61 +50,64 @@ class TlmScoreParams(TypedDict, total=False):
You can set custom values for these arguments regardless of the quality preset
specified.
- Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview",
- "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet",
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
- faster models yield faster/cheaper results). - Models still in beta: "o1",
- "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet",
- "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite",
- "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1",
- "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro",
- "gpt-4o-mini".
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
- Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes.
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
If you experience token/rate limit errors while using TLM, try lowering this number.
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
- num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM.
- TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
- Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens).
- This parameter must be between 1 and 20.
- When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it.
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
- num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency.
- Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes.
- This consistency helps quantify the epistemic uncertainty associated with
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
- TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
- use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it
- generated and self-evaluate this response.
- Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes.
- Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts
- and catches answers that are obviously incorrect/bad.
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
- similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures
- similarity between sampled responses considered by the model in the consistency assessment.
- Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap),
- "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model).
- Set this to "string" to improve latency/costs.
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
- reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens)
- when considering alternative possible responses and double-checking responses.
- Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs.
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
- custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria.
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
The expected input format is a list of dictionaries, where each dictionary has the following keys:
- name: Name of the evaluation criteria.
- criteria: Instructions specifying the evaluation criteria.
"""
quality_preset: Literal["best", "high", "medium", "low", "base"]
+ """The quality preset to use for the TLM or Trustworthy RAG API."""
task: Optional[str]
From 0642776fb76e03811b6ecb70fc7cd6b8f5515fc3 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 7 May 2025 17:16:56 +0000
Subject: [PATCH 5/7] codegen metadata
---
.stats.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.stats.yml b/.stats.yml
index bcd9e36c..d087a84d 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
configured_endpoints: 43
-openapi_spec_hash: f0d588a39e2040ff516a5cff26c4ab58
+openapi_spec_hash: 97719fe7ae4c641a5a020dd21f2978dd
config_hash: 5e459b33c53ffa6e554087a779bdb790
From 099ac1b68ffc0cbfc5ce4bd23fd5f9899e0b64f9 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 7 May 2025 17:36:40 +0000
Subject: [PATCH 6/7] feat(api): add validate endpoint
---
.stats.yml | 4 +-
api.md | 2 +
src/codex/resources/projects/projects.py | 378 ++++++++++++++++++-
src/codex/types/__init__.py | 2 +
src/codex/types/project_validate_params.py | 169 +++++++++
src/codex/types/project_validate_response.py | 36 ++
tests/api_resources/test_projects.py | 195 ++++++++++
7 files changed, 782 insertions(+), 4 deletions(-)
create mode 100644 src/codex/types/project_validate_params.py
create mode 100644 src/codex/types/project_validate_response.py
diff --git a/.stats.yml b/.stats.yml
index d087a84d..f01e1b9f 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
-configured_endpoints: 43
+configured_endpoints: 44
openapi_spec_hash: 97719fe7ae4c641a5a020dd21f2978dd
-config_hash: 5e459b33c53ffa6e554087a779bdb790
+config_hash: 659f65b6ccf5612986f920f7f9abbcb5
diff --git a/api.md b/api.md
index 3bd2cf4f..f3a2ea14 100644
--- a/api.md
+++ b/api.md
@@ -142,6 +142,7 @@ from codex.types import (
ProjectExportResponse,
ProjectIncrementQueriesResponse,
ProjectRetrieveAnalyticsResponse,
+ ProjectValidateResponse,
)
```
@@ -155,6 +156,7 @@ Methods:
- client.projects.export(project_id) -> object
- client.projects.increment_queries(project_id, \*\*params) -> object
- client.projects.retrieve_analytics(project_id, \*\*params) -> ProjectRetrieveAnalyticsResponse
+- client.projects.validate(project_id, \*\*params) -> ProjectValidateResponse
## AccessKeys
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
index 7c676b6d..b8fbaf7e 100644
--- a/src/codex/resources/projects/projects.py
+++ b/src/codex/resources/projects/projects.py
@@ -2,7 +2,7 @@
from __future__ import annotations
-from typing import Optional
+from typing import Dict, List, Optional
from typing_extensions import Literal
import httpx
@@ -11,6 +11,7 @@
project_list_params,
project_create_params,
project_update_params,
+ project_validate_params,
project_increment_queries_params,
project_retrieve_analytics_params,
)
@@ -23,7 +24,7 @@
AsyncEntriesResourceWithStreamingResponse,
)
from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
-from ..._utils import maybe_transform, async_maybe_transform
+from ..._utils import maybe_transform, strip_not_given, async_maybe_transform
from .clusters import (
ClustersResource,
AsyncClustersResource,
@@ -52,6 +53,7 @@
from ...types.project_list_response import ProjectListResponse
from ...types.project_return_schema import ProjectReturnSchema
from ...types.project_retrieve_response import ProjectRetrieveResponse
+from ...types.project_validate_response import ProjectValidateResponse
from ...types.project_retrieve_analytics_response import ProjectRetrieveAnalyticsResponse
__all__ = ["ProjectsResource", "AsyncProjectsResource"]
@@ -415,6 +417,186 @@ def retrieve_analytics(
cast_to=ProjectRetrieveAnalyticsResponse,
)
+ def validate(
+ self,
+ project_id: str,
+ *,
+ context: str,
+ prompt: str,
+ query: str,
+ response: str,
+ use_llm_matching: bool | NotGiven = NOT_GIVEN,
+ bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN,
+ constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN,
+ custom_metadata: Optional[object] | NotGiven = NOT_GIVEN,
+ eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
+ options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN,
+ quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
+ task: Optional[str] | NotGiven = NOT_GIVEN,
+ x_client_library_version: str | NotGiven = NOT_GIVEN,
+ x_integration_type: str | NotGiven = NOT_GIVEN,
+ x_source: str | NotGiven = NOT_GIVEN,
+ x_stainless_package_version: str | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> ProjectValidateResponse:
+ """
+ Evaluate whether a response, given the provided query and context, is
+ potentially bad. If the response is flagged as bad, a lookup is performed to
+ find an alternate expert answer. If there is no expert answer available, this
+ query will be recorded in the project for SMEs to answer.
+
+ Args:
+ custom_metadata: Arbitrary metadata supplied by the user/system
+
+ eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM
+ will be used to generate scores.
+
+ options: Typed dict of advanced configuration options for the Trustworthy Language Model.
+ Many of these configurations are determined by the quality preset selected
+ (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+ Specifying TLMOptions values directly overrides any default values set from the
+ quality preset.
+
+ For all options described below, higher settings will lead to longer runtimes
+ and may consume more tokens internally. You may not be able to run long prompts
+ (or prompts with long responses) in your account, unless your token/rate limits
+ are increased. If you hit token limit issues, try lower/less expensive
+ TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to
+ increase your limits.
+
+ The default values corresponding to each quality preset are:
+
+ - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+ `use_self_reflection` = True.
+ - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+ `use_self_reflection` = True.
+ - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+ `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+ "base" preset, a cheaper self-reflection will be used to compute the
+ trustworthiness score.
+
+ By default, the TLM uses the "medium" quality preset. The default base LLM
+ `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
+ You can set custom values for these arguments regardless of the quality preset
+ specified.
+
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
+ "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
+ "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
+
+ max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
+ If you experience token/rate limit errors while using TLM, try lowering this number.
+ For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
+ strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
+
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
+
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
+
+ log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+ For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+ The expected input format is a list of dictionaries, where each dictionary has the following keys:
+ - name: Name of the evaluation criteria.
+ - criteria: Instructions specifying the evaluation criteria.
+
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ if not project_id:
+ raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}")
+ extra_headers = {
+ **strip_not_given(
+ {
+ "x-client-library-version": x_client_library_version,
+ "x-integration-type": x_integration_type,
+ "x-source": x_source,
+ "x-stainless-package-version": x_stainless_package_version,
+ }
+ ),
+ **(extra_headers or {}),
+ }
+ return self._post(
+ f"/api/projects/{project_id}/validate",
+ body=maybe_transform(
+ {
+ "context": context,
+ "prompt": prompt,
+ "query": query,
+ "response": response,
+ "bad_response_thresholds": bad_response_thresholds,
+ "constrain_outputs": constrain_outputs,
+ "custom_metadata": custom_metadata,
+ "eval_scores": eval_scores,
+ "options": options,
+ "quality_preset": quality_preset,
+ "task": task,
+ },
+ project_validate_params.ProjectValidateParams,
+ ),
+ options=make_request_options(
+ extra_headers=extra_headers,
+ extra_query=extra_query,
+ extra_body=extra_body,
+ timeout=timeout,
+ query=maybe_transform(
+ {"use_llm_matching": use_llm_matching}, project_validate_params.ProjectValidateParams
+ ),
+ ),
+ cast_to=ProjectValidateResponse,
+ )
+
class AsyncProjectsResource(AsyncAPIResource):
@cached_property
@@ -776,6 +958,186 @@ async def retrieve_analytics(
cast_to=ProjectRetrieveAnalyticsResponse,
)
+ async def validate(
+ self,
+ project_id: str,
+ *,
+ context: str,
+ prompt: str,
+ query: str,
+ response: str,
+ use_llm_matching: bool | NotGiven = NOT_GIVEN,
+ bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN,
+ constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN,
+ custom_metadata: Optional[object] | NotGiven = NOT_GIVEN,
+ eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
+ options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN,
+ quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
+ task: Optional[str] | NotGiven = NOT_GIVEN,
+ x_client_library_version: str | NotGiven = NOT_GIVEN,
+ x_integration_type: str | NotGiven = NOT_GIVEN,
+ x_source: str | NotGiven = NOT_GIVEN,
+ x_stainless_package_version: str | NotGiven = NOT_GIVEN,
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+ # The extra values given here take precedence over values defined on the client or passed to this method.
+ extra_headers: Headers | None = None,
+ extra_query: Query | None = None,
+ extra_body: Body | None = None,
+ timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+ ) -> ProjectValidateResponse:
+ """
+ Evaluate whether a response, given the provided query and context, is
+ potentially bad. If the response is flagged as bad, a lookup is performed to
+ find an alternate expert answer. If there is no expert answer available, this
+ query will be recorded in the project for SMEs to answer.
+
+ Args:
+ custom_metadata: Arbitrary metadata supplied by the user/system
+
+ eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM
+ will be used to generate scores.
+
+ options: Typed dict of advanced configuration options for the Trustworthy Language Model.
+ Many of these configurations are determined by the quality preset selected
+ (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+ Specifying TLMOptions values directly overrides any default values set from the
+ quality preset.
+
+ For all options described below, higher settings will lead to longer runtimes
+ and may consume more tokens internally. You may not be able to run long prompts
+ (or prompts with long responses) in your account, unless your token/rate limits
+ are increased. If you hit token limit issues, try lower/less expensive
+ TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to
+ increase your limits.
+
+ The default values corresponding to each quality preset are:
+
+ - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+ `use_self_reflection` = True.
+ - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+ `use_self_reflection` = True.
+ - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+ `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+ "base" preset, a cheaper self-reflection will be used to compute the
+ trustworthiness score.
+
+ By default, the TLM uses the "medium" quality preset. The default base LLM
+ `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
+ You can set custom values for these arguments regardless of the quality preset
+ specified.
+
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
+ "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
+ "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
+
+ max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
+ If you experience token/rate limit errors while using TLM, try lowering this number.
+ For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
+ strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
+
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
+
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
+
+ log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+ For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+ The expected input format is a list of dictionaries, where each dictionary has the following keys:
+ - name: Name of the evaluation criteria.
+ - criteria: Instructions specifying the evaluation criteria.
+
+ quality_preset: The quality preset to use for the TLM or Trustworthy RAG API.
+
+ extra_headers: Send extra headers
+
+ extra_query: Add additional query parameters to the request
+
+ extra_body: Add additional JSON properties to the request
+
+ timeout: Override the client-level default timeout for this request, in seconds
+ """
+ if not project_id:
+ raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}")
+ extra_headers = {
+ **strip_not_given(
+ {
+ "x-client-library-version": x_client_library_version,
+ "x-integration-type": x_integration_type,
+ "x-source": x_source,
+ "x-stainless-package-version": x_stainless_package_version,
+ }
+ ),
+ **(extra_headers or {}),
+ }
+ return await self._post(
+ f"/api/projects/{project_id}/validate",
+ body=await async_maybe_transform(
+ {
+ "context": context,
+ "prompt": prompt,
+ "query": query,
+ "response": response,
+ "bad_response_thresholds": bad_response_thresholds,
+ "constrain_outputs": constrain_outputs,
+ "custom_metadata": custom_metadata,
+ "eval_scores": eval_scores,
+ "options": options,
+ "quality_preset": quality_preset,
+ "task": task,
+ },
+ project_validate_params.ProjectValidateParams,
+ ),
+ options=make_request_options(
+ extra_headers=extra_headers,
+ extra_query=extra_query,
+ extra_body=extra_body,
+ timeout=timeout,
+ query=await async_maybe_transform(
+ {"use_llm_matching": use_llm_matching}, project_validate_params.ProjectValidateParams
+ ),
+ ),
+ cast_to=ProjectValidateResponse,
+ )
+
class ProjectsResourceWithRawResponse:
def __init__(self, projects: ProjectsResource) -> None:
@@ -805,6 +1167,9 @@ def __init__(self, projects: ProjectsResource) -> None:
self.retrieve_analytics = to_raw_response_wrapper(
projects.retrieve_analytics,
)
+ self.validate = to_raw_response_wrapper(
+ projects.validate,
+ )
@cached_property
def access_keys(self) -> AccessKeysResourceWithRawResponse:
@@ -847,6 +1212,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None:
self.retrieve_analytics = async_to_raw_response_wrapper(
projects.retrieve_analytics,
)
+ self.validate = async_to_raw_response_wrapper(
+ projects.validate,
+ )
@cached_property
def access_keys(self) -> AsyncAccessKeysResourceWithRawResponse:
@@ -889,6 +1257,9 @@ def __init__(self, projects: ProjectsResource) -> None:
self.retrieve_analytics = to_streamed_response_wrapper(
projects.retrieve_analytics,
)
+ self.validate = to_streamed_response_wrapper(
+ projects.validate,
+ )
@cached_property
def access_keys(self) -> AccessKeysResourceWithStreamingResponse:
@@ -931,6 +1302,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None:
self.retrieve_analytics = async_to_streamed_response_wrapper(
projects.retrieve_analytics,
)
+ self.validate = async_to_streamed_response_wrapper(
+ projects.validate,
+ )
@cached_property
def access_keys(self) -> AsyncAccessKeysResourceWithStreamingResponse:
diff --git a/src/codex/types/__init__.py b/src/codex/types/__init__.py
index 7f18b9c1..8e0cc4a4 100644
--- a/src/codex/types/__init__.py
+++ b/src/codex/types/__init__.py
@@ -12,7 +12,9 @@
from .project_list_response import ProjectListResponse as ProjectListResponse
from .project_return_schema import ProjectReturnSchema as ProjectReturnSchema
from .project_update_params import ProjectUpdateParams as ProjectUpdateParams
+from .project_validate_params import ProjectValidateParams as ProjectValidateParams
from .project_retrieve_response import ProjectRetrieveResponse as ProjectRetrieveResponse
+from .project_validate_response import ProjectValidateResponse as ProjectValidateResponse
from .organization_schema_public import OrganizationSchemaPublic as OrganizationSchemaPublic
from .user_activate_account_params import UserActivateAccountParams as UserActivateAccountParams
from .project_increment_queries_params import ProjectIncrementQueriesParams as ProjectIncrementQueriesParams
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
new file mode 100644
index 00000000..f6214cbe
--- /dev/null
+++ b/src/codex/types/project_validate_params.py
@@ -0,0 +1,169 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Iterable, Optional
+from typing_extensions import Literal, Required, Annotated, TypedDict
+
+from .._utils import PropertyInfo
+
+__all__ = ["ProjectValidateParams", "BadResponseThresholds", "Options"]
+
+
+class ProjectValidateParams(TypedDict, total=False):
+ context: Required[str]
+
+ prompt: Required[str]
+
+ query: Required[str]
+
+ response: Required[str]
+
+ use_llm_matching: bool
+
+ bad_response_thresholds: BadResponseThresholds
+
+ constrain_outputs: Optional[List[str]]
+
+ custom_metadata: Optional[object]
+ """Arbitrary metadata supplied by the user/system"""
+
+ eval_scores: Optional[Dict[str, float]]
+ """Evaluation scores to use for flagging a response as bad.
+
+ If not provided, TLM will be used to generate scores.
+ """
+
+ options: Optional[Options]
+ """
+ Typed dict of advanced configuration options for the Trustworthy Language Model.
+ Many of these configurations are determined by the quality preset selected
+ (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+ Specifying TLMOptions values directly overrides any default values set from the
+ quality preset.
+
+ For all options described below, higher settings will lead to longer runtimes
+ and may consume more tokens internally. You may not be able to run long prompts
+ (or prompts with long responses) in your account, unless your token/rate limits
+ are increased. If you hit token limit issues, try lower/less expensive
+ TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to
+ increase your limits.
+
+ The default values corresponding to each quality preset are:
+
+ - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+ `use_self_reflection` = True. This preset improves LLM responses.
+ - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+ `use_self_reflection` = True.
+ - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+ `use_self_reflection` = True.
+ - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+ `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+ "base" preset, a cheaper self-reflection will be used to compute the
+ trustworthiness score.
+
+ By default, the TLM uses the "medium" quality preset. The default base LLM
+ `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
+ You can set custom values for these arguments regardless of the quality preset
+ specified.
+
+ Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
+ "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
+ "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
+ "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
+ "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
+ faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
+ "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+ "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
+ "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
+ for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
+ "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
+ "gpt-4.1-nano", "nova-micro".
+
+ max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
+ Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
+ If you experience token/rate limit errors while using TLM, try lowering this number.
+ For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+ num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+ `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+ This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+ Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
+ When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+
+ num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
+ Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
+ Measuring consistency helps quantify the epistemic uncertainty associated with
+ strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+ TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+
+ use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+ Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+ Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+ and catches responses that are noticeably incorrect/bad upon further analysis.
+
+ similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
+ trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+ Supported similarity measures include: "semantic" (based on natural language inference),
+ "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+ "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+ and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
+
+ reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+ when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+ Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
+
+ log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+ For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+ custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+ The expected input format is a list of dictionaries, where each dictionary has the following keys:
+ - name: Name of the evaluation criteria.
+ - criteria: Instructions specifying the evaluation criteria.
+ """
+
+ quality_preset: Literal["best", "high", "medium", "low", "base"]
+ """The quality preset to use for the TLM or Trustworthy RAG API."""
+
+ task: Optional[str]
+
+ x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
+
+ x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")]
+
+ x_source: Annotated[str, PropertyInfo(alias="x-source")]
+
+ x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")]
+
+
+class BadResponseThresholds(TypedDict, total=False):
+ context_sufficiency: Optional[float]
+
+ query_ease: Optional[float]
+
+ response_helpfulness: Optional[float]
+
+ trustworthiness: Optional[float]
+
+
+class Options(TypedDict, total=False):
+ custom_eval_criteria: Iterable[object]
+
+ log: List[str]
+
+ max_tokens: int
+
+ model: str
+
+ num_candidate_responses: int
+
+ num_consistency_samples: int
+
+ reasoning_effort: str
+
+ similarity_measure: str
+
+ use_self_reflection: bool
diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py
new file mode 100644
index 00000000..e2104360
--- /dev/null
+++ b/src/codex/types/project_validate_response.py
@@ -0,0 +1,36 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Dict, Optional
+
+from .._models import BaseModel
+
+__all__ = ["ProjectValidateResponse", "EvalScores"]
+
+
+class EvalScores(BaseModel):
+ is_bad: bool
+
+ score: Optional[float] = None
+
+ log: Optional[object] = None
+
+
+class ProjectValidateResponse(BaseModel):
+ eval_scores: Dict[str, EvalScores]
+ """
+ Evaluation scores for the original response along with a boolean flag, `is_bad`,
+ indicating whether the score is below the threshold.
+ """
+
+ expert_answer: Optional[str] = None
+ """
+ Alternate SME-provided answer from Codex if the response was flagged as bad and
+ an answer was found in the Codex Project, or None otherwise.
+ """
+
+ is_bad_response: bool
+ """True if the response is flagged as potentially bad, False otherwise.
+
+ When True, a lookup is performed, which logs this query in the project for SMEs
+ to answer, if it does not already exist.
+ """
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
index 40f40eb7..f7ca6e01 100644
--- a/tests/api_resources/test_projects.py
+++ b/tests/api_resources/test_projects.py
@@ -12,6 +12,7 @@
ProjectListResponse,
ProjectReturnSchema,
ProjectRetrieveResponse,
+ ProjectValidateResponse,
ProjectRetrieveAnalyticsResponse,
)
from tests.utils import assert_matches_type
@@ -421,6 +422,103 @@ def test_path_params_retrieve_analytics(self, client: Codex) -> None:
project_id="",
)
+ @pytest.mark.skip()
+ @parametrize
+ def test_method_validate(self, client: Codex) -> None:
+ project = client.projects.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ def test_method_validate_with_all_params(self, client: Codex) -> None:
+ project = client.projects.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ use_llm_matching=True,
+ bad_response_thresholds={
+ "context_sufficiency": 0,
+ "query_ease": 0,
+ "response_helpfulness": 0,
+ "trustworthiness": 0,
+ },
+ constrain_outputs=["string"],
+ custom_metadata={},
+ eval_scores={"foo": 0},
+ options={
+ "custom_eval_criteria": [{}],
+ "log": ["string"],
+ "max_tokens": 0,
+ "model": "model",
+ "num_candidate_responses": 0,
+ "num_consistency_samples": 0,
+ "reasoning_effort": "reasoning_effort",
+ "similarity_measure": "similarity_measure",
+ "use_self_reflection": True,
+ },
+ quality_preset="best",
+ task="task",
+ x_client_library_version="x-client-library-version",
+ x_integration_type="x-integration-type",
+ x_source="x-source",
+ x_stainless_package_version="x-stainless-package-version",
+ )
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ def test_raw_response_validate(self, client: Codex) -> None:
+ response = client.projects.with_raw_response.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
+
+ assert response.is_closed is True
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ project = response.parse()
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ def test_streaming_response_validate(self, client: Codex) -> None:
+ with client.projects.with_streaming_response.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ ) as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ project = response.parse()
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ assert cast(Any, response.is_closed) is True
+
+ @pytest.mark.skip()
+ @parametrize
+ def test_path_params_validate(self, client: Codex) -> None:
+ with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
+ client.projects.with_raw_response.validate(
+ project_id="",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
+
class TestAsyncProjects:
parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
@@ -823,3 +921,100 @@ async def test_path_params_retrieve_analytics(self, async_client: AsyncCodex) ->
await async_client.projects.with_raw_response.retrieve_analytics(
project_id="",
)
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_method_validate(self, async_client: AsyncCodex) -> None:
+ project = await async_client.projects.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -> None:
+ project = await async_client.projects.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ use_llm_matching=True,
+ bad_response_thresholds={
+ "context_sufficiency": 0,
+ "query_ease": 0,
+ "response_helpfulness": 0,
+ "trustworthiness": 0,
+ },
+ constrain_outputs=["string"],
+ custom_metadata={},
+ eval_scores={"foo": 0},
+ options={
+ "custom_eval_criteria": [{}],
+ "log": ["string"],
+ "max_tokens": 0,
+ "model": "model",
+ "num_candidate_responses": 0,
+ "num_consistency_samples": 0,
+ "reasoning_effort": "reasoning_effort",
+ "similarity_measure": "similarity_measure",
+ "use_self_reflection": True,
+ },
+ quality_preset="best",
+ task="task",
+ x_client_library_version="x-client-library-version",
+ x_integration_type="x-integration-type",
+ x_source="x-source",
+ x_stainless_package_version="x-stainless-package-version",
+ )
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_raw_response_validate(self, async_client: AsyncCodex) -> None:
+ response = await async_client.projects.with_raw_response.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
+
+ assert response.is_closed is True
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+ project = await response.parse()
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_streaming_response_validate(self, async_client: AsyncCodex) -> None:
+ async with async_client.projects.with_streaming_response.validate(
+ project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ ) as response:
+ assert not response.is_closed
+ assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+ project = await response.parse()
+ assert_matches_type(ProjectValidateResponse, project, path=["response"])
+
+ assert cast(Any, response.is_closed) is True
+
+ @pytest.mark.skip()
+ @parametrize
+ async def test_path_params_validate(self, async_client: AsyncCodex) -> None:
+ with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"):
+ await async_client.projects.with_raw_response.validate(
+ project_id="",
+ context="context",
+ prompt="prompt",
+ query="query",
+ response="response",
+ )
From 332d23bf58fbada04fa6b9d7979759126dc84a1b Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 7 May 2025 18:33:29 +0000
Subject: [PATCH 7/7] release: 0.1.0-alpha.19
---
.release-please-manifest.json | 2 +-
CHANGELOG.md | 12 ++++++++++++
pyproject.toml | 2 +-
src/codex/_version.py | 2 +-
4 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 3cf71e62..b386befd 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "0.1.0-alpha.18"
+ ".": "0.1.0-alpha.19"
}
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6fd961c7..3032a1a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
# Changelog
+## 0.1.0-alpha.19 (2025-05-07)
+
+Full Changelog: [v0.1.0-alpha.18...v0.1.0-alpha.19](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.18...v0.1.0-alpha.19)
+
+### Features
+
+* **api:** add validate endpoint ([099ac1b](https://github.com/cleanlab/codex-python/commit/099ac1b68ffc0cbfc5ce4bd23fd5f9899e0b64f9))
+* **api:** api update ([8e01ccd](https://github.com/cleanlab/codex-python/commit/8e01ccdc8e341d010e8989f30f4fd887effa1871))
+* **api:** api update ([61fdb7a](https://github.com/cleanlab/codex-python/commit/61fdb7aaaa6c2533ebcfdfe3c0aff31474e75d51))
+* **api:** api update ([a52c74a](https://github.com/cleanlab/codex-python/commit/a52c74a22fb720f10265021d057f34874f73846b))
+* **api:** api update ([6947764](https://github.com/cleanlab/codex-python/commit/69477646d7c8eff2bae01199949e4037771ba460))
+
## 0.1.0-alpha.18 (2025-04-24)
Full Changelog: [v0.1.0-alpha.17...v0.1.0-alpha.18](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.17...v0.1.0-alpha.18)
diff --git a/pyproject.toml b/pyproject.toml
index 6cd58eba..13866c7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "codex-sdk"
-version = "0.1.0-alpha.18"
+version = "0.1.0-alpha.19"
description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead."
dynamic = ["readme"]
license = "MIT"
diff --git a/src/codex/_version.py b/src/codex/_version.py
index 29c60372..87d42e64 100644
--- a/src/codex/_version.py
+++ b/src/codex/_version.py
@@ -1,4 +1,4 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
__title__ = "codex"
-__version__ = "0.1.0-alpha.18" # x-release-please-version
+__version__ = "0.1.0-alpha.19" # x-release-please-version