diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 3cf71e62..b386befd 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.1.0-alpha.18" + ".": "0.1.0-alpha.19" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index 7a2f56a6..f01e1b9f 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,3 +1,3 @@ -configured_endpoints: 43 -openapi_spec_hash: b7beefbd38b4fcdd191cdb81a18a023b -config_hash: 5e459b33c53ffa6e554087a779bdb790 +configured_endpoints: 44 +openapi_spec_hash: 97719fe7ae4c641a5a020dd21f2978dd +config_hash: 659f65b6ccf5612986f920f7f9abbcb5 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fd961c7..3032a1a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## 0.1.0-alpha.19 (2025-05-07) + +Full Changelog: [v0.1.0-alpha.18...v0.1.0-alpha.19](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.18...v0.1.0-alpha.19) + +### Features + +* **api:** add validate endpoint ([099ac1b](https://github.com/cleanlab/codex-python/commit/099ac1b68ffc0cbfc5ce4bd23fd5f9899e0b64f9)) +* **api:** api update ([8e01ccd](https://github.com/cleanlab/codex-python/commit/8e01ccdc8e341d010e8989f30f4fd887effa1871)) +* **api:** api update ([61fdb7a](https://github.com/cleanlab/codex-python/commit/61fdb7aaaa6c2533ebcfdfe3c0aff31474e75d51)) +* **api:** api update ([a52c74a](https://github.com/cleanlab/codex-python/commit/a52c74a22fb720f10265021d057f34874f73846b)) +* **api:** api update ([6947764](https://github.com/cleanlab/codex-python/commit/69477646d7c8eff2bae01199949e4037771ba460)) + ## 0.1.0-alpha.18 (2025-04-24) Full Changelog: [v0.1.0-alpha.17...v0.1.0-alpha.18](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.17...v0.1.0-alpha.18) diff --git a/api.md b/api.md index f240632f..f3a2ea14 100644 --- a/api.md +++ b/api.md @@ -142,6 +142,7 @@ from codex.types import ( ProjectExportResponse, ProjectIncrementQueriesResponse, ProjectRetrieveAnalyticsResponse, + ProjectValidateResponse, ) ``` @@ -153,8 +154,9 @@ Methods: - client.projects.list(\*\*params) -> ProjectListResponse - client.projects.delete(project_id) -> None - client.projects.export(project_id) -> object -- client.projects.increment_queries(project_id) -> object +- client.projects.increment_queries(project_id, \*\*params) -> object - client.projects.retrieve_analytics(project_id, \*\*params) -> ProjectRetrieveAnalyticsResponse +- client.projects.validate(project_id, \*\*params) -> ProjectValidateResponse ## AccessKeys diff --git a/pyproject.toml b/pyproject.toml index 6cd58eba..13866c7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "codex-sdk" -version = "0.1.0-alpha.18" +version = "0.1.0-alpha.19" description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead." dynamic = ["readme"] license = "MIT" diff --git a/src/codex/_version.py b/src/codex/_version.py index 29c60372..87d42e64 100644 --- a/src/codex/_version.py +++ b/src/codex/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "codex" -__version__ = "0.1.0-alpha.18" # x-release-please-version +__version__ = "0.1.0-alpha.19" # x-release-please-version diff --git a/src/codex/resources/projects/entries.py b/src/codex/resources/projects/entries.py index a9e690b9..346dd353 100644 --- a/src/codex/resources/projects/entries.py +++ b/src/codex/resources/projects/entries.py @@ -319,6 +319,7 @@ def query( question: str, use_llm_matching: bool | NotGiven = NOT_GIVEN, client_metadata: Optional[object] | NotGiven = NOT_GIVEN, + query_metadata: Optional[entry_query_params.QueryMetadata] | NotGiven = NOT_GIVEN, x_client_library_version: str | NotGiven = NOT_GIVEN, x_integration_type: str | NotGiven = NOT_GIVEN, x_source: str | NotGiven = NOT_GIVEN, @@ -334,6 +335,10 @@ def query( Query Entries Route Args: + client_metadata: Deprecated: Use query_metadata instead + + query_metadata: Optional logging data that can be provided by the client. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -361,6 +366,7 @@ def query( { "question": question, "client_metadata": client_metadata, + "query_metadata": query_metadata, }, entry_query_params.EntryQueryParams, ), @@ -708,6 +714,7 @@ async def query( question: str, use_llm_matching: bool | NotGiven = NOT_GIVEN, client_metadata: Optional[object] | NotGiven = NOT_GIVEN, + query_metadata: Optional[entry_query_params.QueryMetadata] | NotGiven = NOT_GIVEN, x_client_library_version: str | NotGiven = NOT_GIVEN, x_integration_type: str | NotGiven = NOT_GIVEN, x_source: str | NotGiven = NOT_GIVEN, @@ -723,6 +730,10 @@ async def query( Query Entries Route Args: + client_metadata: Deprecated: Use query_metadata instead + + query_metadata: Optional logging data that can be provided by the client. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -750,6 +761,7 @@ async def query( { "question": question, "client_metadata": client_metadata, + "query_metadata": query_metadata, }, entry_query_params.EntryQueryParams, ), diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py index bd50a684..b8fbaf7e 100644 --- a/src/codex/resources/projects/projects.py +++ b/src/codex/resources/projects/projects.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional +from typing import Dict, List, Optional from typing_extensions import Literal import httpx @@ -11,6 +11,8 @@ project_list_params, project_create_params, project_update_params, + project_validate_params, + project_increment_queries_params, project_retrieve_analytics_params, ) from .entries import ( @@ -22,7 +24,7 @@ AsyncEntriesResourceWithStreamingResponse, ) from ..._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven -from ..._utils import maybe_transform, async_maybe_transform +from ..._utils import maybe_transform, strip_not_given, async_maybe_transform from .clusters import ( ClustersResource, AsyncClustersResource, @@ -51,6 +53,7 @@ from ...types.project_list_response import ProjectListResponse from ...types.project_return_schema import ProjectReturnSchema from ...types.project_retrieve_response import ProjectRetrieveResponse +from ...types.project_validate_response import ProjectValidateResponse from ...types.project_retrieve_analytics_response import ProjectRetrieveAnalyticsResponse __all__ = ["ProjectsResource", "AsyncProjectsResource"] @@ -331,6 +334,7 @@ def increment_queries( self, project_id: str, *, + count: int | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -355,7 +359,11 @@ def increment_queries( return self._post( f"/api/projects/{project_id}/increment_queries", options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + query=maybe_transform({"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams), ), cast_to=object, ) @@ -409,6 +417,186 @@ def retrieve_analytics( cast_to=ProjectRetrieveAnalyticsResponse, ) + def validate( + self, + project_id: str, + *, + context: str, + prompt: str, + query: str, + response: str, + use_llm_matching: bool | NotGiven = NOT_GIVEN, + bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN, + constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN, + custom_metadata: Optional[object] | NotGiven = NOT_GIVEN, + eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN, + options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN, + quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN, + task: Optional[str] | NotGiven = NOT_GIVEN, + x_client_library_version: str | NotGiven = NOT_GIVEN, + x_integration_type: str | NotGiven = NOT_GIVEN, + x_source: str | NotGiven = NOT_GIVEN, + x_stainless_package_version: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> ProjectValidateResponse: + """ + Evaluate whether a response, given the provided query and context, is + potentially bad. If the response is flagged as bad, a lookup is performed to + find an alternate expert answer. If there is no expert answer available, this + query will be recorded in the project for SMEs to answer. + + Args: + custom_metadata: Arbitrary metadata supplied by the user/system + + eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM + will be used to generate scores. + + options: Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, + `use_self_reflection` = True. + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, + `use_self_reflection` = True. + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, + `use_self_reflection` = False. When using `get_trustworthiness_score()` on + "base" preset, a cheaper self-reflection will be used to compute the + trustworthiness score. + + By default, the TLM uses the "medium" quality preset. The default base LLM + `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets. + You can set custom values for these arguments regardless of the quality preset + specified. + + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", + "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = + "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. + If you experience token/rate limit errors while using TLM, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + if not project_id: + raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}") + extra_headers = { + **strip_not_given( + { + "x-client-library-version": x_client_library_version, + "x-integration-type": x_integration_type, + "x-source": x_source, + "x-stainless-package-version": x_stainless_package_version, + } + ), + **(extra_headers or {}), + } + return self._post( + f"/api/projects/{project_id}/validate", + body=maybe_transform( + { + "context": context, + "prompt": prompt, + "query": query, + "response": response, + "bad_response_thresholds": bad_response_thresholds, + "constrain_outputs": constrain_outputs, + "custom_metadata": custom_metadata, + "eval_scores": eval_scores, + "options": options, + "quality_preset": quality_preset, + "task": task, + }, + project_validate_params.ProjectValidateParams, + ), + options=make_request_options( + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + query=maybe_transform( + {"use_llm_matching": use_llm_matching}, project_validate_params.ProjectValidateParams + ), + ), + cast_to=ProjectValidateResponse, + ) + class AsyncProjectsResource(AsyncAPIResource): @cached_property @@ -685,6 +873,7 @@ async def increment_queries( self, project_id: str, *, + count: int | NotGiven = NOT_GIVEN, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -709,7 +898,13 @@ async def increment_queries( return await self._post( f"/api/projects/{project_id}/increment_queries", options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + query=await async_maybe_transform( + {"count": count}, project_increment_queries_params.ProjectIncrementQueriesParams + ), ), cast_to=object, ) @@ -763,6 +958,186 @@ async def retrieve_analytics( cast_to=ProjectRetrieveAnalyticsResponse, ) + async def validate( + self, + project_id: str, + *, + context: str, + prompt: str, + query: str, + response: str, + use_llm_matching: bool | NotGiven = NOT_GIVEN, + bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN, + constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN, + custom_metadata: Optional[object] | NotGiven = NOT_GIVEN, + eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN, + options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN, + quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN, + task: Optional[str] | NotGiven = NOT_GIVEN, + x_client_library_version: str | NotGiven = NOT_GIVEN, + x_integration_type: str | NotGiven = NOT_GIVEN, + x_source: str | NotGiven = NOT_GIVEN, + x_stainless_package_version: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> ProjectValidateResponse: + """ + Evaluate whether a response, given the provided query and context, is + potentially bad. If the response is flagged as bad, a lookup is performed to + find an alternate expert answer. If there is no expert answer available, this + query will be recorded in the project for SMEs to answer. + + Args: + custom_metadata: Arbitrary metadata supplied by the user/system + + eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM + will be used to generate scores. + + options: Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, + `use_self_reflection` = True. + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, + `use_self_reflection` = True. + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, + `use_self_reflection` = False. When using `get_trustworthiness_score()` on + "base" preset, a cheaper self-reflection will be used to compute the + trustworthiness score. + + By default, the TLM uses the "medium" quality preset. The default base LLM + `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets. + You can set custom values for these arguments regardless of the quality preset + specified. + + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", + "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = + "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. + If you experience token/rate limit errors while using TLM, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + if not project_id: + raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}") + extra_headers = { + **strip_not_given( + { + "x-client-library-version": x_client_library_version, + "x-integration-type": x_integration_type, + "x-source": x_source, + "x-stainless-package-version": x_stainless_package_version, + } + ), + **(extra_headers or {}), + } + return await self._post( + f"/api/projects/{project_id}/validate", + body=await async_maybe_transform( + { + "context": context, + "prompt": prompt, + "query": query, + "response": response, + "bad_response_thresholds": bad_response_thresholds, + "constrain_outputs": constrain_outputs, + "custom_metadata": custom_metadata, + "eval_scores": eval_scores, + "options": options, + "quality_preset": quality_preset, + "task": task, + }, + project_validate_params.ProjectValidateParams, + ), + options=make_request_options( + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + query=await async_maybe_transform( + {"use_llm_matching": use_llm_matching}, project_validate_params.ProjectValidateParams + ), + ), + cast_to=ProjectValidateResponse, + ) + class ProjectsResourceWithRawResponse: def __init__(self, projects: ProjectsResource) -> None: @@ -792,6 +1167,9 @@ def __init__(self, projects: ProjectsResource) -> None: self.retrieve_analytics = to_raw_response_wrapper( projects.retrieve_analytics, ) + self.validate = to_raw_response_wrapper( + projects.validate, + ) @cached_property def access_keys(self) -> AccessKeysResourceWithRawResponse: @@ -834,6 +1212,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None: self.retrieve_analytics = async_to_raw_response_wrapper( projects.retrieve_analytics, ) + self.validate = async_to_raw_response_wrapper( + projects.validate, + ) @cached_property def access_keys(self) -> AsyncAccessKeysResourceWithRawResponse: @@ -876,6 +1257,9 @@ def __init__(self, projects: ProjectsResource) -> None: self.retrieve_analytics = to_streamed_response_wrapper( projects.retrieve_analytics, ) + self.validate = to_streamed_response_wrapper( + projects.validate, + ) @cached_property def access_keys(self) -> AccessKeysResourceWithStreamingResponse: @@ -918,6 +1302,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None: self.retrieve_analytics = async_to_streamed_response_wrapper( projects.retrieve_analytics, ) + self.validate = async_to_streamed_response_wrapper( + projects.validate, + ) @cached_property def access_keys(self) -> AsyncAccessKeysResourceWithStreamingResponse: diff --git a/src/codex/resources/tlm.py b/src/codex/resources/tlm.py index 78f97e2e..5e4cd7e1 100644 --- a/src/codex/resources/tlm.py +++ b/src/codex/resources/tlm.py @@ -97,59 +97,63 @@ def prompt( You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -234,59 +238,63 @@ def score( You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -387,59 +395,63 @@ async def prompt( You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -524,59 +536,63 @@ async def score( You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + extra_headers: Send extra headers extra_query: Add additional query parameters to the request diff --git a/src/codex/types/__init__.py b/src/codex/types/__init__.py index 53d1ab6b..8e0cc4a4 100644 --- a/src/codex/types/__init__.py +++ b/src/codex/types/__init__.py @@ -12,9 +12,12 @@ from .project_list_response import ProjectListResponse as ProjectListResponse from .project_return_schema import ProjectReturnSchema as ProjectReturnSchema from .project_update_params import ProjectUpdateParams as ProjectUpdateParams +from .project_validate_params import ProjectValidateParams as ProjectValidateParams from .project_retrieve_response import ProjectRetrieveResponse as ProjectRetrieveResponse +from .project_validate_response import ProjectValidateResponse as ProjectValidateResponse from .organization_schema_public import OrganizationSchemaPublic as OrganizationSchemaPublic from .user_activate_account_params import UserActivateAccountParams as UserActivateAccountParams +from .project_increment_queries_params import ProjectIncrementQueriesParams as ProjectIncrementQueriesParams from .project_retrieve_analytics_params import ProjectRetrieveAnalyticsParams as ProjectRetrieveAnalyticsParams from .organization_list_members_response import OrganizationListMembersResponse as OrganizationListMembersResponse from .project_retrieve_analytics_response import ProjectRetrieveAnalyticsResponse as ProjectRetrieveAnalyticsResponse diff --git a/src/codex/types/project_increment_queries_params.py b/src/codex/types/project_increment_queries_params.py new file mode 100644 index 00000000..f6043a76 --- /dev/null +++ b/src/codex/types/project_increment_queries_params.py @@ -0,0 +1,11 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import TypedDict + +__all__ = ["ProjectIncrementQueriesParams"] + + +class ProjectIncrementQueriesParams(TypedDict, total=False): + count: int diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py new file mode 100644 index 00000000..f6214cbe --- /dev/null +++ b/src/codex/types/project_validate_params.py @@ -0,0 +1,169 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Dict, List, Iterable, Optional +from typing_extensions import Literal, Required, Annotated, TypedDict + +from .._utils import PropertyInfo + +__all__ = ["ProjectValidateParams", "BadResponseThresholds", "Options"] + + +class ProjectValidateParams(TypedDict, total=False): + context: Required[str] + + prompt: Required[str] + + query: Required[str] + + response: Required[str] + + use_llm_matching: bool + + bad_response_thresholds: BadResponseThresholds + + constrain_outputs: Optional[List[str]] + + custom_metadata: Optional[object] + """Arbitrary metadata supplied by the user/system""" + + eval_scores: Optional[Dict[str, float]] + """Evaluation scores to use for flagging a response as bad. + + If not provided, TLM will be used to generate scores. + """ + + options: Optional[Options] + """ + Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8, + `use_self_reflection` = True. This preset improves LLM responses. + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, + `use_self_reflection` = True. + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, + `use_self_reflection` = True. + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, + `use_self_reflection` = False. When using `get_trustworthiness_score()` on + "base" preset, a cheaper self-reflection will be used to compute the + trustworthiness score. + + By default, the TLM uses the "medium" quality preset. The default base LLM + `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets. + You can set custom values for these arguments regardless of the quality preset + specified. + + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", + "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = + "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. + If you experience token/rate limit errors while using TLM, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + """ + + quality_preset: Literal["best", "high", "medium", "low", "base"] + """The quality preset to use for the TLM or Trustworthy RAG API.""" + + task: Optional[str] + + x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")] + + x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")] + + x_source: Annotated[str, PropertyInfo(alias="x-source")] + + x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")] + + +class BadResponseThresholds(TypedDict, total=False): + context_sufficiency: Optional[float] + + query_ease: Optional[float] + + response_helpfulness: Optional[float] + + trustworthiness: Optional[float] + + +class Options(TypedDict, total=False): + custom_eval_criteria: Iterable[object] + + log: List[str] + + max_tokens: int + + model: str + + num_candidate_responses: int + + num_consistency_samples: int + + reasoning_effort: str + + similarity_measure: str + + use_self_reflection: bool diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py new file mode 100644 index 00000000..e2104360 --- /dev/null +++ b/src/codex/types/project_validate_response.py @@ -0,0 +1,36 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Dict, Optional + +from .._models import BaseModel + +__all__ = ["ProjectValidateResponse", "EvalScores"] + + +class EvalScores(BaseModel): + is_bad: bool + + score: Optional[float] = None + + log: Optional[object] = None + + +class ProjectValidateResponse(BaseModel): + eval_scores: Dict[str, EvalScores] + """ + Evaluation scores for the original response along with a boolean flag, `is_bad`, + indicating whether the score is below the threshold. + """ + + expert_answer: Optional[str] = None + """ + Alternate SME-provided answer from Codex if the response was flagged as bad and + an answer was found in the Codex Project, or None otherwise. + """ + + is_bad_response: bool + """True if the response is flagged as potentially bad, False otherwise. + + When True, a lookup is performed, which logs this query in the project for SMEs + to answer, if it does not already exist. + """ diff --git a/src/codex/types/projects/entry_query_params.py b/src/codex/types/projects/entry_query_params.py index d58b7bfa..1edabbed 100644 --- a/src/codex/types/projects/entry_query_params.py +++ b/src/codex/types/projects/entry_query_params.py @@ -2,12 +2,12 @@ from __future__ import annotations -from typing import Optional +from typing import Dict, List, Union, Iterable, Optional from typing_extensions import Required, Annotated, TypedDict from ..._utils import PropertyInfo -__all__ = ["EntryQueryParams"] +__all__ = ["EntryQueryParams", "QueryMetadata", "QueryMetadataContextUnionMember3"] class EntryQueryParams(TypedDict, total=False): @@ -16,6 +16,10 @@ class EntryQueryParams(TypedDict, total=False): use_llm_matching: bool client_metadata: Optional[object] + """Deprecated: Use query_metadata instead""" + + query_metadata: Optional[QueryMetadata] + """Optional logging data that can be provided by the client.""" x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")] @@ -24,3 +28,34 @@ class EntryQueryParams(TypedDict, total=False): x_source: Annotated[str, PropertyInfo(alias="x-source")] x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")] + + +class QueryMetadataContextUnionMember3(TypedDict, total=False): + content: Required[str] + """The actual content/text of the document.""" + + id: Optional[str] + """Unique identifier for the document. Useful for tracking documents""" + + source: Optional[str] + """Source or origin of the document. Useful for citations.""" + + tags: Optional[List[str]] + """Tags or categories for the document. Useful for filtering""" + + title: Optional[str] + """Title or heading of the document. Useful for display and context.""" + + +class QueryMetadata(TypedDict, total=False): + context: Union[str, List[str], Iterable[object], Iterable[QueryMetadataContextUnionMember3], None] + """RAG context used for the query""" + + custom_metadata: Optional[object] + """Arbitrary metadata supplied by the user/system""" + + eval_scores: Optional[Dict[str, float]] + """Evaluation scores for the original response""" + + evaluated_response: Optional[str] + """The response being evaluated from the RAG system(before any remediation)""" diff --git a/src/codex/types/tlm_prompt_params.py b/src/codex/types/tlm_prompt_params.py index 860f1a77..94536055 100644 --- a/src/codex/types/tlm_prompt_params.py +++ b/src/codex/types/tlm_prompt_params.py @@ -48,61 +48,64 @@ class TlmPromptParams(TypedDict, total=False): You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. """ quality_preset: Literal["best", "high", "medium", "low", "base"] + """The quality preset to use for the TLM or Trustworthy RAG API.""" task: Optional[str] diff --git a/src/codex/types/tlm_score_params.py b/src/codex/types/tlm_score_params.py index 213da422..a0d90175 100644 --- a/src/codex/types/tlm_score_params.py +++ b/src/codex/types/tlm_score_params.py @@ -50,61 +50,64 @@ class TlmScoreParams(TypedDict, total=False): You can set custom values for these arguments regardless of the quality preset specified. - Args: model ({"gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "o1-preview", - "gpt-3.5-turbo-16k", "gpt-4", "gpt-4.5-preview", "claude-3.7-sonnet", + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, - faster models yield faster/cheaper results). - Models still in beta: "o1", - "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-3.7-sonnet", - "claude-3.5-sonnet-v2", "claude-3.5-haiku", "nova-micro", "nova-lite", - "nova-pro". - Recommended models for accuracy: "gpt-4o", "o3-mini", "o1", - "claude-3.7-sonnet". - Recommended models for low latency/costs: "nova-micro", - "gpt-4o-mini". + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: + "gpt-4.1-nano", "nova-micro". max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). - Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher costs/runtimes. + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. If you experience token/rate limit errors while using TLM, try lowering this number. For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. - num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated by TLM. - TLM scores the trustworthiness of each candidate response, and then returns the most trustworthy one. - Higher values here can produce better (more accurate) responses from the TLM, but at higher costs/runtimes (and internally consumes more tokens). - This parameter must be between 1 and 20. - When it is 1, TLM simply returns a standard LLM response and does not attempt to auto-improve it. + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. - num_consistency_samples (int, default = 8): the amount of internal sampling to evaluate LLM response consistency. - Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher costs/runtimes. - This consistency helps quantify the epistemic uncertainty associated with + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. + Measuring consistency helps quantify the epistemic uncertainty associated with strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. - TLM internally measures consistency via the degree of contradiction between sampled responses that the model considers equally plausible. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. - use_self_reflection (bool, default = `True`): whether the LLM is asked to self-reflect upon the response it - generated and self-evaluate this response. - Setting this False disables self-reflection and may worsen trustworthiness scores, but will reduce costs/runtimes. - Self-reflection helps quantify aleatoric uncertainty associated with challenging prompts - and catches answers that are obviously incorrect/bad. + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts + and catches responses that are noticeably incorrect/bad upon further analysis. - similarity_measure ({"semantic", "string", "embedding", "embedding_large"}, default = "semantic"): how the trustworthiness scoring algorithm measures - similarity between sampled responses considered by the model in the consistency assessment. - Supported similarity measures include "semantic" (based on natural language inference), "string" (based on character/word overlap), - "embedding" (based on embedding similarity), and "embedding_large" (based on embedding similarity with a larger embedding model). - Set this to "string" to improve latency/costs. + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include: "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. - reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much the LLM can reason (number of thinking tokens) - when considering alternative possible responses and double-checking responses. - Higher efforts here may produce better TLM trustworthiness scores and LLM responses. Reduce this value to improve latency/costs. + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. - custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria. + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. The expected input format is a list of dictionaries, where each dictionary has the following keys: - name: Name of the evaluation criteria. - criteria: Instructions specifying the evaluation criteria. """ quality_preset: Literal["best", "high", "medium", "low", "base"] + """The quality preset to use for the TLM or Trustworthy RAG API.""" task: Optional[str] diff --git a/src/codex/types/users/user_schema.py b/src/codex/types/users/user_schema.py index b1665f21..d22c9563 100644 --- a/src/codex/types/users/user_schema.py +++ b/src/codex/types/users/user_schema.py @@ -23,6 +23,8 @@ class UserSchema(BaseModel): email: str + email_verified: bool + updated_at: datetime user_provided_company_name: Optional[str] = None diff --git a/src/codex/types/users/user_schema_public.py b/src/codex/types/users/user_schema_public.py index 181113b0..d5e1d9bf 100644 --- a/src/codex/types/users/user_schema_public.py +++ b/src/codex/types/users/user_schema_public.py @@ -14,7 +14,7 @@ class UserSchemaPublic(BaseModel): email: str - email_verified: Optional[bool] = None + email_verified: bool first_name: Optional[str] = None diff --git a/tests/api_resources/projects/test_entries.py b/tests/api_resources/projects/test_entries.py index 31a5e408..73a45ad4 100644 --- a/tests/api_resources/projects/test_entries.py +++ b/tests/api_resources/projects/test_entries.py @@ -396,6 +396,12 @@ def test_method_query_with_all_params(self, client: Codex) -> None: question="question", use_llm_matching=True, client_metadata={}, + query_metadata={ + "context": "string", + "custom_metadata": {}, + "eval_scores": {"foo": 0}, + "evaluated_response": "evaluated_response", + }, x_client_library_version="x-client-library-version", x_integration_type="x-integration-type", x_source="x-source", @@ -871,6 +877,12 @@ async def test_method_query_with_all_params(self, async_client: AsyncCodex) -> N question="question", use_llm_matching=True, client_metadata={}, + query_metadata={ + "context": "string", + "custom_metadata": {}, + "eval_scores": {"foo": 0}, + "evaluated_response": "evaluated_response", + }, x_client_library_version="x-client-library-version", x_integration_type="x-integration-type", x_source="x-source", diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py index 772a7b29..f7ca6e01 100644 --- a/tests/api_resources/test_projects.py +++ b/tests/api_resources/test_projects.py @@ -12,6 +12,7 @@ ProjectListResponse, ProjectReturnSchema, ProjectRetrieveResponse, + ProjectValidateResponse, ProjectRetrieveAnalyticsResponse, ) from tests.utils import assert_matches_type @@ -322,7 +323,16 @@ def test_path_params_export(self, client: Codex) -> None: @parametrize def test_method_increment_queries(self, client: Codex) -> None: project = client.projects.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + ) + assert_matches_type(object, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + def test_method_increment_queries_with_all_params(self, client: Codex) -> None: + project = client.projects.increment_queries( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + count=0, ) assert_matches_type(object, project, path=["response"]) @@ -330,7 +340,7 @@ def test_method_increment_queries(self, client: Codex) -> None: @parametrize def test_raw_response_increment_queries(self, client: Codex) -> None: response = client.projects.with_raw_response.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", ) assert response.is_closed is True @@ -342,7 +352,7 @@ def test_raw_response_increment_queries(self, client: Codex) -> None: @parametrize def test_streaming_response_increment_queries(self, client: Codex) -> None: with client.projects.with_streaming_response.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -357,7 +367,7 @@ def test_streaming_response_increment_queries(self, client: Codex) -> None: def test_path_params_increment_queries(self, client: Codex) -> None: with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): client.projects.with_raw_response.increment_queries( - "", + project_id="", ) @pytest.mark.skip() @@ -412,6 +422,103 @@ def test_path_params_retrieve_analytics(self, client: Codex) -> None: project_id="", ) + @pytest.mark.skip() + @parametrize + def test_method_validate(self, client: Codex) -> None: + project = client.projects.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + def test_method_validate_with_all_params(self, client: Codex) -> None: + project = client.projects.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + use_llm_matching=True, + bad_response_thresholds={ + "context_sufficiency": 0, + "query_ease": 0, + "response_helpfulness": 0, + "trustworthiness": 0, + }, + constrain_outputs=["string"], + custom_metadata={}, + eval_scores={"foo": 0}, + options={ + "custom_eval_criteria": [{}], + "log": ["string"], + "max_tokens": 0, + "model": "model", + "num_candidate_responses": 0, + "num_consistency_samples": 0, + "reasoning_effort": "reasoning_effort", + "similarity_measure": "similarity_measure", + "use_self_reflection": True, + }, + quality_preset="best", + task="task", + x_client_library_version="x-client-library-version", + x_integration_type="x-integration-type", + x_source="x-source", + x_stainless_package_version="x-stainless-package-version", + ) + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + def test_raw_response_validate(self, client: Codex) -> None: + response = client.projects.with_raw_response.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + project = response.parse() + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + def test_streaming_response_validate(self, client: Codex) -> None: + with client.projects.with_streaming_response.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + project = response.parse() + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + assert cast(Any, response.is_closed) is True + + @pytest.mark.skip() + @parametrize + def test_path_params_validate(self, client: Codex) -> None: + with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): + client.projects.with_raw_response.validate( + project_id="", + context="context", + prompt="prompt", + query="query", + response="response", + ) + class TestAsyncProjects: parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"]) @@ -716,7 +823,16 @@ async def test_path_params_export(self, async_client: AsyncCodex) -> None: @parametrize async def test_method_increment_queries(self, async_client: AsyncCodex) -> None: project = await async_client.projects.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + ) + assert_matches_type(object, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + async def test_method_increment_queries_with_all_params(self, async_client: AsyncCodex) -> None: + project = await async_client.projects.increment_queries( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + count=0, ) assert_matches_type(object, project, path=["response"]) @@ -724,7 +840,7 @@ async def test_method_increment_queries(self, async_client: AsyncCodex) -> None: @parametrize async def test_raw_response_increment_queries(self, async_client: AsyncCodex) -> None: response = await async_client.projects.with_raw_response.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", ) assert response.is_closed is True @@ -736,7 +852,7 @@ async def test_raw_response_increment_queries(self, async_client: AsyncCodex) -> @parametrize async def test_streaming_response_increment_queries(self, async_client: AsyncCodex) -> None: async with async_client.projects.with_streaming_response.increment_queries( - "182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", ) as response: assert not response.is_closed assert response.http_request.headers.get("X-Stainless-Lang") == "python" @@ -751,7 +867,7 @@ async def test_streaming_response_increment_queries(self, async_client: AsyncCod async def test_path_params_increment_queries(self, async_client: AsyncCodex) -> None: with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): await async_client.projects.with_raw_response.increment_queries( - "", + project_id="", ) @pytest.mark.skip() @@ -805,3 +921,100 @@ async def test_path_params_retrieve_analytics(self, async_client: AsyncCodex) -> await async_client.projects.with_raw_response.retrieve_analytics( project_id="", ) + + @pytest.mark.skip() + @parametrize + async def test_method_validate(self, async_client: AsyncCodex) -> None: + project = await async_client.projects.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -> None: + project = await async_client.projects.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + use_llm_matching=True, + bad_response_thresholds={ + "context_sufficiency": 0, + "query_ease": 0, + "response_helpfulness": 0, + "trustworthiness": 0, + }, + constrain_outputs=["string"], + custom_metadata={}, + eval_scores={"foo": 0}, + options={ + "custom_eval_criteria": [{}], + "log": ["string"], + "max_tokens": 0, + "model": "model", + "num_candidate_responses": 0, + "num_consistency_samples": 0, + "reasoning_effort": "reasoning_effort", + "similarity_measure": "similarity_measure", + "use_self_reflection": True, + }, + quality_preset="best", + task="task", + x_client_library_version="x-client-library-version", + x_integration_type="x-integration-type", + x_source="x-source", + x_stainless_package_version="x-stainless-package-version", + ) + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + async def test_raw_response_validate(self, async_client: AsyncCodex) -> None: + response = await async_client.projects.with_raw_response.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + project = await response.parse() + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + @pytest.mark.skip() + @parametrize + async def test_streaming_response_validate(self, async_client: AsyncCodex) -> None: + async with async_client.projects.with_streaming_response.validate( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + prompt="prompt", + query="query", + response="response", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + project = await response.parse() + assert_matches_type(ProjectValidateResponse, project, path=["response"]) + + assert cast(Any, response.is_closed) is True + + @pytest.mark.skip() + @parametrize + async def test_path_params_validate(self, async_client: AsyncCodex) -> None: + with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): + await async_client.projects.with_raw_response.validate( + project_id="", + context="context", + prompt="prompt", + query="query", + response="response", + )