diff --git a/pyproject.toml b/pyproject.toml index 79c66e780..32dc19a4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,8 +127,8 @@ renderers = false [tool.uv.sources] # Pinned to renderers PR #11 until the next PyPI release lands; drop after. -# 1f3de65 = Dynamo chat nvext transport for token-in /chat/completions. -renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "7ca1ab3" } +# 17005dd = Dynamo chat nvext transport with engine_data response support. +renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "17005dd" } [tool.uv.extra-build-dependencies] flash-attn = [{ requirement = "torch", match-runtime = true }] diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index ff9049473..6d1dee485 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -1,3 +1,4 @@ +from types import SimpleNamespace from typing import Any, cast import pytest @@ -169,7 +170,9 @@ async def test_get_native_response_falls_back_to_super_when_no_prefix_match( sentinel = {"source": "super"} calls: list[dict[str, Any]] = [] - async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: ANN001 + async def fake_get_prompt_ids( # noqa: ANN001 + self, state, prompt_messages, oai_tools, chat_template_kwargs=None + ): return None async def fake_super_get_native_response( # noqa: ANN001 @@ -235,7 +238,9 @@ async def test_get_native_response_uses_token_route_when_prompt_ids_available( recording_client = _RecordingClient() client = OpenAIChatCompletionsTokenClient(recording_client) - async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: ANN001 + async def fake_get_prompt_ids( # noqa: ANN001 + self, state, prompt_messages, oai_tools, chat_template_kwargs=None + ): return [10, 20] monkeypatch.setattr( @@ -270,3 +275,222 @@ async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: assert len(recording_client.calls) == 1 assert recording_client.calls[0]["path"] == "/chat/completions/tokens" assert recording_client.calls[0]["body"]["tokens"] == [10, 20] + + +# --------------------------------------------------------------------------- +# dynamo_chat_nvext transport (Dynamo bis/dynamo-rl) +# --------------------------------------------------------------------------- + + +class _StubRenderer: + """Renderer stand-in for the dynamo_chat_nvext transport tests. + + Returns deterministic ids so we can assert on body shape without pulling + in a real HuggingFace tokenizer download. ``render_ids`` returns a + fixed sequence; ``get_stop_token_ids`` returns a marker pair. + """ + + def __init__(self) -> None: + self.render_calls: list[dict[str, Any]] = [] + + def render_ids( + self, + messages, + *, + tools=None, + add_generation_prompt: bool = False, + ) -> list[int]: + self.render_calls.append( + { + "messages": messages, + "tools": tools, + "add_generation_prompt": add_generation_prompt, + } + ) + # Encode the call shape into ids so tests can disambiguate the two + # bridge tokenize calls without a real tokenizer. + return [42, len(messages), int(add_generation_prompt)] + + def get_stop_token_ids(self) -> list[int]: + return [99, 100] + + +class _DynamoTestClient(OpenAIChatCompletionsTokenClient): + """Dynamo-transport TITO client with a stubbed renderer. + + Subclass override is the cleanest way to inject the stub without going + through ``ClientConfig`` (which would require a real ``api_base_url`` + and ``setup_client`` to construct the AsyncOpenAI). The recording + client captures the eventual ``self.client.post(...)`` call. + """ + + _stub_renderer: _StubRenderer + + def __init__(self, recording_client) -> None: + super().__init__(recording_client) + self._stub_renderer = _StubRenderer() + + @property + def renderer_transport(self) -> str: # type: ignore[override] + return "dynamo_chat_nvext" + + def _get_renderer(self, model: str): # type: ignore[override] + return self._stub_renderer + + +@pytest.mark.asyncio +async def test_local_tokenize_uses_renderer_under_dynamo_transport(): + """Bridge tokenize must NOT hit any HTTP route under dynamo_chat_nvext. + + Goes straight through ``_local_tokenize`` -> ``renderer.render_ids``. + The recording client would record any errant POST; we assert it sees + none. + """ + recording_client = _RecordingClient() + client = _DynamoTestClient(recording_client) + + ids_full = await client.tokenize( + messages=[{"role": "user", "content": "u"}], + tools=None, + model="test-model", + ) + ids_base = await client.tokenize( + messages=[{"role": "user", "content": "u"}], + tools=None, + model="test-model", + extra_kwargs={"add_generation_prompt": False}, + ) + + # Both calls hit the renderer, neither hit the wire. + assert recording_client.calls == [] + assert client._stub_renderer.render_calls[0]["add_generation_prompt"] is True + assert client._stub_renderer.render_calls[1]["add_generation_prompt"] is False + # And the stub encodes that into the returned ids' last element. + assert ids_full[-1] == 1 + assert ids_base[-1] == 0 + + +@pytest.mark.asyncio +async def test_get_native_response_uses_dynamo_chat_nvext_under_transport( + monkeypatch: pytest.MonkeyPatch, +): + """Dynamo transport must POST to /chat/completions with nvext.token_data. + + Mirrors test_get_native_response_uses_token_route_when_prompt_ids_available + but for the new transport. + """ + recording_client = _RecordingClient() + client = _DynamoTestClient(recording_client) + + async def fake_get_prompt_ids( # noqa: ANN001 + self, state, prompt_messages, oai_tools, chat_template_kwargs=None + ): + return [10, 20, 30] + + monkeypatch.setattr( + OpenAIChatCompletionsTokenClient, "get_prompt_ids", fake_get_prompt_ids + ) + + state = cast( + State, + { + "model": "test-model", + "trajectory": [ + _make_step( + prompt=[{"role": "user", "content": "u1"}], + completion=[{"role": "assistant", "content": "a1"}], + prompt_ids=[1], + completion_ids=[2], + ) + ], + }, + ) + prompt = cast(Any, [{"role": "user", "content": "u2"}]) + + response = await client.get_native_response( + prompt=prompt, + model="test-model", + sampling_args={ + "max_completion_tokens": 16, + "temperature": 0.5, + "extra_body": { + "nvext": { + "extra_fields": ["timing"], + "cache_salt": "ckpt-42", + }, + "cache_salt": "top-level-salt", + }, + }, + tools=None, + state=state, + ) + + assert response["ok"] is True + assert len(recording_client.calls) == 1 + call = recording_client.calls[0] + + # Wire-shape assertions: route, nvext.token_data, stop_token_ids, + # placeholder messages, sampling fields promoted. + assert call["path"] == "/chat/completions" + body = call["body"] + assert body["nvext"]["token_data"] == [10, 20, 30] + assert body["nvext"]["extra_fields"] == ["timing", "engine_data"] + assert body["nvext"]["cache_salt"] == "ckpt-42" + assert body["cache_salt"] == "top-level-salt" + assert body["stop_token_ids"] == [99, 100] + assert body["messages"] == [{"role": "user", "content": "(token-in mode)"}] + assert body["max_completion_tokens"] == 16 + assert body["temperature"] == 0.5 + assert body["logprobs"] is True + assert body["stream"] is False + + # No /chat/completions/tokens, no /tokenize for the dynamo transport. + assert all( + c["path"] != "/chat/completions/tokens" and not c["path"].endswith("/tokenize") + for c in recording_client.calls + ) + + +@pytest.mark.asyncio +async def test_from_native_response_grafts_dynamo_engine_data_tokens(): + client = OpenAIChatCompletionsClient(_NoopClient()) + message = SimpleNamespace( + content="ok", + tool_calls=None, + model_dump=lambda: {}, + ) + response = SimpleNamespace( + id="chatcmpl-test", + created=0, + model="test-model", + usage=SimpleNamespace( + prompt_tokens=3, + completion_tokens=2, + total_tokens=5, + ), + nvext={ + "engine_data": { + "prompt_token_ids": [1, 2, 3], + "completion_token_ids": [4, 5], + }, + }, + choices=[ + SimpleNamespace( + finish_reason="stop", + message=message, + logprobs={ + "content": [ + {"logprob": -0.1}, + {"logprob": -0.2}, + ] + }, + ) + ], + ) + + parsed = await client.from_native_response(cast(Any, response)) + + assert parsed.message.tokens is not None + assert parsed.message.tokens.prompt_ids == [1, 2, 3] + assert parsed.message.tokens.completion_ids == [4, 5] + assert parsed.message.tokens.completion_logprobs == [-0.1, -0.2] diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index cd438590e..e810980b0 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -193,7 +193,7 @@ async def fake_generate(**kwargs): assert response == {"content": "ok"} assert len(calls) == 1 - assert calls[0]["transport"] == "dynamo_chat_nvext" + assert calls[0]["transport"] == "dynamo" assert calls[0]["prompt_ids"] == [10, 20] @@ -469,7 +469,7 @@ async def test_get_incremental_prompt_ids_accepts_multimodal_tool_user_tail(): "auto", id="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", ), - pytest.param("openai/gpt-oss-20b", "gpt_oss", id="openai/gpt-oss-20b"), + pytest.param("openai/gpt-oss-20b", "gpt-oss", id="openai/gpt-oss-20b"), ] diff --git a/uv.lock b/uv.lock index 4ed78a6df..38c49a7b4 100644 --- a/uv.lock +++ b/uv.lock @@ -4845,8 +4845,8 @@ wheels = [ [[package]] name = "renderers" -version = "0.1.7" -source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3#7ca1ab357f3ae2262ad10ffe757670739a8ec2c5" } +version = "0.1.8.dev12+g17005dd" +source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd#17005dd79f031f7993fb8e0b26b52aff346ad07e" } dependencies = [ { name = "jinja2" }, { name = "numpy" }, @@ -6209,7 +6209,7 @@ requires-dist = [ { name = "pyzmq", specifier = ">=27.1.0" }, { name = "reasoning-gym", marker = "extra == 'rg'" }, { name = "regex", specifier = "<2026.4.4" }, - { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, + { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd" }, { name = "requests" }, { name = "requests", marker = "extra == 'rl'" }, { name = "rich" }, @@ -6242,7 +6242,7 @@ dev = [ { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "reasoning-gym" }, - { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, + { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd" }, { name = "ruff" }, { name = "stagehand", specifier = ">=3.0.0" }, { name = "textarena" }, diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c755d8dd4..f8bb772fb 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -420,8 +420,44 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: case _: return None + def _graft_engine_data(response: OpenAIChatResponse) -> None: + nvext = getattr(response, "nvext", None) + if nvext is None and hasattr(response, "model_dump"): + nvext = response.model_dump().get("nvext") + if not isinstance(nvext, dict): + return + + choice = response.choices[0] + engine_data = nvext.get("engine_data") + completion_token_ids_top = nvext.get("completion_token_ids") + prompt_token_ids_top = nvext.get("prompt_token_ids") + + completion_token_ids: list[int] | None = None + prompt_token_ids: list[int] | None = None + if isinstance(engine_data, dict): + if engine_data.get("completion_token_ids") is not None: + completion_token_ids = list(engine_data["completion_token_ids"]) + if engine_data.get("prompt_token_ids") is not None: + prompt_token_ids = list(engine_data["prompt_token_ids"]) + if completion_token_ids is None and completion_token_ids_top is not None: + completion_token_ids = list(completion_token_ids_top) + if prompt_token_ids is None and prompt_token_ids_top is not None: + prompt_token_ids = list(prompt_token_ids_top) + + if ( + getattr(choice, "token_ids", None) is None + and completion_token_ids is not None + ): + object.__setattr__(choice, "token_ids", completion_token_ids) + if ( + getattr(response, "prompt_token_ids", None) is None + and prompt_token_ids is not None + ): + object.__setattr__(response, "prompt_token_ids", prompt_token_ids) + def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" + _graft_engine_data(response) choice = response.choices[0] if not hasattr(choice, "token_ids"): return None diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 50f34b54f..a2bface81 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -1,3 +1,4 @@ +import asyncio from collections.abc import Mapping from typing import Any, Optional, cast @@ -19,7 +20,11 @@ OpenAITool, handle_openai_overlong_prompt, ) -from verifiers.types import SamplingArgs, State +from verifiers.types import RendererTransport, SamplingArgs, State + +# Sentinel returned by transports that don't tokenize over HTTP. Lets callers +# route around the legacy /tokenize body shape without changing the signature. +_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" def _has_multimodal_content(messages) -> bool: @@ -64,7 +69,22 @@ class TokenizeResponse(BaseModel): class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): - """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client.""" + """Token-in/token-out chat client. + + Two transports share this class: + + * ``prime_vllm_generate`` (default): the historical TITO surface that + posts to vLLM's ``/v1/chat/completions/tokens`` and uses the engine's + ``/tokenize`` for bridge-token computation. This is what vanilla vLLM + ``>=0.20`` exposes. + * ``dynamo_chat_nvext``: posts pre-tokenized prompts to Dynamo's standard + ``/v1/chat/completions`` route with ``nvext.token_data`` carrying the + stitched ``prompt_ids``. Bridge tokenization runs locally via the + ``renderers`` package (no ``/tokenize`` round-trip) since Dynamo + doesn't expose vLLM's token routes. Selection is via + ``ClientConfig.renderer_transport``; same field the renderer client + consults so a single config option drives both clients consistently. + """ @property def token_client(self) -> AsyncOpenAI: @@ -74,6 +94,51 @@ def token_client(self) -> AsyncOpenAI: base_url = base_url[:-3] return self.client.with_options(base_url=base_url) + @property + def renderer_transport(self) -> RendererTransport: + """Wire-shape selector. ``ClientConfig.renderer_transport`` if set, + else the default vLLM TITO shape. Mirrors the same field used by + ``RendererClient`` so backend selection stays in one place.""" + return cast( + RendererTransport, + getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT) + if self._config is not None + else _DEFAULT_TRANSPORT, + ) + + def _get_renderer(self, model: str): + """Lazy, per-model renderer cache. Used only by the ``dynamo_chat_nvext`` + transport for client-side tokenization and stop-token resolution. + + Loaded on first use and reused across calls so we pay the + ``AutoTokenizer.from_pretrained`` cost once. The renderer's + underlying tokenizer is HuggingFace fast-tokenizer-backed, so the + wrapping ``asyncio.to_thread`` calls in ``tokenize()`` get real + parallelism (the Rust encode releases the GIL). + """ + cache: dict[str, Any] = self.__dict__.setdefault("_renderer_cache", {}) + if model in cache: + return cache[model] + try: + from renderers import create_renderer + from transformers import AutoTokenizer + except ImportError as exc: # pragma: no cover - dependency surface + raise ImportError( + "OpenAIChatCompletionsTokenClient with renderer_transport=" + "'dynamo_chat_nvext' requires the 'renderers' and 'transformers' " + "packages. Install via `pip install verifiers[renderers]` or add " + "renderers + transformers to your environment." + ) from exc + tokenizer = AutoTokenizer.from_pretrained(model) + renderer_name = ( + getattr(self._config, "renderer", "auto") + if self._config is not None + else "auto" + ) + renderer = create_renderer(tokenizer, renderer=renderer_name or "auto") + cache[model] = renderer + return renderer + @handle_openai_overlong_prompt async def get_native_response( self, @@ -88,12 +153,35 @@ def normalize_sampling_args(sampling_args: SamplingArgs): if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") sampling_args["logprobs"] = True - extra_body = dict(return_token_ids=True) - if "extra_body" in sampling_args: - sampling_args["extra_body"] = { - **sampling_args["extra_body"], - **extra_body, + + if self.renderer_transport == "dynamo_chat_nvext": + extra_body: dict[str, Any] = { + "nvext": {"extra_fields": ["engine_data"]} } + else: + extra_body = {"return_token_ids": True} + + if "extra_body" in sampling_args: + merged = {**sampling_args["extra_body"]} + if "nvext" in merged and "nvext" in extra_body: + merged_nvext = merged.get("nvext") + extra_nvext = extra_body.get("nvext") + base = ( + dict(merged_nvext) if isinstance(merged_nvext, Mapping) else {} + ) + inc = dict(extra_nvext) if isinstance(extra_nvext, Mapping) else {} + base_extra_fields = list(base.get("extra_fields") or []) + inc_extra_fields = list(inc.get("extra_fields") or []) + extra_fields = list( + dict.fromkeys(base_extra_fields + inc_extra_fields) + ) + merged["nvext"] = {**base, **inc, "extra_fields": extra_fields} + sampling_args["extra_body"] = { + **{k: v for k, v in extra_body.items() if k != "nvext"}, + **merged, + } + else: + sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body return {k: v for k, v in sampling_args.items() if v is not None} @@ -115,7 +203,12 @@ def normalize_sampling_args(sampling_args: SamplingArgs): return await super().get_native_response( prompt, model, sampling_args, tools, extra_headers=extra_headers ) - prompt_ids = await self.get_prompt_ids(state, prompt, tools) + chat_template_kwargs = sampling_args["extra_body"].get( + "chat_template_kwargs", {} + ) + prompt_ids = await self.get_prompt_ids( + state, prompt, tools, chat_template_kwargs=chat_template_kwargs + ) if prompt_ids is None: # Reaching this branch means we have a non-empty trajectory but # could not stitch — surface it loudly so ops catches regressions. @@ -126,6 +219,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) + if self.renderer_transport == "dynamo_chat_nvext": + return await self._post_dynamo_chat_nvext( + prompt=prompt, + prompt_ids=prompt_ids, + model=model, + tools=tools, + sampling_args=sampling_args, + extra_headers=extra_headers, + ) + extra_body = sampling_args.pop("extra_body", {}) body = dict( model=model, @@ -143,11 +246,89 @@ def normalize_sampling_args(sampling_args: SamplingArgs): options={"headers": extra_headers} if extra_headers else {}, ) + async def _post_dynamo_chat_nvext( + self, + prompt: OpenAIChatMessages, + prompt_ids: list[int], + model: str, + tools: list[OpenAITool] | None, + sampling_args: dict, + extra_headers: Mapping[str, str] | None, + ) -> OpenAIChatResponse: + """Post stitched prompt_ids to Dynamo's chat-completions route. + + The engine sees ``nvext.token_data`` and skips tokenization. Response + token IDs come back through ``nvext.engine_data.completion_token_ids`` + and are grafted onto the standard token fields by + ``OpenAIChatCompletionsClient.from_native_response``. + """ + renderer = self._get_renderer(model) + stop_token_ids = list(renderer.get_stop_token_ids()) + + extra_body = dict(sampling_args.pop("extra_body", {}) or {}) + + nvext = dict(extra_body.pop("nvext", None) or {}) + nvext["token_data"] = prompt_ids + priority = sampling_args.get("priority", extra_body.get("priority")) + if priority is not None: + nvext["agent_hints"] = {"priority": priority} + + body: dict[str, Any] = { + "model": model, + "messages": [{"role": "user", "content": "(token-in mode)"}], + "stream": False, + "logprobs": True, + "stop_token_ids": stop_token_ids, + "nvext": nvext, + } + if tools: + body["tools"] = tools + + # Promote sampling fields that Dynamo's chat-completions surface + # accepts directly. Anything else stays in extra_body and rides as + # an unrecognized passthrough field (validate.rs:104 allowlist). + promotable = ( + "max_completion_tokens", + "max_tokens", + "temperature", + "top_p", + "top_k", + "min_p", + "seed", + "n", + "repetition_penalty", + "min_tokens", + "top_logprobs", + "stop", + ) + for key in promotable: + value = sampling_args.get(key, extra_body.get(key)) + if value is not None: + body[key] = value + + # Pass any remaining unhandled extra_body keys straight through (e.g. + # cache_salt, return_token_ids). Dynamo's PASSTHROUGH_EXTRA_FIELDS + # allowlist accepts these without rejection. + passthrough = { + k: v + for k, v in extra_body.items() + if k not in promotable and v is not None and k not in body + } + body.update(passthrough) + + return await self.client.post( + "/chat/completions", + body=body, + cast_to=ChatCompletion, + options={"headers": extra_headers} if extra_headers else {}, + ) + async def get_prompt_ids( self, state: State, prompt_messages: OpenAIChatMessages, oai_tools: list[OpenAITool] | None, + chat_template_kwargs: dict | None = None, ) -> list[int] | None: """ Build prompt_ids for the next turn by stitching engine tokens with @@ -175,6 +356,13 @@ def normalize_for_comparison(value: Any) -> Any: # prefix-match equality is unaffected. if normalized.get("content") == "": normalized["content"] = None + # Drop None-valued keys so model_dump's exhaustive view (which + # carries e.g. thinking_blocks=None on AssistantMessage) is + # equivalent to to_native_prompt's slimmer view (which omits + # the field entirely). Without this, vf.Message-shaped input + # never matches the to_native_prompt-normalized step messages, + # which breaks the prefix match for MultiTurnEnv rollouts. + normalized = {k: v for k, v in normalized.items() if v is not None} return normalized if isinstance(value, list): return [normalize_for_comparison(item) for item in value] @@ -269,6 +457,7 @@ async def find_largest_prefix_match() -> tuple[list[int], bool, int] | None: if tool_call_ids: dummy_assistant: OpenAIChatMessage = ChatCompletionAssistantMessageParam( role="assistant", + reasoning_content="", # type: ignore[typeddict-unknown-key] tool_calls=[ ChatCompletionMessageFunctionToolCallParam( id=tc_id, @@ -280,20 +469,29 @@ async def find_largest_prefix_match() -> tuple[list[int], bool, int] | None: ) else: dummy_assistant: OpenAIChatMessage = ChatCompletionAssistantMessageParam( - role="assistant", content="x" + role="assistant", + reasoning_content="", # type: ignore[typeddict-unknown-key] + content="x", ) + forwarded_ctk = ( + {"chat_template_kwargs": dict(chat_template_kwargs)} + if chat_template_kwargs + else {} + ) + try: bridge_full_ids = await self.tokenize( messages=[dummy_assistant] + env_messages, tools=oai_tools, model=state["model"], + extra_kwargs=dict(forwarded_ctk), ) bridge_base_ids = await self.tokenize( messages=[dummy_assistant], tools=oai_tools, model=state["model"], - extra_kwargs=dict(add_generation_prompt=False), + extra_kwargs=dict(add_generation_prompt=False, **forwarded_ctk), ) except Exception: self.logger.debug("TITO: bridge tokenization failed, falling back to MITO") @@ -338,9 +536,27 @@ async def tokenize( extra_kwargs: dict | None = None, **kwargs, ) -> list[int]: - """Tokenize messages using the vLLM /tokenize API.""" + """Tokenize messages. + + ``dynamo_chat_nvext`` transport: tokenizes locally via the + ``renderers`` package, no network call. Runs on a worker thread so + the event loop stays free; HuggingFace fast tokenizers release the + GIL during the Rust encode pass. + + Default transport: posts to vLLM's ``/tokenize`` route on the + host root. + """ if extra_kwargs is None: extra_kwargs = {} + + if self.renderer_transport == "dynamo_chat_nvext": + return await self._local_tokenize( + messages=messages, + tools=tools, + model=model, + extra_kwargs=extra_kwargs, + ) + if isinstance(messages, str): body = dict( model=model, @@ -361,3 +577,46 @@ async def tokenize( "/tokenize", body=body, cast_to=TokenizeResponse ) return tokenize_response.tokens + + async def _local_tokenize( + self, + messages: str | OpenAIChatMessages, + tools: list[OpenAITool] | None, + model: str, + extra_kwargs: dict, + ) -> list[int]: + """Local in-process tokenization for the dynamo transport. + + Bridge tokenization under TITO calls this twice per turn (once for + ``add_generation_prompt=True`` and once for ``False``). Both calls + go through the same renderer, so the chat-template + tool-call + normalization is consistent with whatever Dynamo's worker would + produce server-side. + """ + renderer = self._get_renderer(model) + + def _render() -> list[int]: + if isinstance(messages, str): + tokenizer = getattr(renderer, "tokenizer", None) + if tokenizer is None: + raise RuntimeError( + "Renderer for model %r does not expose a tokenizer; " + "cannot tokenize a raw string under dynamo_chat_nvext." % model + ) + # Strip BOS for parity with vLLM /tokenize (which never + # prepends a BOS for raw-prompt tokenize requests). + encoded = tokenizer(messages, add_special_tokens=False) + return list(encoded["input_ids"]) + + add_generation_prompt = bool( + extra_kwargs.get("add_generation_prompt", True) + ) + return list( + renderer.render_ids( + cast(Any, list(messages)), + tools=cast(Any, tools), + add_generation_prompt=add_generation_prompt, + ) + ) + + return await asyncio.to_thread(_render) diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 45d21416c..b0686f57e 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -27,6 +27,7 @@ ) from renderers import ToolCall as RendererToolCall from renderers import ToolCallFunction +from renderers.client import RendererTransport as RenderersTransport from renderers.client import generate from verifiers.clients.client import Client @@ -352,7 +353,9 @@ async def _get_incremental_prompt_ids( ), ) _record_bridge(success=bridged is not None) - return bridged + if bridged is None: + return None + return list(getattr(bridged, "token_ids", bridged)) return None @@ -518,6 +521,13 @@ async def get_native_response( tools=tools, ) + renderer_transport = getattr( + self._config, "renderer_transport", "prime_vllm_generate" + ) + transport: RenderersTransport = ( + "dynamo" if renderer_transport == "dynamo_chat_nvext" else "vllm" + ) + return await generate( client=self.client, renderer=renderer, @@ -530,9 +540,7 @@ async def get_native_response( or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), extra_headers=args.get("extra_headers"), - transport=getattr( - self._config, "renderer_transport", "prime_vllm_generate" - ), + transport=transport, ) async def raise_from_native_response(self, response: dict[str, Any]) -> None: