diff --git a/src/copilot_usage/models.py b/src/copilot_usage/models.py index 6d177ba..fbd41e3 100644 --- a/src/copilot_usage/models.py +++ b/src/copilot_usage/models.py @@ -40,6 +40,7 @@ "session_sort_key", "shutdown_output_tokens", "total_output_tokens", + "parse_token_int", ] # --------------------------------------------------------------------------- @@ -208,6 +209,36 @@ class ToolRequest(BaseModel): type: str = "" +def parse_token_int(raw: object) -> int | None: + """Parse a raw ``outputTokens`` value into a positive ``int``, or ``None``. + + Centralises the token-validation rules shared by + :meth:`AssistantMessageData._sanitize_non_numeric_tokens` (Pydantic + boundary) and :func:`~copilot_usage.parser._extract_output_tokens` + (parser fast path). + + Rules: + + - ``bool`` / ``str`` → ``None`` (invalid, not coerced) + - non-whole ``float`` → ``None`` + - zero or negative ``int`` / ``float`` → ``None`` + - positive whole-number ``float`` → coerced to ``int`` + - positive ``int`` → returned as-is + - any other type → ``None`` + """ + if isinstance(raw, (bool, str)): + return None + if isinstance(raw, float): + if not raw.is_integer(): + return None + tokens = int(raw) + elif isinstance(raw, int): + tokens = raw + else: + return None + return tokens if tokens > 0 else None + + class AssistantMessageData(BaseModel): """Payload for ``assistant.message`` events.""" @@ -221,25 +252,20 @@ class AssistantMessageData(BaseModel): def _sanitize_non_numeric_tokens(cls, v: object) -> object: """Map non-positive, non-numeric, and non-whole-float token counts to ``0``. - JSON ``true``/``false``, numeric strings like ``"100"``, - non-positive numeric values, and non-integer floats (e.g. ``1.5``) - are not valid token counts. Returning ``0`` preserves parsing of - the rest of the assistant message payload while preventing these - values from being lax-coerced into token counts. - - This aligns with ``_extract_output_tokens`` in the parser fast path: - both paths agree that only positive whole-number values contribute - tokens. + Delegates to :func:`parse_token_int` for the actual validation + logic. ``None`` (JSON ``null``) and types the helper recognises + (``bool``, ``str``, ``int``, ``float``) are mapped to ``0`` when + they don't represent a positive whole-number count, so that + Pydantic's downstream ``int`` coercion always succeeds. Unknown + types are passed through so that Pydantic can raise its own + ``ValidationError``. """ - if isinstance(v, (bool, str)): - return 0 - if isinstance(v, float): - if not v.is_integer() or v <= 0: - return 0 - return int(v) - if isinstance(v, int) and v <= 0: + if v is None: return 0 - return v + if not isinstance(v, (bool, str, int, float)): + return v + result = parse_token_int(v) + return result if result is not None else 0 reasoningText: str | None = None reasoningOpaque: str | None = None diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py index c459016..1f0ffb7 100644 --- a/src/copilot_usage/parser.py +++ b/src/copilot_usage/parser.py @@ -36,6 +36,7 @@ TokenUsage, add_to_model_metrics, copy_model_metrics, + parse_token_int, session_sort_key, ) @@ -275,35 +276,17 @@ def _read_config_model(config_path: Path | None = None) -> str | None: def _extract_output_tokens(ev: SessionEvent) -> int | None: """Extract ``outputTokens`` from an ``assistant.message`` event via direct dict access. - Mirrors the domain intent of :class:`AssistantMessageData`'s - ``_sanitize_non_numeric_tokens`` field-validator: only positive - whole-number values contribute tokens. Both paths agree on whether a - value contributes tokens; the representation differs for - non-contributing values — this function returns ``None``, whereas the - Pydantic model stores ``0``. - - Specifically: - - - ``bool`` / ``str`` → ``None`` (invalid, not coerced) - - zero or negative ``int`` / ``float`` → ``None`` (non-positive) - - whole-number positive ``float`` → coerced to ``int`` - - non-whole ``float`` / other non-numeric → ``None`` + Delegates to :func:`~copilot_usage.models.parse_token_int` for the + actual validation logic, ensuring that the fast path and the Pydantic + model path agree on whether a value contributes to token totals. For + contributing values, they also agree on the numeric token count; + non-contributing values may be represented differently here as ``None`` + and in the model as ``0``. Callers are responsible for verifying ``ev.type`` before calling; this function reads only the ``outputTokens`` key from the event data dict. """ - raw = ev.data.get("outputTokens") - if raw is None or isinstance(raw, (bool, str)): - return None - if isinstance(raw, float): - if not raw.is_integer(): - return None - tokens = int(raw) - elif isinstance(raw, int): - tokens = raw - else: - return None - return tokens if tokens > 0 else None + return parse_token_int(ev.data.get("outputTokens")) def _full_scandir_discovery( diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index 56f2deb..fdc9682 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -4808,6 +4808,7 @@ def test_missing_key_no_pydantic(self) -> None: # --------------------------------------------------------------------------- _EQUIVALENCE_CASES: list[tuple[str, object]] = [ + ("none_null", None), ("bool_true", True), ("bool_false", False), ("str_numeric", "100"), @@ -4850,6 +4851,49 @@ def test_positive_contribution_agreement( ) +_VALUE_EQUALITY_CASES: list[tuple[str, object, int]] = [ + ("positive_int_1", 1, 1), + ("positive_int_42", 42, 42), + ("whole_float_1234", 1234.0, 1234), + ("whole_float_1", 1.0, 1), +] + + +class TestExtractOutputTokensValueEquality: + """When both paths agree a value contributes, the numeric counts must be identical.""" + + @pytest.mark.parametrize( + ("label", "raw_value", "expected"), + _VALUE_EQUALITY_CASES, + ids=[c[0] for c in _VALUE_EQUALITY_CASES], + ) + def test_token_value_agreement( + self, label: str, raw_value: object, expected: int + ) -> None: + """For contributing values, fast-path and model-path produce the same integer count.""" + _ = label + fast_path_result = _extract_output_tokens(_make_assistant_event(raw_value)) + model = AssistantMessageData.model_validate({"outputTokens": raw_value}) + + assert fast_path_result is not None, ( + f"Expected fast path to contribute for {raw_value!r}" + ) + assert model.outputTokens > 0, f"Expected model to contribute for {raw_value!r}" + assert fast_path_result == expected, ( + f"Fast-path count mismatch for {raw_value!r}: " + f"got {fast_path_result}, expected {expected}" + ) + assert model.outputTokens == expected, ( + f"Model count mismatch for {raw_value!r}: " + f"got {model.outputTokens}, expected {expected}" + ) + assert fast_path_result == model.outputTokens, ( + f"Value divergence for {raw_value!r}: " + f"_extract_output_tokens → {fast_path_result}, " + f"AssistantMessageData.outputTokens → {model.outputTokens}" + ) + + class TestExtractOutputTokensIntegration: """Integration tests exercising _extract_output_tokens through parse → build_session_summary."""