From fb7b3ff3006069b2c7f17cc8f830d61d21413d9d Mon Sep 17 00:00:00 2001
From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com>
Date: Thu, 9 Apr 2026 03:21:52 -0700
Subject: [PATCH 1/3] fix: extract shared parse_token_int and add
 value-equality tests (#880)

Extract a shared parse_token_int() helper in models.py that centralises
the token-validation rules used by both _sanitize_non_numeric_tokens
(Pydantic boundary) and _extract_output_tokens (parser fast path).
This eliminates the duplicated logic and makes divergence structurally
impossible.

Add TestExtractOutputTokensValueEquality with test_token_value_agreement
parametrized test that verifies numeric counts match (not just the
binary contributes/does-not-contribute verdict) for contributing values.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/copilot_usage/models.py        | 58 ++++++++++++++++++++----------
 src/copilot_usage/parser.py        | 31 ++++------------
 tests/copilot_usage/test_parser.py | 43 ++++++++++++++++++++++
 3 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/src/copilot_usage/models.py b/src/copilot_usage/models.py
index 6d177ba..c559a33 100644
--- a/src/copilot_usage/models.py
+++ b/src/copilot_usage/models.py
@@ -40,6 +40,7 @@
     "session_sort_key",
     "shutdown_output_tokens",
     "total_output_tokens",
+    "parse_token_int",
 ]
 
 # ---------------------------------------------------------------------------
@@ -208,6 +209,35 @@ class ToolRequest(BaseModel):
     type: str = ""
 
 
+def parse_token_int(raw: object) -> int | None:
+    """Parse a raw ``outputTokens`` value into a positive ``int``, or ``None``.
+
+    Centralises the token-validation rules shared by
+    :meth:`AssistantMessageData._sanitize_non_numeric_tokens` (Pydantic
+    boundary) and :func:`_extract_output_tokens` (parser fast path).
+
+    Rules:
+
+    - ``bool`` / ``str`` → ``None`` (invalid, not coerced)
+    - non-whole ``float`` → ``None``
+    - zero or negative ``int`` / ``float`` → ``None``
+    - positive whole-number ``float`` → coerced to ``int``
+    - positive ``int`` → returned as-is
+    - any other type → ``None``
+    """
+    if isinstance(raw, (bool, str)):
+        return None
+    if isinstance(raw, float):
+        if not raw.is_integer():
+            return None
+        tokens = int(raw)
+    elif isinstance(raw, int):
+        tokens = raw
+    else:
+        return None
+    return tokens if tokens > 0 else None
+
+
 class AssistantMessageData(BaseModel):
     """Payload for ``assistant.message`` events."""
 
@@ -221,25 +251,17 @@ class AssistantMessageData(BaseModel):
     def _sanitize_non_numeric_tokens(cls, v: object) -> object:
         """Map non-positive, non-numeric, and non-whole-float token counts to ``0``.
 
-        JSON ``true``/``false``, numeric strings like ``"100"``,
-        non-positive numeric values, and non-integer floats (e.g. ``1.5``)
-        are not valid token counts.  Returning ``0`` preserves parsing of
-        the rest of the assistant message payload while preventing these
-        values from being lax-coerced into token counts.
-
-        This aligns with ``_extract_output_tokens`` in the parser fast path:
-        both paths agree that only positive whole-number values contribute
-        tokens.
+        Delegates to :func:`parse_token_int` for the actual validation
+        logic.  For types the helper recognises (``bool``, ``str``,
+        ``int``, ``float``), a ``None`` result is mapped to ``0`` so that
+        Pydantic's downstream ``int`` coercion always succeeds.  Unknown
+        types are passed through so that Pydantic can raise its own
+        ``ValidationError``.
         """
-        if isinstance(v, (bool, str)):
-            return 0
-        if isinstance(v, float):
-            if not v.is_integer() or v <= 0:
-                return 0
-            return int(v)
-        if isinstance(v, int) and v <= 0:
-            return 0
-        return v
+        if not isinstance(v, (bool, str, int, float)):
+            return v
+        result = parse_token_int(v)
+        return result if result is not None else 0
 
     reasoningText: str | None = None
     reasoningOpaque: str | None = None
diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py
index c459016..191887d 100644
--- a/src/copilot_usage/parser.py
+++ b/src/copilot_usage/parser.py
@@ -36,6 +36,7 @@
     TokenUsage,
     add_to_model_metrics,
     copy_model_metrics,
+    parse_token_int,
     session_sort_key,
 )
 
@@ -275,35 +276,15 @@ def _read_config_model(config_path: Path | None = None) -> str | None:
 def _extract_output_tokens(ev: SessionEvent) -> int | None:
     """Extract ``outputTokens`` from an ``assistant.message`` event via direct dict access.
 
-    Mirrors the domain intent of :class:`AssistantMessageData`'s
-    ``_sanitize_non_numeric_tokens`` field-validator: only positive
-    whole-number values contribute tokens.  Both paths agree on whether a
-    value contributes tokens; the representation differs for
-    non-contributing values — this function returns ``None``, whereas the
-    Pydantic model stores ``0``.
-
-    Specifically:
-
-    - ``bool`` / ``str`` → ``None`` (invalid, not coerced)
-    - zero or negative ``int`` / ``float`` → ``None`` (non-positive)
-    - whole-number positive ``float`` → coerced to ``int``
-    - non-whole ``float`` / other non-numeric → ``None``
+    Delegates to :func:`~copilot_usage.models.parse_token_int` for the
+    actual validation logic, ensuring that the fast path and the Pydantic
+    model path always agree on both the contributing/non-contributing
+    verdict **and** the numeric token count.
 
     Callers are responsible for verifying ``ev.type`` before calling; this
     function reads only the ``outputTokens`` key from the event data dict.
     """
-    raw = ev.data.get("outputTokens")
-    if raw is None or isinstance(raw, (bool, str)):
-        return None
-    if isinstance(raw, float):
-        if not raw.is_integer():
-            return None
-        tokens = int(raw)
-    elif isinstance(raw, int):
-        tokens = raw
-    else:
-        return None
-    return tokens if tokens > 0 else None
+    return parse_token_int(ev.data.get("outputTokens"))
 
 
 def _full_scandir_discovery(
diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py
index 56f2deb..ac5dfe7 100644
--- a/tests/copilot_usage/test_parser.py
+++ b/tests/copilot_usage/test_parser.py
@@ -4850,6 +4850,49 @@ def test_positive_contribution_agreement(
         )
 
 
+_VALUE_EQUALITY_CASES: list[tuple[str, object, int]] = [
+    ("positive_int_1", 1, 1),
+    ("positive_int_42", 42, 42),
+    ("whole_float_1234", 1234.0, 1234),
+    ("whole_float_1", 1.0, 1),
+]
+
+
+class TestExtractOutputTokensValueEquality:
+    """When both paths agree a value contributes, the numeric counts must be identical."""
+
+    @pytest.mark.parametrize(
+        ("label", "raw_value", "expected"),
+        _VALUE_EQUALITY_CASES,
+        ids=[c[0] for c in _VALUE_EQUALITY_CASES],
+    )
+    def test_token_value_agreement(
+        self, label: str, raw_value: object, expected: int
+    ) -> None:
+        """For contributing values, fast-path and model-path produce the same integer count."""
+        _ = label
+        fast_path_result = _extract_output_tokens(_make_assistant_event(raw_value))
+        model = AssistantMessageData.model_validate({"outputTokens": raw_value})
+
+        assert fast_path_result is not None, (
+            f"Expected fast path to contribute for {raw_value!r}"
+        )
+        assert model.outputTokens > 0, f"Expected model to contribute for {raw_value!r}"
+        assert fast_path_result == expected, (
+            f"Fast-path count mismatch for {raw_value!r}: "
+            f"got {fast_path_result}, expected {expected}"
+        )
+        assert model.outputTokens == expected, (
+            f"Model count mismatch for {raw_value!r}: "
+            f"got {model.outputTokens}, expected {expected}"
+        )
+        assert fast_path_result == model.outputTokens, (
+            f"Value divergence for {raw_value!r}: "
+            f"_extract_output_tokens → {fast_path_result}, "
+            f"AssistantMessageData.outputTokens → {model.outputTokens}"
+        )
+
+
 class TestExtractOutputTokensIntegration:
     """Integration tests exercising _extract_output_tokens through parse → build_session_summary."""
 

From 503735318d1be79716ee473bad5f9bd0da3fbd63 Mon Sep 17 00:00:00 2001
From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com>
Date: Thu, 9 Apr 2026 03:32:10 -0700
Subject: [PATCH 2/3] fix: address review comments

- Handle None (JSON null) in _sanitize_non_numeric_tokens by mapping it
  to 0, preventing ValidationError on model_validate({"outputTokens": None})
- Fix overclaiming docstring in _extract_output_tokens to clarify that
  numeric equality is guaranteed only for contributing values
- Add None to _EQUIVALENCE_CASES as regression test

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/copilot_usage/models.py        | 7 +++++--
 src/copilot_usage/parser.py        | 6 ++++--
 tests/copilot_usage/test_parser.py | 1 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/copilot_usage/models.py b/src/copilot_usage/models.py
index c559a33..576ae1d 100644
--- a/src/copilot_usage/models.py
+++ b/src/copilot_usage/models.py
@@ -252,12 +252,15 @@ def _sanitize_non_numeric_tokens(cls, v: object) -> object:
         """Map non-positive, non-numeric, and non-whole-float token counts to ``0``.
 
         Delegates to :func:`parse_token_int` for the actual validation
-        logic.  For types the helper recognises (``bool``, ``str``,
-        ``int``, ``float``), a ``None`` result is mapped to ``0`` so that
+        logic.  ``None`` (JSON ``null``) and types the helper recognises
+        (``bool``, ``str``, ``int``, ``float``) are mapped to ``0`` when
+        they don't represent a positive whole-number count, so that
         Pydantic's downstream ``int`` coercion always succeeds.  Unknown
         types are passed through so that Pydantic can raise its own
         ``ValidationError``.
         """
+        if v is None:
+            return 0
         if not isinstance(v, (bool, str, int, float)):
             return v
         result = parse_token_int(v)
diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py
index 191887d..1f0ffb7 100644
--- a/src/copilot_usage/parser.py
+++ b/src/copilot_usage/parser.py
@@ -278,8 +278,10 @@ def _extract_output_tokens(ev: SessionEvent) -> int | None:
 
     Delegates to :func:`~copilot_usage.models.parse_token_int` for the
     actual validation logic, ensuring that the fast path and the Pydantic
-    model path always agree on both the contributing/non-contributing
-    verdict **and** the numeric token count.
+    model path agree on whether a value contributes to token totals.  For
+    contributing values, they also agree on the numeric token count;
+    non-contributing values may be represented differently here as ``None``
+    and in the model as ``0``.
 
     Callers are responsible for verifying ``ev.type`` before calling; this
     function reads only the ``outputTokens`` key from the event data dict.
diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py
index ac5dfe7..fdc9682 100644
--- a/tests/copilot_usage/test_parser.py
+++ b/tests/copilot_usage/test_parser.py
@@ -4808,6 +4808,7 @@ def test_missing_key_no_pydantic(self) -> None:
 # ---------------------------------------------------------------------------
 
 _EQUIVALENCE_CASES: list[tuple[str, object]] = [
+    ("none_null", None),
     ("bool_true", True),
     ("bool_false", False),
     ("str_numeric", "100"),

From aac155e0669b54b785e0a089cf922c001a06b3f4 Mon Sep 17 00:00:00 2001
From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com>
Date: Thu, 9 Apr 2026 03:41:45 -0700
Subject: [PATCH 3/3] fix: address review comments

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/copilot_usage/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/copilot_usage/models.py b/src/copilot_usage/models.py
index 576ae1d..fbd41e3 100644
--- a/src/copilot_usage/models.py
+++ b/src/copilot_usage/models.py
@@ -214,7 +214,8 @@ def parse_token_int(raw: object) -> int | None:
 
     Centralises the token-validation rules shared by
     :meth:`AssistantMessageData._sanitize_non_numeric_tokens` (Pydantic
-    boundary) and :func:`_extract_output_tokens` (parser fast path).
+    boundary) and :func:`~copilot_usage.parser._extract_output_tokens`
+    (parser fast path).
 
     Rules: