electric-sql · kevin-dp · Jun 4, 2026
diff --git a/.changeset/agent-token-usage.md b/.changeset/agent-token-usage.md
@@ -0,0 +1,27 @@
+---
+'@electric-ax/agents-server-ui': patch
+'@electric-ax/agents-runtime': patch
+'@electric-ax/agents-desktop': patch
+---
+
+Show per-response token usage in the agent meta row, e.g. `1.2k ↑ 412
+↓`. Updates as each step settles — for a single-turn call this lands
+once at done; for tool-using runs the counter jumps at each step
+boundary (the LLM SDK only emits `usage` at end-of-step, so we can't
+tick smoothly between tokens).
+
+Plumbing:
+
+- `StepValue` gains optional `input_tokens` / `output_tokens` columns
+  (Zod + TS). Strictly additive: events recorded before this change
+  stay valid since both fields are optional, so no migration.
+- `outbound-bridge.ts:onStepEnd` now persists the `tokenInput` /
+  `tokenOutput` it already received from `pi-adapter.ts` — previously
+  those values were accepted and silently dropped.
+- `EntityTimelineStepItem` / `IncludesStep` surface the new fields,
+  and the three `.select()` blocks that materialize steps include
+  them.
+- The cached `agent_response` section gets a `tokens?: { input?,
+  output? }` summed across the run's steps at section-build time, and
+  the section-cache fingerprint factors in step token deltas so a
+  late-arriving `onStepEnd` invalidates a stale section.
diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts
@@ -123,6 +123,12 @@ type StepValue = {
   model_provider?: string
   model_id?: string
   duration_ms?: number
+  // Token usage for this step as reported by the provider's
+  // end-of-message `usage` payload. Populated on `onStepEnd` when the
+  // adapter has the data — older events without these fields stay
+  // valid (both optional), so this is a strictly additive change.
+  input_tokens?: number
+  output_tokens?: number
 }
 type TextValue = {
   key?: string
@@ -436,6 +442,8 @@ function createStepSchema(): Schema<StepValue> {
     model_provider: z.string().optional(),
     model_id: z.string().optional(),
     duration_ms: z.number().int().optional(),
+    input_tokens: z.number().int().nonnegative().optional(),
+    output_tokens: z.number().int().nonnegative().optional(),
   })
 }
 

diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts
@@ -57,6 +57,13 @@ export type EntityTimelineSection =
       items: Array<EntityTimelineContentItem>
       done?: true
       error?: string
+      // Summed across all steps of the run that produced this section.
+      // Either side may be missing if the provider didn't report it
+      // (e.g. older events recorded before tokens were persisted).
+      tokens?: {
+        input?: number
+        output?: number
+      }
     }
   | {
       kind: `wake`
@@ -103,6 +110,8 @@ export interface IncludesStep {
   status: `started` | `completed`
   model_id?: string
   duration_ms?: number
+  input_tokens?: number
+  output_tokens?: number
 }
 
 export interface IncludesError {
@@ -228,6 +237,8 @@ export interface EntityTimelineStepItem {
   status: `started` | `completed`
   model_id?: string
   duration_ms?: number
+  input_tokens?: number
+  output_tokens?: number
 }
 
 export interface EntityTimelineErrorItem {
@@ -778,6 +789,8 @@ function buildIncludesRuns(input: {
       status: step.status,
       model_id: step.model_id,
       duration_ms: step.duration_ms,
+      input_tokens: step.input_tokens,
+      output_tokens: step.output_tokens,
     })
     stepsByRun.set(step.run_id, entries)
   }
@@ -1361,6 +1374,8 @@ function buildEntityTimelineQuery(
         status: step.status,
         model_id: step.model_id,
         duration_ms: step.duration_ms,
+        input_tokens: step.input_tokens,
+        output_tokens: step.output_tokens,
       })),
     errors: q
       .from({ error: db.collections.errors })
@@ -1490,6 +1505,8 @@ export function createEntityIncludesQuery(
                   status: step.status,
                   model_id: step.model_id,
                   duration_ms: step.duration_ms,
+                  input_tokens: step.input_tokens,
+                  output_tokens: step.output_tokens,
                 }))
             ),
             errors: toArray(

diff --git a/packages/agents-runtime/src/outbound-bridge.ts b/packages/agents-runtime/src/outbound-bridge.ts
@@ -231,6 +231,12 @@ export function createOutboundBridge(
             ...(opts?.durationMs !== undefined && {
               duration_ms: opts.durationMs,
             }),
+            ...(opts?.tokenInput !== undefined && {
+              input_tokens: opts.tokenInput,
+            }),
+            ...(opts?.tokenOutput !== undefined && {
+              output_tokens: opts.tokenOutput,
+            }),
           } as never,
         }) as ChangeEvent
       )

diff --git a/packages/agents-runtime/src/use-chat.ts b/packages/agents-runtime/src/use-chat.ts
@@ -148,6 +148,14 @@ function fingerprintRun(run: IncludesRun): string {
   for (const tc of run.toolCalls) {
     fp += `:${tc.key}.${tc.status}${payloadSniff(`a`, tc.args)}${payloadSniff(`r`, tc.result)}`
   }
+  // Steps participate in the fingerprint because the section now
+  // surfaces summed token counts from them — without this, a step
+  // landing its `input_tokens` / `output_tokens` after the run
+  // already settled would not invalidate the cached section.
+  fp += `|s:${run.steps.length}`
+  for (const s of run.steps) {
+    fp += `:${s.key}.${s.status}.${s.input_tokens ?? `-`}.${s.output_tokens ?? `-`}`
+  }
   return fp
 }
 
@@ -327,11 +335,39 @@ function buildAgentSection(run: IncludesRun): AgentResponseSection {
       failedToolText ?? finishReason ?? `Run failed (no error details recorded)`
   }
 
+  // Token totals across this run's steps. We accumulate per side and
+  // only attach `tokens` to the section if at least one step reported
+  // a number — that way a run whose provider never emitted usage data
+  // (older events, test fixtures, future providers without `usage`)
+  // continues to render with no token row instead of "0 / 0".
+  let tokenInputSum = 0
+  let tokenOutputSum = 0
+  let sawTokenInput = false
+  let sawTokenOutput = false
+  for (const step of run.steps) {
+    if (typeof step.input_tokens === `number`) {
+      tokenInputSum += step.input_tokens
+      sawTokenInput = true
+    }
+    if (typeof step.output_tokens === `number`) {
+      tokenOutputSum += step.output_tokens
+      sawTokenOutput = true
+    }
+  }
+  const tokens =
+    sawTokenInput || sawTokenOutput
+      ? {
+          ...(sawTokenInput && { input: tokenInputSum }),
+          ...(sawTokenOutput && { output: tokenOutputSum }),
+        }
+      : undefined
+
   const section: AgentResponseSection = {
     kind: `agent_response`,
     items: contentItems,
     ...(run.status === `completed` && { done: true as const }),
     ...(errorText && { error: errorText }),
+    ...(tokens && { tokens }),
   }
   // Always cache (terminal or in-flight). Fingerprint check above
   // guarantees we never serve a stale streaming section — text growth

diff --git a/packages/agents-server-ui/src/components/AgentResponse.tsx b/packages/agents-server-ui/src/components/AgentResponse.tsx
@@ -26,6 +26,7 @@ import { ToolCallView } from './ToolCallView'
 import { TimeText } from './TimeText'
 import { ThinkingIndicator } from './ThinkingIndicator'
 import { ElapsedTime } from './ElapsedTime'
+import { TokenUsage } from './TokenUsage'
 import { formatElapsedDuration, toMillis } from '../lib/formatTime'
 import styles from './AgentResponse.module.css'
 import type {
@@ -400,6 +401,39 @@ export const AgentResponseLive = memo(function AgentResponseLive({
     (q) => (run.errors ? q.from({ error: run.errors }) : undefined),
     [run.errors]
   )
+  // Live token aggregation: subscribe to this run's step rows and
+  // sum `input_tokens` / `output_tokens` across them. Steps land
+  // their token counts on `onStepEnd`, so for a single-turn LLM call
+  // this updates once; for a tool-using run with N model calls it
+  // jumps N times as each step settles.
+  const { data: stepRows = [] } = useLiveQuery(
+    (q) => (run.steps ? q.from({ step: run.steps }) : undefined),
+    [run.steps]
+  )
+  const liveTokens = useMemo(() => {
+    let inSum = 0
+    let outSum = 0
+    let sawIn = false
+    let sawOut = false
+    for (const s of stepRows as Array<{
+      input_tokens?: number
+      output_tokens?: number
+    }>) {
+      if (typeof s.input_tokens === `number`) {
+        inSum += s.input_tokens
+        sawIn = true
+      }
+      if (typeof s.output_tokens === `number`) {
+        outSum += s.output_tokens
+        sawOut = true
+      }
+    }
+    if (!sawIn && !sawOut) return null
+    return {
+      input: sawIn ? inSum : undefined,
+      output: sawOut ? outSum : undefined,
+    }
+  }, [stepRows])
   const sortedItems = useMemo(
     () => [...items].sort(compareLiveRunItems),
     [items]
@@ -532,9 +566,24 @@ export const AgentResponseLive = memo(function AgentResponseLive({
             <ElapsedTime ts={timestamp} enabled={isStreaming} />
           </>
         )}
+        {/* Token usage — sums every step's `input_tokens` /
+            `output_tokens` as they land. Updates at step boundaries
+            (the LLM SDK only emits `usage` at end-of-step), so for a
+            single-turn call it appears once at done; for tool-using
+            runs it jumps as each step completes. */}
+        {liveTokens && (
+          <>
+            {(hasLeadingMeta || (isStreaming && timestamp != null)) && (
+              <Text size={1} tone="muted" className={styles.metaSeparator}>
+                ·
+              </Text>
+            )}
+            <TokenUsage input={liveTokens.input} output={liveTokens.output} />
+          </>
+        )}
         {showTimestamp && (
           <>
-            {hasLeadingMeta && (
+            {(hasLeadingMeta || liveTokens) && (
               <Text size={1} tone="muted" className={styles.metaSeparator}>
                 ·
               </Text>
@@ -682,13 +731,29 @@ export const AgentResponse = memo(function AgentResponse({
             <ElapsedTime ts={timestamp} enabled={isStreaming} />
           </>
         )}
+        {/* Token usage — `section.tokens` is the sum across the
+            run's steps, materialized at section-build time. Mirrors
+            the live render above so cached + live look identical. */}
+        {section.tokens && (
+          <>
+            {(hasLeadingMeta || (isStreaming && timestamp != null)) && (
+              <Text size={1} tone="muted" className={styles.metaSeparator}>
+                ·
+              </Text>
+            )}
+            <TokenUsage
+              input={section.tokens.input}
+              output={section.tokens.output}
+            />
+          </>
+        )}
         {/* Timestamp only on a settled response — while the agent is
             still streaming we let `ThinkingIndicator` + `ElapsedTime`
             own the meta row so it doesn't sit inline with a timestamp
             that hasn't really happened yet. */}
         {showTimestamp && (
           <>
-            {hasLeadingMeta && (
+            {(hasLeadingMeta || section.tokens) && (
               <Text size={1} tone="muted" className={styles.metaSeparator}>
                 ·
               </Text>

diff --git a/packages/agents-server-ui/src/components/TokenUsage.module.css b/packages/agents-server-ui/src/components/TokenUsage.module.css
@@ -0,0 +1,8 @@
+/* Match the dimmed tone of the other meta-row siblings (done text,
+ * elapsed time, timestamp). `tabular-nums` keeps the digit column
+ * from jittering as the counts tick up on each step boundary. */
+.usage {
+  color: var(--ds-text-4);
+  opacity: 0.7;
+  font-variant-numeric: tabular-nums;
+}
diff --git a/packages/agents-server-ui/src/components/TokenUsage.tsx b/packages/agents-server-ui/src/components/TokenUsage.tsx
@@ -0,0 +1,57 @@
+import { Text } from '../ui'
+import styles from './TokenUsage.module.css'
+
+/**
+ * Compact token-usage label, e.g. `1.2k ↑ 412 ↓`.
+ *
+ * Rendered next to the elapsed-time ticker in the agent response
+ * meta row, with `tabular-nums` to keep the digit column from
+ * jittering as numbers tick up (input grows when a tool result is
+ * fed back; output grows when the model streams a new step).
+ *
+ * Either side may be `undefined` (the provider didn't emit it, or
+ * the section is historical and was recorded before tokens were
+ * persisted) — we skip the missing half rather than print `0`.
+ */
+export function TokenUsage({
+  input,
+  output,
+}: {
+  input: number | undefined
+  output: number | undefined
+}): React.ReactElement | null {
+  if (input == null && output == null) return null
+  const parts: Array<string> = []
+  if (input != null) parts.push(`${formatTokenCount(input)} ↑`)
+  if (output != null) parts.push(`${formatTokenCount(output)} ↓`)
+  const text = parts.join(` `)
+  const ariaParts: Array<string> = []
+  if (input != null) ariaParts.push(`${input} input tokens`)
+  if (output != null) ariaParts.push(`${output} output tokens`)
+  return (
+    <Text
+      size={1}
+      tone="muted"
+      className={styles.usage}
+      aria-label={ariaParts.join(`, `)}
+    >
+      {text}
+    </Text>
+  )
+}
+
+/**
+ * `Intl.NumberFormat` with `notation: 'compact'` gives us "1.2K",
+ * "12K", "1.2M" etc., locale-aware and bounded in width — better
+ * than a hand-rolled rounder. We force lowercase `k`/`m` afterward
+ * so the suffix tone matches the muted meta row.
+ */
+const compactFormatter = new Intl.NumberFormat(undefined, {
+  notation: `compact`,
+  maximumFractionDigits: 1,
+})
+
+function formatTokenCount(n: number): string {
+  if (n < 1000) return String(n)
+  return compactFormatter.format(n).toLowerCase()
+}