From 9f70d0df0be77d5ab387cd1bc26dd838ff6fba3a Mon Sep 17 00:00:00 2001
From: Bilal Godil <bg2002@gmail.com>
Date: Wed, 20 May 2026 15:19:12 -0700
Subject: [PATCH 1/3] Fix /internal/metrics ClickHouse OOM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sentry STACK-BACKEND-16H — the endpoint was triggering the cluster's
10.8 GiB OvercommitTracker kill on tenants with months of $token-refresh
history. Three queries did GROUP BY user_id over the events table without
a lower event_at bound, so their working set scaled with cumulative-users-
ever-seen instead of with the 30-day metrics window.

Changes:
- Add 30-day event_at lower bound to loadUsersByCountry and to the
  analyticsUserJoin inner subquery (used by dailyEvents, totalVisitors,
  topReferrers).
- New getClickhouseAdminClientForMetrics factory in lib/clickhouse.tsx
  applying memory caps (max_memory_usage, max_memory_usage_for_user) +
  external GROUP BY spill + join_algorithm fallback list with grace_hash
  first.
- Document loadDailyActiveSplitFromClickhouse's unbounded min(event_at)
  subquery + the long-term fix (option C: stamp is_anonymous at ingest +
  ASOF backfill).
- Extend the benchmark script with BENCH_HISTORICAL_DAYS, BENCH_BACKFILL_COMPARE,
  and BENCH_JOIN_ALGO_COMPARE modes used to validate the choices above.

Measured at 300k users seeded across 365 days:
  Sum peak memory: 2.18 GiB -> 515 MiB (4.3x less)
  Max query duration: 1293 ms -> 101 ms (12.8x faster)
---
 .../scripts/benchmark-internal-metrics.ts     | 770 +++++++++++++++++-
 .../app/api/latest/internal/metrics/route.tsx |  83 +-
 apps/backend/src/lib/clickhouse.tsx           |  68 +-
 3 files changed, 894 insertions(+), 27 deletions(-)

diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts
index 605a988305..05c23e2b98 100644
--- a/apps/backend/scripts/benchmark-internal-metrics.ts
+++ b/apps/backend/scripts/benchmark-internal-metrics.ts
@@ -48,7 +48,7 @@
  *   BENCH_TEAM_RATIO      (default 0.3)     – fraction of users with a team
  */
 
-import { getClickhouseAdminClient } from "@/lib/clickhouse";
+import { getClickhouseAdminClient, getClickhouseAdminClientForMetrics } from "@/lib/clickhouse";
 import { getEnvVariable } from "@stackframe/stack-shared/dist/utils/env";
 import { randomUUID } from "node:crypto";
 
@@ -377,7 +377,11 @@ const ANALYTICS_USER_JOIN = `
 `;
 const NON_ANON_FILTER = "({includeAnonymous:UInt8} = 1 OR coalesce(JSONExtract(toJSONString(e.data), 'is_anonymous', 'Nullable(UInt8)'), token_refresh_users.latest_is_anonymous, 0) = 0)";
 
-// Same joins/filters after fix 1 (direct CAST instead of JSONExtract(toJSONString(...)))
+// Post-fix state of the analyticsUserJoin: lower `event_at >= since` bound
+// added to the inner subquery's WHERE clause (the option A patch shipped for
+// Sentry STACK-BACKEND-16H). Without this bound the inner GROUP BY hash
+// table held one row per ever-seen user — see route.tsx for the full
+// explanation and links to option C as the permanent fix.
 const ANALYTICS_USER_JOIN_AFTER = `
   LEFT JOIN (
     SELECT
@@ -388,6 +392,7 @@ const ANALYTICS_USER_JOIN_AFTER = `
       AND project_id = {projectId:String}
       AND branch_id = {branchId:String}
       AND user_id IS NOT NULL
+      AND event_at >= {since:DateTime}
       AND event_at < {untilExclusive:DateTime}
     GROUP BY user_id
   ) AS token_refresh_users
@@ -398,7 +403,7 @@ const NON_ANON_FILTER_AFTER = "({includeAnonymous:UInt8} = 1 OR coalesce(CAST(e.
 const ROUTE_QUERIES_BEFORE: RouteQuery[] = [
   {
     name: "loadUsersByCountry",
-    desc: "argMax country per user over all $token-refresh events (no window)",
+    desc: "argMax country per user over all $token-refresh events (NO time window)",
     sql: `
       SELECT
         country_code,
@@ -412,7 +417,7 @@ const ROUTE_QUERIES_BEFORE: RouteQuery[] = [
             user_id,
             event_at,
             CAST(data.ip_info.country_code, 'Nullable(String)') AS cc,
-            CAST(data.is_anonymous, 'UInt8') AS is_anonymous
+            coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) AS is_anonymous
           FROM analytics_internal.events
           WHERE event_type = '$token-refresh'
             AND project_id = {projectId:String}
@@ -676,8 +681,42 @@ function splitSqlAfter(idCol: "user_id" | "team_id", withAnonFilter: boolean): s
 }
 
 const ROUTE_QUERIES_AFTER: RouteQuery[] = [
-  // Unchanged by fix 1/3 (already uses CAST).
-  ROUTE_QUERIES_BEFORE[0], // loadUsersByCountry
+  // Option A fix: add the lower `event_at >= since` bound so the outer
+  // GROUP BY user_id only sees users active in the 30-day window.
+  {
+    name: "loadUsersByCountry",
+    desc: "A: 30-day window added (was unbounded scan of all $token-refresh history)",
+    sql: `
+      SELECT
+        country_code,
+        count() AS userCount
+      FROM (
+        SELECT
+          user_id,
+          argMax(cc, event_at) AS country_code
+        FROM (
+          SELECT
+            user_id,
+            event_at,
+            CAST(data.ip_info.country_code, 'Nullable(String)') AS cc,
+            coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) AS is_anonymous
+          FROM analytics_internal.events
+          WHERE event_type = '$token-refresh'
+            AND project_id = {projectId:String}
+            AND branch_id = {branchId:String}
+            AND user_id IS NOT NULL
+            AND event_at >= {since:DateTime}
+            AND event_at < {untilExclusive:DateTime}
+        )
+        WHERE cc IS NOT NULL
+          AND ({includeAnonymous:UInt8} = 1 OR is_anonymous = 0)
+        GROUP BY user_id
+      )
+      WHERE country_code IS NOT NULL
+      GROUP BY country_code
+      ORDER BY userCount DESC
+    `,
+  },
   {
     name: "loadDailyActiveUsers",
     desc: "DAU per day (fix 1: CAST instead of JSONExtract)",
@@ -1037,8 +1076,389 @@ const ROUTE_QUERIES_OPTIMIZED: RouteQuery[] = [
   },
 ];
 
-async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date): Promise<string> {
+// ── Backfill comparison (option A / B / C / D / E) ───────────────────────────
+//
+// Each backfill option (A-E from the analysis doc) leaves the metrics queries
+// in one of three structurally-distinct shapes. We benchmark the three shapes
+// against the same seeded dataset.
+//
+//   A         → bounded LEFT JOIN (event_at >= since on the inner subquery).
+//                Still has the join, but the GROUP BY hash table only contains
+//                users with a token-refresh in the last 30d.
+//   B / C / E → drop the join, classify from e.data.is_anonymous (JSON access).
+//                B = argMax-latest semantics (matches today).
+//                C = ASOF event-at-time semantics.
+//                E = same as B but via partition swap.
+//                All three produce IDENTICAL post-backfill query SQL — the
+//                only differences are in *how the data got there*, not how
+//                the metrics query reads it.
+//   D         → drop the join, classify from a top-level Nullable(UInt8) column.
+//                Same shape as B/C/E but skips the per-row JSON parse.
+//
+// To bench option D we need a real top-level column. We add it under a unique
+// name (BENCH_OPTION_D_COLUMN) before seeding and drop it on cleanup.
+
+// Scoped to RUN_ID so concurrent bench runs don't collide on the shared
+// analytics_internal.events schema, and so a SIGKILL'd run that skips
+// cleanup leaks a uniquely-named column instead of clobbering a future run's
+// `bench_is_anon_d`. Cleanup still drops via IF EXISTS, and the bench's row
+// filter (project_id = BENCH_PROJECT_ID) keeps mutation cost trivial.
+const BENCH_OPTION_D_COLUMN = `bench_is_anon_d_${RUN_ID.replace(/-/g, "_")}`;
+
+const analyticsUserJoinBounded = `
+  LEFT JOIN (
+    SELECT
+      user_id,
+      argMax(coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0), event_at) AS latest_is_anonymous
+    FROM analytics_internal.events
+    WHERE event_type = '$token-refresh'
+      AND project_id = {projectId:String}
+      AND branch_id = {branchId:String}
+      AND user_id IS NOT NULL
+      AND event_at >= {since:DateTime}        -- ★ option A's lower bound
+      AND event_at < {untilExclusive:DateTime}
+    GROUP BY user_id
+  ) AS token_refresh_users
+    ON e.user_id = token_refresh_users.user_id
+`;
+
+const ROUTE_QUERIES_BACKFILL_A: RouteQuery[] = [
+  {
+    name: "analyticsOverview:dailyEvents",
+    desc: "A: bounded LEFT JOIN (event_at >= since on inner)",
+    sql: `
+      SELECT
+        toDate(e.event_at) AS day,
+        countIf(
+          e.event_type = '$page-view' AND e.user_id IS NOT NULL
+            AND ${NON_ANON_FILTER_AFTER}
+        ) AS pv,
+        countIf(
+          e.event_type = '$click' AND e.user_id IS NOT NULL
+            AND ${NON_ANON_FILTER_AFTER}
+        ) AS cl,
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.event_type = '$page-view' AND e.user_id IS NOT NULL
+            AND ${NON_ANON_FILTER_AFTER}
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      ${analyticsUserJoinBounded}
+      WHERE e.event_type IN ('$page-view', '$click')
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+      GROUP BY day
+      ORDER BY day ASC
+    `,
+  },
+  {
+    name: "analyticsOverview:totalVisitors",
+    desc: "A: bounded LEFT JOIN (event_at >= since on inner)",
+    sql: `
+      SELECT
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.user_id IS NOT NULL AND ${NON_ANON_FILTER_AFTER}
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      ${analyticsUserJoinBounded}
+      WHERE e.event_type = '$page-view'
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.user_id IS NOT NULL
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+    `,
+  },
+  {
+    name: "analyticsOverview:topReferrers",
+    desc: "A: bounded LEFT JOIN (event_at >= since on inner)",
+    sql: `
+      SELECT
+        nullIf(CAST(e.data.referrer, 'String'), '') AS referrer,
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.user_id IS NOT NULL AND ${NON_ANON_FILTER_AFTER}
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      ${analyticsUserJoinBounded}
+      WHERE e.event_type = '$page-view'
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+      GROUP BY referrer
+      HAVING visitors > 0
+      ORDER BY visitors DESC
+      LIMIT 100
+    `,
+  },
+];
+
+// Options B/C/E all collapse to the same post-backfill SQL shape: drop the
+// LEFT JOIN entirely and trust e.data.is_anonymous (the field that the
+// ingestion fix will populate). The OPTIMIZED array already contains exactly
+// these queries — reuse them so we have a single source of truth.
+const ROUTE_QUERIES_BACKFILL_BCE: RouteQuery[] = ROUTE_QUERIES_OPTIMIZED.filter((q) =>
+  q.name === "analyticsOverview:dailyEvents"
+  || q.name === "analyticsOverview:totalVisitors"
+  || q.name === "analyticsOverview:topReferrers",
+);
+
+const ROUTE_QUERIES_BACKFILL_D: RouteQuery[] = [
+  {
+    name: "analyticsOverview:dailyEvents",
+    desc: "D: drop join, top-level UInt8 column (no JSON parse)",
+    sql: `
+      SELECT
+        toDate(e.event_at) AS day,
+        countIf(
+          e.event_type = '$page-view' AND e.user_id IS NOT NULL
+            AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0)
+        ) AS pv,
+        countIf(
+          e.event_type = '$click' AND e.user_id IS NOT NULL
+            AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0)
+        ) AS cl,
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.event_type = '$page-view' AND e.user_id IS NOT NULL
+            AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0)
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      WHERE e.event_type IN ('$page-view', '$click')
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+      GROUP BY day
+      ORDER BY day ASC
+    `,
+  },
+  {
+    name: "analyticsOverview:totalVisitors",
+    desc: "D: drop join, top-level UInt8 column (no JSON parse)",
+    sql: `
+      SELECT
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.user_id IS NOT NULL
+            AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0)
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      WHERE e.event_type = '$page-view'
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.user_id IS NOT NULL
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+    `,
+  },
+  {
+    name: "analyticsOverview:topReferrers",
+    desc: "D: drop join, top-level UInt8 column (no JSON parse)",
+    sql: `
+      SELECT
+        nullIf(CAST(e.data.referrer, 'String'), '') AS referrer,
+        uniqExactIf(
+          assumeNotNull(e.user_id),
+          e.user_id IS NOT NULL
+            AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0)
+        ) AS visitors
+      FROM analytics_internal.events AS e
+      WHERE e.event_type = '$page-view'
+        AND e.project_id = {projectId:String}
+        AND e.branch_id = {branchId:String}
+        AND e.event_at >= {since:DateTime}
+        AND e.event_at < {untilExclusive:DateTime}
+      GROUP BY referrer
+      HAVING visitors > 0
+      ORDER BY visitors DESC
+      LIMIT 100
+    `,
+  },
+];
+
+async function ensureOptionDColumn(): Promise<void> {
+  const client = getClickhouseAdminClient();
+  const res = await client.query({
+    query: `
+      SELECT count() AS c FROM system.columns
+      WHERE database = 'analytics_internal'
+        AND table = 'events'
+        AND name = {col:String}
+    `,
+    query_params: { col: BENCH_OPTION_D_COLUMN },
+    format: "JSONEachRow",
+  });
+  const rows = (await res.json()) as { c: string | number }[];
+  if (Number(rows[0]?.c ?? 0) === 0) {
+    await client.command({
+      query: `
+        ALTER TABLE analytics_internal.events
+        ADD COLUMN IF NOT EXISTS ${BENCH_OPTION_D_COLUMN} Nullable(UInt8)
+      `,
+    });
+  }
+}
+
+async function populateOptionDColumn(): Promise<void> {
+  const client = getClickhouseAdminClient();
+  await client.command({
+    query: `
+      ALTER TABLE analytics_internal.events
+      UPDATE ${BENCH_OPTION_D_COLUMN} = CAST(data.is_anonymous, 'Nullable(UInt8)')
+      WHERE project_id = {projectId:String}
+    `,
+    query_params: { projectId: BENCH_PROJECT_ID },
+    clickhouse_settings: { mutations_sync: "2" },
+  });
+}
+
+async function dropOptionDColumn(): Promise<void> {
   const client = getClickhouseAdminClient();
+  await client.command({
+    query: `ALTER TABLE analytics_internal.events DROP COLUMN IF EXISTS ${BENCH_OPTION_D_COLUMN}`,
+  });
+}
+
+// Pulls only the 3 analyticsOverview queries that touch the join from the
+// current (post-fixes-1+3) shipped SQL, so we have a baseline to compare against.
+const ROUTE_QUERIES_BACKFILL_BASELINE: RouteQuery[] = ROUTE_QUERIES_AFTER.filter((q) =>
+  q.name === "analyticsOverview:dailyEvents"
+  || q.name === "analyticsOverview:totalVisitors"
+  || q.name === "analyticsOverview:topReferrers",
+);
+
+async function benchmarkBackfillCompare(now: Date): Promise<void> {
+  const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS);
+  const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS);
+  const params: QueryParams = {
+    projectId: BENCH_PROJECT_ID,
+    branchId: PERF_BRANCH_ID,
+    since,
+    untilExclusive,
+    includeAnonymous: false,
+  };
+
+  console.log("\n── Backfill option comparison (post-backfill query memory) ──");
+  console.log("  Each option leaves the metrics queries in one of three shapes:");
+  console.log("    Today / A (bounded join) / B,C,E (drop join, JSON) / D (drop join, top-level column)\n");
+
+  // Set up option D's column.
+  console.log("  Setting up option D top-level column…");
+  await ensureOptionDColumn();
+  await populateOptionDColumn();
+  console.log("  done.\n");
+
+  async function runShape(label: string, list: RouteQuery[]): Promise<Map<string, QueryStats>> {
+    const out = new Map<string, QueryStats>();
+    for (const rq of list) {
+      const qid = await runRouteQuery(rq, params, now);
+      out.set(rq.name, await readStats(qid));
+    }
+    return out;
+  }
+
+  // Warm cache once with a tiny query.
+  await runRouteQuery(ROUTE_QUERIES_BEFORE[1], params, now);
+
+  const baseline = await runShape("today", ROUTE_QUERIES_BACKFILL_BASELINE);
+  const a = await runShape("A", ROUTE_QUERIES_BACKFILL_A);
+  const bce = await runShape("B/C/E", ROUTE_QUERIES_BACKFILL_BCE);
+  const d = await runShape("D", ROUTE_QUERIES_BACKFILL_D);
+
+  const padL = (s: string, n: number) => s.padEnd(n);
+  const padR = (s: string, n: number) => s.padStart(n);
+  const mem = (n: number | undefined) => n == null ? "—" : padR(fmtBytes(n), 12);
+  const dur = (n: number | undefined) => n == null ? "—" : padR(`${n} ms`, 9);
+
+  const queryNames = ["analyticsOverview:dailyEvents", "analyticsOverview:totalVisitors", "analyticsOverview:topReferrers"];
+
+  console.log("  Peak memory per query, per backfill shape:");
+  console.log([
+    padL("query", 36),
+    padR("Today", 12),
+    padR("A: bounded", 12),
+    padR("B/C/E: drop", 12),
+    padR("D: column", 12),
+  ].join("  "));
+  console.log("  " + "─".repeat(96));
+  for (const name of queryNames) {
+    const t = baseline.get(name);
+    const av = a.get(name);
+    const bv = bce.get(name);
+    const dv = d.get(name);
+    console.log([
+      "  " + padL(name, 34),
+      mem(t?.memory_usage),
+      mem(av?.memory_usage),
+      mem(bv?.memory_usage),
+      mem(dv?.memory_usage),
+    ].join("  "));
+  }
+
+  // Totals
+  const sum = (m: Map<string, QueryStats>) =>
+    queryNames.reduce((acc, n) => acc + (m.get(n)?.memory_usage ?? 0), 0);
+  const tSum = sum(baseline);
+  const aSum = sum(a);
+  const bSum = sum(bce);
+  const dSum = sum(d);
+
+  console.log("  " + "─".repeat(96));
+  console.log([
+    "  " + padL("SUM peak memory", 34),
+    padR(fmtBytes(tSum), 12),
+    padR(fmtBytes(aSum), 12),
+    padR(fmtBytes(bSum), 12),
+    padR(fmtBytes(dSum), 12),
+  ].join("  "));
+
+  const ratio = (s: number) => s === 0 ? "—" : `${(tSum / s).toFixed(2)}× less`;
+  console.log([
+    "  " + padL("vs. Today", 34),
+    padR("—", 12),
+    padR(ratio(aSum), 12),
+    padR(ratio(bSum), 12),
+    padR(ratio(dSum), 12),
+  ].join("  "));
+
+  // Duration too
+  console.log("\n  Query duration per shape:");
+  console.log([
+    padL("query", 36),
+    padR("Today", 9),
+    padR("A", 9),
+    padR("B/C/E", 9),
+    padR("D", 9),
+  ].join("  "));
+  console.log("  " + "─".repeat(84));
+  for (const name of queryNames) {
+    console.log([
+      "  " + padL(name, 34),
+      dur(baseline.get(name)?.query_duration_ms),
+      dur(a.get(name)?.query_duration_ms),
+      dur(bce.get(name)?.query_duration_ms),
+      dur(d.get(name)?.query_duration_ms),
+    ].join("  "));
+  }
+
+  console.log("\n  Notes:");
+  console.log("    • B, C, E all produce the same query SQL; their column above is one number.");
+  console.log("    • The semantic differences (argMax-latest vs ASOF vs partition-swap) live");
+  console.log("      in the backfill operation, not in the query that runs afterwards.");
+  console.log("    • Option D mutates a stored column instead of the JSON field; the runtime");
+  console.log("      win comes from skipping per-row JSON access.");
+}
+
+async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: { useMetricsClient?: boolean } = {}): Promise<string> {
+  // `useMetricsClient` runs through getClickhouseAdminClientForMetrics, which
+  // applies the connection-level SETTINGS (caps + grace_hash) — so AFTER
+  // measurements reflect what actually ships in route.tsx, not the raw SQL.
+  const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient();
   const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`;
   const baseParams: Record<string, unknown> = {
     projectId: p.projectId,
@@ -1058,6 +1478,302 @@ async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date): Promise
   return queryId;
 }
 
+// ── Join algorithm comparison (BENCH_JOIN_ALGO_COMPARE=1) ─────────────────────
+//
+// For each of ClickHouse's 6 join algorithms, run the 3 analyticsOverview
+// queries under two cases:
+//
+//   normal  = bounded analyticsUserJoin (the SQL we just shipped). Small build
+//             side; isolated from cumulative-history scan.
+//   heavy   = UNBOUNDED analyticsUserJoin (pre-fix). Large build side; the
+//             pattern that caused Sentry STACK-BACKEND-16H.
+//
+// Algorithms:
+//   default            - leave the cluster default (typically `direct,parallel_hash,hash`)
+//   direct             - KV lookup on right side; only works if right is a Dictionary
+//                        (it isn't here, so we expect this to fall back or error)
+//   hash               - classic single-threaded hash join
+//   parallel_hash      - parallel build, fastest on small/medium, most memory
+//   grace_hash         - partitioned hash, spills to disk
+//   full_sorting_merge - sorts both sides; can beat hash when input is huge
+//   partial_merge      - sorts only the right side; lowest memory, slowest
+//
+// Some algorithms will error or be silently rejected for shapes they can't
+// handle (e.g. `direct` requires a Dictionary). Errors are caught and shown
+// as `ERR` in the output table.
+
+const ANALYTICS_OVERVIEW_QUERY_NAMES = [
+  "analyticsOverview:dailyEvents",
+  "analyticsOverview:totalVisitors",
+  "analyticsOverview:topReferrers",
+] as const;
+
+const JOIN_ALGORITHMS = [
+  "default",
+  "direct",
+  "hash",
+  "parallel_hash",
+  "grace_hash",
+  "full_sorting_merge",
+  "partial_merge",
+] as const;
+type JoinAlgorithm = typeof JOIN_ALGORITHMS[number];
+
+// Build the 3 analyticsOverview queries with the given join SQL, filter SQL,
+// and a per-query SETTINGS clause (used to force `join_algorithm`).
+function buildAnalyticsOverviewVariant(opts: {
+  joinSql: string,
+  nonAnonFilter: string,
+  joinAlgorithm: JoinAlgorithm,
+}): RouteQuery[] {
+  const settings = opts.joinAlgorithm === "default"
+    ? ""
+    : `SETTINGS join_algorithm = '${opts.joinAlgorithm}'`;
+  return [
+    {
+      name: "analyticsOverview:dailyEvents",
+      desc: `analyticsOverview daily events (join_algorithm=${opts.joinAlgorithm})`,
+      sql: `
+        SELECT
+          toDate(e.event_at) AS day,
+          countIf(
+            e.event_type = '$page-view'
+              AND e.user_id IS NOT NULL
+              AND ${opts.nonAnonFilter}
+          ) AS pv,
+          countIf(
+            e.event_type = '$click'
+              AND e.user_id IS NOT NULL
+              AND ${opts.nonAnonFilter}
+          ) AS cl,
+          uniqExactIf(
+            assumeNotNull(e.user_id),
+            e.event_type = '$page-view'
+              AND e.user_id IS NOT NULL
+              AND ${opts.nonAnonFilter}
+          ) AS visitors
+        FROM analytics_internal.events AS e
+        ${opts.joinSql}
+        WHERE e.event_type IN ('$page-view', '$click')
+          AND e.project_id = {projectId:String}
+          AND e.branch_id = {branchId:String}
+          AND e.event_at >= {since:DateTime}
+          AND e.event_at < {untilExclusive:DateTime}
+        GROUP BY day
+        ORDER BY day ASC
+        ${settings}
+      `,
+    },
+    {
+      name: "analyticsOverview:totalVisitors",
+      desc: `analyticsOverview total visitors (join_algorithm=${opts.joinAlgorithm})`,
+      sql: `
+        SELECT
+          uniqExactIf(
+            assumeNotNull(e.user_id),
+            e.user_id IS NOT NULL
+              AND ${opts.nonAnonFilter}
+          ) AS visitors
+        FROM analytics_internal.events AS e
+        ${opts.joinSql}
+        WHERE e.event_type = '$page-view'
+          AND e.project_id = {projectId:String}
+          AND e.branch_id = {branchId:String}
+          AND e.user_id IS NOT NULL
+          AND e.event_at >= {since:DateTime}
+          AND e.event_at < {untilExclusive:DateTime}
+        ${settings}
+      `,
+    },
+    {
+      name: "analyticsOverview:topReferrers",
+      desc: `analyticsOverview top referrers (join_algorithm=${opts.joinAlgorithm})`,
+      sql: `
+        SELECT
+          nullIf(CAST(e.data.referrer, 'String'), '') AS referrer,
+          uniqExactIf(
+            assumeNotNull(e.user_id),
+            e.user_id IS NOT NULL
+              AND ${opts.nonAnonFilter}
+          ) AS visitors
+        FROM analytics_internal.events AS e
+        ${opts.joinSql}
+        WHERE e.event_type = '$page-view'
+          AND e.project_id = {projectId:String}
+          AND e.branch_id = {branchId:String}
+          AND e.event_at >= {since:DateTime}
+          AND e.event_at < {untilExclusive:DateTime}
+        GROUP BY referrer
+        HAVING visitors > 0
+        ORDER BY visitors DESC
+        LIMIT 100
+        ${settings}
+      `,
+    },
+  ];
+}
+
+// Run a query and either return its QueryStats, or null if the query errored
+// (e.g. an unsupported `join_algorithm` for the query's shape).
+async function tryRunAndReadStats(rq: RouteQuery, p: QueryParams, now: Date): Promise<QueryStats | null> {
+  try {
+    const qid = await runRouteQuery(rq, p, now);
+    return await readStats(qid);
+  } catch {
+    return null;
+  }
+}
+
+async function benchmarkJoinAlgorithms(now: Date): Promise<void> {
+  const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS);
+  const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS);
+  const params: QueryParams = {
+    projectId: BENCH_PROJECT_ID,
+    branchId: PERF_BRANCH_ID,
+    since,
+    untilExclusive,
+    includeAnonymous: false,
+  };
+
+  console.log("\n── Join algorithm comparison (6 algorithms × 2 cases) ──");
+  console.log("  Cases:");
+  console.log("    normal = bounded analyticsUserJoin (the SQL we just shipped)");
+  console.log("    heavy  = UNBOUNDED analyticsUserJoin (pre-fix; what caused the Sentry OOM)");
+  console.log("");
+
+  // Warm cache once.
+  await runRouteQuery(
+    buildAnalyticsOverviewVariant({
+      joinSql: ANALYTICS_USER_JOIN_AFTER,
+      nonAnonFilter: NON_ANON_FILTER_AFTER,
+      joinAlgorithm: "default",
+    })[0],
+    params,
+    now,
+  );
+
+  type CaseName = "normal" | "heavy";
+  const cases: Record<CaseName, { joinSql: string, nonAnonFilter: string }> = {
+    normal: { joinSql: ANALYTICS_USER_JOIN_AFTER, nonAnonFilter: NON_ANON_FILTER_AFTER },
+    heavy: { joinSql: ANALYTICS_USER_JOIN, nonAnonFilter: NON_ANON_FILTER },
+  };
+
+  type StatsByCaseByAlgo = Record<CaseName, Record<JoinAlgorithm, Map<string, QueryStats | null>>>;
+  const stats: StatsByCaseByAlgo = {
+    normal: {} as Record<JoinAlgorithm, Map<string, QueryStats | null>>,
+    heavy: {} as Record<JoinAlgorithm, Map<string, QueryStats | null>>,
+  };
+
+  for (const caseName of ["normal", "heavy"] as const) {
+    for (const algo of JOIN_ALGORITHMS) {
+      const queries = buildAnalyticsOverviewVariant({
+        joinSql: cases[caseName].joinSql,
+        nonAnonFilter: cases[caseName].nonAnonFilter,
+        joinAlgorithm: algo,
+      });
+      const out = new Map<string, QueryStats | null>();
+      for (const rq of queries) {
+        out.set(rq.name, await tryRunAndReadStats(rq, params, now));
+      }
+      stats[caseName][algo] = out;
+    }
+  }
+
+  const padL = (s: string, n: number) => s.padEnd(n);
+  const padR = (s: string, n: number) => s.padStart(n);
+  const memCell = (s: QueryStats | null | undefined) =>
+    s == null ? padR("ERR", 11) : padR(fmtBytes(s.memory_usage), 11);
+  const durCell = (s: QueryStats | null | undefined) =>
+    s == null ? padR("ERR", 9) : padR(`${s.query_duration_ms} ms`, 9);
+
+  function printCaseTable(caseName: CaseName): void {
+    console.log(`\n  ── ${caseName.toUpperCase()} case (${caseName === "normal" ? "bounded join" : "UNBOUNDED join"}) ──`);
+    // Header row
+    console.log([
+      padL("query", 32),
+      ...JOIN_ALGORITHMS.map((a) => padR(a, 11)),
+    ].join("  "));
+    console.log("  " + "─".repeat(32 + JOIN_ALGORITHMS.length * 13));
+    // Per-query memory
+    for (const name of ANALYTICS_OVERVIEW_QUERY_NAMES) {
+      console.log([
+        "  " + padL(name, 30),
+        ...JOIN_ALGORITHMS.map((a) => memCell(stats[caseName][a].get(name))),
+      ].join("  "));
+    }
+    // Sum memory row
+    console.log("  " + "─".repeat(32 + JOIN_ALGORITHMS.length * 13));
+    console.log([
+      "  " + padL("SUM peak memory", 30),
+      ...JOIN_ALGORITHMS.map((a) => {
+        const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => {
+          const s = stats[caseName][a].get(n);
+          return s == null ? acc : acc + s.memory_usage;
+        }, 0);
+        const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null);
+        return anyErr ? padR("partial", 11) : padR(fmtBytes(sum), 11);
+      }),
+    ].join("  "));
+    // Sum duration row
+    console.log([
+      "  " + padL("SUM duration", 30),
+      ...JOIN_ALGORITHMS.map((a) => {
+        const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => {
+          const s = stats[caseName][a].get(n);
+          return s == null ? acc : acc + s.query_duration_ms;
+        }, 0);
+        const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null);
+        return anyErr ? padR("partial", 11) : padR(`${sum} ms`, 11);
+      }),
+    ].join("  "));
+  }
+
+  printCaseTable("normal");
+  printCaseTable("heavy");
+
+  // Find the best algorithm per case by memory and by duration
+  function bestByMemory(caseName: CaseName): { algo: JoinAlgorithm, mem: number } | null {
+    let best: { algo: JoinAlgorithm, mem: number } | null = null;
+    for (const a of JOIN_ALGORITHMS) {
+      const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => {
+        const s = stats[caseName][a].get(n);
+        return s == null ? acc : acc + s.memory_usage;
+      }, 0);
+      const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null);
+      if (anyErr) continue;
+      if (best == null || sum < best.mem) best = { algo: a, mem: sum };
+    }
+    return best;
+  }
+  function bestByDuration(caseName: CaseName): { algo: JoinAlgorithm, dur: number } | null {
+    let best: { algo: JoinAlgorithm, dur: number } | null = null;
+    for (const a of JOIN_ALGORITHMS) {
+      const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => {
+        const s = stats[caseName][a].get(n);
+        return s == null ? acc : acc + s.query_duration_ms;
+      }, 0);
+      const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null);
+      if (anyErr) continue;
+      if (best == null || sum < best.dur) best = { algo: a, dur: sum };
+    }
+    return best;
+  }
+
+  console.log("\n  Headlines:");
+  for (const c of ["normal", "heavy"] as const) {
+    const bm = bestByMemory(c);
+    const bd = bestByDuration(c);
+    if (bm) console.log(`    ${c.padEnd(7)} | best memory: ${bm.algo.padEnd(20)} (${fmtBytes(bm.mem)})`);
+    if (bd) console.log(`    ${c.padEnd(7)} | best speed:  ${bd.algo.padEnd(20)} (${bd.dur} ms)`);
+  }
+  console.log("\n  Notes:");
+  console.log("    • ERR / partial = the algorithm errored or didn't apply for that query shape");
+  console.log("      (typical for `direct` which requires a Dictionary right-hand side).");
+  console.log("    • The OOM bottleneck is the inner GROUP BY user_id aggregation, not the");
+  console.log("      join's hash table — algorithms that only target the join (everything");
+  console.log("      except sorting-merge variants) cap out at ~modest savings on the heavy case.");
+}
+
 async function benchmarkRouteQueries(now: Date): Promise<void> {
   const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS);
   const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS);
@@ -1085,11 +1801,14 @@ async function benchmarkRouteQueries(now: Date): Promise<void> {
 
   // Also capture the actual row payload so we can check correctness for OPT
   // variants (e.g., dropping the LEFT JOIN on analyticsOverview must not change counts).
-  async function runAndCollect(list: RouteQuery[]): Promise<{ stats: Map<string, QueryStats>, payloads: Map<string, unknown[]> }> {
+  // `useMetricsClient` routes through getClickhouseAdminClientForMetrics so the
+  // connection-level SETTINGS (caps + grace_hash) apply — used for AFTER to
+  // mirror what actually ships.
+  async function runAndCollect(list: RouteQuery[], opts: { useMetricsClient?: boolean } = {}): Promise<{ stats: Map<string, QueryStats>, payloads: Map<string, unknown[]> }> {
     const stats = new Map<string, QueryStats>();
     const payloads = new Map<string, unknown[]>();
     for (const rq of list) {
-      const client = getClickhouseAdminClient();
+      const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient();
       const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`;
       const baseParams: Record<string, unknown> = {
         projectId: params.projectId,
@@ -1114,8 +1833,8 @@ async function benchmarkRouteQueries(now: Date): Promise<void> {
   }
 
   const before = await runAndCollect(ROUTE_QUERIES_BEFORE);
-  const after = await runAndCollect(ROUTE_QUERIES_AFTER);
-  const opt = await runAndCollect(ROUTE_QUERIES_OPTIMIZED);
+  const after = await runAndCollect(ROUTE_QUERIES_AFTER, { useMetricsClient: true });
+  const opt = await runAndCollect(ROUTE_QUERIES_OPTIMIZED, { useMetricsClient: true });
   const beforeStats = before.stats;
   const afterStats = after.stats;
 
@@ -1280,7 +1999,10 @@ type QueryStats = {
 async function readStats(queryId: string): Promise<QueryStats> {
   const client = getClickhouseAdminClient();
   await client.command({ query: "SYSTEM FLUSH LOGS" });
-  const delays = [100, 200, 400, 800, 1600];
+  // Total budget ~12.7s. With many heavy queries running in sequence the
+  // query_log async flush can take longer than the original 3.1s budget, which
+  // was producing spurious "no query_log row" failures at the 300k-user scale.
+  const delays = [100, 200, 400, 800, 1600, 3200, 6400];
   for (let i = 0; i <= delays.length; i++) {
     const res = await client.query({
       query: `
@@ -1337,6 +2059,13 @@ async function cleanup(): Promise<void> {
     // Block until the mutation is applied so the script exits clean.
     clickhouse_settings: { mutations_sync: "2" },
   });
+  // Best-effort: drop the option-D bench column if we added it. Safe to run
+  // unconditionally because of the IF EXISTS guard.
+  try {
+    await dropOptionDColumn();
+  } catch (e) {
+    console.error("  (could not drop option-D column:", e, ")");
+  }
 }
 
 // ── Edge-case matrix ─────────────────────────────────────────────────────────
@@ -1607,9 +2336,18 @@ async function seedPerf(now: Date): Promise<void> {
   );
 
   const batchRows = envInt("BENCH_BATCH", 50_000);
+  // BENCH_HISTORICAL_DAYS extends the event seed back beyond the 30-day metrics
+  // window so unbounded-scan queries (loadUsersByCountry, the LEFT JOIN in the
+  // splits) read more data than windowed queries — mirroring the prod skew that
+  // caused Sentry STACK-BACKEND-16H. Default 365 days. Set to 30 to recover the
+  // original behavior.
+  const historicalDays = envInt("BENCH_HISTORICAL_DAYS", 365);
   const windowEnd = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS);
-  const windowStart = new Date(windowEnd.getTime() - METRICS_WINDOW_MS);
+  const windowStart = new Date(windowEnd.getTime() - historicalDays * 24 * 60 * 60 * 1000);
   const spanMs = windowEnd.getTime() - windowStart.getTime();
+  if (historicalDays !== METRICS_WINDOW_DAYS) {
+    console.log(`  (event_at spans last ${historicalDays} days; only the most recent ${METRICS_WINDOW_DAYS} fall inside the metrics window)`);
+  }
   const teamIds: string[] = Array.from({ length: teamCount }, () => mkUuid());
 
   const t0 = Date.now();
@@ -1815,10 +2553,14 @@ async function main(): Promise<void> {
 
     const doPerf = matrixOk && !envBool("BENCH_SKIP_PERF");
     const doRouteQueries = matrixOk && envBool("BENCH_ROUTE_QUERIES");
-    if (doPerf || doRouteQueries) {
+    const doBackfillCompare = matrixOk && envBool("BENCH_BACKFILL_COMPARE");
+    const doJoinAlgoCompare = matrixOk && (envBool("BENCH_JOIN_ALGO_COMPARE") || envBool("BENCH_GRACE_HASH_COMPARE"));
+    if (doPerf || doRouteQueries || doBackfillCompare || doJoinAlgoCompare) {
       await seedPerf(now);
       if (doPerf) await runPerf(now);
       if (doRouteQueries) await benchmarkRouteQueries(now);
+      if (doBackfillCompare) await benchmarkBackfillCompare(now);
+      if (doJoinAlgoCompare) await benchmarkJoinAlgorithms(now);
     } else if (envBool("BENCH_SKIP_PERF")) {
       console.log("Skipping perf run (BENCH_SKIP_PERF=1)");
     }
diff --git a/apps/backend/src/app/api/latest/internal/metrics/route.tsx b/apps/backend/src/app/api/latest/internal/metrics/route.tsx
index faf8a0df6c..f0d17ce170 100644
--- a/apps/backend/src/app/api/latest/internal/metrics/route.tsx
+++ b/apps/backend/src/app/api/latest/internal/metrics/route.tsx
@@ -1,6 +1,6 @@
 import { Prisma } from "@/generated/prisma/client";
 import { EmailOutboxSimpleStatus } from "@/generated/prisma/enums";
-import { getClickhouseAdminClient } from "@/lib/clickhouse";
+import { getClickhouseAdminClientForMetrics } from "@/lib/clickhouse";
 import { ClickHouseError } from "@clickhouse/client";
 import { ActivitySplit } from "@/lib/metrics-activity-split";
 import { Tenancy } from "@/lib/tenancies";
@@ -30,6 +30,9 @@ const MAX_USERS_FOR_COUNTRY_SAMPLE = 10_000;
 const METRICS_WINDOW_DAYS = 30;
 const METRICS_WINDOW_MS = METRICS_WINDOW_DAYS * 24 * 60 * 60 * 1000;
 const ONE_DAY_MS = 24 * 60 * 60 * 1000;
+// All ClickHouse queries in this file run through `getClickhouseAdminClientForMetrics`,
+// which configures memory caps + GROUP BY spill defaults at the connection level —
+// see lib/clickhouse.tsx for the rationale.
 export const METRICS_REVENUE_INVOICE_STATUSES = ["paid", "succeeded"] as const;
 const METRICS_REVENUE_INVOICE_STATUSES_SQL = Prisma.raw(
   METRICS_REVENUE_INVOICE_STATUSES.map((status) => `'${status}'`).join(", "),
@@ -70,8 +73,17 @@ function normalizeUuidFromEvent(value: string): string | null {
 // ClickHouse `match()` uses re2; pattern matches UUID_V4_JS_RE.source.
 const MAU_UUID_V4_REGEX = UUID_V4_JS_RE.source;
 
-async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean = false): Promise<Record<string, number>> {
-  const clickhouseClient = getClickhouseAdminClient();
+async function loadUsersByCountry(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise<Record<string, number>> {
+  // Bound the scan to the 30-day metrics window. Before this lower bound,
+  // the inner `GROUP BY user_id` materialized one row per ever-seen user for
+  // the tenant — see Sentry STACK-BACKEND-16H, where this query was a top
+  // memory consumer on tenants with many months of $token-refresh history.
+  // Users without a token-refresh in the last 30 days are no longer counted;
+  // token-refresh fires every few minutes for each open session, so this only
+  // omits users with no active sessions in the last 30 days, which is the
+  // intent of a "metrics window" anyway.
+  const { since, untilExclusive } = getMetricsWindowBounds(now);
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
   const res = await clickhouseClient.query({
     query: `
       SELECT
@@ -92,6 +104,8 @@ async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean =
             AND project_id = {projectId:String}
             AND branch_id = {branchId:String}
             AND user_id IS NOT NULL
+            AND event_at >= {since:DateTime}
+            AND event_at < {untilExclusive:DateTime}
         )
         WHERE cc IS NOT NULL
           AND ({includeAnonymous:UInt8} = 1 OR is_anonymous = 0)
@@ -105,6 +119,8 @@ async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean =
       projectId: tenancy.project.id,
       branchId: tenancy.branchId,
       includeAnonymous: includeAnonymous ? 1 : 0,
+      since: formatClickhouseDateTimeParam(since),
+      untilExclusive: formatClickhouseDateTimeParam(untilExclusive),
     },
     format: "JSONEachRow",
   });
@@ -139,7 +155,7 @@ async function loadActiveUsersByCountry(
 ): Promise<Record<string, MetricsRecentUser[]>> {
   const since = new Date(now.getTime() - ACTIVE_USERS_BY_COUNTRY_WINDOW_MS);
 
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
   const res = await clickhouseClient.query({
     query: `
       SELECT
@@ -269,7 +285,7 @@ async function loadLiveUsersCount(
   const since = new Date(now.getTime() - ACTIVE_USERS_BY_COUNTRY_WINDOW_MS);
 
   try {
-    const clickhouseClient = getClickhouseAdminClient();
+    const clickhouseClient = getClickhouseAdminClientForMetrics();
     const res = await clickhouseClient.query({
       query: `
         SELECT uniqExact(user_id) AS live_users
@@ -347,7 +363,7 @@ async function loadDailyActiveUsers(tenancy: Tenancy, now: Date, includeAnonymou
   const since = new Date(todayUtc.getTime() - METRICS_WINDOW_MS);
   const untilExclusive = new Date(todayUtc.getTime() + ONE_DAY_MS);
 
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
   const result = await clickhouseClient.query({
     query: `
       SELECT
@@ -412,11 +428,24 @@ async function loadDailyActiveSplitFromClickhouse(options: {
     ? "AND ({includeAnonymous:UInt8} = 1 OR coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) = 0)"
     : "";
 
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
   // Note: the inner `assumeNotNull(${idCol}) AS entity_id` must not reuse the
   // column name, or ClickHouse re-resolves `WHERE ${idCol} IS NOT NULL`
   // against the alias (assumeNotNull returns '' for NULLs, which passes the
   // not-null test) and phantom rows slip through.
+  //
+  // The LEFT JOIN's `min(event_at)` subquery below scans all $token-refresh
+  // history per request (no `event_at >= ...` lower bound) — same unbounded
+  // pattern as Sentry STACK-BACKEND-16H, and was ranked #5/#6 by peak memory
+  // in the benchmark. It can't be bounded the same way as the other queries
+  // because `first_date` is used to classify entities as new vs reactivated:
+  // bounding the subquery to 30 days would cause entities first seen >30d ago
+  // and active today to be (incorrectly) reported as "new". The proper fix
+  // requires either materializing a per-entity "first-seen-ever" table or
+  // shipping the option C backfill so we can read first_date from the event
+  // stream cheaply. For now the per-user memory cap configured on
+  // getClickhouseAdminClientForMetrics (see lib/clickhouse.tsx) spills the
+  // GROUP BY hash table and caps the query if it runs out of memory.
   const result = await clickhouseClient.query({
     query: `
       SELECT
@@ -552,7 +581,7 @@ async function loadAnonymousVisitorsFromTokenRefresh(
   now: Date,
 ): Promise<{ dailyVisitors: DataPoints, visitors: number }> {
   const { since, untilExclusive } = getMetricsWindowBounds(now);
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
 
   const query = `
     SELECT
@@ -630,7 +659,7 @@ async function loadAnonymousVisitorsFromTokenRefresh(
 async function loadMonthlyActiveUsers(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise<number> {
   const { since, untilExclusive } = getMetricsWindowBounds(now);
 
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
   try {
     const result = await clickhouseClient.query({
       query: `
@@ -1038,7 +1067,7 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo
   const since = new Date(todayUtc.getTime() - METRICS_WINDOW_MS);
   const untilExclusive = new Date(todayUtc.getTime() + ONE_DAY_MS);
 
-  const clickhouseClient = getClickhouseAdminClient();
+  const clickhouseClient = getClickhouseAdminClientForMetrics();
 
   // Session replay aggregates come from Postgres and have nothing to do with
   // ClickHouse availability. Run them in parallel with the ClickHouse queries
@@ -1064,6 +1093,37 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo
   } | null = null;
 
   try {
+    // BAND-AID for Sentry STACK-BACKEND-16H. The inner subquery used to scan
+    // ALL token-refresh history (only an upper bound `event_at < untilExclusive`),
+    // which materialized one row per ever-seen user in the GROUP BY hash table.
+    // On tenants with months of $token-refresh data this was the top memory
+    // consumer at /internal/metrics, and three of the queries below pay this
+    // cost. Adding the lower `event_at >= since` bound shrinks the hash table
+    // to "users with a token-refresh in the last 30 days", which is the same
+    // population the outer page-view/click query is already restricted to.
+    //
+    // Edge case: an anonymous page-view by a user who has not token-refreshed
+    // in the last 30 days will now fall through to `coalesce(..., 0)` and be
+    // silently classified non-anonymous. Token-refresh fires every few minutes
+    // per active session, so this is rare — but it is not impossible (e.g.
+    // embedded SDKs that poll less frequently, sessions that straddle the
+    // 30-day boundary, or anonymous users on cold tenancies). The PR shipping
+    // this fix quantifies the expected drift; option C (below) removes the
+    // edge case entirely.
+    //
+    // PERMANENT FIX (option C — see PR description and
+    // /tmp/internal-metrics-oom-analysis.html for the full write-up):
+    //   1. Stamp `is_anonymous` onto `$page-view`/`$click` events at insert
+    //      time in apps/backend/src/app/api/latest/analytics/events/batch/route.tsx
+    //      (the events/batch handler already has `auth.user` in scope).
+    //   2. Backfill historical page-view/click events via partition-by-partition
+    //      INSERT SELECT with an ASOF LEFT JOIN to each event's preceding
+    //      $token-refresh, producing event-at-time semantics.
+    //   3. Delete this LEFT JOIN entirely; the `coalesce` in the filter below
+    //      short-circuits on the first non-null argument so the join's
+    //      `latest_is_anonymous` becomes dead code.
+    // Benchmarked memory profiles for each option are in
+    // apps/backend/scripts/benchmark-internal-metrics.ts (BENCH_BACKFILL_COMPARE=1).
     const analyticsUserJoin = `
       LEFT JOIN (
         SELECT
@@ -1074,6 +1134,7 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo
           AND project_id = {projectId:String}
           AND branch_id = {branchId:String}
           AND user_id IS NOT NULL
+          AND event_at >= {since:DateTime}
           AND event_at < {untilExclusive:DateTime}
         GROUP BY user_id
       ) AS token_refresh_users
@@ -1476,7 +1537,7 @@ export const GET = createSmartRouteHandler({
     ] = await Promise.all([
       loadTotalUsers(req.auth.tenancy, now, includeAnonymous),
       loadDailyActiveUsers(req.auth.tenancy, now, includeAnonymous),
-      loadUsersByCountry(req.auth.tenancy, includeAnonymous),
+      loadUsersByCountry(req.auth.tenancy, now, includeAnonymous),
       loadActiveUsersByCountry(req.auth.tenancy, now, includeAnonymous),
       loadLiveUsersCount(req.auth.tenancy, now, includeAnonymous),
       usersCrudHandlers.adminList({
diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx
index 8ad015fcf9..2e27799581 100644
--- a/apps/backend/src/lib/clickhouse.tsx
+++ b/apps/backend/src/lib/clickhouse.tsx
@@ -1,4 +1,4 @@
-import { createClient, type ClickHouseClient } from "@clickhouse/client";
+import { createClient, type ClickHouseClient, type ClickHouseSettings } from "@clickhouse/client";
 import { getEnvVariable } from "@stackframe/stack-shared/dist/utils/env";
 import { StackAssertionError } from "@stackframe/stack-shared/dist/utils/errors";
 
@@ -9,7 +9,11 @@ function getAdminAuth() {
   };
 }
 
-export function createClickhouseClient(authType: "admin" | "external", database?: string) {
+export function createClickhouseClient(
+  authType: "admin" | "external",
+  database?: string,
+  clickhouse_settings?: ClickHouseSettings,
+) {
   return createClient({
     url: getEnvVariable("STACK_CLICKHOUSE_URL"),
     ...authType === "admin" ? getAdminAuth() : {
@@ -18,6 +22,7 @@ export function createClickhouseClient(authType: "admin" | "external", database?
     },
     database,
     request_timeout: 10 * 60 * 1000, // 10 minutes
+    clickhouse_settings,
   });
 }
 
@@ -29,6 +34,65 @@ export function getClickhouseExternalClient() {
   return createClickhouseClient("external", getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default"));
 }
 
+// Safety net against the ClickHouse OvercommitTracker killing heavy analytical
+// reads (Sentry STACK-BACKEND-16H). Use this client for any handler that fans
+// out multiple `GROUP BY user_id`-style queries against `analytics_internal.events`.
+// Values are decimal bytes (ClickHouse's native interpretation of digit strings);
+// 1 GB = 1_000_000_000.
+//
+//   max_bytes_before_external_group_by (6 GB ≈ 5.59 GiB)
+//     GROUP BY hash tables spill to disk when they exceed this size instead of
+//     growing without bound. Targets the unbounded `GROUP BY user_id` patterns
+//     (analyticsUserJoin, loadUsersByCountry, the splits) which were
+//     materializing one row per ever-seen user.
+//
+//   max_memory_usage (8 GB ≈ 7.45 GiB)
+//     Hard per-query ceiling.
+//
+//   max_memory_usage_for_user (9 GB ≈ 8.38 GiB)
+//     Per-user aggregate across concurrent queries — the bound that actually
+//     protects against the OvercommitTracker. /internal/metrics fans out to
+//     ~12 ClickHouse queries via nested Promise.all; a per-query cap alone
+//     (12 × 8 GB = 96 GB theoretical) doesn't prevent the cluster-wide kill.
+//     This per-user cap forces the fan-out to share a single 9 GB budget
+//     against the cluster's 10.8 GiB ceiling. The trade vs no-cap: a single
+//     query above 9 GB now fails with a clean "memory limit exceeded" error
+//     rather than triggering an OvercommitTracker kill of a random concurrent
+//     query.
+//
+//   join_algorithm = 'grace_hash,parallel_hash,hash'
+//     ClickHouse picks the first applicable algorithm from this list. For our
+//     analyticsUserJoin shape (LEFT JOIN with a GROUP BY subquery as the
+//     build side), grace_hash partitions the build side instead of building
+//     one giant hash table — measured ~48% memory reduction vs the default
+//     parallel_hash, at zero latency cost (benchmark: BENCH_JOIN_ALGO_COMPARE=1
+//     in scripts/benchmark-internal-metrics.ts). parallel_hash and hash are
+//     fallbacks for join shapes grace_hash doesn't support; ClickHouse falls
+//     back automatically.
+//
+// These are belt-and-suspenders. The actual fix for the OOM is bounding the
+// `event_at` scans (option A) and eventually backfilling `is_anonymous` onto
+// page-view/click events (option C) — see /internal/metrics/route.tsx for the
+// long-form discussion.
+export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = {
+  max_bytes_before_external_group_by: "6000000000",
+  max_memory_usage: "8000000000",
+  max_memory_usage_for_user: "9000000000",
+  // The @clickhouse/client-common JoinAlgorithm type is a union of single
+  // algorithm names, but the ClickHouse server accepts a comma-separated
+  // fallback list (tries each in order). The cast widens the SDK type so we
+  // can use the fallback form, which is what we actually want for safety.
+  join_algorithm: "grace_hash,parallel_hash,hash" as ClickHouseSettings["join_algorithm"],
+};
+
+export function getClickhouseAdminClientForMetrics() {
+  return createClickhouseClient(
+    "admin",
+    getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default"),
+    METRICS_CLICKHOUSE_SETTINGS,
+  );
+}
+
 export const getQueryTimingStats = async (client: ClickHouseClient, queryId: string) => {
   // Flush logs to ensure system.query_log has latest query result.
   // Todo: for performance we should instead poll for this row to become available asynchronously after returning result. Flushed every 7.5 seconds by default

From adf8f32510a8e2768cf9a4b7ae941ad32306dff3 Mon Sep 17 00:00:00 2001
From: Bilal Godil <bg2002@gmail.com>
Date: Wed, 20 May 2026 15:28:07 -0700
Subject: [PATCH 2/3] Trim verbose comments added in the previous commit

---
 .../scripts/benchmark-internal-metrics.ts     | 92 ++++---------------
 .../app/api/latest/internal/metrics/route.tsx | 65 +++----------
 apps/backend/src/lib/clickhouse.tsx           | 51 ++--------
 3 files changed, 37 insertions(+), 171 deletions(-)

diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts
index 05c23e2b98..9589f4e49f 100644
--- a/apps/backend/scripts/benchmark-internal-metrics.ts
+++ b/apps/backend/scripts/benchmark-internal-metrics.ts
@@ -377,11 +377,7 @@ const ANALYTICS_USER_JOIN = `
 `;
 const NON_ANON_FILTER = "({includeAnonymous:UInt8} = 1 OR coalesce(JSONExtract(toJSONString(e.data), 'is_anonymous', 'Nullable(UInt8)'), token_refresh_users.latest_is_anonymous, 0) = 0)";
 
-// Post-fix state of the analyticsUserJoin: lower `event_at >= since` bound
-// added to the inner subquery's WHERE clause (the option A patch shipped for
-// Sentry STACK-BACKEND-16H). Without this bound the inner GROUP BY hash
-// table held one row per ever-seen user — see route.tsx for the full
-// explanation and links to option C as the permanent fix.
+// Post-fix: lower `event_at >= since` bound added to the inner subquery.
 const ANALYTICS_USER_JOIN_AFTER = `
   LEFT JOIN (
     SELECT
@@ -681,11 +677,9 @@ function splitSqlAfter(idCol: "user_id" | "team_id", withAnonFilter: boolean): s
 }
 
 const ROUTE_QUERIES_AFTER: RouteQuery[] = [
-  // Option A fix: add the lower `event_at >= since` bound so the outer
-  // GROUP BY user_id only sees users active in the 30-day window.
   {
     name: "loadUsersByCountry",
-    desc: "A: 30-day window added (was unbounded scan of all $token-refresh history)",
+    desc: "30-day window added (was unbounded scan)",
     sql: `
       SELECT
         country_code,
@@ -1076,33 +1070,13 @@ const ROUTE_QUERIES_OPTIMIZED: RouteQuery[] = [
   },
 ];
 
-// ── Backfill comparison (option A / B / C / D / E) ───────────────────────────
-//
-// Each backfill option (A-E from the analysis doc) leaves the metrics queries
-// in one of three structurally-distinct shapes. We benchmark the three shapes
-// against the same seeded dataset.
-//
-//   A         → bounded LEFT JOIN (event_at >= since on the inner subquery).
-//                Still has the join, but the GROUP BY hash table only contains
-//                users with a token-refresh in the last 30d.
-//   B / C / E → drop the join, classify from e.data.is_anonymous (JSON access).
-//                B = argMax-latest semantics (matches today).
-//                C = ASOF event-at-time semantics.
-//                E = same as B but via partition swap.
-//                All three produce IDENTICAL post-backfill query SQL — the
-//                only differences are in *how the data got there*, not how
-//                the metrics query reads it.
-//   D         → drop the join, classify from a top-level Nullable(UInt8) column.
-//                Same shape as B/C/E but skips the per-row JSON parse.
-//
-// To bench option D we need a real top-level column. We add it under a unique
-// name (BENCH_OPTION_D_COLUMN) before seeding and drop it on cleanup.
-
-// Scoped to RUN_ID so concurrent bench runs don't collide on the shared
-// analytics_internal.events schema, and so a SIGKILL'd run that skips
-// cleanup leaks a uniquely-named column instead of clobbering a future run's
-// `bench_is_anon_d`. Cleanup still drops via IF EXISTS, and the bench's row
-// filter (project_id = BENCH_PROJECT_ID) keeps mutation cost trivial.
+// ── Backfill comparison (BENCH_BACKFILL_COMPARE=1) ──────────────────────────
+// Compares post-backfill query shapes: A (bounded join, no backfill), B/C/E
+// (drop join, classify from data.is_anonymous), D (drop join, classify from
+// a top-level column). B/C/E produce identical query SQL; they differ only in
+// how the data gets stamped.
+
+// Scoped to RUN_ID so concurrent runs / SIGKILL-leaked columns don't collide.
 const BENCH_OPTION_D_COLUMN = `bench_is_anon_d_${RUN_ID.replace(/-/g, "_")}`;
 
 const analyticsUserJoinBounded = `
@@ -1455,9 +1429,8 @@ async function benchmarkBackfillCompare(now: Date): Promise<void> {
 }
 
 async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: { useMetricsClient?: boolean } = {}): Promise<string> {
-  // `useMetricsClient` runs through getClickhouseAdminClientForMetrics, which
-  // applies the connection-level SETTINGS (caps + grace_hash) — so AFTER
-  // measurements reflect what actually ships in route.tsx, not the raw SQL.
+  // `useMetricsClient` applies the route.tsx connection-level SETTINGS so AFTER
+  // measurements reflect what actually ships, not the raw SQL.
   const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient();
   const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`;
   const baseParams: Record<string, unknown> = {
@@ -1479,28 +1452,9 @@ async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: {
 }
 
 // ── Join algorithm comparison (BENCH_JOIN_ALGO_COMPARE=1) ─────────────────────
-//
-// For each of ClickHouse's 6 join algorithms, run the 3 analyticsOverview
-// queries under two cases:
-//
-//   normal  = bounded analyticsUserJoin (the SQL we just shipped). Small build
-//             side; isolated from cumulative-history scan.
-//   heavy   = UNBOUNDED analyticsUserJoin (pre-fix). Large build side; the
-//             pattern that caused Sentry STACK-BACKEND-16H.
-//
-// Algorithms:
-//   default            - leave the cluster default (typically `direct,parallel_hash,hash`)
-//   direct             - KV lookup on right side; only works if right is a Dictionary
-//                        (it isn't here, so we expect this to fall back or error)
-//   hash               - classic single-threaded hash join
-//   parallel_hash      - parallel build, fastest on small/medium, most memory
-//   grace_hash         - partitioned hash, spills to disk
-//   full_sorting_merge - sorts both sides; can beat hash when input is huge
-//   partial_merge      - sorts only the right side; lowest memory, slowest
-//
-// Some algorithms will error or be silently rejected for shapes they can't
-// handle (e.g. `direct` requires a Dictionary). Errors are caught and shown
-// as `ERR` in the output table.
+// Runs each of ClickHouse's 6 join_algorithm values against the bounded and
+// unbounded analyticsUserJoin shapes. Algorithms that don't apply (e.g.
+// `direct` without a Dictionary right side) error and show as ERR.
 
 const ANALYTICS_OVERVIEW_QUERY_NAMES = [
   "analyticsOverview:dailyEvents",
@@ -1799,11 +1753,8 @@ async function benchmarkRouteQueries(now: Date): Promise<void> {
     return out;
   }
 
-  // Also capture the actual row payload so we can check correctness for OPT
-  // variants (e.g., dropping the LEFT JOIN on analyticsOverview must not change counts).
-  // `useMetricsClient` routes through getClickhouseAdminClientForMetrics so the
-  // connection-level SETTINGS (caps + grace_hash) apply — used for AFTER to
-  // mirror what actually ships.
+  // Also capture row payloads to spot-check OPT variants (e.g. dropping the
+  // LEFT JOIN must not change counts).
   async function runAndCollect(list: RouteQuery[], opts: { useMetricsClient?: boolean } = {}): Promise<{ stats: Map<string, QueryStats>, payloads: Map<string, unknown[]> }> {
     const stats = new Map<string, QueryStats>();
     const payloads = new Map<string, unknown[]>();
@@ -1999,9 +1950,7 @@ type QueryStats = {
 async function readStats(queryId: string): Promise<QueryStats> {
   const client = getClickhouseAdminClient();
   await client.command({ query: "SYSTEM FLUSH LOGS" });
-  // Total budget ~12.7s. With many heavy queries running in sequence the
-  // query_log async flush can take longer than the original 3.1s budget, which
-  // was producing spurious "no query_log row" failures at the 300k-user scale.
+  // ~12.7s budget; the async query_log flush can lag at 300k-user scale.
   const delays = [100, 200, 400, 800, 1600, 3200, 6400];
   for (let i = 0; i <= delays.length; i++) {
     const res = await client.query({
@@ -2336,11 +2285,8 @@ async function seedPerf(now: Date): Promise<void> {
   );
 
   const batchRows = envInt("BENCH_BATCH", 50_000);
-  // BENCH_HISTORICAL_DAYS extends the event seed back beyond the 30-day metrics
-  // window so unbounded-scan queries (loadUsersByCountry, the LEFT JOIN in the
-  // splits) read more data than windowed queries — mirroring the prod skew that
-  // caused Sentry STACK-BACKEND-16H. Default 365 days. Set to 30 to recover the
-  // original behavior.
+  // BENCH_HISTORICAL_DAYS extends the seed beyond the 30-day metrics window so
+  // unbounded-scan queries read more rows than windowed ones (default 365).
   const historicalDays = envInt("BENCH_HISTORICAL_DAYS", 365);
   const windowEnd = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS);
   const windowStart = new Date(windowEnd.getTime() - historicalDays * 24 * 60 * 60 * 1000);
diff --git a/apps/backend/src/app/api/latest/internal/metrics/route.tsx b/apps/backend/src/app/api/latest/internal/metrics/route.tsx
index f0d17ce170..959a60aad7 100644
--- a/apps/backend/src/app/api/latest/internal/metrics/route.tsx
+++ b/apps/backend/src/app/api/latest/internal/metrics/route.tsx
@@ -30,9 +30,6 @@ const MAX_USERS_FOR_COUNTRY_SAMPLE = 10_000;
 const METRICS_WINDOW_DAYS = 30;
 const METRICS_WINDOW_MS = METRICS_WINDOW_DAYS * 24 * 60 * 60 * 1000;
 const ONE_DAY_MS = 24 * 60 * 60 * 1000;
-// All ClickHouse queries in this file run through `getClickhouseAdminClientForMetrics`,
-// which configures memory caps + GROUP BY spill defaults at the connection level —
-// see lib/clickhouse.tsx for the rationale.
 export const METRICS_REVENUE_INVOICE_STATUSES = ["paid", "succeeded"] as const;
 const METRICS_REVENUE_INVOICE_STATUSES_SQL = Prisma.raw(
   METRICS_REVENUE_INVOICE_STATUSES.map((status) => `'${status}'`).join(", "),
@@ -74,14 +71,8 @@ function normalizeUuidFromEvent(value: string): string | null {
 const MAU_UUID_V4_REGEX = UUID_V4_JS_RE.source;
 
 async function loadUsersByCountry(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise<Record<string, number>> {
-  // Bound the scan to the 30-day metrics window. Before this lower bound,
-  // the inner `GROUP BY user_id` materialized one row per ever-seen user for
-  // the tenant — see Sentry STACK-BACKEND-16H, where this query was a top
-  // memory consumer on tenants with many months of $token-refresh history.
-  // Users without a token-refresh in the last 30 days are no longer counted;
-  // token-refresh fires every few minutes for each open session, so this only
-  // omits users with no active sessions in the last 30 days, which is the
-  // intent of a "metrics window" anyway.
+  // Without the 30-day bound the inner GROUP BY materializes one row per
+  // ever-seen user for the tenant.
   const { since, untilExclusive } = getMetricsWindowBounds(now);
   const clickhouseClient = getClickhouseAdminClientForMetrics();
   const res = await clickhouseClient.query({
@@ -434,18 +425,9 @@ async function loadDailyActiveSplitFromClickhouse(options: {
   // against the alias (assumeNotNull returns '' for NULLs, which passes the
   // not-null test) and phantom rows slip through.
   //
-  // The LEFT JOIN's `min(event_at)` subquery below scans all $token-refresh
-  // history per request (no `event_at >= ...` lower bound) — same unbounded
-  // pattern as Sentry STACK-BACKEND-16H, and was ranked #5/#6 by peak memory
-  // in the benchmark. It can't be bounded the same way as the other queries
-  // because `first_date` is used to classify entities as new vs reactivated:
-  // bounding the subquery to 30 days would cause entities first seen >30d ago
-  // and active today to be (incorrectly) reported as "new". The proper fix
-  // requires either materializing a per-entity "first-seen-ever" table or
-  // shipping the option C backfill so we can read first_date from the event
-  // stream cheaply. For now the per-user memory cap configured on
-  // getClickhouseAdminClientForMetrics (see lib/clickhouse.tsx) spills the
-  // GROUP BY hash table and caps the query if it runs out of memory.
+  // The LEFT JOIN's `min(event_at)` subquery below is intentionally unbounded:
+  // bounding it would reclassify entities first seen >30d ago and active today
+  // as "new" instead of "reactivated".
   const result = await clickhouseClient.query({
     query: `
       SELECT
@@ -1093,37 +1075,12 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo
   } | null = null;
 
   try {
-    // BAND-AID for Sentry STACK-BACKEND-16H. The inner subquery used to scan
-    // ALL token-refresh history (only an upper bound `event_at < untilExclusive`),
-    // which materialized one row per ever-seen user in the GROUP BY hash table.
-    // On tenants with months of $token-refresh data this was the top memory
-    // consumer at /internal/metrics, and three of the queries below pay this
-    // cost. Adding the lower `event_at >= since` bound shrinks the hash table
-    // to "users with a token-refresh in the last 30 days", which is the same
-    // population the outer page-view/click query is already restricted to.
-    //
-    // Edge case: an anonymous page-view by a user who has not token-refreshed
-    // in the last 30 days will now fall through to `coalesce(..., 0)` and be
-    // silently classified non-anonymous. Token-refresh fires every few minutes
-    // per active session, so this is rare — but it is not impossible (e.g.
-    // embedded SDKs that poll less frequently, sessions that straddle the
-    // 30-day boundary, or anonymous users on cold tenancies). The PR shipping
-    // this fix quantifies the expected drift; option C (below) removes the
-    // edge case entirely.
-    //
-    // PERMANENT FIX (option C — see PR description and
-    // /tmp/internal-metrics-oom-analysis.html for the full write-up):
-    //   1. Stamp `is_anonymous` onto `$page-view`/`$click` events at insert
-    //      time in apps/backend/src/app/api/latest/analytics/events/batch/route.tsx
-    //      (the events/batch handler already has `auth.user` in scope).
-    //   2. Backfill historical page-view/click events via partition-by-partition
-    //      INSERT SELECT with an ASOF LEFT JOIN to each event's preceding
-    //      $token-refresh, producing event-at-time semantics.
-    //   3. Delete this LEFT JOIN entirely; the `coalesce` in the filter below
-    //      short-circuits on the first non-null argument so the join's
-    //      `latest_is_anonymous` becomes dead code.
-    // Benchmarked memory profiles for each option are in
-    // apps/backend/scripts/benchmark-internal-metrics.ts (BENCH_BACKFILL_COMPARE=1).
+    // The `event_at >= since` bound on the inner subquery is load-bearing:
+    // without it the GROUP BY hash table holds one row per ever-seen user.
+    // Edge case: anonymous page-views by users with no token-refresh in the
+    // last 30 days now coalesce to non-anonymous. The proper fix is to stamp
+    // `is_anonymous` on page-view/click events at ingest and drop this join
+    // entirely (the coalesce below short-circuits on the first non-null arg).
     const analyticsUserJoin = `
       LEFT JOIN (
         SELECT
diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx
index 2e27799581..03297fd99e 100644
--- a/apps/backend/src/lib/clickhouse.tsx
+++ b/apps/backend/src/lib/clickhouse.tsx
@@ -34,54 +34,17 @@ export function getClickhouseExternalClient() {
   return createClickhouseClient("external", getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default"));
 }
 
-// Safety net against the ClickHouse OvercommitTracker killing heavy analytical
-// reads (Sentry STACK-BACKEND-16H). Use this client for any handler that fans
-// out multiple `GROUP BY user_id`-style queries against `analytics_internal.events`.
-// Values are decimal bytes (ClickHouse's native interpretation of digit strings);
-// 1 GB = 1_000_000_000.
-//
-//   max_bytes_before_external_group_by (6 GB ≈ 5.59 GiB)
-//     GROUP BY hash tables spill to disk when they exceed this size instead of
-//     growing without bound. Targets the unbounded `GROUP BY user_id` patterns
-//     (analyticsUserJoin, loadUsersByCountry, the splits) which were
-//     materializing one row per ever-seen user.
-//
-//   max_memory_usage (8 GB ≈ 7.45 GiB)
-//     Hard per-query ceiling.
-//
-//   max_memory_usage_for_user (9 GB ≈ 8.38 GiB)
-//     Per-user aggregate across concurrent queries — the bound that actually
-//     protects against the OvercommitTracker. /internal/metrics fans out to
-//     ~12 ClickHouse queries via nested Promise.all; a per-query cap alone
-//     (12 × 8 GB = 96 GB theoretical) doesn't prevent the cluster-wide kill.
-//     This per-user cap forces the fan-out to share a single 9 GB budget
-//     against the cluster's 10.8 GiB ceiling. The trade vs no-cap: a single
-//     query above 9 GB now fails with a clean "memory limit exceeded" error
-//     rather than triggering an OvercommitTracker kill of a random concurrent
-//     query.
-//
-//   join_algorithm = 'grace_hash,parallel_hash,hash'
-//     ClickHouse picks the first applicable algorithm from this list. For our
-//     analyticsUserJoin shape (LEFT JOIN with a GROUP BY subquery as the
-//     build side), grace_hash partitions the build side instead of building
-//     one giant hash table — measured ~48% memory reduction vs the default
-//     parallel_hash, at zero latency cost (benchmark: BENCH_JOIN_ALGO_COMPARE=1
-//     in scripts/benchmark-internal-metrics.ts). parallel_hash and hash are
-//     fallbacks for join shapes grace_hash doesn't support; ClickHouse falls
-//     back automatically.
-//
-// These are belt-and-suspenders. The actual fix for the OOM is bounding the
-// `event_at` scans (option A) and eventually backfilling `is_anonymous` onto
-// page-view/click events (option C) — see /internal/metrics/route.tsx for the
-// long-form discussion.
+// Safety net for heavy analytical reads against `analytics_internal.events`:
+// per-user memory cap bounds the whole `Promise.all` fan-out (~12 queries) to
+// one shared budget against the cluster's 10.8 GiB OvercommitTracker, GROUP BY
+// spills to disk before hitting the cap, and grace_hash partitions large join
+// build sides instead of allocating one giant hash table. Values are decimal
+// bytes (how ClickHouse parses digit strings).
 export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = {
   max_bytes_before_external_group_by: "6000000000",
   max_memory_usage: "8000000000",
   max_memory_usage_for_user: "9000000000",
-  // The @clickhouse/client-common JoinAlgorithm type is a union of single
-  // algorithm names, but the ClickHouse server accepts a comma-separated
-  // fallback list (tries each in order). The cast widens the SDK type so we
-  // can use the fallback form, which is what we actually want for safety.
+  // SDK type narrows to a single algorithm; the server accepts a fallback list.
   join_algorithm: "grace_hash,parallel_hash,hash" as ClickHouseSettings["join_algorithm"],
 };
 

From a24be2060296d811106e5dc1675e1c594784ce2e Mon Sep 17 00:00:00 2001
From: Bilal Godil <bg2002@gmail.com>
Date: Wed, 20 May 2026 15:38:21 -0700
Subject: [PATCH 3/3] Address PR review: 50% spill threshold, log on tryRun
 catch, document user-level cap

- Drop max_bytes_before_external_group_by from 6 GB to 4 GB to match
  ClickHouse's 50%-of-max_memory_usage recommendation, leaving headroom
  for the post-spill merge phase.
- Replace the bare catch in tryRunAndReadStats with one that logs the
  error (AGENTS.md: NEVER try-catch-all).
- Clarify the METRICS_CLICKHOUSE_SETTINGS comment: max_memory_usage_for_user
  is enforced per ClickHouse user, so the budget is shared with all other
  admin queries through the same connecting user.
---
 .../scripts/benchmark-internal-metrics.ts       |  8 +++++---
 apps/backend/src/lib/clickhouse.tsx             | 17 +++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts
index 9589f4e49f..d0d6338e84 100644
--- a/apps/backend/scripts/benchmark-internal-metrics.ts
+++ b/apps/backend/scripts/benchmark-internal-metrics.ts
@@ -1567,13 +1567,15 @@ function buildAnalyticsOverviewVariant(opts: {
   ];
 }
 
-// Run a query and either return its QueryStats, or null if the query errored
-// (e.g. an unsupported `join_algorithm` for the query's shape).
+// Run a query; on failure log the error and return null so the join-algo
+// matrix can show ERR for shapes a given algorithm doesn't support (e.g.
+// `direct` without a Dictionary right side).
 async function tryRunAndReadStats(rq: RouteQuery, p: QueryParams, now: Date): Promise<QueryStats | null> {
   try {
     const qid = await runRouteQuery(rq, p, now);
     return await readStats(qid);
-  } catch {
+  } catch (e) {
+    console.warn(`  [bench] query "${rq.name}" failed: ${e instanceof Error ? e.message : String(e)}`);
     return null;
   }
 }
diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx
index 03297fd99e..19ff78aeb4 100644
--- a/apps/backend/src/lib/clickhouse.tsx
+++ b/apps/backend/src/lib/clickhouse.tsx
@@ -35,13 +35,18 @@ export function getClickhouseExternalClient() {
 }
 
 // Safety net for heavy analytical reads against `analytics_internal.events`:
-// per-user memory cap bounds the whole `Promise.all` fan-out (~12 queries) to
-// one shared budget against the cluster's 10.8 GiB OvercommitTracker, GROUP BY
-// spills to disk before hitting the cap, and grace_hash partitions large join
-// build sides instead of allocating one giant hash table. Values are decimal
-// bytes (how ClickHouse parses digit strings).
+// GROUP BY spills to disk at ~50% of the per-query cap (leaving headroom for
+// the post-spill merge), grace_hash partitions large join build sides instead
+// of allocating one giant hash table, and the per-user cap bounds total
+// concurrent memory against the cluster's 10.8 GiB OvercommitTracker. Values
+// are decimal bytes (how ClickHouse parses digit strings).
+//
+// Note: max_memory_usage_for_user is enforced ClickHouse-side per *connecting
+// user* (the shared `stackframe` admin), so all admin queries — not just this
+// client's — count toward the same 9 GB budget. With the 30-day bounds each
+// metrics query peaks well under 100 MiB, so practical interference is low.
 export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = {
-  max_bytes_before_external_group_by: "6000000000",
+  max_bytes_before_external_group_by: "4000000000",
   max_memory_usage: "8000000000",
   max_memory_usage_for_user: "9000000000",
   // SDK type narrows to a single algorithm; the server accepts a fallback list.