From 9f70d0df0be77d5ab387cd1bc26dd838ff6fba3a Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Wed, 20 May 2026 15:19:12 -0700 Subject: [PATCH 1/3] Fix /internal/metrics ClickHouse OOM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sentry STACK-BACKEND-16H — the endpoint was triggering the cluster's 10.8 GiB OvercommitTracker kill on tenants with months of $token-refresh history. Three queries did GROUP BY user_id over the events table without a lower event_at bound, so their working set scaled with cumulative-users- ever-seen instead of with the 30-day metrics window. Changes: - Add 30-day event_at lower bound to loadUsersByCountry and to the analyticsUserJoin inner subquery (used by dailyEvents, totalVisitors, topReferrers). - New getClickhouseAdminClientForMetrics factory in lib/clickhouse.tsx applying memory caps (max_memory_usage, max_memory_usage_for_user) + external GROUP BY spill + join_algorithm fallback list with grace_hash first. - Document loadDailyActiveSplitFromClickhouse's unbounded min(event_at) subquery + the long-term fix (option C: stamp is_anonymous at ingest + ASOF backfill). - Extend the benchmark script with BENCH_HISTORICAL_DAYS, BENCH_BACKFILL_COMPARE, and BENCH_JOIN_ALGO_COMPARE modes used to validate the choices above. Measured at 300k users seeded across 365 days: Sum peak memory: 2.18 GiB -> 515 MiB (4.3x less) Max query duration: 1293 ms -> 101 ms (12.8x faster) --- .../scripts/benchmark-internal-metrics.ts | 770 +++++++++++++++++- .../app/api/latest/internal/metrics/route.tsx | 83 +- apps/backend/src/lib/clickhouse.tsx | 68 +- 3 files changed, 894 insertions(+), 27 deletions(-) diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts index 605a988305..05c23e2b98 100644 --- a/apps/backend/scripts/benchmark-internal-metrics.ts +++ b/apps/backend/scripts/benchmark-internal-metrics.ts @@ -48,7 +48,7 @@ * BENCH_TEAM_RATIO (default 0.3) – fraction of users with a team */ -import { getClickhouseAdminClient } from "@/lib/clickhouse"; +import { getClickhouseAdminClient, getClickhouseAdminClientForMetrics } from "@/lib/clickhouse"; import { getEnvVariable } from "@stackframe/stack-shared/dist/utils/env"; import { randomUUID } from "node:crypto"; @@ -377,7 +377,11 @@ const ANALYTICS_USER_JOIN = ` `; const NON_ANON_FILTER = "({includeAnonymous:UInt8} = 1 OR coalesce(JSONExtract(toJSONString(e.data), 'is_anonymous', 'Nullable(UInt8)'), token_refresh_users.latest_is_anonymous, 0) = 0)"; -// Same joins/filters after fix 1 (direct CAST instead of JSONExtract(toJSONString(...))) +// Post-fix state of the analyticsUserJoin: lower `event_at >= since` bound +// added to the inner subquery's WHERE clause (the option A patch shipped for +// Sentry STACK-BACKEND-16H). Without this bound the inner GROUP BY hash +// table held one row per ever-seen user — see route.tsx for the full +// explanation and links to option C as the permanent fix. const ANALYTICS_USER_JOIN_AFTER = ` LEFT JOIN ( SELECT @@ -388,6 +392,7 @@ const ANALYTICS_USER_JOIN_AFTER = ` AND project_id = {projectId:String} AND branch_id = {branchId:String} AND user_id IS NOT NULL + AND event_at >= {since:DateTime} AND event_at < {untilExclusive:DateTime} GROUP BY user_id ) AS token_refresh_users @@ -398,7 +403,7 @@ const NON_ANON_FILTER_AFTER = "({includeAnonymous:UInt8} = 1 OR coalesce(CAST(e. const ROUTE_QUERIES_BEFORE: RouteQuery[] = [ { name: "loadUsersByCountry", - desc: "argMax country per user over all $token-refresh events (no window)", + desc: "argMax country per user over all $token-refresh events (NO time window)", sql: ` SELECT country_code, @@ -412,7 +417,7 @@ const ROUTE_QUERIES_BEFORE: RouteQuery[] = [ user_id, event_at, CAST(data.ip_info.country_code, 'Nullable(String)') AS cc, - CAST(data.is_anonymous, 'UInt8') AS is_anonymous + coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) AS is_anonymous FROM analytics_internal.events WHERE event_type = '$token-refresh' AND project_id = {projectId:String} @@ -676,8 +681,42 @@ function splitSqlAfter(idCol: "user_id" | "team_id", withAnonFilter: boolean): s } const ROUTE_QUERIES_AFTER: RouteQuery[] = [ - // Unchanged by fix 1/3 (already uses CAST). - ROUTE_QUERIES_BEFORE[0], // loadUsersByCountry + // Option A fix: add the lower `event_at >= since` bound so the outer + // GROUP BY user_id only sees users active in the 30-day window. + { + name: "loadUsersByCountry", + desc: "A: 30-day window added (was unbounded scan of all $token-refresh history)", + sql: ` + SELECT + country_code, + count() AS userCount + FROM ( + SELECT + user_id, + argMax(cc, event_at) AS country_code + FROM ( + SELECT + user_id, + event_at, + CAST(data.ip_info.country_code, 'Nullable(String)') AS cc, + coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) AS is_anonymous + FROM analytics_internal.events + WHERE event_type = '$token-refresh' + AND project_id = {projectId:String} + AND branch_id = {branchId:String} + AND user_id IS NOT NULL + AND event_at >= {since:DateTime} + AND event_at < {untilExclusive:DateTime} + ) + WHERE cc IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR is_anonymous = 0) + GROUP BY user_id + ) + WHERE country_code IS NOT NULL + GROUP BY country_code + ORDER BY userCount DESC + `, + }, { name: "loadDailyActiveUsers", desc: "DAU per day (fix 1: CAST instead of JSONExtract)", @@ -1037,8 +1076,389 @@ const ROUTE_QUERIES_OPTIMIZED: RouteQuery[] = [ }, ]; -async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date): Promise { +// ── Backfill comparison (option A / B / C / D / E) ─────────────────────────── +// +// Each backfill option (A-E from the analysis doc) leaves the metrics queries +// in one of three structurally-distinct shapes. We benchmark the three shapes +// against the same seeded dataset. +// +// A → bounded LEFT JOIN (event_at >= since on the inner subquery). +// Still has the join, but the GROUP BY hash table only contains +// users with a token-refresh in the last 30d. +// B / C / E → drop the join, classify from e.data.is_anonymous (JSON access). +// B = argMax-latest semantics (matches today). +// C = ASOF event-at-time semantics. +// E = same as B but via partition swap. +// All three produce IDENTICAL post-backfill query SQL — the +// only differences are in *how the data got there*, not how +// the metrics query reads it. +// D → drop the join, classify from a top-level Nullable(UInt8) column. +// Same shape as B/C/E but skips the per-row JSON parse. +// +// To bench option D we need a real top-level column. We add it under a unique +// name (BENCH_OPTION_D_COLUMN) before seeding and drop it on cleanup. + +// Scoped to RUN_ID so concurrent bench runs don't collide on the shared +// analytics_internal.events schema, and so a SIGKILL'd run that skips +// cleanup leaks a uniquely-named column instead of clobbering a future run's +// `bench_is_anon_d`. Cleanup still drops via IF EXISTS, and the bench's row +// filter (project_id = BENCH_PROJECT_ID) keeps mutation cost trivial. +const BENCH_OPTION_D_COLUMN = `bench_is_anon_d_${RUN_ID.replace(/-/g, "_")}`; + +const analyticsUserJoinBounded = ` + LEFT JOIN ( + SELECT + user_id, + argMax(coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0), event_at) AS latest_is_anonymous + FROM analytics_internal.events + WHERE event_type = '$token-refresh' + AND project_id = {projectId:String} + AND branch_id = {branchId:String} + AND user_id IS NOT NULL + AND event_at >= {since:DateTime} -- ★ option A's lower bound + AND event_at < {untilExclusive:DateTime} + GROUP BY user_id + ) AS token_refresh_users + ON e.user_id = token_refresh_users.user_id +`; + +const ROUTE_QUERIES_BACKFILL_A: RouteQuery[] = [ + { + name: "analyticsOverview:dailyEvents", + desc: "A: bounded LEFT JOIN (event_at >= since on inner)", + sql: ` + SELECT + toDate(e.event_at) AS day, + countIf( + e.event_type = '$page-view' AND e.user_id IS NOT NULL + AND ${NON_ANON_FILTER_AFTER} + ) AS pv, + countIf( + e.event_type = '$click' AND e.user_id IS NOT NULL + AND ${NON_ANON_FILTER_AFTER} + ) AS cl, + uniqExactIf( + assumeNotNull(e.user_id), + e.event_type = '$page-view' AND e.user_id IS NOT NULL + AND ${NON_ANON_FILTER_AFTER} + ) AS visitors + FROM analytics_internal.events AS e + ${analyticsUserJoinBounded} + WHERE e.event_type IN ('$page-view', '$click') + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY day + ORDER BY day ASC + `, + }, + { + name: "analyticsOverview:totalVisitors", + desc: "A: bounded LEFT JOIN (event_at >= since on inner)", + sql: ` + SELECT + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL AND ${NON_ANON_FILTER_AFTER} + ) AS visitors + FROM analytics_internal.events AS e + ${analyticsUserJoinBounded} + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.user_id IS NOT NULL + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + `, + }, + { + name: "analyticsOverview:topReferrers", + desc: "A: bounded LEFT JOIN (event_at >= since on inner)", + sql: ` + SELECT + nullIf(CAST(e.data.referrer, 'String'), '') AS referrer, + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL AND ${NON_ANON_FILTER_AFTER} + ) AS visitors + FROM analytics_internal.events AS e + ${analyticsUserJoinBounded} + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY referrer + HAVING visitors > 0 + ORDER BY visitors DESC + LIMIT 100 + `, + }, +]; + +// Options B/C/E all collapse to the same post-backfill SQL shape: drop the +// LEFT JOIN entirely and trust e.data.is_anonymous (the field that the +// ingestion fix will populate). The OPTIMIZED array already contains exactly +// these queries — reuse them so we have a single source of truth. +const ROUTE_QUERIES_BACKFILL_BCE: RouteQuery[] = ROUTE_QUERIES_OPTIMIZED.filter((q) => + q.name === "analyticsOverview:dailyEvents" + || q.name === "analyticsOverview:totalVisitors" + || q.name === "analyticsOverview:topReferrers", +); + +const ROUTE_QUERIES_BACKFILL_D: RouteQuery[] = [ + { + name: "analyticsOverview:dailyEvents", + desc: "D: drop join, top-level UInt8 column (no JSON parse)", + sql: ` + SELECT + toDate(e.event_at) AS day, + countIf( + e.event_type = '$page-view' AND e.user_id IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0) + ) AS pv, + countIf( + e.event_type = '$click' AND e.user_id IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0) + ) AS cl, + uniqExactIf( + assumeNotNull(e.user_id), + e.event_type = '$page-view' AND e.user_id IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0) + ) AS visitors + FROM analytics_internal.events AS e + WHERE e.event_type IN ('$page-view', '$click') + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY day + ORDER BY day ASC + `, + }, + { + name: "analyticsOverview:totalVisitors", + desc: "D: drop join, top-level UInt8 column (no JSON parse)", + sql: ` + SELECT + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0) + ) AS visitors + FROM analytics_internal.events AS e + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.user_id IS NOT NULL + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + `, + }, + { + name: "analyticsOverview:topReferrers", + desc: "D: drop join, top-level UInt8 column (no JSON parse)", + sql: ` + SELECT + nullIf(CAST(e.data.referrer, 'String'), '') AS referrer, + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL + AND ({includeAnonymous:UInt8} = 1 OR coalesce(e.${BENCH_OPTION_D_COLUMN}, 0) = 0) + ) AS visitors + FROM analytics_internal.events AS e + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY referrer + HAVING visitors > 0 + ORDER BY visitors DESC + LIMIT 100 + `, + }, +]; + +async function ensureOptionDColumn(): Promise { + const client = getClickhouseAdminClient(); + const res = await client.query({ + query: ` + SELECT count() AS c FROM system.columns + WHERE database = 'analytics_internal' + AND table = 'events' + AND name = {col:String} + `, + query_params: { col: BENCH_OPTION_D_COLUMN }, + format: "JSONEachRow", + }); + const rows = (await res.json()) as { c: string | number }[]; + if (Number(rows[0]?.c ?? 0) === 0) { + await client.command({ + query: ` + ALTER TABLE analytics_internal.events + ADD COLUMN IF NOT EXISTS ${BENCH_OPTION_D_COLUMN} Nullable(UInt8) + `, + }); + } +} + +async function populateOptionDColumn(): Promise { + const client = getClickhouseAdminClient(); + await client.command({ + query: ` + ALTER TABLE analytics_internal.events + UPDATE ${BENCH_OPTION_D_COLUMN} = CAST(data.is_anonymous, 'Nullable(UInt8)') + WHERE project_id = {projectId:String} + `, + query_params: { projectId: BENCH_PROJECT_ID }, + clickhouse_settings: { mutations_sync: "2" }, + }); +} + +async function dropOptionDColumn(): Promise { const client = getClickhouseAdminClient(); + await client.command({ + query: `ALTER TABLE analytics_internal.events DROP COLUMN IF EXISTS ${BENCH_OPTION_D_COLUMN}`, + }); +} + +// Pulls only the 3 analyticsOverview queries that touch the join from the +// current (post-fixes-1+3) shipped SQL, so we have a baseline to compare against. +const ROUTE_QUERIES_BACKFILL_BASELINE: RouteQuery[] = ROUTE_QUERIES_AFTER.filter((q) => + q.name === "analyticsOverview:dailyEvents" + || q.name === "analyticsOverview:totalVisitors" + || q.name === "analyticsOverview:topReferrers", +); + +async function benchmarkBackfillCompare(now: Date): Promise { + const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS); + const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS); + const params: QueryParams = { + projectId: BENCH_PROJECT_ID, + branchId: PERF_BRANCH_ID, + since, + untilExclusive, + includeAnonymous: false, + }; + + console.log("\n── Backfill option comparison (post-backfill query memory) ──"); + console.log(" Each option leaves the metrics queries in one of three shapes:"); + console.log(" Today / A (bounded join) / B,C,E (drop join, JSON) / D (drop join, top-level column)\n"); + + // Set up option D's column. + console.log(" Setting up option D top-level column…"); + await ensureOptionDColumn(); + await populateOptionDColumn(); + console.log(" done.\n"); + + async function runShape(label: string, list: RouteQuery[]): Promise> { + const out = new Map(); + for (const rq of list) { + const qid = await runRouteQuery(rq, params, now); + out.set(rq.name, await readStats(qid)); + } + return out; + } + + // Warm cache once with a tiny query. + await runRouteQuery(ROUTE_QUERIES_BEFORE[1], params, now); + + const baseline = await runShape("today", ROUTE_QUERIES_BACKFILL_BASELINE); + const a = await runShape("A", ROUTE_QUERIES_BACKFILL_A); + const bce = await runShape("B/C/E", ROUTE_QUERIES_BACKFILL_BCE); + const d = await runShape("D", ROUTE_QUERIES_BACKFILL_D); + + const padL = (s: string, n: number) => s.padEnd(n); + const padR = (s: string, n: number) => s.padStart(n); + const mem = (n: number | undefined) => n == null ? "—" : padR(fmtBytes(n), 12); + const dur = (n: number | undefined) => n == null ? "—" : padR(`${n} ms`, 9); + + const queryNames = ["analyticsOverview:dailyEvents", "analyticsOverview:totalVisitors", "analyticsOverview:topReferrers"]; + + console.log(" Peak memory per query, per backfill shape:"); + console.log([ + padL("query", 36), + padR("Today", 12), + padR("A: bounded", 12), + padR("B/C/E: drop", 12), + padR("D: column", 12), + ].join(" ")); + console.log(" " + "─".repeat(96)); + for (const name of queryNames) { + const t = baseline.get(name); + const av = a.get(name); + const bv = bce.get(name); + const dv = d.get(name); + console.log([ + " " + padL(name, 34), + mem(t?.memory_usage), + mem(av?.memory_usage), + mem(bv?.memory_usage), + mem(dv?.memory_usage), + ].join(" ")); + } + + // Totals + const sum = (m: Map) => + queryNames.reduce((acc, n) => acc + (m.get(n)?.memory_usage ?? 0), 0); + const tSum = sum(baseline); + const aSum = sum(a); + const bSum = sum(bce); + const dSum = sum(d); + + console.log(" " + "─".repeat(96)); + console.log([ + " " + padL("SUM peak memory", 34), + padR(fmtBytes(tSum), 12), + padR(fmtBytes(aSum), 12), + padR(fmtBytes(bSum), 12), + padR(fmtBytes(dSum), 12), + ].join(" ")); + + const ratio = (s: number) => s === 0 ? "—" : `${(tSum / s).toFixed(2)}× less`; + console.log([ + " " + padL("vs. Today", 34), + padR("—", 12), + padR(ratio(aSum), 12), + padR(ratio(bSum), 12), + padR(ratio(dSum), 12), + ].join(" ")); + + // Duration too + console.log("\n Query duration per shape:"); + console.log([ + padL("query", 36), + padR("Today", 9), + padR("A", 9), + padR("B/C/E", 9), + padR("D", 9), + ].join(" ")); + console.log(" " + "─".repeat(84)); + for (const name of queryNames) { + console.log([ + " " + padL(name, 34), + dur(baseline.get(name)?.query_duration_ms), + dur(a.get(name)?.query_duration_ms), + dur(bce.get(name)?.query_duration_ms), + dur(d.get(name)?.query_duration_ms), + ].join(" ")); + } + + console.log("\n Notes:"); + console.log(" • B, C, E all produce the same query SQL; their column above is one number."); + console.log(" • The semantic differences (argMax-latest vs ASOF vs partition-swap) live"); + console.log(" in the backfill operation, not in the query that runs afterwards."); + console.log(" • Option D mutates a stored column instead of the JSON field; the runtime"); + console.log(" win comes from skipping per-row JSON access."); +} + +async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: { useMetricsClient?: boolean } = {}): Promise { + // `useMetricsClient` runs through getClickhouseAdminClientForMetrics, which + // applies the connection-level SETTINGS (caps + grace_hash) — so AFTER + // measurements reflect what actually ships in route.tsx, not the raw SQL. + const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient(); const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`; const baseParams: Record = { projectId: p.projectId, @@ -1058,6 +1478,302 @@ async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date): Promise return queryId; } +// ── Join algorithm comparison (BENCH_JOIN_ALGO_COMPARE=1) ───────────────────── +// +// For each of ClickHouse's 6 join algorithms, run the 3 analyticsOverview +// queries under two cases: +// +// normal = bounded analyticsUserJoin (the SQL we just shipped). Small build +// side; isolated from cumulative-history scan. +// heavy = UNBOUNDED analyticsUserJoin (pre-fix). Large build side; the +// pattern that caused Sentry STACK-BACKEND-16H. +// +// Algorithms: +// default - leave the cluster default (typically `direct,parallel_hash,hash`) +// direct - KV lookup on right side; only works if right is a Dictionary +// (it isn't here, so we expect this to fall back or error) +// hash - classic single-threaded hash join +// parallel_hash - parallel build, fastest on small/medium, most memory +// grace_hash - partitioned hash, spills to disk +// full_sorting_merge - sorts both sides; can beat hash when input is huge +// partial_merge - sorts only the right side; lowest memory, slowest +// +// Some algorithms will error or be silently rejected for shapes they can't +// handle (e.g. `direct` requires a Dictionary). Errors are caught and shown +// as `ERR` in the output table. + +const ANALYTICS_OVERVIEW_QUERY_NAMES = [ + "analyticsOverview:dailyEvents", + "analyticsOverview:totalVisitors", + "analyticsOverview:topReferrers", +] as const; + +const JOIN_ALGORITHMS = [ + "default", + "direct", + "hash", + "parallel_hash", + "grace_hash", + "full_sorting_merge", + "partial_merge", +] as const; +type JoinAlgorithm = typeof JOIN_ALGORITHMS[number]; + +// Build the 3 analyticsOverview queries with the given join SQL, filter SQL, +// and a per-query SETTINGS clause (used to force `join_algorithm`). +function buildAnalyticsOverviewVariant(opts: { + joinSql: string, + nonAnonFilter: string, + joinAlgorithm: JoinAlgorithm, +}): RouteQuery[] { + const settings = opts.joinAlgorithm === "default" + ? "" + : `SETTINGS join_algorithm = '${opts.joinAlgorithm}'`; + return [ + { + name: "analyticsOverview:dailyEvents", + desc: `analyticsOverview daily events (join_algorithm=${opts.joinAlgorithm})`, + sql: ` + SELECT + toDate(e.event_at) AS day, + countIf( + e.event_type = '$page-view' + AND e.user_id IS NOT NULL + AND ${opts.nonAnonFilter} + ) AS pv, + countIf( + e.event_type = '$click' + AND e.user_id IS NOT NULL + AND ${opts.nonAnonFilter} + ) AS cl, + uniqExactIf( + assumeNotNull(e.user_id), + e.event_type = '$page-view' + AND e.user_id IS NOT NULL + AND ${opts.nonAnonFilter} + ) AS visitors + FROM analytics_internal.events AS e + ${opts.joinSql} + WHERE e.event_type IN ('$page-view', '$click') + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY day + ORDER BY day ASC + ${settings} + `, + }, + { + name: "analyticsOverview:totalVisitors", + desc: `analyticsOverview total visitors (join_algorithm=${opts.joinAlgorithm})`, + sql: ` + SELECT + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL + AND ${opts.nonAnonFilter} + ) AS visitors + FROM analytics_internal.events AS e + ${opts.joinSql} + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.user_id IS NOT NULL + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + ${settings} + `, + }, + { + name: "analyticsOverview:topReferrers", + desc: `analyticsOverview top referrers (join_algorithm=${opts.joinAlgorithm})`, + sql: ` + SELECT + nullIf(CAST(e.data.referrer, 'String'), '') AS referrer, + uniqExactIf( + assumeNotNull(e.user_id), + e.user_id IS NOT NULL + AND ${opts.nonAnonFilter} + ) AS visitors + FROM analytics_internal.events AS e + ${opts.joinSql} + WHERE e.event_type = '$page-view' + AND e.project_id = {projectId:String} + AND e.branch_id = {branchId:String} + AND e.event_at >= {since:DateTime} + AND e.event_at < {untilExclusive:DateTime} + GROUP BY referrer + HAVING visitors > 0 + ORDER BY visitors DESC + LIMIT 100 + ${settings} + `, + }, + ]; +} + +// Run a query and either return its QueryStats, or null if the query errored +// (e.g. an unsupported `join_algorithm` for the query's shape). +async function tryRunAndReadStats(rq: RouteQuery, p: QueryParams, now: Date): Promise { + try { + const qid = await runRouteQuery(rq, p, now); + return await readStats(qid); + } catch { + return null; + } +} + +async function benchmarkJoinAlgorithms(now: Date): Promise { + const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS); + const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS); + const params: QueryParams = { + projectId: BENCH_PROJECT_ID, + branchId: PERF_BRANCH_ID, + since, + untilExclusive, + includeAnonymous: false, + }; + + console.log("\n── Join algorithm comparison (6 algorithms × 2 cases) ──"); + console.log(" Cases:"); + console.log(" normal = bounded analyticsUserJoin (the SQL we just shipped)"); + console.log(" heavy = UNBOUNDED analyticsUserJoin (pre-fix; what caused the Sentry OOM)"); + console.log(""); + + // Warm cache once. + await runRouteQuery( + buildAnalyticsOverviewVariant({ + joinSql: ANALYTICS_USER_JOIN_AFTER, + nonAnonFilter: NON_ANON_FILTER_AFTER, + joinAlgorithm: "default", + })[0], + params, + now, + ); + + type CaseName = "normal" | "heavy"; + const cases: Record = { + normal: { joinSql: ANALYTICS_USER_JOIN_AFTER, nonAnonFilter: NON_ANON_FILTER_AFTER }, + heavy: { joinSql: ANALYTICS_USER_JOIN, nonAnonFilter: NON_ANON_FILTER }, + }; + + type StatsByCaseByAlgo = Record>>; + const stats: StatsByCaseByAlgo = { + normal: {} as Record>, + heavy: {} as Record>, + }; + + for (const caseName of ["normal", "heavy"] as const) { + for (const algo of JOIN_ALGORITHMS) { + const queries = buildAnalyticsOverviewVariant({ + joinSql: cases[caseName].joinSql, + nonAnonFilter: cases[caseName].nonAnonFilter, + joinAlgorithm: algo, + }); + const out = new Map(); + for (const rq of queries) { + out.set(rq.name, await tryRunAndReadStats(rq, params, now)); + } + stats[caseName][algo] = out; + } + } + + const padL = (s: string, n: number) => s.padEnd(n); + const padR = (s: string, n: number) => s.padStart(n); + const memCell = (s: QueryStats | null | undefined) => + s == null ? padR("ERR", 11) : padR(fmtBytes(s.memory_usage), 11); + const durCell = (s: QueryStats | null | undefined) => + s == null ? padR("ERR", 9) : padR(`${s.query_duration_ms} ms`, 9); + + function printCaseTable(caseName: CaseName): void { + console.log(`\n ── ${caseName.toUpperCase()} case (${caseName === "normal" ? "bounded join" : "UNBOUNDED join"}) ──`); + // Header row + console.log([ + padL("query", 32), + ...JOIN_ALGORITHMS.map((a) => padR(a, 11)), + ].join(" ")); + console.log(" " + "─".repeat(32 + JOIN_ALGORITHMS.length * 13)); + // Per-query memory + for (const name of ANALYTICS_OVERVIEW_QUERY_NAMES) { + console.log([ + " " + padL(name, 30), + ...JOIN_ALGORITHMS.map((a) => memCell(stats[caseName][a].get(name))), + ].join(" ")); + } + // Sum memory row + console.log(" " + "─".repeat(32 + JOIN_ALGORITHMS.length * 13)); + console.log([ + " " + padL("SUM peak memory", 30), + ...JOIN_ALGORITHMS.map((a) => { + const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => { + const s = stats[caseName][a].get(n); + return s == null ? acc : acc + s.memory_usage; + }, 0); + const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null); + return anyErr ? padR("partial", 11) : padR(fmtBytes(sum), 11); + }), + ].join(" ")); + // Sum duration row + console.log([ + " " + padL("SUM duration", 30), + ...JOIN_ALGORITHMS.map((a) => { + const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => { + const s = stats[caseName][a].get(n); + return s == null ? acc : acc + s.query_duration_ms; + }, 0); + const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null); + return anyErr ? padR("partial", 11) : padR(`${sum} ms`, 11); + }), + ].join(" ")); + } + + printCaseTable("normal"); + printCaseTable("heavy"); + + // Find the best algorithm per case by memory and by duration + function bestByMemory(caseName: CaseName): { algo: JoinAlgorithm, mem: number } | null { + let best: { algo: JoinAlgorithm, mem: number } | null = null; + for (const a of JOIN_ALGORITHMS) { + const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => { + const s = stats[caseName][a].get(n); + return s == null ? acc : acc + s.memory_usage; + }, 0); + const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null); + if (anyErr) continue; + if (best == null || sum < best.mem) best = { algo: a, mem: sum }; + } + return best; + } + function bestByDuration(caseName: CaseName): { algo: JoinAlgorithm, dur: number } | null { + let best: { algo: JoinAlgorithm, dur: number } | null = null; + for (const a of JOIN_ALGORITHMS) { + const sum = ANALYTICS_OVERVIEW_QUERY_NAMES.reduce((acc, n) => { + const s = stats[caseName][a].get(n); + return s == null ? acc : acc + s.query_duration_ms; + }, 0); + const anyErr = ANALYTICS_OVERVIEW_QUERY_NAMES.some((n) => stats[caseName][a].get(n) == null); + if (anyErr) continue; + if (best == null || sum < best.dur) best = { algo: a, dur: sum }; + } + return best; + } + + console.log("\n Headlines:"); + for (const c of ["normal", "heavy"] as const) { + const bm = bestByMemory(c); + const bd = bestByDuration(c); + if (bm) console.log(` ${c.padEnd(7)} | best memory: ${bm.algo.padEnd(20)} (${fmtBytes(bm.mem)})`); + if (bd) console.log(` ${c.padEnd(7)} | best speed: ${bd.algo.padEnd(20)} (${bd.dur} ms)`); + } + console.log("\n Notes:"); + console.log(" • ERR / partial = the algorithm errored or didn't apply for that query shape"); + console.log(" (typical for `direct` which requires a Dictionary right-hand side)."); + console.log(" • The OOM bottleneck is the inner GROUP BY user_id aggregation, not the"); + console.log(" join's hash table — algorithms that only target the join (everything"); + console.log(" except sorting-merge variants) cap out at ~modest savings on the heavy case."); +} + async function benchmarkRouteQueries(now: Date): Promise { const untilExclusive = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS); const since = new Date(untilExclusive.getTime() - METRICS_WINDOW_MS); @@ -1085,11 +1801,14 @@ async function benchmarkRouteQueries(now: Date): Promise { // Also capture the actual row payload so we can check correctness for OPT // variants (e.g., dropping the LEFT JOIN on analyticsOverview must not change counts). - async function runAndCollect(list: RouteQuery[]): Promise<{ stats: Map, payloads: Map }> { + // `useMetricsClient` routes through getClickhouseAdminClientForMetrics so the + // connection-level SETTINGS (caps + grace_hash) apply — used for AFTER to + // mirror what actually ships. + async function runAndCollect(list: RouteQuery[], opts: { useMetricsClient?: boolean } = {}): Promise<{ stats: Map, payloads: Map }> { const stats = new Map(); const payloads = new Map(); for (const rq of list) { - const client = getClickhouseAdminClient(); + const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient(); const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`; const baseParams: Record = { projectId: params.projectId, @@ -1114,8 +1833,8 @@ async function benchmarkRouteQueries(now: Date): Promise { } const before = await runAndCollect(ROUTE_QUERIES_BEFORE); - const after = await runAndCollect(ROUTE_QUERIES_AFTER); - const opt = await runAndCollect(ROUTE_QUERIES_OPTIMIZED); + const after = await runAndCollect(ROUTE_QUERIES_AFTER, { useMetricsClient: true }); + const opt = await runAndCollect(ROUTE_QUERIES_OPTIMIZED, { useMetricsClient: true }); const beforeStats = before.stats; const afterStats = after.stats; @@ -1280,7 +1999,10 @@ type QueryStats = { async function readStats(queryId: string): Promise { const client = getClickhouseAdminClient(); await client.command({ query: "SYSTEM FLUSH LOGS" }); - const delays = [100, 200, 400, 800, 1600]; + // Total budget ~12.7s. With many heavy queries running in sequence the + // query_log async flush can take longer than the original 3.1s budget, which + // was producing spurious "no query_log row" failures at the 300k-user scale. + const delays = [100, 200, 400, 800, 1600, 3200, 6400]; for (let i = 0; i <= delays.length; i++) { const res = await client.query({ query: ` @@ -1337,6 +2059,13 @@ async function cleanup(): Promise { // Block until the mutation is applied so the script exits clean. clickhouse_settings: { mutations_sync: "2" }, }); + // Best-effort: drop the option-D bench column if we added it. Safe to run + // unconditionally because of the IF EXISTS guard. + try { + await dropOptionDColumn(); + } catch (e) { + console.error(" (could not drop option-D column:", e, ")"); + } } // ── Edge-case matrix ───────────────────────────────────────────────────────── @@ -1607,9 +2336,18 @@ async function seedPerf(now: Date): Promise { ); const batchRows = envInt("BENCH_BATCH", 50_000); + // BENCH_HISTORICAL_DAYS extends the event seed back beyond the 30-day metrics + // window so unbounded-scan queries (loadUsersByCountry, the LEFT JOIN in the + // splits) read more data than windowed queries — mirroring the prod skew that + // caused Sentry STACK-BACKEND-16H. Default 365 days. Set to 30 to recover the + // original behavior. + const historicalDays = envInt("BENCH_HISTORICAL_DAYS", 365); const windowEnd = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS); - const windowStart = new Date(windowEnd.getTime() - METRICS_WINDOW_MS); + const windowStart = new Date(windowEnd.getTime() - historicalDays * 24 * 60 * 60 * 1000); const spanMs = windowEnd.getTime() - windowStart.getTime(); + if (historicalDays !== METRICS_WINDOW_DAYS) { + console.log(` (event_at spans last ${historicalDays} days; only the most recent ${METRICS_WINDOW_DAYS} fall inside the metrics window)`); + } const teamIds: string[] = Array.from({ length: teamCount }, () => mkUuid()); const t0 = Date.now(); @@ -1815,10 +2553,14 @@ async function main(): Promise { const doPerf = matrixOk && !envBool("BENCH_SKIP_PERF"); const doRouteQueries = matrixOk && envBool("BENCH_ROUTE_QUERIES"); - if (doPerf || doRouteQueries) { + const doBackfillCompare = matrixOk && envBool("BENCH_BACKFILL_COMPARE"); + const doJoinAlgoCompare = matrixOk && (envBool("BENCH_JOIN_ALGO_COMPARE") || envBool("BENCH_GRACE_HASH_COMPARE")); + if (doPerf || doRouteQueries || doBackfillCompare || doJoinAlgoCompare) { await seedPerf(now); if (doPerf) await runPerf(now); if (doRouteQueries) await benchmarkRouteQueries(now); + if (doBackfillCompare) await benchmarkBackfillCompare(now); + if (doJoinAlgoCompare) await benchmarkJoinAlgorithms(now); } else if (envBool("BENCH_SKIP_PERF")) { console.log("Skipping perf run (BENCH_SKIP_PERF=1)"); } diff --git a/apps/backend/src/app/api/latest/internal/metrics/route.tsx b/apps/backend/src/app/api/latest/internal/metrics/route.tsx index faf8a0df6c..f0d17ce170 100644 --- a/apps/backend/src/app/api/latest/internal/metrics/route.tsx +++ b/apps/backend/src/app/api/latest/internal/metrics/route.tsx @@ -1,6 +1,6 @@ import { Prisma } from "@/generated/prisma/client"; import { EmailOutboxSimpleStatus } from "@/generated/prisma/enums"; -import { getClickhouseAdminClient } from "@/lib/clickhouse"; +import { getClickhouseAdminClientForMetrics } from "@/lib/clickhouse"; import { ClickHouseError } from "@clickhouse/client"; import { ActivitySplit } from "@/lib/metrics-activity-split"; import { Tenancy } from "@/lib/tenancies"; @@ -30,6 +30,9 @@ const MAX_USERS_FOR_COUNTRY_SAMPLE = 10_000; const METRICS_WINDOW_DAYS = 30; const METRICS_WINDOW_MS = METRICS_WINDOW_DAYS * 24 * 60 * 60 * 1000; const ONE_DAY_MS = 24 * 60 * 60 * 1000; +// All ClickHouse queries in this file run through `getClickhouseAdminClientForMetrics`, +// which configures memory caps + GROUP BY spill defaults at the connection level — +// see lib/clickhouse.tsx for the rationale. export const METRICS_REVENUE_INVOICE_STATUSES = ["paid", "succeeded"] as const; const METRICS_REVENUE_INVOICE_STATUSES_SQL = Prisma.raw( METRICS_REVENUE_INVOICE_STATUSES.map((status) => `'${status}'`).join(", "), @@ -70,8 +73,17 @@ function normalizeUuidFromEvent(value: string): string | null { // ClickHouse `match()` uses re2; pattern matches UUID_V4_JS_RE.source. const MAU_UUID_V4_REGEX = UUID_V4_JS_RE.source; -async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean = false): Promise> { - const clickhouseClient = getClickhouseAdminClient(); +async function loadUsersByCountry(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise> { + // Bound the scan to the 30-day metrics window. Before this lower bound, + // the inner `GROUP BY user_id` materialized one row per ever-seen user for + // the tenant — see Sentry STACK-BACKEND-16H, where this query was a top + // memory consumer on tenants with many months of $token-refresh history. + // Users without a token-refresh in the last 30 days are no longer counted; + // token-refresh fires every few minutes for each open session, so this only + // omits users with no active sessions in the last 30 days, which is the + // intent of a "metrics window" anyway. + const { since, untilExclusive } = getMetricsWindowBounds(now); + const clickhouseClient = getClickhouseAdminClientForMetrics(); const res = await clickhouseClient.query({ query: ` SELECT @@ -92,6 +104,8 @@ async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean = AND project_id = {projectId:String} AND branch_id = {branchId:String} AND user_id IS NOT NULL + AND event_at >= {since:DateTime} + AND event_at < {untilExclusive:DateTime} ) WHERE cc IS NOT NULL AND ({includeAnonymous:UInt8} = 1 OR is_anonymous = 0) @@ -105,6 +119,8 @@ async function loadUsersByCountry(tenancy: Tenancy, includeAnonymous: boolean = projectId: tenancy.project.id, branchId: tenancy.branchId, includeAnonymous: includeAnonymous ? 1 : 0, + since: formatClickhouseDateTimeParam(since), + untilExclusive: formatClickhouseDateTimeParam(untilExclusive), }, format: "JSONEachRow", }); @@ -139,7 +155,7 @@ async function loadActiveUsersByCountry( ): Promise> { const since = new Date(now.getTime() - ACTIVE_USERS_BY_COUNTRY_WINDOW_MS); - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); const res = await clickhouseClient.query({ query: ` SELECT @@ -269,7 +285,7 @@ async function loadLiveUsersCount( const since = new Date(now.getTime() - ACTIVE_USERS_BY_COUNTRY_WINDOW_MS); try { - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); const res = await clickhouseClient.query({ query: ` SELECT uniqExact(user_id) AS live_users @@ -347,7 +363,7 @@ async function loadDailyActiveUsers(tenancy: Tenancy, now: Date, includeAnonymou const since = new Date(todayUtc.getTime() - METRICS_WINDOW_MS); const untilExclusive = new Date(todayUtc.getTime() + ONE_DAY_MS); - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); const result = await clickhouseClient.query({ query: ` SELECT @@ -412,11 +428,24 @@ async function loadDailyActiveSplitFromClickhouse(options: { ? "AND ({includeAnonymous:UInt8} = 1 OR coalesce(CAST(data.is_anonymous, 'Nullable(UInt8)'), 0) = 0)" : ""; - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); // Note: the inner `assumeNotNull(${idCol}) AS entity_id` must not reuse the // column name, or ClickHouse re-resolves `WHERE ${idCol} IS NOT NULL` // against the alias (assumeNotNull returns '' for NULLs, which passes the // not-null test) and phantom rows slip through. + // + // The LEFT JOIN's `min(event_at)` subquery below scans all $token-refresh + // history per request (no `event_at >= ...` lower bound) — same unbounded + // pattern as Sentry STACK-BACKEND-16H, and was ranked #5/#6 by peak memory + // in the benchmark. It can't be bounded the same way as the other queries + // because `first_date` is used to classify entities as new vs reactivated: + // bounding the subquery to 30 days would cause entities first seen >30d ago + // and active today to be (incorrectly) reported as "new". The proper fix + // requires either materializing a per-entity "first-seen-ever" table or + // shipping the option C backfill so we can read first_date from the event + // stream cheaply. For now the per-user memory cap configured on + // getClickhouseAdminClientForMetrics (see lib/clickhouse.tsx) spills the + // GROUP BY hash table and caps the query if it runs out of memory. const result = await clickhouseClient.query({ query: ` SELECT @@ -552,7 +581,7 @@ async function loadAnonymousVisitorsFromTokenRefresh( now: Date, ): Promise<{ dailyVisitors: DataPoints, visitors: number }> { const { since, untilExclusive } = getMetricsWindowBounds(now); - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); const query = ` SELECT @@ -630,7 +659,7 @@ async function loadAnonymousVisitorsFromTokenRefresh( async function loadMonthlyActiveUsers(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise { const { since, untilExclusive } = getMetricsWindowBounds(now); - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); try { const result = await clickhouseClient.query({ query: ` @@ -1038,7 +1067,7 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo const since = new Date(todayUtc.getTime() - METRICS_WINDOW_MS); const untilExclusive = new Date(todayUtc.getTime() + ONE_DAY_MS); - const clickhouseClient = getClickhouseAdminClient(); + const clickhouseClient = getClickhouseAdminClientForMetrics(); // Session replay aggregates come from Postgres and have nothing to do with // ClickHouse availability. Run them in parallel with the ClickHouse queries @@ -1064,6 +1093,37 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo } | null = null; try { + // BAND-AID for Sentry STACK-BACKEND-16H. The inner subquery used to scan + // ALL token-refresh history (only an upper bound `event_at < untilExclusive`), + // which materialized one row per ever-seen user in the GROUP BY hash table. + // On tenants with months of $token-refresh data this was the top memory + // consumer at /internal/metrics, and three of the queries below pay this + // cost. Adding the lower `event_at >= since` bound shrinks the hash table + // to "users with a token-refresh in the last 30 days", which is the same + // population the outer page-view/click query is already restricted to. + // + // Edge case: an anonymous page-view by a user who has not token-refreshed + // in the last 30 days will now fall through to `coalesce(..., 0)` and be + // silently classified non-anonymous. Token-refresh fires every few minutes + // per active session, so this is rare — but it is not impossible (e.g. + // embedded SDKs that poll less frequently, sessions that straddle the + // 30-day boundary, or anonymous users on cold tenancies). The PR shipping + // this fix quantifies the expected drift; option C (below) removes the + // edge case entirely. + // + // PERMANENT FIX (option C — see PR description and + // /tmp/internal-metrics-oom-analysis.html for the full write-up): + // 1. Stamp `is_anonymous` onto `$page-view`/`$click` events at insert + // time in apps/backend/src/app/api/latest/analytics/events/batch/route.tsx + // (the events/batch handler already has `auth.user` in scope). + // 2. Backfill historical page-view/click events via partition-by-partition + // INSERT SELECT with an ASOF LEFT JOIN to each event's preceding + // $token-refresh, producing event-at-time semantics. + // 3. Delete this LEFT JOIN entirely; the `coalesce` in the filter below + // short-circuits on the first non-null argument so the join's + // `latest_is_anonymous` becomes dead code. + // Benchmarked memory profiles for each option are in + // apps/backend/scripts/benchmark-internal-metrics.ts (BENCH_BACKFILL_COMPARE=1). const analyticsUserJoin = ` LEFT JOIN ( SELECT @@ -1074,6 +1134,7 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo AND project_id = {projectId:String} AND branch_id = {branchId:String} AND user_id IS NOT NULL + AND event_at >= {since:DateTime} AND event_at < {untilExclusive:DateTime} GROUP BY user_id ) AS token_refresh_users @@ -1476,7 +1537,7 @@ export const GET = createSmartRouteHandler({ ] = await Promise.all([ loadTotalUsers(req.auth.tenancy, now, includeAnonymous), loadDailyActiveUsers(req.auth.tenancy, now, includeAnonymous), - loadUsersByCountry(req.auth.tenancy, includeAnonymous), + loadUsersByCountry(req.auth.tenancy, now, includeAnonymous), loadActiveUsersByCountry(req.auth.tenancy, now, includeAnonymous), loadLiveUsersCount(req.auth.tenancy, now, includeAnonymous), usersCrudHandlers.adminList({ diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx index 8ad015fcf9..2e27799581 100644 --- a/apps/backend/src/lib/clickhouse.tsx +++ b/apps/backend/src/lib/clickhouse.tsx @@ -1,4 +1,4 @@ -import { createClient, type ClickHouseClient } from "@clickhouse/client"; +import { createClient, type ClickHouseClient, type ClickHouseSettings } from "@clickhouse/client"; import { getEnvVariable } from "@stackframe/stack-shared/dist/utils/env"; import { StackAssertionError } from "@stackframe/stack-shared/dist/utils/errors"; @@ -9,7 +9,11 @@ function getAdminAuth() { }; } -export function createClickhouseClient(authType: "admin" | "external", database?: string) { +export function createClickhouseClient( + authType: "admin" | "external", + database?: string, + clickhouse_settings?: ClickHouseSettings, +) { return createClient({ url: getEnvVariable("STACK_CLICKHOUSE_URL"), ...authType === "admin" ? getAdminAuth() : { @@ -18,6 +22,7 @@ export function createClickhouseClient(authType: "admin" | "external", database? }, database, request_timeout: 10 * 60 * 1000, // 10 minutes + clickhouse_settings, }); } @@ -29,6 +34,65 @@ export function getClickhouseExternalClient() { return createClickhouseClient("external", getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default")); } +// Safety net against the ClickHouse OvercommitTracker killing heavy analytical +// reads (Sentry STACK-BACKEND-16H). Use this client for any handler that fans +// out multiple `GROUP BY user_id`-style queries against `analytics_internal.events`. +// Values are decimal bytes (ClickHouse's native interpretation of digit strings); +// 1 GB = 1_000_000_000. +// +// max_bytes_before_external_group_by (6 GB ≈ 5.59 GiB) +// GROUP BY hash tables spill to disk when they exceed this size instead of +// growing without bound. Targets the unbounded `GROUP BY user_id` patterns +// (analyticsUserJoin, loadUsersByCountry, the splits) which were +// materializing one row per ever-seen user. +// +// max_memory_usage (8 GB ≈ 7.45 GiB) +// Hard per-query ceiling. +// +// max_memory_usage_for_user (9 GB ≈ 8.38 GiB) +// Per-user aggregate across concurrent queries — the bound that actually +// protects against the OvercommitTracker. /internal/metrics fans out to +// ~12 ClickHouse queries via nested Promise.all; a per-query cap alone +// (12 × 8 GB = 96 GB theoretical) doesn't prevent the cluster-wide kill. +// This per-user cap forces the fan-out to share a single 9 GB budget +// against the cluster's 10.8 GiB ceiling. The trade vs no-cap: a single +// query above 9 GB now fails with a clean "memory limit exceeded" error +// rather than triggering an OvercommitTracker kill of a random concurrent +// query. +// +// join_algorithm = 'grace_hash,parallel_hash,hash' +// ClickHouse picks the first applicable algorithm from this list. For our +// analyticsUserJoin shape (LEFT JOIN with a GROUP BY subquery as the +// build side), grace_hash partitions the build side instead of building +// one giant hash table — measured ~48% memory reduction vs the default +// parallel_hash, at zero latency cost (benchmark: BENCH_JOIN_ALGO_COMPARE=1 +// in scripts/benchmark-internal-metrics.ts). parallel_hash and hash are +// fallbacks for join shapes grace_hash doesn't support; ClickHouse falls +// back automatically. +// +// These are belt-and-suspenders. The actual fix for the OOM is bounding the +// `event_at` scans (option A) and eventually backfilling `is_anonymous` onto +// page-view/click events (option C) — see /internal/metrics/route.tsx for the +// long-form discussion. +export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = { + max_bytes_before_external_group_by: "6000000000", + max_memory_usage: "8000000000", + max_memory_usage_for_user: "9000000000", + // The @clickhouse/client-common JoinAlgorithm type is a union of single + // algorithm names, but the ClickHouse server accepts a comma-separated + // fallback list (tries each in order). The cast widens the SDK type so we + // can use the fallback form, which is what we actually want for safety. + join_algorithm: "grace_hash,parallel_hash,hash" as ClickHouseSettings["join_algorithm"], +}; + +export function getClickhouseAdminClientForMetrics() { + return createClickhouseClient( + "admin", + getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default"), + METRICS_CLICKHOUSE_SETTINGS, + ); +} + export const getQueryTimingStats = async (client: ClickHouseClient, queryId: string) => { // Flush logs to ensure system.query_log has latest query result. // Todo: for performance we should instead poll for this row to become available asynchronously after returning result. Flushed every 7.5 seconds by default From adf8f32510a8e2768cf9a4b7ae941ad32306dff3 Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Wed, 20 May 2026 15:28:07 -0700 Subject: [PATCH 2/3] Trim verbose comments added in the previous commit --- .../scripts/benchmark-internal-metrics.ts | 92 ++++--------------- .../app/api/latest/internal/metrics/route.tsx | 65 +++---------- apps/backend/src/lib/clickhouse.tsx | 51 ++-------- 3 files changed, 37 insertions(+), 171 deletions(-) diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts index 05c23e2b98..9589f4e49f 100644 --- a/apps/backend/scripts/benchmark-internal-metrics.ts +++ b/apps/backend/scripts/benchmark-internal-metrics.ts @@ -377,11 +377,7 @@ const ANALYTICS_USER_JOIN = ` `; const NON_ANON_FILTER = "({includeAnonymous:UInt8} = 1 OR coalesce(JSONExtract(toJSONString(e.data), 'is_anonymous', 'Nullable(UInt8)'), token_refresh_users.latest_is_anonymous, 0) = 0)"; -// Post-fix state of the analyticsUserJoin: lower `event_at >= since` bound -// added to the inner subquery's WHERE clause (the option A patch shipped for -// Sentry STACK-BACKEND-16H). Without this bound the inner GROUP BY hash -// table held one row per ever-seen user — see route.tsx for the full -// explanation and links to option C as the permanent fix. +// Post-fix: lower `event_at >= since` bound added to the inner subquery. const ANALYTICS_USER_JOIN_AFTER = ` LEFT JOIN ( SELECT @@ -681,11 +677,9 @@ function splitSqlAfter(idCol: "user_id" | "team_id", withAnonFilter: boolean): s } const ROUTE_QUERIES_AFTER: RouteQuery[] = [ - // Option A fix: add the lower `event_at >= since` bound so the outer - // GROUP BY user_id only sees users active in the 30-day window. { name: "loadUsersByCountry", - desc: "A: 30-day window added (was unbounded scan of all $token-refresh history)", + desc: "30-day window added (was unbounded scan)", sql: ` SELECT country_code, @@ -1076,33 +1070,13 @@ const ROUTE_QUERIES_OPTIMIZED: RouteQuery[] = [ }, ]; -// ── Backfill comparison (option A / B / C / D / E) ─────────────────────────── -// -// Each backfill option (A-E from the analysis doc) leaves the metrics queries -// in one of three structurally-distinct shapes. We benchmark the three shapes -// against the same seeded dataset. -// -// A → bounded LEFT JOIN (event_at >= since on the inner subquery). -// Still has the join, but the GROUP BY hash table only contains -// users with a token-refresh in the last 30d. -// B / C / E → drop the join, classify from e.data.is_anonymous (JSON access). -// B = argMax-latest semantics (matches today). -// C = ASOF event-at-time semantics. -// E = same as B but via partition swap. -// All three produce IDENTICAL post-backfill query SQL — the -// only differences are in *how the data got there*, not how -// the metrics query reads it. -// D → drop the join, classify from a top-level Nullable(UInt8) column. -// Same shape as B/C/E but skips the per-row JSON parse. -// -// To bench option D we need a real top-level column. We add it under a unique -// name (BENCH_OPTION_D_COLUMN) before seeding and drop it on cleanup. - -// Scoped to RUN_ID so concurrent bench runs don't collide on the shared -// analytics_internal.events schema, and so a SIGKILL'd run that skips -// cleanup leaks a uniquely-named column instead of clobbering a future run's -// `bench_is_anon_d`. Cleanup still drops via IF EXISTS, and the bench's row -// filter (project_id = BENCH_PROJECT_ID) keeps mutation cost trivial. +// ── Backfill comparison (BENCH_BACKFILL_COMPARE=1) ────────────────────────── +// Compares post-backfill query shapes: A (bounded join, no backfill), B/C/E +// (drop join, classify from data.is_anonymous), D (drop join, classify from +// a top-level column). B/C/E produce identical query SQL; they differ only in +// how the data gets stamped. + +// Scoped to RUN_ID so concurrent runs / SIGKILL-leaked columns don't collide. const BENCH_OPTION_D_COLUMN = `bench_is_anon_d_${RUN_ID.replace(/-/g, "_")}`; const analyticsUserJoinBounded = ` @@ -1455,9 +1429,8 @@ async function benchmarkBackfillCompare(now: Date): Promise { } async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: { useMetricsClient?: boolean } = {}): Promise { - // `useMetricsClient` runs through getClickhouseAdminClientForMetrics, which - // applies the connection-level SETTINGS (caps + grace_hash) — so AFTER - // measurements reflect what actually ships in route.tsx, not the raw SQL. + // `useMetricsClient` applies the route.tsx connection-level SETTINGS so AFTER + // measurements reflect what actually ships, not the raw SQL. const client = opts.useMetricsClient ? getClickhouseAdminClientForMetrics() : getClickhouseAdminClient(); const queryId = `bench-route-${rq.name.replace(/[^a-z0-9]/gi, "-")}-${randomUUID()}`; const baseParams: Record = { @@ -1479,28 +1452,9 @@ async function runRouteQuery(rq: RouteQuery, p: QueryParams, now: Date, opts: { } // ── Join algorithm comparison (BENCH_JOIN_ALGO_COMPARE=1) ───────────────────── -// -// For each of ClickHouse's 6 join algorithms, run the 3 analyticsOverview -// queries under two cases: -// -// normal = bounded analyticsUserJoin (the SQL we just shipped). Small build -// side; isolated from cumulative-history scan. -// heavy = UNBOUNDED analyticsUserJoin (pre-fix). Large build side; the -// pattern that caused Sentry STACK-BACKEND-16H. -// -// Algorithms: -// default - leave the cluster default (typically `direct,parallel_hash,hash`) -// direct - KV lookup on right side; only works if right is a Dictionary -// (it isn't here, so we expect this to fall back or error) -// hash - classic single-threaded hash join -// parallel_hash - parallel build, fastest on small/medium, most memory -// grace_hash - partitioned hash, spills to disk -// full_sorting_merge - sorts both sides; can beat hash when input is huge -// partial_merge - sorts only the right side; lowest memory, slowest -// -// Some algorithms will error or be silently rejected for shapes they can't -// handle (e.g. `direct` requires a Dictionary). Errors are caught and shown -// as `ERR` in the output table. +// Runs each of ClickHouse's 6 join_algorithm values against the bounded and +// unbounded analyticsUserJoin shapes. Algorithms that don't apply (e.g. +// `direct` without a Dictionary right side) error and show as ERR. const ANALYTICS_OVERVIEW_QUERY_NAMES = [ "analyticsOverview:dailyEvents", @@ -1799,11 +1753,8 @@ async function benchmarkRouteQueries(now: Date): Promise { return out; } - // Also capture the actual row payload so we can check correctness for OPT - // variants (e.g., dropping the LEFT JOIN on analyticsOverview must not change counts). - // `useMetricsClient` routes through getClickhouseAdminClientForMetrics so the - // connection-level SETTINGS (caps + grace_hash) apply — used for AFTER to - // mirror what actually ships. + // Also capture row payloads to spot-check OPT variants (e.g. dropping the + // LEFT JOIN must not change counts). async function runAndCollect(list: RouteQuery[], opts: { useMetricsClient?: boolean } = {}): Promise<{ stats: Map, payloads: Map }> { const stats = new Map(); const payloads = new Map(); @@ -1999,9 +1950,7 @@ type QueryStats = { async function readStats(queryId: string): Promise { const client = getClickhouseAdminClient(); await client.command({ query: "SYSTEM FLUSH LOGS" }); - // Total budget ~12.7s. With many heavy queries running in sequence the - // query_log async flush can take longer than the original 3.1s budget, which - // was producing spurious "no query_log row" failures at the 300k-user scale. + // ~12.7s budget; the async query_log flush can lag at 300k-user scale. const delays = [100, 200, 400, 800, 1600, 3200, 6400]; for (let i = 0; i <= delays.length; i++) { const res = await client.query({ @@ -2336,11 +2285,8 @@ async function seedPerf(now: Date): Promise { ); const batchRows = envInt("BENCH_BATCH", 50_000); - // BENCH_HISTORICAL_DAYS extends the event seed back beyond the 30-day metrics - // window so unbounded-scan queries (loadUsersByCountry, the LEFT JOIN in the - // splits) read more data than windowed queries — mirroring the prod skew that - // caused Sentry STACK-BACKEND-16H. Default 365 days. Set to 30 to recover the - // original behavior. + // BENCH_HISTORICAL_DAYS extends the seed beyond the 30-day metrics window so + // unbounded-scan queries read more rows than windowed ones (default 365). const historicalDays = envInt("BENCH_HISTORICAL_DAYS", 365); const windowEnd = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()) + ONE_DAY_MS); const windowStart = new Date(windowEnd.getTime() - historicalDays * 24 * 60 * 60 * 1000); diff --git a/apps/backend/src/app/api/latest/internal/metrics/route.tsx b/apps/backend/src/app/api/latest/internal/metrics/route.tsx index f0d17ce170..959a60aad7 100644 --- a/apps/backend/src/app/api/latest/internal/metrics/route.tsx +++ b/apps/backend/src/app/api/latest/internal/metrics/route.tsx @@ -30,9 +30,6 @@ const MAX_USERS_FOR_COUNTRY_SAMPLE = 10_000; const METRICS_WINDOW_DAYS = 30; const METRICS_WINDOW_MS = METRICS_WINDOW_DAYS * 24 * 60 * 60 * 1000; const ONE_DAY_MS = 24 * 60 * 60 * 1000; -// All ClickHouse queries in this file run through `getClickhouseAdminClientForMetrics`, -// which configures memory caps + GROUP BY spill defaults at the connection level — -// see lib/clickhouse.tsx for the rationale. export const METRICS_REVENUE_INVOICE_STATUSES = ["paid", "succeeded"] as const; const METRICS_REVENUE_INVOICE_STATUSES_SQL = Prisma.raw( METRICS_REVENUE_INVOICE_STATUSES.map((status) => `'${status}'`).join(", "), @@ -74,14 +71,8 @@ function normalizeUuidFromEvent(value: string): string | null { const MAU_UUID_V4_REGEX = UUID_V4_JS_RE.source; async function loadUsersByCountry(tenancy: Tenancy, now: Date, includeAnonymous: boolean = false): Promise> { - // Bound the scan to the 30-day metrics window. Before this lower bound, - // the inner `GROUP BY user_id` materialized one row per ever-seen user for - // the tenant — see Sentry STACK-BACKEND-16H, where this query was a top - // memory consumer on tenants with many months of $token-refresh history. - // Users without a token-refresh in the last 30 days are no longer counted; - // token-refresh fires every few minutes for each open session, so this only - // omits users with no active sessions in the last 30 days, which is the - // intent of a "metrics window" anyway. + // Without the 30-day bound the inner GROUP BY materializes one row per + // ever-seen user for the tenant. const { since, untilExclusive } = getMetricsWindowBounds(now); const clickhouseClient = getClickhouseAdminClientForMetrics(); const res = await clickhouseClient.query({ @@ -434,18 +425,9 @@ async function loadDailyActiveSplitFromClickhouse(options: { // against the alias (assumeNotNull returns '' for NULLs, which passes the // not-null test) and phantom rows slip through. // - // The LEFT JOIN's `min(event_at)` subquery below scans all $token-refresh - // history per request (no `event_at >= ...` lower bound) — same unbounded - // pattern as Sentry STACK-BACKEND-16H, and was ranked #5/#6 by peak memory - // in the benchmark. It can't be bounded the same way as the other queries - // because `first_date` is used to classify entities as new vs reactivated: - // bounding the subquery to 30 days would cause entities first seen >30d ago - // and active today to be (incorrectly) reported as "new". The proper fix - // requires either materializing a per-entity "first-seen-ever" table or - // shipping the option C backfill so we can read first_date from the event - // stream cheaply. For now the per-user memory cap configured on - // getClickhouseAdminClientForMetrics (see lib/clickhouse.tsx) spills the - // GROUP BY hash table and caps the query if it runs out of memory. + // The LEFT JOIN's `min(event_at)` subquery below is intentionally unbounded: + // bounding it would reclassify entities first seen >30d ago and active today + // as "new" instead of "reactivated". const result = await clickhouseClient.query({ query: ` SELECT @@ -1093,37 +1075,12 @@ async function loadAnalyticsOverview(tenancy: Tenancy, now: Date, includeAnonymo } | null = null; try { - // BAND-AID for Sentry STACK-BACKEND-16H. The inner subquery used to scan - // ALL token-refresh history (only an upper bound `event_at < untilExclusive`), - // which materialized one row per ever-seen user in the GROUP BY hash table. - // On tenants with months of $token-refresh data this was the top memory - // consumer at /internal/metrics, and three of the queries below pay this - // cost. Adding the lower `event_at >= since` bound shrinks the hash table - // to "users with a token-refresh in the last 30 days", which is the same - // population the outer page-view/click query is already restricted to. - // - // Edge case: an anonymous page-view by a user who has not token-refreshed - // in the last 30 days will now fall through to `coalesce(..., 0)` and be - // silently classified non-anonymous. Token-refresh fires every few minutes - // per active session, so this is rare — but it is not impossible (e.g. - // embedded SDKs that poll less frequently, sessions that straddle the - // 30-day boundary, or anonymous users on cold tenancies). The PR shipping - // this fix quantifies the expected drift; option C (below) removes the - // edge case entirely. - // - // PERMANENT FIX (option C — see PR description and - // /tmp/internal-metrics-oom-analysis.html for the full write-up): - // 1. Stamp `is_anonymous` onto `$page-view`/`$click` events at insert - // time in apps/backend/src/app/api/latest/analytics/events/batch/route.tsx - // (the events/batch handler already has `auth.user` in scope). - // 2. Backfill historical page-view/click events via partition-by-partition - // INSERT SELECT with an ASOF LEFT JOIN to each event's preceding - // $token-refresh, producing event-at-time semantics. - // 3. Delete this LEFT JOIN entirely; the `coalesce` in the filter below - // short-circuits on the first non-null argument so the join's - // `latest_is_anonymous` becomes dead code. - // Benchmarked memory profiles for each option are in - // apps/backend/scripts/benchmark-internal-metrics.ts (BENCH_BACKFILL_COMPARE=1). + // The `event_at >= since` bound on the inner subquery is load-bearing: + // without it the GROUP BY hash table holds one row per ever-seen user. + // Edge case: anonymous page-views by users with no token-refresh in the + // last 30 days now coalesce to non-anonymous. The proper fix is to stamp + // `is_anonymous` on page-view/click events at ingest and drop this join + // entirely (the coalesce below short-circuits on the first non-null arg). const analyticsUserJoin = ` LEFT JOIN ( SELECT diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx index 2e27799581..03297fd99e 100644 --- a/apps/backend/src/lib/clickhouse.tsx +++ b/apps/backend/src/lib/clickhouse.tsx @@ -34,54 +34,17 @@ export function getClickhouseExternalClient() { return createClickhouseClient("external", getEnvVariable("STACK_CLICKHOUSE_DATABASE", "default")); } -// Safety net against the ClickHouse OvercommitTracker killing heavy analytical -// reads (Sentry STACK-BACKEND-16H). Use this client for any handler that fans -// out multiple `GROUP BY user_id`-style queries against `analytics_internal.events`. -// Values are decimal bytes (ClickHouse's native interpretation of digit strings); -// 1 GB = 1_000_000_000. -// -// max_bytes_before_external_group_by (6 GB ≈ 5.59 GiB) -// GROUP BY hash tables spill to disk when they exceed this size instead of -// growing without bound. Targets the unbounded `GROUP BY user_id` patterns -// (analyticsUserJoin, loadUsersByCountry, the splits) which were -// materializing one row per ever-seen user. -// -// max_memory_usage (8 GB ≈ 7.45 GiB) -// Hard per-query ceiling. -// -// max_memory_usage_for_user (9 GB ≈ 8.38 GiB) -// Per-user aggregate across concurrent queries — the bound that actually -// protects against the OvercommitTracker. /internal/metrics fans out to -// ~12 ClickHouse queries via nested Promise.all; a per-query cap alone -// (12 × 8 GB = 96 GB theoretical) doesn't prevent the cluster-wide kill. -// This per-user cap forces the fan-out to share a single 9 GB budget -// against the cluster's 10.8 GiB ceiling. The trade vs no-cap: a single -// query above 9 GB now fails with a clean "memory limit exceeded" error -// rather than triggering an OvercommitTracker kill of a random concurrent -// query. -// -// join_algorithm = 'grace_hash,parallel_hash,hash' -// ClickHouse picks the first applicable algorithm from this list. For our -// analyticsUserJoin shape (LEFT JOIN with a GROUP BY subquery as the -// build side), grace_hash partitions the build side instead of building -// one giant hash table — measured ~48% memory reduction vs the default -// parallel_hash, at zero latency cost (benchmark: BENCH_JOIN_ALGO_COMPARE=1 -// in scripts/benchmark-internal-metrics.ts). parallel_hash and hash are -// fallbacks for join shapes grace_hash doesn't support; ClickHouse falls -// back automatically. -// -// These are belt-and-suspenders. The actual fix for the OOM is bounding the -// `event_at` scans (option A) and eventually backfilling `is_anonymous` onto -// page-view/click events (option C) — see /internal/metrics/route.tsx for the -// long-form discussion. +// Safety net for heavy analytical reads against `analytics_internal.events`: +// per-user memory cap bounds the whole `Promise.all` fan-out (~12 queries) to +// one shared budget against the cluster's 10.8 GiB OvercommitTracker, GROUP BY +// spills to disk before hitting the cap, and grace_hash partitions large join +// build sides instead of allocating one giant hash table. Values are decimal +// bytes (how ClickHouse parses digit strings). export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = { max_bytes_before_external_group_by: "6000000000", max_memory_usage: "8000000000", max_memory_usage_for_user: "9000000000", - // The @clickhouse/client-common JoinAlgorithm type is a union of single - // algorithm names, but the ClickHouse server accepts a comma-separated - // fallback list (tries each in order). The cast widens the SDK type so we - // can use the fallback form, which is what we actually want for safety. + // SDK type narrows to a single algorithm; the server accepts a fallback list. join_algorithm: "grace_hash,parallel_hash,hash" as ClickHouseSettings["join_algorithm"], }; From a24be2060296d811106e5dc1675e1c594784ce2e Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Wed, 20 May 2026 15:38:21 -0700 Subject: [PATCH 3/3] Address PR review: 50% spill threshold, log on tryRun catch, document user-level cap - Drop max_bytes_before_external_group_by from 6 GB to 4 GB to match ClickHouse's 50%-of-max_memory_usage recommendation, leaving headroom for the post-spill merge phase. - Replace the bare catch in tryRunAndReadStats with one that logs the error (AGENTS.md: NEVER try-catch-all). - Clarify the METRICS_CLICKHOUSE_SETTINGS comment: max_memory_usage_for_user is enforced per ClickHouse user, so the budget is shared with all other admin queries through the same connecting user. --- .../scripts/benchmark-internal-metrics.ts | 8 +++++--- apps/backend/src/lib/clickhouse.tsx | 17 +++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/apps/backend/scripts/benchmark-internal-metrics.ts b/apps/backend/scripts/benchmark-internal-metrics.ts index 9589f4e49f..d0d6338e84 100644 --- a/apps/backend/scripts/benchmark-internal-metrics.ts +++ b/apps/backend/scripts/benchmark-internal-metrics.ts @@ -1567,13 +1567,15 @@ function buildAnalyticsOverviewVariant(opts: { ]; } -// Run a query and either return its QueryStats, or null if the query errored -// (e.g. an unsupported `join_algorithm` for the query's shape). +// Run a query; on failure log the error and return null so the join-algo +// matrix can show ERR for shapes a given algorithm doesn't support (e.g. +// `direct` without a Dictionary right side). async function tryRunAndReadStats(rq: RouteQuery, p: QueryParams, now: Date): Promise { try { const qid = await runRouteQuery(rq, p, now); return await readStats(qid); - } catch { + } catch (e) { + console.warn(` [bench] query "${rq.name}" failed: ${e instanceof Error ? e.message : String(e)}`); return null; } } diff --git a/apps/backend/src/lib/clickhouse.tsx b/apps/backend/src/lib/clickhouse.tsx index 03297fd99e..19ff78aeb4 100644 --- a/apps/backend/src/lib/clickhouse.tsx +++ b/apps/backend/src/lib/clickhouse.tsx @@ -35,13 +35,18 @@ export function getClickhouseExternalClient() { } // Safety net for heavy analytical reads against `analytics_internal.events`: -// per-user memory cap bounds the whole `Promise.all` fan-out (~12 queries) to -// one shared budget against the cluster's 10.8 GiB OvercommitTracker, GROUP BY -// spills to disk before hitting the cap, and grace_hash partitions large join -// build sides instead of allocating one giant hash table. Values are decimal -// bytes (how ClickHouse parses digit strings). +// GROUP BY spills to disk at ~50% of the per-query cap (leaving headroom for +// the post-spill merge), grace_hash partitions large join build sides instead +// of allocating one giant hash table, and the per-user cap bounds total +// concurrent memory against the cluster's 10.8 GiB OvercommitTracker. Values +// are decimal bytes (how ClickHouse parses digit strings). +// +// Note: max_memory_usage_for_user is enforced ClickHouse-side per *connecting +// user* (the shared `stackframe` admin), so all admin queries — not just this +// client's — count toward the same 9 GB budget. With the 30-day bounds each +// metrics query peaks well under 100 MiB, so practical interference is low. export const METRICS_CLICKHOUSE_SETTINGS: ClickHouseSettings = { - max_bytes_before_external_group_by: "6000000000", + max_bytes_before_external_group_by: "4000000000", max_memory_usage: "8000000000", max_memory_usage_for_user: "9000000000", // SDK type narrows to a single algorithm; the server accepts a fallback list.