Skip to content

Commit ea505f0

Browse files
improvement(tables): versioned CSV snapshot cache for table mounts + parallel multipart uploader (#5108)
* improvement(tables): versioned CSV snapshot cache for table mounts + parallel multipart uploader * chore(db): drop colliding 0239 migration (renumber pending) * chore(db): renumber rows_version migration to 0240 (off staging's 0239) * improvement(tables): mount snapshots by presigned URL so the sandbox fetches directly (raise cap to 500MB) * fix(tables): allow url sandbox entries in the function-execute contract; key snapshot by column shape so schema edits invalidate it * chore(e2b): log sandbox inputs split by url-fetch vs inline write * improvement(tables): order export + snapshot rows by order_key so the CSV matches the grid under fractional ordering
1 parent c907b11 commit ea505f0

22 files changed

Lines changed: 17978 additions & 133 deletions

File tree

apps/sim/background/table-export.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ import { runTableExport, type TableExportPayload } from '@/lib/table/export-runn
33

44
/**
55
* Trigger.dev wrapper around `runTableExport`. Retry-safe: a retried attempt regenerates the file
6-
* from scratch (failures clean up their partial upload), and the `table_jobs` ownership gate
7-
* stops a run that lost the job. `medium-1x` — the serialized file is buffered in memory before
8-
* the single-shot storage upload (~hundreds of MB worst case for enterprise 1M-row tables).
6+
* from scratch (failures abort/clean up their partial upload), and the `table_jobs` ownership gate
7+
* stops a run that lost the job. The file streams to storage in bounded multipart chunks (no longer
8+
* buffered whole), so `medium-1x` is now headroom rather than a hard requirement.
99
*/
1010
export const tableExportTask = task({
1111
id: 'table-export',

apps/sim/lib/api/contracts/hotspots.ts

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,20 @@ export const functionExecuteContract = defineRouteContract({
162162
isCustomTool: z.boolean().optional().default(false),
163163
_sandboxFiles: z
164164
.array(
165-
z.object({
166-
path: z.string(),
167-
content: z.string(),
168-
encoding: z.literal('base64').optional(),
169-
})
165+
z.union([
166+
z.object({
167+
type: z.literal('content').optional(),
168+
path: z.string(),
169+
content: z.string(),
170+
encoding: z.literal('base64').optional(),
171+
}),
172+
// Mounted by reference: the sandbox fetches `url` itself (no bytes through the web tier).
173+
z.object({
174+
type: z.literal('url'),
175+
path: z.string(),
176+
url: z.string(),
177+
}),
178+
])
170179
)
171180
.optional(),
172181
}),
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/**
2+
* @vitest-environment node
3+
*/
4+
import { beforeEach, describe, expect, it, vi } from 'vitest'
5+
6+
const {
7+
mockIsFeatureEnabled,
8+
mockGetTableById,
9+
mockListTables,
10+
mockQueryRows,
11+
mockGetOrCreateTableSnapshot,
12+
mockDownloadFile,
13+
mockGeneratePresignedDownloadUrl,
14+
mockHasCloudStorage,
15+
mockExecuteTool,
16+
} = vi.hoisted(() => ({
17+
mockIsFeatureEnabled: vi.fn(),
18+
mockGetTableById: vi.fn(),
19+
mockListTables: vi.fn(),
20+
mockQueryRows: vi.fn(),
21+
mockGetOrCreateTableSnapshot: vi.fn(),
22+
mockDownloadFile: vi.fn(),
23+
mockGeneratePresignedDownloadUrl: vi.fn(),
24+
mockHasCloudStorage: vi.fn(),
25+
mockExecuteTool: vi.fn(),
26+
}))
27+
28+
vi.mock('@/lib/core/config/feature-flags', () => ({ isFeatureEnabled: mockIsFeatureEnabled }))
29+
vi.mock('@/lib/table/service', () => ({
30+
getTableById: mockGetTableById,
31+
listTables: mockListTables,
32+
}))
33+
vi.mock('@/lib/table/rows/service', () => ({ queryRows: mockQueryRows }))
34+
vi.mock('@/lib/table/snapshot-cache', () => ({
35+
getOrCreateTableSnapshot: mockGetOrCreateTableSnapshot,
36+
SNAPSHOT_MAX_BYTES: 500 * 1024 * 1024,
37+
}))
38+
vi.mock('@/lib/uploads/core/storage-service', () => ({
39+
downloadFile: mockDownloadFile,
40+
generatePresignedDownloadUrl: mockGeneratePresignedDownloadUrl,
41+
hasCloudStorage: mockHasCloudStorage,
42+
}))
43+
vi.mock('@/tools', () => ({ executeTool: mockExecuteTool }))
44+
// Workspace-file + VFS surfaces are unused on the tables-only path; stub to avoid heavy loads.
45+
vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({
46+
fetchWorkspaceFileBuffer: vi.fn(),
47+
findWorkspaceFileRecord: vi.fn(),
48+
getSandboxWorkspaceFilePath: vi.fn(),
49+
listWorkspaceFiles: vi.fn(),
50+
}))
51+
vi.mock('@/lib/uploads/contexts/workspace/workspace-file-folder-manager', () => ({
52+
listWorkspaceFileFolders: vi.fn(),
53+
}))
54+
vi.mock('@/lib/copilot/vfs/path-utils', () => ({
55+
decodeVfsPathSegments: (p: string) => p.split('/'),
56+
encodeVfsPathSegments: (s: string[]) => s.join('/'),
57+
}))
58+
vi.mock('@/lib/copilot/vfs/workflow-alias-resolver', () => ({
59+
resolveWorkflowAliasForWorkspace: vi.fn().mockResolvedValue(null),
60+
}))
61+
vi.mock('@/lib/copilot/vfs/workflow-aliases', () => ({
62+
isPlanAliasPath: () => false,
63+
workflowAliasSandboxPath: (p: string) => p,
64+
}))
65+
66+
import { executeFunctionExecute } from '@/lib/copilot/tools/handlers/function-execute'
67+
68+
const table = {
69+
id: 'tbl_1',
70+
workspaceId: 'ws_1',
71+
rowCount: 1000,
72+
schema: { columns: [{ id: 'col_name', name: 'name', type: 'string' }] },
73+
}
74+
75+
const context = { workspaceId: 'ws_1', userId: 'u1' }
76+
77+
function mountedFiles() {
78+
const params = mockExecuteTool.mock.calls[0][1] as {
79+
_sandboxFiles?: Array<{ path: string; type?: string; content?: string; url?: string }>
80+
}
81+
return params._sandboxFiles ?? []
82+
}
83+
84+
const snapshotCacheOn = (flag: string) => Promise.resolve(flag === 'table-snapshot-cache')
85+
86+
describe('executeFunctionExecute table mounts', () => {
87+
beforeEach(() => {
88+
vi.clearAllMocks()
89+
mockExecuteTool.mockResolvedValue({ success: true })
90+
mockGetTableById.mockResolvedValue(table)
91+
mockIsFeatureEnabled.mockResolvedValue(false)
92+
mockQueryRows.mockResolvedValue({ rows: [{ data: { name: 'Ada' } }] })
93+
mockHasCloudStorage.mockReturnValue(true)
94+
mockGeneratePresignedDownloadUrl.mockResolvedValue('https://s3.example/presigned?sig=abc')
95+
})
96+
97+
it('flag OFF: drains the table inline via queryRows (existing path)', async () => {
98+
await executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
99+
100+
expect(mockQueryRows).toHaveBeenCalledTimes(1)
101+
expect(mockGetOrCreateTableSnapshot).not.toHaveBeenCalled()
102+
const files = mountedFiles()
103+
expect(files[0].path).toBe('/home/user/tables/tbl_1.csv')
104+
expect(files[0].content).toBe('name\nAda')
105+
})
106+
107+
it('flag ON + cloud storage: mounts by presigned URL, no bytes through web', async () => {
108+
mockIsFeatureEnabled.mockImplementation(snapshotCacheOn)
109+
mockGetOrCreateTableSnapshot.mockResolvedValue({
110+
key: 'table-snapshots/ws_1/tbl_1/v5.csv',
111+
size: 9,
112+
version: 5,
113+
})
114+
115+
await executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
116+
117+
expect(mockGetOrCreateTableSnapshot).toHaveBeenCalledTimes(1)
118+
expect(mockQueryRows).not.toHaveBeenCalled()
119+
expect(mockDownloadFile).not.toHaveBeenCalled()
120+
expect(mockGeneratePresignedDownloadUrl).toHaveBeenCalledWith(
121+
'table-snapshots/ws_1/tbl_1/v5.csv',
122+
'execution',
123+
expect.any(Number)
124+
)
125+
expect(mountedFiles()[0]).toEqual({
126+
type: 'url',
127+
path: '/home/user/tables/tbl_1.csv',
128+
url: 'https://s3.example/presigned?sig=abc',
129+
})
130+
})
131+
132+
it('flag ON + local storage: falls back to a buffered content mount', async () => {
133+
mockIsFeatureEnabled.mockImplementation(snapshotCacheOn)
134+
mockHasCloudStorage.mockReturnValue(false)
135+
mockGetOrCreateTableSnapshot.mockResolvedValue({
136+
key: 'table-snapshots/ws_1/tbl_1/v5.csv',
137+
size: 9,
138+
version: 5,
139+
})
140+
mockDownloadFile.mockResolvedValue(Buffer.from('name\nAda\n'))
141+
142+
await executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
143+
144+
expect(mockGeneratePresignedDownloadUrl).not.toHaveBeenCalled()
145+
expect(mockDownloadFile).toHaveBeenCalledWith(
146+
expect.objectContaining({ key: 'table-snapshots/ws_1/tbl_1/v5.csv', context: 'execution' })
147+
)
148+
const file = mountedFiles()[0]
149+
expect(file.path).toBe('/home/user/tables/tbl_1.csv')
150+
expect(file.content).toBe('name\nAda\n')
151+
expect(file.type).toBeUndefined()
152+
})
153+
154+
it('flag ON but small table stays on the inline path', async () => {
155+
mockIsFeatureEnabled.mockImplementation(snapshotCacheOn)
156+
mockGetTableById.mockResolvedValue({ ...table, rowCount: 10 })
157+
158+
await executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
159+
160+
expect(mockGetOrCreateTableSnapshot).not.toHaveBeenCalled()
161+
expect(mockQueryRows).toHaveBeenCalledTimes(1)
162+
})
163+
164+
it('flag ON + cloud: throws when the snapshot exceeds the table mount limit', async () => {
165+
mockIsFeatureEnabled.mockImplementation(snapshotCacheOn)
166+
mockGetOrCreateTableSnapshot.mockResolvedValue({
167+
key: 'table-snapshots/ws_1/tbl_1/v5.csv',
168+
size: 600 * 1024 * 1024,
169+
version: 5,
170+
})
171+
172+
await expect(
173+
executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
174+
).rejects.toThrow(/table mount limit/)
175+
expect(mockGeneratePresignedDownloadUrl).not.toHaveBeenCalled()
176+
})
177+
178+
it('flag ON + local: throws when the snapshot exceeds the per-file mount limit', async () => {
179+
mockIsFeatureEnabled.mockImplementation(snapshotCacheOn)
180+
mockHasCloudStorage.mockReturnValue(false)
181+
mockGetOrCreateTableSnapshot.mockResolvedValue({
182+
key: 'table-snapshots/ws_1/tbl_1/v5.csv',
183+
size: 20 * 1024 * 1024,
184+
version: 5,
185+
})
186+
187+
await expect(
188+
executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
189+
).rejects.toThrow(/per-file mount limit/)
190+
expect(mockDownloadFile).not.toHaveBeenCalled()
191+
})
192+
193+
it('rejects a table that belongs to another workspace (tenant isolation)', async () => {
194+
mockGetTableById.mockResolvedValue({ ...table, workspaceId: 'ws_2' })
195+
196+
await expect(
197+
executeFunctionExecute({ inputTables: ['tbl_1'] }, context as never)
198+
).rejects.toThrow(/Input table not found/)
199+
expect(mockGetOrCreateTableSnapshot).not.toHaveBeenCalled()
200+
})
201+
})

apps/sim/lib/copilot/tools/handlers/function-execute.ts

Lines changed: 74 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,19 @@ import { isPlanAliasPath, workflowAliasSandboxPath } from '@/lib/copilot/vfs/wor
55
import { isFeatureEnabled } from '@/lib/core/config/feature-flags'
66
import { queryRows } from '@/lib/table/rows/service'
77
import { getTableById, listTables } from '@/lib/table/service'
8+
import { getOrCreateTableSnapshot, SNAPSHOT_MAX_BYTES } from '@/lib/table/snapshot-cache'
89
import { listWorkspaceFileFolders } from '@/lib/uploads/contexts/workspace/workspace-file-folder-manager'
910
import {
1011
fetchWorkspaceFileBuffer,
1112
findWorkspaceFileRecord,
1213
getSandboxWorkspaceFilePath,
1314
listWorkspaceFiles,
1415
} from '@/lib/uploads/contexts/workspace/workspace-file-manager'
16+
import {
17+
downloadFile,
18+
generatePresignedDownloadUrl,
19+
hasCloudStorage,
20+
} from '@/lib/uploads/core/storage-service'
1521
import { executeTool as executeAppTool } from '@/tools'
1622
import type { ToolExecutionContext, ToolExecutionResult } from '../../tool-executor/types'
1723

@@ -21,11 +27,22 @@ const MAX_FILE_SIZE = 10 * 1024 * 1024
2127
const MAX_TOTAL_SIZE = 50 * 1024 * 1024
2228
const MAX_MOUNTED_FILES = 500
2329

24-
interface SandboxFile {
25-
path: string
26-
content: string
27-
encoding?: 'base64'
28-
}
30+
/**
31+
* Below this row count a table mounts via the direct inline CSV path — the version-keyed snapshot
32+
* cache (storage round-trip) only pays off for larger/hot tables. Behind the feature flag either
33+
* way; this just keeps tiny one-shot tables on the cheaper path.
34+
*/
35+
const SNAPSHOT_MIN_ROWS = 500
36+
37+
/**
38+
* Lifetime of the presigned URL handed to the sandbox to fetch a snapshot. Long enough to download
39+
* a large file at sandbox startup; the URL grants read to only that one version-pinned object.
40+
*/
41+
const SNAPSHOT_URL_TTL_SECONDS = 600
42+
43+
type SandboxFile =
44+
| { type?: 'content'; path: string; content: string; encoding?: 'base64' }
45+
| { type: 'url'; path: string; url: string }
2946

3047
interface CanonicalFileInput {
3148
path: string
@@ -249,6 +266,7 @@ async function resolveInputFiles(
249266
const tablePathLookup = hasTablePathRefs
250267
? new Map((await listTables(workspaceId)).map((table) => [table.name, table]))
251268
: undefined
269+
const snapshotCacheEnabled = await isFeatureEnabled('table-snapshot-cache')
252270
for (const tableRef of inputTables) {
253271
const tableId =
254272
typeof tableRef === 'string'
@@ -263,6 +281,56 @@ async function resolveInputFiles(
263281
`Input table not found: "${tableId}". Pass the table id (tbl_...) from tables/{name}/meta.json, or a tables/{name}/meta.json path.`
264282
)
265283
}
284+
const sandboxPath =
285+
typeof tableRef === 'object' && tableRef !== null
286+
? (tableRef as CanonicalTableInput).sandboxPath
287+
: undefined
288+
const mountPath = sandboxPath || `/home/user/tables/${table.id}.csv`
289+
290+
// Large/hot tables mount by reference from a version-keyed CSV snapshot in object storage.
291+
if (snapshotCacheEnabled && table.rowCount >= SNAPSHOT_MIN_ROWS) {
292+
const snapshot = await getOrCreateTableSnapshot(table, 'copilot-fn-exec')
293+
294+
if (hasCloudStorage()) {
295+
// Mount by reference: the sandbox fetches the snapshot straight from storage via a
296+
// presigned URL, so the bytes never pass through the web process — the only ceiling is
297+
// sandbox disk (enforced at materialization by SNAPSHOT_MAX_BYTES).
298+
if (snapshot.size > SNAPSHOT_MAX_BYTES) {
299+
throw new Error(
300+
`Input table "${tableId}" is ${Math.round(snapshot.size / 1024 / 1024)}MB, over the ${SNAPSHOT_MAX_BYTES / 1024 / 1024}MB table mount limit.`
301+
)
302+
}
303+
const url = await generatePresignedDownloadUrl(
304+
snapshot.key,
305+
'execution',
306+
SNAPSHOT_URL_TTL_SECONDS
307+
)
308+
sandboxFiles.push({ type: 'url', path: mountPath, url })
309+
continue
310+
}
311+
312+
// Local storage: a presigned URL is an app-internal serve path a remote sandbox can't
313+
// reach, so fall back to buffering the bytes through the web process (file-mount guards).
314+
if (snapshot.size > MAX_FILE_SIZE) {
315+
throw new Error(
316+
`Input table "${tableId}" is ${Math.round(snapshot.size / 1024 / 1024)}MB, over the ${MAX_FILE_SIZE / 1024 / 1024}MB per-file mount limit.`
317+
)
318+
}
319+
if (totalSize + snapshot.size > MAX_TOTAL_SIZE) {
320+
throw new Error(
321+
`Mounting "${tableId}" would exceed the ${MAX_TOTAL_SIZE / 1024 / 1024}MB total mount limit. Mount fewer or smaller tables.`
322+
)
323+
}
324+
const buffer = await downloadFile({
325+
key: snapshot.key,
326+
context: 'execution',
327+
maxBytes: MAX_FILE_SIZE,
328+
})
329+
totalSize += buffer.length
330+
sandboxFiles.push({ path: mountPath, content: buffer.toString('utf-8') })
331+
continue
332+
}
333+
266334
const rows = await queryRows(table, {}, 'copilot-fn-exec')
267335

268336
const allKeys = new Set(table.schema.columns.map((column) => column.name))
@@ -290,14 +358,7 @@ async function resolveInputFiles(
290358
)
291359
}
292360
const csvContent = csvLines.join('\n')
293-
const sandboxPath =
294-
typeof tableRef === 'object' && tableRef !== null
295-
? (tableRef as CanonicalTableInput).sandboxPath
296-
: undefined
297-
sandboxFiles.push({
298-
path: sandboxPath || `/home/user/tables/${table.id}.csv`,
299-
content: csvContent,
300-
})
361+
sandboxFiles.push({ path: mountPath, content: csvContent })
301362
}
302363
}
303364

apps/sim/lib/core/config/env.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ export const env = createEnv({
7373
BILLING_ENABLED: z.boolean().optional(), // Enable billing enforcement and usage tracking
7474
FREE_API_DEPLOYMENT_GATE_ENABLED: z.boolean().optional(), // Block free-plan accounts from programmatic execution (API/MCP/A2A/generic webhooks/chat embeds). Requires BILLING_ENABLED. Off by default for dark rollout
7575
TABLES_FRACTIONAL_ORDERING: z.boolean().optional(), // Order table rows by fractional order_key (O(1) insert/delete) instead of integer position
76+
TABLE_SNAPSHOT_CACHE: z.boolean().optional(), // Mount tables into sandboxes by reference via a version-keyed CSV snapshot in object storage instead of draining the whole table into web-process heap
7677

7778
// Table feature limits (per plan). Apply when billing is disabled (free tier defaults) or for billed plans.
7879
FREE_TABLES_LIMIT: z.number().optional(), // Max user tables per workspace on free tier (default: 3)

apps/sim/lib/core/config/feature-flags.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ const FEATURE_FLAGS = {
7474
'user context — use enabled:true for global rollout rather than per-user targeting.',
7575
fallback: 'MOTHERSHIP_BETA_FEATURES',
7676
},
77+
'table-snapshot-cache': {
78+
description:
79+
'Mount Sim tables into code sandboxes by reference via a version-keyed CSV snapshot in ' +
80+
'object storage (reused across runs until the table mutates) instead of draining the whole ' +
81+
'table into web-process heap. resolveInputFiles evaluates without user context — use ' +
82+
'enabled:true for global rollout rather than per-user targeting.',
83+
fallback: 'TABLE_SNAPSHOT_CACHE',
84+
},
7785
} satisfies Record<string, FeatureFlagDefinition>
7886

7987
/**

0 commit comments

Comments
 (0)