Skip to content

Commit 34ab1d2

Browse files
fix(logs): run PII redaction over HTTP and fix Presidio provisioning
- resolve the guardrails venv via candidate paths and fail fast instead of silently falling back to system python3 (the misleading "Presidio not installed" that broke redaction and the guardrails block in deployed runtimes) - install the en_core_web_lg spaCy model in setup.sh and app.Dockerfile - route log redaction through an internal /api/guardrails/mask-batch endpoint so Presidio always runs in the app container, including async executions that persist inside the trigger.dev runtime
1 parent 7349bf4 commit 34ab1d2

10 files changed

Lines changed: 213 additions & 22 deletions

File tree

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/**
2+
* @vitest-environment node
3+
*/
4+
import { createMockRequest } from '@sim/testing'
5+
import { beforeEach, describe, expect, it, vi } from 'vitest'
6+
7+
const { mockCheckInternalAuth, mockMaskPIIBatch } = vi.hoisted(() => ({
8+
mockCheckInternalAuth: vi.fn(),
9+
mockMaskPIIBatch: vi.fn(),
10+
}))
11+
12+
vi.mock('@/lib/auth/hybrid', () => ({
13+
checkInternalAuth: mockCheckInternalAuth,
14+
}))
15+
16+
vi.mock('@/lib/guardrails/validate_pii', () => ({
17+
maskPIIBatch: mockMaskPIIBatch,
18+
}))
19+
20+
import { POST } from '@/app/api/guardrails/mask-batch/route'
21+
22+
describe('POST /api/guardrails/mask-batch', () => {
23+
beforeEach(() => {
24+
vi.clearAllMocks()
25+
mockCheckInternalAuth.mockResolvedValue({ success: true })
26+
mockMaskPIIBatch.mockImplementation(async (texts: string[]) => texts.map((t) => `M(${t})`))
27+
})
28+
29+
it('returns 401 without internal auth', async () => {
30+
mockCheckInternalAuth.mockResolvedValue({
31+
success: false,
32+
error: 'Internal authentication required',
33+
})
34+
35+
const res = await POST(
36+
createMockRequest('POST', { texts: ['a@b.com'], entityTypes: ['EMAIL_ADDRESS'] })
37+
)
38+
39+
expect(res.status).toBe(401)
40+
expect(mockMaskPIIBatch).not.toHaveBeenCalled()
41+
})
42+
43+
it('masks the batch in-process and preserves order', async () => {
44+
const res = await POST(
45+
createMockRequest('POST', {
46+
texts: ['a@b.com', 'hello'],
47+
entityTypes: ['EMAIL_ADDRESS'],
48+
language: 'en',
49+
})
50+
)
51+
52+
expect(res.status).toBe(200)
53+
const json = await res.json()
54+
expect(json.masked).toEqual(['M(a@b.com)', 'M(hello)'])
55+
expect(mockMaskPIIBatch).toHaveBeenCalledWith(['a@b.com', 'hello'], ['EMAIL_ADDRESS'], 'en')
56+
})
57+
58+
it('rejects an invalid body with 400', async () => {
59+
const res = await POST(createMockRequest('POST', { texts: 'not-an-array', entityTypes: [] }))
60+
61+
expect(res.status).toBe(400)
62+
expect(mockMaskPIIBatch).not.toHaveBeenCalled()
63+
})
64+
})
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { createLogger } from '@sim/logger'
2+
import { type NextRequest, NextResponse } from 'next/server'
3+
import { guardrailsMaskBatchContract } from '@/lib/api/contracts'
4+
import { parseRequest } from '@/lib/api/server'
5+
import { checkInternalAuth } from '@/lib/auth/hybrid'
6+
import { withRouteHandler } from '@/lib/core/utils/with-route-handler'
7+
import { maskPIIBatch } from '@/lib/guardrails/validate_pii'
8+
9+
const logger = createLogger('GuardrailsMaskBatchAPI')
10+
11+
/**
12+
* Internal batch PII masking. The log-redaction persist path runs in both the
13+
* Next.js server and the trigger.dev runtime, but Presidio (Python venv) lives
14+
* only in the app container — so redaction calls this endpoint server-to-server
15+
* (internal JWT) to keep Presidio centralized here.
16+
*/
17+
export const POST = withRouteHandler(async (request: NextRequest) => {
18+
const auth = await checkInternalAuth(request, { requireWorkflowId: false })
19+
if (!auth.success) {
20+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
21+
}
22+
23+
const parsed = await parseRequest(guardrailsMaskBatchContract, request, {})
24+
if (!parsed.success) return parsed.response
25+
26+
const { texts, entityTypes, language } = parsed.data.body
27+
28+
const masked = await maskPIIBatch(texts, entityTypes, language)
29+
logger.info('Masked PII batch', { count: texts.length })
30+
return NextResponse.json({ masked })
31+
})

apps/sim/lib/api/contracts/hotspots.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,34 @@ export const guardrailsValidateContract = defineRouteContract({
4545
},
4646
})
4747

48+
const guardrailsMaskBatchBodySchema = z.object({
49+
texts: z.array(z.string()).max(100_000),
50+
entityTypes: z.array(z.string().min(1, 'Entity type cannot be empty')).max(200),
51+
language: z.string().min(1).max(20).optional(),
52+
})
53+
54+
const guardrailsMaskBatchResponseSchema = z.object({
55+
masked: z.array(z.string()),
56+
})
57+
58+
/**
59+
* Internal batch PII masking. Called server-to-server (internal JWT) from the
60+
* log-redaction persist path so Presidio always runs in the app container,
61+
* including for async executions that persist inside the trigger.dev runtime.
62+
*/
63+
export const guardrailsMaskBatchContract = defineRouteContract({
64+
method: 'POST',
65+
path: '/api/guardrails/mask-batch',
66+
body: guardrailsMaskBatchBodySchema,
67+
response: {
68+
mode: 'json',
69+
schema: guardrailsMaskBatchResponseSchema,
70+
},
71+
})
72+
73+
export type GuardrailsMaskBatchBody = z.input<typeof guardrailsMaskBatchBodySchema>
74+
export type GuardrailsMaskBatchResult = z.output<typeof guardrailsMaskBatchResponseSchema>
75+
4876
const chatMessageSchema = z.object({
4977
role: z.enum(['user', 'assistant', 'system']),
5078
content: z.string(),
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import type { GuardrailsMaskBatchResult } from '@/lib/api/contracts'
2+
import { generateInternalToken } from '@/lib/auth/internal'
3+
import { getInternalApiBaseUrl } from '@/lib/core/utils/urls'
4+
5+
/**
6+
* Mask PII across many strings via the internal app-container endpoint.
7+
*
8+
* Presidio (a Python venv) only exists in the app container, but the
9+
* log-redaction persist path also runs inside the trigger.dev runtime — so
10+
* redaction always routes through HTTP, the same way the guardrails tool does.
11+
* Order is preserved: the returned array matches `texts` length.
12+
*
13+
* Rejects on any non-2xx or shape mismatch so the caller can apply its own
14+
* fail-safe (scrubbing rather than leaking).
15+
*/
16+
export async function maskPIIBatchViaHttp(
17+
texts: string[],
18+
entityTypes: string[],
19+
language?: string
20+
): Promise<string[]> {
21+
const token = await generateInternalToken()
22+
const url = `${getInternalApiBaseUrl()}/api/guardrails/mask-batch`
23+
24+
// boundary-raw-fetch: internal server-to-server call to the app container (internal JWT auth, configurable base URL)
25+
const response = await fetch(url, {
26+
method: 'POST',
27+
headers: {
28+
'content-type': 'application/json',
29+
authorization: `Bearer ${token}`,
30+
},
31+
body: JSON.stringify({ texts, entityTypes, language }),
32+
})
33+
34+
if (!response.ok) {
35+
const detail = await response.text().catch(() => '')
36+
throw new Error(`PII mask-batch request failed (${response.status}): ${detail.slice(0, 200)}`)
37+
}
38+
39+
const data = (await response.json()) as GuardrailsMaskBatchResult
40+
if (!Array.isArray(data.masked) || data.masked.length !== texts.length) {
41+
throw new Error('PII mask-batch returned an unexpected result')
42+
}
43+
return data.masked
44+
}

apps/sim/lib/guardrails/setup.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ source "$VENV_DIR/bin/activate"
3030
pip install --upgrade pip
3131
pip install -r "$SCRIPT_DIR/requirements.txt"
3232

33+
# Presidio's default AnalyzerEngine loads the en_core_web_lg spaCy model; it is
34+
# not a pip dependency, so download the version compatible with the installed spaCy.
35+
echo "Downloading spaCy model (en_core_web_lg)..."
36+
python -m spacy download en_core_web_lg
37+
3338
echo ""
3439
echo "✅ Setup complete! Guardrails validators are ready to use."
3540
echo ""

apps/sim/lib/guardrails/validate_pii.ts

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,33 @@ const DEFAULT_TIMEOUT = 30000 // 30 seconds
1313
*/
1414
const PII_CHUNK_MAX_BYTES = 256 * 1024
1515

16+
/**
17+
* Resolve the guardrails Presidio interpreter + script path.
18+
*
19+
* `process.cwd()` is not stable across runtimes — the Next standalone container
20+
* launches from the monorepo root while local dev and some paths run from
21+
* `apps/sim` — so probe both layouts (mirrors the candidate-path resolution in
22+
* `lib/execution/isolated-vm.ts`). Requires the bundled venv: throws if it is
23+
* absent rather than silently falling back to the system `python3`, which has no
24+
* Presidio and reports a misleading "not installed".
25+
*/
26+
function resolveGuardrailsPython(): { pythonCmd: string; scriptPath: string } {
27+
const candidateDirs = [
28+
path.join(process.cwd(), 'apps', 'sim', 'lib', 'guardrails'),
29+
path.join(process.cwd(), 'lib', 'guardrails'),
30+
]
31+
for (const dir of candidateDirs) {
32+
const venvPython = path.join(dir, 'venv', 'bin', 'python3')
33+
if (fs.existsSync(venvPython)) {
34+
return { pythonCmd: venvPython, scriptPath: path.join(dir, 'validate_pii.py') }
35+
}
36+
}
37+
const probed = candidateDirs.map((d) => path.join(d, 'venv', 'bin', 'python3')).join(', ')
38+
throw new Error(
39+
`Guardrails Presidio venv not found (looked in ${probed}). Provision it with apps/sim/lib/guardrails/setup.sh locally, or verify the image build installs it.`
40+
)
41+
}
42+
1643
export interface PIIValidationInput {
1744
text: string
1845
entityTypes: string[] // e.g., ["PERSON", "EMAIL_ADDRESS", "CREDIT_CARD"]
@@ -136,10 +163,7 @@ export async function maskPIIBatch(
136163
*/
137164
function runPythonScript<T>(payload: Record<string, unknown>): Promise<T> {
138165
return new Promise((resolve, reject) => {
139-
const guardrailsDir = path.join(process.cwd(), 'lib/guardrails')
140-
const scriptPath = path.join(guardrailsDir, 'validate_pii.py')
141-
const venvPython = path.join(guardrailsDir, 'venv/bin/python3')
142-
const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3'
166+
const { pythonCmd, scriptPath } = resolveGuardrailsPython()
143167

144168
const python = spawn(pythonCmd, [scriptPath])
145169
let stdout = ''
@@ -208,14 +232,7 @@ async function executePythonPIIDetection(
208232
requestId: string
209233
): Promise<PIIValidationResult> {
210234
return new Promise((resolve, reject) => {
211-
// Use path relative to project root
212-
// In Next.js, process.cwd() returns the project root
213-
const guardrailsDir = path.join(process.cwd(), 'lib/guardrails')
214-
const scriptPath = path.join(guardrailsDir, 'validate_pii.py')
215-
const venvPython = path.join(guardrailsDir, 'venv/bin/python3')
216-
217-
// Use venv Python if it exists, otherwise fall back to system python3
218-
const pythonCmd = fs.existsSync(venvPython) ? venvPython : 'python3'
235+
const { pythonCmd, scriptPath } = resolveGuardrailsPython()
219236

220237
const python = spawn(pythonCmd, [scriptPath])
221238

apps/sim/lib/logs/execution/pii-redaction.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ const { mockMaskPIIBatch } = vi.hoisted(() => ({
77
mockMaskPIIBatch: vi.fn(),
88
}))
99

10-
vi.mock('@/lib/guardrails/validate_pii', () => ({
11-
maskPIIBatch: mockMaskPIIBatch,
10+
vi.mock('@/lib/guardrails/mask-client', () => ({
11+
maskPIIBatchViaHttp: mockMaskPIIBatch,
1212
}))
1313

1414
import { REDACTION_FAILED_MARKER, redactPIIFromExecution } from '@/lib/logs/execution/pii-redaction'

apps/sim/lib/logs/execution/pii-redaction.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { createLogger } from '@sim/logger'
22
import { getErrorMessage } from '@sim/utils/errors'
3+
import { maskPIIBatchViaHttp } from '@/lib/guardrails/mask-client'
34

45
const logger = createLogger('PiiRedaction')
56

@@ -158,11 +159,9 @@ export async function redactPIIFromExecution(
158159
masked = collected.map(() => REDACTION_FAILED_MARKER)
159160
} else {
160161
try {
161-
// Lazy import keeps the Python-spawning guardrails module (child_process +
162-
// a `lib/guardrails` dir reference) out of the static middleware/RSC graph;
163-
// it's only loaded at runtime on the Node log-persist path.
164-
const { maskPIIBatch } = await import('@/lib/guardrails/validate_pii')
165-
masked = await maskPIIBatch(collected, entityTypes, language)
162+
// Presidio runs only in the app container; the persist path also runs in
163+
// the trigger.dev runtime, so masking always goes over HTTP to the app.
164+
masked = await maskPIIBatchViaHttp(collected, entityTypes, language)
166165
} catch (error) {
167166
logger.error('PII masking failed; scrubbing text to avoid leaking PII', {
168167
error: getErrorMessage(error),

docker/app.Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,14 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/execution/sandbox/bu
118118
COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/requirements.txt ./apps/sim/lib/guardrails/requirements.txt
119119
COPY --from=builder --chown=nextjs:nodejs /app/apps/sim/lib/guardrails/validate_pii.py ./apps/sim/lib/guardrails/validate_pii.py
120120

121-
# Install Python dependencies with pip cache mount for faster rebuilds
121+
# Install Python dependencies with pip cache mount for faster rebuilds.
122+
# Presidio's default AnalyzerEngine loads en_core_web_lg, which is not a pip
123+
# dependency — download the spaCy model into the venv after installing Presidio.
122124
RUN --mount=type=cache,target=/root/.cache/pip \
123125
python3 -m venv ./apps/sim/lib/guardrails/venv && \
124126
./apps/sim/lib/guardrails/venv/bin/pip install --upgrade pip && \
125127
./apps/sim/lib/guardrails/venv/bin/pip install -r ./apps/sim/lib/guardrails/requirements.txt && \
128+
./apps/sim/lib/guardrails/venv/bin/python -m spacy download en_core_web_lg && \
126129
chown -R nextjs:nodejs /app/apps/sim/lib/guardrails
127130

128131
# Create .next/cache directory with correct ownership

scripts/check-api-validation-contracts.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ const QUERY_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/queries')
99
const SELECTOR_HOOKS_DIR = path.join(ROOT, 'apps/sim/hooks/selectors')
1010

1111
const BASELINE = {
12-
totalRoutes: 859,
13-
zodRoutes: 859,
12+
totalRoutes: 860,
13+
zodRoutes: 860,
1414
nonZodRoutes: 0,
1515
} as const
1616

0 commit comments

Comments
 (0)