From 28ca06e56c350230cae6444348bbbcb25a962f80 Mon Sep 17 00:00:00 2001 From: Tofik Hasanov Date: Mon, 8 Jun 2026 14:53:37 -0400 Subject: [PATCH 1/2] fix(cloud-security): retry transient AssumeRole failures (first-scan "could not assume role") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Customer-reported: the first AWS scan fails with a "Could not assume AWS role" finding; switching the scan engine and back, then re-running, works — on every account, with no change in AWS. Customer correctly concluded it's not the role (it assumes fine the second time). Root cause (verified): assuming the customer's cross-account role is a two-hop STS flow with NO application-level retry, and the AWS SDK's default retry only covers throttling/5xx/network — NOT the classes that actually occur here: - IAM/STS eventual consistency on a new or just-edited role/trust-policy returns AccessDenied on the first assume and succeeds seconds later (no AWS change). - The base/roleAssumer session can briefly return ExpiredToken. So a single transient failure surfaced as a sticky finding. The "switch switch" is INCIDENTAL — updateMode only writes metadata (idempotent, no cache/cred refresh); what actually fixes it is the elapsed time before the retry. The visible finding is written by the integration-platform checks path (resolveAwsSessionOrFail in manifests/aws/checks/shared.ts), not the Cloud Tests scan (which returns success:false with no finding) — so both assume implementations are fixed. - new shared helper retryAssume / isRetryableAssumeError (manifests/aws/checks/assume-retry.ts): bounded exponential backoff + jitter, retries only transient / eventual-consistency classes (AccessDenied, ExpiredToken, IDPCommunicationError, throttling, 5xx, network). Hard config errors (bad ARN, ValidationError) and a persistently-denied role still propagate; backoff capped so a genuinely broken role fails within seconds. - wrap both AssumeRole hops in shared.ts (checks path) and aws-security.service.ts assumeRole (Cloud Tests scan path). - export retryAssume from the package barrel for apps/api. - tests: retry-then-succeed, no-retry on config errors, give-up after N, classifier. Note: not yet live-verified; the exact STS class (AccessDenied vs ExpiredToken) is already captured in evidence.error on the failing finding — the fix covers both. cloud-security suite green (311); helper tests green; typecheck clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../providers/aws-security.service.ts | 31 ++--- packages/integration-platform/src/index.ts | 7 ++ .../aws/checks/__tests__/assume-retry.test.ts | 89 +++++++++++++++ .../src/manifests/aws/checks/assume-retry.ts | 106 ++++++++++++++++++ .../src/manifests/aws/checks/shared.ts | 31 ++--- 5 files changed, 238 insertions(+), 26 deletions(-) create mode 100644 packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts create mode 100644 packages/integration-platform/src/manifests/aws/checks/assume-retry.ts diff --git a/apps/api/src/cloud-security/providers/aws-security.service.ts b/apps/api/src/cloud-security/providers/aws-security.service.ts index de43d89555..e09ef7c5ab 100644 --- a/apps/api/src/cloud-security/providers/aws-security.service.ts +++ b/apps/api/src/cloud-security/providers/aws-security.service.ts @@ -1,5 +1,6 @@ import { Injectable, Logger } from '@nestjs/common'; import { AssumeRoleCommand, STSClient } from '@aws-sdk/client-sts'; +import { retryAssume } from '@trycompai/integration-platform'; import { CostExplorerClient, GetCostAndUsageCommand, @@ -514,12 +515,14 @@ export class AWSSecurityService { region, credentials: getAwsBaseCredentials(partition), }); - const roleAssumerResp = await baseSts.send( - new AssumeRoleCommand({ - RoleArn: roleAssumerArn, - RoleSessionName: 'CompRoleAssumer', - DurationSeconds: 3600, - }), + const roleAssumerResp = await retryAssume(() => + baseSts.send( + new AssumeRoleCommand({ + RoleArn: roleAssumerArn, + RoleSessionName: 'CompRoleAssumer', + DurationSeconds: 3600, + }), + ), ); const roleAssumerCreds = roleAssumerResp.Credentials; @@ -541,13 +544,15 @@ export class AWSSecurityService { this.logger.log(`Assuming customer role ${roleArn} in region ${region}`); - const customerResp = await roleAssumerSts.send( - new AssumeRoleCommand({ - RoleArn: roleArn, - ExternalId: externalId, - RoleSessionName: sessionName ?? 'CompSecurityAudit', - DurationSeconds: 3600, - }), + const customerResp = await retryAssume(() => + roleAssumerSts.send( + new AssumeRoleCommand({ + RoleArn: roleArn, + ExternalId: externalId, + RoleSessionName: sessionName ?? 'CompSecurityAudit', + DurationSeconds: 3600, + }), + ), ); const customerCreds = customerResp.Credentials; diff --git a/packages/integration-platform/src/index.ts b/packages/integration-platform/src/index.ts index 6131578d9c..b50fed9c72 100644 --- a/packages/integration-platform/src/index.ts +++ b/packages/integration-platform/src/index.ts @@ -145,6 +145,13 @@ export { } from './manifests/aws/credentials'; export type { AwsEnvironment } from './manifests/aws/credentials'; +// Shared AWS STS AssumeRole retry (transient / IAM-eventual-consistency safe), +// reused by the Cloud Tests scanner in apps/api. +export { + retryAssume, + isRetryableAssumeError, +} from './manifests/aws/checks/assume-retry'; + // API Response types (for frontend and API type sharing) export type { diff --git a/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts b/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts new file mode 100644 index 0000000000..bb2eed1e8c --- /dev/null +++ b/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts @@ -0,0 +1,89 @@ +import { describe, expect, it } from 'bun:test'; +import { isRetryableAssumeError, retryAssume } from '../assume-retry'; + +// Fast, deterministic options for tests — never actually sleep. +const fastOpts = { + sleep: async () => {}, + random: () => 0, + baseDelayMs: 1, + maxDelayMs: 1, +}; + +function awsError(name: string, extra: Record = {}): Error { + return Object.assign(new Error(`${name} occurred`), { name, ...extra }); +} + +describe('isRetryableAssumeError', () => { + it('retries IAM/STS eventual-consistency AccessDenied (the customer bug)', () => { + expect(isRetryableAssumeError(awsError('AccessDenied'))).toBe(true); + expect(isRetryableAssumeError(awsError('AccessDeniedException'))).toBe(true); + }); + + it('retries expired-token, throttling, 5xx and network errors', () => { + expect(isRetryableAssumeError(awsError('ExpiredToken'))).toBe(true); + expect(isRetryableAssumeError(awsError('ThrottlingException'))).toBe(true); + expect( + isRetryableAssumeError({ name: 'X', $metadata: { httpStatusCode: 503 } }), + ).toBe(true); + expect(isRetryableAssumeError(new Error('socket hang up'))).toBe(true); + }); + + it('does NOT retry hard configuration errors', () => { + expect(isRetryableAssumeError(awsError('ValidationError'))).toBe(false); + expect( + isRetryableAssumeError(new Error('Invalid IAM Role ARN format')), + ).toBe(false); + expect(isRetryableAssumeError(null)).toBe(false); + expect(isRetryableAssumeError(undefined)).toBe(false); + }); +}); + +describe('retryAssume', () => { + it('retries a transient AccessDenied then succeeds (no AWS change needed)', async () => { + let calls = 0; + const result = await retryAssume(async () => { + calls++; + if (calls < 3) throw awsError('AccessDenied'); + return 'creds'; + }, fastOpts); + expect(result).toBe('creds'); + expect(calls).toBe(3); + }); + + it('does not retry a non-transient error (single attempt)', async () => { + let calls = 0; + await expect( + retryAssume(async () => { + calls++; + throw awsError('ValidationError'); + }, fastOpts), + ).rejects.toThrow(); + expect(calls).toBe(1); + }); + + it('gives up after the configured attempts on a persistent transient error', async () => { + let calls = 0; + await expect( + retryAssume( + async () => { + calls++; + throw awsError('AccessDenied'); + }, + { ...fastOpts, attempts: 4 }, + ), + ).rejects.toThrow(); + expect(calls).toBe(4); + }); + + it('succeeds on the first attempt without sleeping', async () => { + let slept = false; + const result = await retryAssume(async () => 'ok', { + ...fastOpts, + sleep: async () => { + slept = true; + }, + }); + expect(result).toBe('ok'); + expect(slept).toBe(false); + }); +}); diff --git a/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts b/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts new file mode 100644 index 0000000000..d81fe7795b --- /dev/null +++ b/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts @@ -0,0 +1,106 @@ +/** + * Bounded retry for AWS STS `AssumeRole` calls. + * + * Why this exists: assuming a customer's cross-account role is a two-hop STS + * flow, and the FIRST attempt right after a connection is set up (or its trust + * policy edited) can fail transiently with NO change in AWS — IAM/STS is + * eventually consistent, so a brand-new/edited role can return `AccessDenied` + * on the first assume and then succeed seconds later. The base/roleAssumer + * session can likewise briefly return `ExpiredToken`. None of these classes are + * retried by the AWS SDK's default strategy (it only retries throttling/5xx/ + * network), so a single transient failure surfaced to customers as a sticky + * "Could not assume AWS role" finding that "fixed itself" on the next run. + * + * This helper retries only those transient/eventually-consistent classes with + * capped exponential backoff + jitter. Hard configuration errors (bad ARN, + * ValidationError) and a role that is *persistently* denied still propagate — + * the backoff is capped so a genuinely broken role fails within a few seconds. + */ + +const RETRYABLE_ASSUME_ERROR_NAMES = new Set([ + // IAM/STS eventual consistency on a new or just-edited cross-account role. + 'AccessDenied', + 'AccessDeniedException', + // Base/roleAssumer session momentarily expired, or an STS hiccup. + 'ExpiredToken', + 'ExpiredTokenException', + 'IDPCommunicationError', + 'RegionDisabledException', + // Standard transient classes (belt-and-suspenders alongside the SDK). + 'Throttling', + 'ThrottlingException', + 'TooManyRequestsException', + 'RequestLimitExceeded', + 'ServiceUnavailable', + 'ServiceUnavailableException', + 'InternalError', + 'InternalFailure', +]); + +export function isRetryableAssumeError(error: unknown): boolean { + if (!error || typeof error !== 'object') return false; + const e = error as { + name?: string; + Code?: string; + code?: string; + __type?: string; + message?: string; + $metadata?: { httpStatusCode?: number }; + }; + const name = e.name ?? e.Code ?? e.code ?? e.__type ?? ''; + if (RETRYABLE_ASSUME_ERROR_NAMES.has(name)) return true; + + const status = e.$metadata?.httpStatusCode; + if (typeof status === 'number' && status >= 500) return true; + + return /ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|socket hang up|network|timeout/i.test( + e.message ?? '', + ); +} + +export interface RetryAssumeOptions { + /** Total attempts including the first (default 4). */ + attempts?: number; + /** Base backoff in ms; grows exponentially, capped by maxDelayMs (default 500). */ + baseDelayMs?: number; + /** Maximum backoff per attempt in ms (default 5000). */ + maxDelayMs?: number; + /** Injectable sleep so tests don't actually wait. */ + sleep?: (ms: number) => Promise; + /** Injectable jitter source in [0, 1); defaults to Math.random. */ + random?: () => number; +} + +const defaultSleep = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Run `fn` (an AssumeRole call) with bounded exponential backoff + jitter, + * retrying only transient/eventually-consistent failures. Non-transient errors + * propagate immediately; transient ones that never recover propagate after the + * attempts are exhausted. + */ +export async function retryAssume( + fn: () => Promise, + options: RetryAssumeOptions = {}, +): Promise { + const attempts = Math.max(1, options.attempts ?? 4); + const baseDelayMs = options.baseDelayMs ?? 500; + const maxDelayMs = options.maxDelayMs ?? 5000; + const sleep = options.sleep ?? defaultSleep; + const random = options.random ?? Math.random; + + let lastError: unknown; + for (let attempt = 0; attempt < attempts; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + const isLastAttempt = attempt === attempts - 1; + if (isLastAttempt || !isRetryableAssumeError(error)) throw error; + const backoff = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt); + await sleep(backoff + Math.floor(backoff * 0.5 * random())); + } + } + throw lastError; +} diff --git a/packages/integration-platform/src/manifests/aws/checks/shared.ts b/packages/integration-platform/src/manifests/aws/checks/shared.ts index 8a7e48e6eb..c95735ef94 100644 --- a/packages/integration-platform/src/manifests/aws/checks/shared.ts +++ b/packages/integration-platform/src/manifests/aws/checks/shared.ts @@ -1,5 +1,6 @@ import { AssumeRoleCommand, STSClient } from '@aws-sdk/client-sts'; import type { CheckContext, FindingSeverity } from '../../../types'; +import { retryAssume } from './assume-retry'; export interface AwsSession { credentials: { @@ -113,12 +114,14 @@ export async function assumeAwsSession( region, credentials: awsBaseCredentials(partition), }); - const assumerResp = await baseSts.send( - new AssumeRoleCommand({ - RoleArn: roleAssumerArn, - RoleSessionName: 'CompRoleAssumer', - DurationSeconds: 3600, - }), + const assumerResp = await retryAssume(() => + baseSts.send( + new AssumeRoleCommand({ + RoleArn: roleAssumerArn, + RoleSessionName: 'CompRoleAssumer', + DurationSeconds: 3600, + }), + ), ); const assumer = assumerResp.Credentials; if ( @@ -139,13 +142,15 @@ export async function assumeAwsSession( sessionToken: assumer.SessionToken, }, }); - const res = await assumerSts.send( - new AssumeRoleCommand({ - RoleArn: roleArn, - ExternalId: externalId, - RoleSessionName: 'CompEvidenceCheck', - DurationSeconds: 3600, - }), + const res = await retryAssume(() => + assumerSts.send( + new AssumeRoleCommand({ + RoleArn: roleArn, + ExternalId: externalId, + RoleSessionName: 'CompEvidenceCheck', + DurationSeconds: 3600, + }), + ), ); const c = res.Credentials; if (!c?.AccessKeyId || !c.SecretAccessKey || !c.SessionToken) return null; From b0bcd2118dc195fb2ba6c1057c70bfcdebc1bc98 Mon Sep 17 00:00:00 2001 From: Tofik Hasanov Date: Mon, 8 Jun 2026 15:54:42 -0400 Subject: [PATCH 2/2] fix(cloud-security): resolve aws check-path session in ecs, not trigger.dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The integration-platform AWS checks run inside the Trigger.dev runtime and assumed the cross-account role there via process.env.SECURITY_HUB_ROLE_ASSUMER_ARN. That env var (and any base AWS credentials) only exist on the ECS API task role, not in Trigger.dev, so every AWS connection's checks failed with "Could not assume AWS role" — while scans worked, because the scan task calls back to ECS. Resolve the session in ECS instead: a new internal, service-token-only resolve-session endpoint performs the partition-aware two-hop assume (reusing the scan path's logic) and returns only short-lived, read-only, single-customer temp creds. The check task injects those into the check credentials, and assumeAwsSession uses them directly. The cross-tenant roleAssumer credential never leaves ECS. The in-runtime two-hop is kept as a fallback for the ECS controller callers and local dev. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/auth/service-token-only.guard.spec.ts | 45 +++++++ apps/api/src/auth/service-token-only.guard.ts | 30 +++++ .../cloud-security.controller.spec.ts | 93 ++++++++++++- .../cloud-security.controller.ts | 26 ++++ ...d-security.service.resolve-session.spec.ts | 125 ++++++++++++++++++ .../cloud-security/cloud-security.service.ts | 70 ++++++++++ .../providers/aws-security.service.ts | 54 ++++++++ .../checks-aws-session.ts | 125 ++++++++++++++++++ .../run-connection-checks.ts | 12 ++ .../run-task-integration-checks.ts | 12 ++ .../aws/checks/__tests__/aws-checks.test.ts | 70 +++++++++- .../src/manifests/aws/checks/shared.ts | 49 +++++++ 12 files changed, 709 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/auth/service-token-only.guard.spec.ts create mode 100644 apps/api/src/auth/service-token-only.guard.ts create mode 100644 apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts create mode 100644 apps/api/src/trigger/integration-platform/checks-aws-session.ts diff --git a/apps/api/src/auth/service-token-only.guard.spec.ts b/apps/api/src/auth/service-token-only.guard.spec.ts new file mode 100644 index 0000000000..40e16b67bd --- /dev/null +++ b/apps/api/src/auth/service-token-only.guard.spec.ts @@ -0,0 +1,45 @@ +import { ExecutionContext, ForbiddenException } from '@nestjs/common'; +import { ServiceTokenOnlyGuard } from './service-token-only.guard'; +import type { AuthenticatedRequest } from './types'; + +function contextFor( + request: Partial, +): ExecutionContext { + return { + switchToHttp: () => ({ + getRequest: () => request as AuthenticatedRequest, + }), + } as unknown as ExecutionContext; +} + +describe('ServiceTokenOnlyGuard', () => { + const guard = new ServiceTokenOnlyGuard(); + + it('allows internal service-token requests', () => { + expect( + guard.canActivate(contextFor({ isServiceToken: true })), + ).toBe(true); + }); + + it('rejects session requests', () => { + expect(() => + guard.canActivate( + contextFor({ authType: 'session', isApiKey: false, isServiceToken: false }), + ), + ).toThrow(ForbiddenException); + }); + + it('rejects API-key requests', () => { + expect(() => + guard.canActivate( + contextFor({ authType: 'api-key', isApiKey: true, isServiceToken: false }), + ), + ).toThrow(ForbiddenException); + }); + + it('rejects requests with no service-token flag set', () => { + expect(() => guard.canActivate(contextFor({}))).toThrow( + ForbiddenException, + ); + }); +}); diff --git a/apps/api/src/auth/service-token-only.guard.ts b/apps/api/src/auth/service-token-only.guard.ts new file mode 100644 index 0000000000..a3adaae49a --- /dev/null +++ b/apps/api/src/auth/service-token-only.guard.ts @@ -0,0 +1,30 @@ +import { + CanActivate, + ExecutionContext, + ForbiddenException, + Injectable, +} from '@nestjs/common'; +import { AuthenticatedRequest } from './types'; + +/** + * Guard that rejects everything except internal service-token auth. + * Use on internal-only endpoints that return sensitive data not meant for + * customers (e.g., minting temporary AWS credentials for background workers). + * + * Place between HybridAuthGuard and PermissionGuard: + * @UseGuards(HybridAuthGuard, ServiceTokenOnlyGuard, PermissionGuard) + */ +@Injectable() +export class ServiceTokenOnlyGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const request = context.switchToHttp().getRequest(); + + if (!request.isServiceToken) { + throw new ForbiddenException( + 'This endpoint is only available for internal service-token requests.', + ); + } + + return true; + } +} diff --git a/apps/api/src/cloud-security/cloud-security.controller.spec.ts b/apps/api/src/cloud-security/cloud-security.controller.spec.ts index a08dd6404b..3940fa8f21 100644 --- a/apps/api/src/cloud-security/cloud-security.controller.spec.ts +++ b/apps/api/src/cloud-security/cloud-security.controller.spec.ts @@ -16,7 +16,10 @@ import { Test, TestingModule } from '@nestjs/testing'; import { HttpException, HttpStatus } from '@nestjs/common'; import { CloudSecurityController } from './cloud-security.controller'; -import { CloudSecurityService } from './cloud-security.service'; +import { + CloudSecurityService, + ConnectionNotFoundError, +} from './cloud-security.service'; import { CloudSecurityQueryService } from './cloud-security-query.service'; import { CloudSecurityLegacyService } from './cloud-security-legacy.service'; import { CloudSecurityActivityService } from './cloud-security-activity.service'; @@ -29,6 +32,7 @@ import { CloudAwsScanModeService } from './aws-scan-mode.service'; import { ActingUserResolver } from '../auth/acting-user.service'; import { HybridAuthGuard } from '../auth/hybrid-auth.guard'; import { PermissionGuard } from '../auth/permission.guard'; +import { ServiceTokenOnlyGuard } from '../auth/service-token-only.guard'; import type { AuthenticatedRequest } from '../auth/types'; /** @@ -87,6 +91,8 @@ describe('CloudSecurityController — API-key mutation support', () => { .useValue(mockGuard) .overrideGuard(PermissionGuard) .useValue(mockGuard) + .overrideGuard(ServiceTokenOnlyGuard) + .useValue(mockGuard) .compile(); controller = module.get(CloudSecurityController); @@ -309,3 +315,88 @@ describe('CloudSecurityController — API-key mutation support', () => { }); }); }); + +describe('CloudSecurityController — resolveSession (internal)', () => { + let controller: CloudSecurityController; + let cloudSecurityService: { resolveAwsSession: jest.Mock }; + const mockGuard = { canActivate: jest.fn().mockReturnValue(true) }; + + beforeEach(async () => { + cloudSecurityService = { resolveAwsSession: jest.fn() }; + + const module: TestingModule = await Test.createTestingModule({ + controllers: [CloudSecurityController], + providers: [ + { provide: CloudSecurityService, useValue: cloudSecurityService }, + { provide: CloudSecurityQueryService, useValue: {} }, + { provide: CloudSecurityLegacyService, useValue: {} }, + { provide: CloudSecurityActivityService, useValue: {} }, + { provide: GCPSecurityService, useValue: {} }, + { provide: AzureSecurityService, useValue: {} }, + { provide: CheckDefinitionService, useValue: {} }, + { provide: CloudExceptionService, useValue: {} }, + { provide: CloudHistoryService, useValue: {} }, + { provide: CloudAwsScanModeService, useValue: {} }, + { provide: ActingUserResolver, useValue: {} }, + ], + }) + .overrideGuard(HybridAuthGuard) + .useValue(mockGuard) + .overrideGuard(PermissionGuard) + .useValue(mockGuard) + .overrideGuard(ServiceTokenOnlyGuard) + .useValue(mockGuard) + .compile(); + + controller = module.get(CloudSecurityController); + jest.clearAllMocks(); + }); + + it('returns the resolved session from the service', async () => { + const session = { + ok: true, + session: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }, + }; + cloudSecurityService.resolveAwsSession.mockResolvedValueOnce(session); + + const result = await controller.resolveSession('conn_1', 'org_1'); + + expect(cloudSecurityService.resolveAwsSession).toHaveBeenCalledWith( + 'conn_1', + 'org_1', + ); + expect(result).toEqual(session); + }); + + it('passes through not_configured / assume_failed results', async () => { + cloudSecurityService.resolveAwsSession.mockResolvedValueOnce({ + ok: false, + reason: 'assume_failed', + error: 'denied', + }); + + const result = await controller.resolveSession('conn_1', 'org_1'); + expect(result).toEqual({ + ok: false, + reason: 'assume_failed', + error: 'denied', + }); + }); + + it('maps ConnectionNotFoundError to a 404', async () => { + cloudSecurityService.resolveAwsSession.mockRejectedValueOnce( + new ConnectionNotFoundError(), + ); + + const error = await controller + .resolveSession('conn_missing', 'org_1') + .catch((e) => e); + + expect(error).toBeInstanceOf(HttpException); + expect((error as HttpException).getStatus()).toBe(HttpStatus.NOT_FOUND); + }); +}); diff --git a/apps/api/src/cloud-security/cloud-security.controller.ts b/apps/api/src/cloud-security/cloud-security.controller.ts index 28ed2f1f0d..c919d5d970 100644 --- a/apps/api/src/cloud-security/cloud-security.controller.ts +++ b/apps/api/src/cloud-security/cloud-security.controller.ts @@ -17,6 +17,7 @@ import { ApiOperation } from '@nestjs/swagger'; import { SkipThrottle } from '@nestjs/throttler'; import { HybridAuthGuard } from '../auth/hybrid-auth.guard'; import { PermissionGuard } from '../auth/permission.guard'; +import { ServiceTokenOnlyGuard } from '../auth/service-token-only.guard'; import { RequirePermission } from '../auth/require-permission.decorator'; import { OrganizationId } from '../auth/auth-context.decorator'; import { @@ -254,6 +255,31 @@ export class CloudSecurityController { return { data: definition }; } + @Post('resolve-session/:connectionId') + @SkipThrottle() + @UseGuards(HybridAuthGuard, ServiceTokenOnlyGuard, PermissionGuard) + @RequirePermission('integration', 'update') + @ApiOperation({ + summary: + 'Resolve short-lived AWS credentials for a connection (internal only)', + }) + async resolveSession( + @Param('connectionId') connectionId: string, + @OrganizationId() organizationId: string, + ) { + try { + return await this.cloudSecurityService.resolveAwsSession( + connectionId, + organizationId, + ); + } catch (error) { + if (error instanceof ConnectionNotFoundError) { + throw new HttpException('Connection not found', HttpStatus.NOT_FOUND); + } + throw error; + } + } + @Post('scan/:connectionId') @UseGuards(HybridAuthGuard, PermissionGuard) @RequirePermission('integration', 'update') diff --git a/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts b/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts new file mode 100644 index 0000000000..aa499ce846 --- /dev/null +++ b/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts @@ -0,0 +1,125 @@ +jest.mock('@db', () => ({ + db: { integrationConnection: { findFirst: jest.fn() } }, + Prisma: {}, +})); + +import { db } from '@db'; +import { + CloudSecurityService, + ConnectionNotFoundError, +} from './cloud-security.service'; + +type Ctor = ConstructorParameters; + +const findFirst = (db as unknown as { + integrationConnection: { findFirst: jest.Mock }; +}).integrationConnection.findFirst; + +describe('CloudSecurityService.resolveAwsSession', () => { + let credentialVault: { getDecryptedCredentials: jest.Mock }; + let awsService: { resolveRoleSession: jest.Mock }; + let service: CloudSecurityService; + + beforeEach(() => { + credentialVault = { getDecryptedCredentials: jest.fn() }; + awsService = { resolveRoleSession: jest.fn() }; + service = new CloudSecurityService( + credentialVault as unknown as Ctor[0], + {} as unknown as Ctor[1], + {} as unknown as Ctor[2], + awsService as unknown as Ctor[3], + {} as unknown as Ctor[4], + {} as unknown as Ctor[5], + ); + jest.clearAllMocks(); + }); + + it('scopes the connection lookup by organizationId (tenant isolation)', async () => { + findFirst.mockResolvedValueOnce(null); + + await expect( + service.resolveAwsSession('conn_1', 'org_1'), + ).rejects.toBeInstanceOf(ConnectionNotFoundError); + + expect(findFirst).toHaveBeenCalledWith( + expect.objectContaining({ + where: { id: 'conn_1', organizationId: 'org_1', status: 'active' }, + }), + ); + }); + + it('returns not_configured for a non-AWS provider (never assumes)', async () => { + findFirst.mockResolvedValueOnce({ provider: { slug: 'gcp' } }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ ok: false, reason: 'not_configured' }); + expect(awsService.resolveRoleSession).not.toHaveBeenCalled(); + }); + + it('returns not_configured when roleArn/externalId are missing', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: {}, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + externalId: 'eid', // roleArn missing + }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ ok: false, reason: 'not_configured' }); + expect(awsService.resolveRoleSession).not.toHaveBeenCalled(); + }); + + it('returns the resolved session on success', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: { regions: ['us-east-1'] }, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1'], + }); + awsService.resolveRoleSession.mockResolvedValueOnce({ + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ + ok: true, + session: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }, + }); + }); + + it('returns assume_failed with the real reason when the assume throws', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: {}, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1'], + }); + awsService.resolveRoleSession.mockRejectedValueOnce( + new Error('not authorized to perform sts:AssumeRole'), + ); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ + ok: false, + reason: 'assume_failed', + error: 'not authorized to perform sts:AssumeRole', + }); + }); +}); diff --git a/apps/api/src/cloud-security/cloud-security.service.ts b/apps/api/src/cloud-security/cloud-security.service.ts index 38de61ce74..f3b5b0fccf 100644 --- a/apps/api/src/cloud-security/cloud-security.service.ts +++ b/apps/api/src/cloud-security/cloud-security.service.ts @@ -32,6 +32,23 @@ export interface ScanResult { error?: string; } +/** + * Outcome of resolving short-lived AWS session credentials for a connection. + * `not_configured` = nothing to do (check should no-op); `assume_failed` = the + * assume genuinely failed and the check should surface a finding. + */ +export type ResolveAwsSessionResult = + | { + ok: true; + session: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { ok: false; reason: 'not_configured' } + | { ok: false; reason: 'assume_failed'; error: string }; + export class ConnectionNotFoundError extends Error { constructor() { super('Connection not found'); @@ -392,6 +409,59 @@ export class CloudSecurityService { } } + /** + * Resolve short-lived, customer-scoped AWS credentials for a connection's + * IAM role — performed here in ECS, which holds the roleAssumer task role and + * the SECURITY_HUB_ROLE_ASSUMER_ARN env. + * + * The Cloud Tests CHECK path runs in the Trigger.dev runtime, which has no + * base AWS credentials or roleAssumer ARN, so it cannot assume the + * cross-account role itself. It calls this (via the internal `resolve-session` + * endpoint) and injects the returned temp creds into the check credentials, + * so the cross-tenant master credential never leaves ECS. + * + * Org-scoped: a service-token caller cannot resolve another org's connection. + */ + async resolveAwsSession( + connectionId: string, + organizationId: string, + ): Promise { + const connection = await db.integrationConnection.findFirst({ + where: { id: connectionId, organizationId, status: 'active' }, + include: { provider: true }, + }); + + if (!connection) { + throw new ConnectionNotFoundError(); + } + + if (connection.provider.slug !== 'aws') { + return { ok: false, reason: 'not_configured' }; + } + + const decrypted = + await this.credentialVaultService.getDecryptedCredentials(connectionId); + if (!decrypted || !(decrypted.roleArn && decrypted.externalId)) { + return { ok: false, reason: 'not_configured' }; + } + + const variables = (connection.variables as Record) || {}; + + try { + const session = await this.awsService.resolveRoleSession( + decrypted, + variables, + ); + return { ok: true, session }; + } catch (error) { + return { + ok: false, + reason: 'assume_failed', + error: error instanceof Error ? error.message : String(error), + }; + } + } + /** * Detect which AWS services are actively used (via Cost Explorer). * Saves detected services to connection variables for the frontend. diff --git a/apps/api/src/cloud-security/providers/aws-security.service.ts b/apps/api/src/cloud-security/providers/aws-security.service.ts index de43d89555..418c781d23 100644 --- a/apps/api/src/cloud-security/providers/aws-security.service.ts +++ b/apps/api/src/cloud-security/providers/aws-security.service.ts @@ -252,6 +252,60 @@ export class AWSSecurityService { }); } + /** + * Resolve short-lived, customer-scoped AWS credentials for a role-auth + * connection, using the SAME partition/region normalization and two-hop + * assume as a scan. Returns ONLY the temporary session credentials. + * + * This exists so the Cloud Tests CHECK path can perform the cross-account + * assume in ECS (which holds the roleAssumer task role + env), instead of in + * the Trigger.dev runtime where no base AWS credentials or roleAssumer ARN + * are available. Throws when the connection is not role-auth or the assume + * fails — the caller maps those to "not_configured" / "assume_failed". + */ + async resolveRoleSession( + credentials: Record, + variables: Record, + ): Promise<{ + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }> { + const isRoleAuth = Boolean(credentials.roleArn && credentials.externalId); + if (!isRoleAuth) { + throw new Error( + 'AWS connection is not configured for IAM role access (roleArn/externalId missing).', + ); + } + + const partition = normalizeAwsPartition( + credentials.awsType ?? variables.awsType, + ); + const configuredRegions = this.getConfiguredRegions( + credentials, + variables, + partition, + ); + + const session = await this.resolveRoleCredentials({ + roleArn: credentials.roleArn as string, + externalId: credentials.externalId as string, + partition, + regions: configuredRegions, + primaryRegion: configuredRegions[0], + }); + + if (!session.sessionToken) { + throw new Error('Assumed role returned no session token.'); + } + + return { + accessKeyId: session.accessKeyId, + secretAccessKey: session.secretAccessKey, + sessionToken: session.sessionToken, + }; + } + /** * Today's default scan engine. Runs each registered service adapter * (global once, regional per region) and aggregates the findings. diff --git a/apps/api/src/trigger/integration-platform/checks-aws-session.ts b/apps/api/src/trigger/integration-platform/checks-aws-session.ts new file mode 100644 index 0000000000..ffb032614b --- /dev/null +++ b/apps/api/src/trigger/integration-platform/checks-aws-session.ts @@ -0,0 +1,125 @@ +import { logger } from '@trigger.dev/sdk'; +import type { IntegrationCredentialValues } from './ensure-valid-credentials'; + +const RESOLVE_SESSION_TIMEOUT_MS = 30_000; + +type ResolveSessionResponse = + | { + ok: true; + session: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { ok: false; reason: 'not_configured' } + | { ok: false; reason: 'assume_failed'; error?: string }; + +/** + * Credential keys injected by {@link injectAwsResolvedSession} and consumed by + * the AWS manifest checks (`assumeAwsSession` in the integration-platform + * package). Underscore-prefixed so they never collide with real AWS connection + * credential fields (roleArn, externalId, regions, awsType, remediationRoleArn). + */ +export const RESOLVED_AWS_SESSION_KEYS = { + accessKeyId: '__resolvedAccessKeyId', + secretAccessKey: '__resolvedSecretAccessKey', + sessionToken: '__resolvedSessionToken', + error: '__resolvedSessionError', +} as const; + +/** + * For AWS connections, resolve the cross-account session in ECS (which holds the + * roleAssumer task role + `SECURITY_HUB_ROLE_ASSUMER_ARN`) and inject the + * resulting short-lived, customer-scoped credentials into `credentials`. + * + * Why: the Cloud Tests CHECK path runs inside the Trigger.dev runtime, which has + * no base AWS credentials or roleAssumer ARN, so it cannot perform the two-hop + * assume itself. Resolving in ECS keeps the cross-tenant master credential out + * of Trigger.dev; the check just consumes the temp creds. + * + * On a genuine assume failure (or any transport error) an error marker is + * injected so the AWS check surfaces a real "Could not assume AWS role" finding + * with the true reason, rather than silently failing or falsely passing. + * Non-AWS providers and not-configured connections are left untouched. + * + * Mutates and returns the same credentials object. + */ +export async function injectAwsResolvedSession(params: { + credentials: IntegrationCredentialValues; + apiUrl: string; + connectionId: string; + organizationId: string; + providerSlug: string; +}): Promise { + const { credentials, apiUrl, connectionId, organizationId, providerSlug } = + params; + + if (providerSlug !== 'aws') return credentials; + + const serviceToken = process.env.SERVICE_TOKEN_TRIGGER; + if (!serviceToken) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + 'SERVICE_TOKEN_TRIGGER is not configured'; + return credentials; + } + + const abortController = new AbortController(); + const timeoutId = setTimeout( + () => abortController.abort(), + RESOLVE_SESSION_TIMEOUT_MS, + ); + + try { + const response = await fetch( + `${apiUrl}/v1/cloud-security/resolve-session/${connectionId}`, + { + method: 'POST', + signal: abortController.signal, + headers: { + 'Content-Type': 'application/json', + 'x-service-token': serviceToken, + 'x-organization-id': organizationId, + }, + }, + ); + + if (!response.ok) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + `Could not resolve AWS session (status ${response.status}).`; + return credentials; + } + + const result = (await response.json()) as ResolveSessionResponse; + + if (result.ok) { + credentials[RESOLVED_AWS_SESSION_KEYS.accessKeyId] = + result.session.accessKeyId; + credentials[RESOLVED_AWS_SESSION_KEYS.secretAccessKey] = + result.session.secretAccessKey; + credentials[RESOLVED_AWS_SESSION_KEYS.sessionToken] = + result.session.sessionToken; + logger.info('Resolved AWS session via ECS for connection', { + connectionId, + }); + return credentials; + } + + if (result.reason === 'assume_failed') { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + result.error || 'The cross-account IAM role could not be assumed.'; + } + // not_configured -> inject nothing; the check no-ops naturally. + return credentials; + } catch (error) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = abortController.signal + .aborted + ? `Timed out resolving AWS session after ${RESOLVE_SESSION_TIMEOUT_MS}ms.` + : error instanceof Error + ? error.message + : String(error); + return credentials; + } finally { + clearTimeout(timeoutId); + } +} diff --git a/apps/api/src/trigger/integration-platform/run-connection-checks.ts b/apps/api/src/trigger/integration-platform/run-connection-checks.ts index 7523b63b9a..e7515540ed 100644 --- a/apps/api/src/trigger/integration-platform/run-connection-checks.ts +++ b/apps/api/src/trigger/integration-platform/run-connection-checks.ts @@ -6,6 +6,7 @@ import { requestValidCredentials, type IntegrationCredentialValues, } from './ensure-valid-credentials'; +import { injectAwsResolvedSession } from './checks-aws-session'; /** * Trigger task that runs all checks for a connection. @@ -147,6 +148,17 @@ export const runConnectionChecks = task({ return { success: false, error: 'No credentials found' }; } + // For AWS, resolve the cross-account session in ECS and inject the temp + // creds — the checks run in the Trigger.dev runtime, which cannot assume the + // role itself (no base creds / roleAssumer ARN there). + credentials = await injectAwsResolvedSession({ + credentials, + apiUrl, + connectionId, + organizationId, + providerSlug, + }); + const variables = (connection.variables as Record< string, diff --git a/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts b/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts index 8d697fe3dc..4d68611326 100644 --- a/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts +++ b/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts @@ -10,6 +10,7 @@ import { requestValidCredentials, type IntegrationCredentialValues, } from './ensure-valid-credentials'; +import { injectAwsResolvedSession } from './checks-aws-session'; /** * Send email notifications for task status change @@ -292,6 +293,17 @@ export const runTaskIntegrationChecks = task({ }; } + // For AWS, resolve the cross-account session in ECS and inject the temp + // creds — the checks run in the Trigger.dev runtime, which cannot assume the + // role itself (no base creds / roleAssumer ARN there). + credentials = await injectAwsResolvedSession({ + credentials, + apiUrl, + connectionId, + organizationId, + providerSlug, + }); + const variables = (connection.variables as Record< string, diff --git a/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts b/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts index 19259d9698..51bc6b6b70 100644 --- a/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts +++ b/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts @@ -14,10 +14,78 @@ import { evaluateRdsEncryption, } from '../rds'; import { evaluateS3Encryption, evaluateS3PublicAccess } from '../s3'; -import { resolveAwsCredentialInputs } from '../shared'; +import { assumeAwsSession, resolveAwsCredentialInputs } from '../shared'; const kinds = (os: { kind: string }[]) => os.map((o) => o.kind); +// Minimal CheckContext stub — assumeAwsSession only reads ctx.credentials. +const ctxWith = (credentials: Record) => + ({ credentials }) as unknown as Parameters[0]; + +describe('assumeAwsSession — ECS-resolved session injection (CHECK path)', () => { + const base = { + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1', 'eu-west-1'], + }; + + it('uses injected ECS-resolved credentials directly (no STS, no env needed)', async () => { + const session = await assumeAwsSession( + ctxWith({ + ...base, + __resolvedAccessKeyId: 'AKIA_TEMP', + __resolvedSecretAccessKey: 'secret_temp', + __resolvedSessionToken: 'token_temp', + }), + ); + expect(session).toEqual({ + credentials: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret_temp', + sessionToken: 'token_temp', + }, + regions: ['us-east-1', 'eu-west-1'], + }); + }); + + it('throws the injected error so the caller surfaces the real reason', async () => { + await expect( + assumeAwsSession( + ctxWith({ + ...base, + __resolvedSessionError: 'The cross-account IAM role could not be assumed.', + }), + ), + ).rejects.toThrow('The cross-account IAM role could not be assumed.'); + }); + + it('returns null for a not-configured connection even if creds are injected', async () => { + const session = await assumeAwsSession( + ctxWith({ + externalId: 'eid', // roleArn missing -> not configured + __resolvedAccessKeyId: 'AKIA_TEMP', + __resolvedSecretAccessKey: 'secret_temp', + __resolvedSessionToken: 'token_temp', + }), + ); + expect(session).toBeNull(); + }); + + it('falls back to the in-runtime two-hop when nothing is injected (ECS/dev path)', async () => { + const prev = process.env.SECURITY_HUB_ROLE_ASSUMER_ARN; + delete process.env.SECURITY_HUB_ROLE_ASSUMER_ARN; + try { + // No injected creds + no roleAssumer env -> the fallback runs and fails + // fast (before any STS call), proving the original path is preserved. + await expect(assumeAwsSession(ctxWith(base))).rejects.toThrow( + /SECURITY_HUB_ROLE_ASSUMER_ARN/, + ); + } finally { + if (prev !== undefined) process.env.SECURITY_HUB_ROLE_ASSUMER_ARN = prev; + } + }); +}); + describe('AWS credential resolution (regions shape)', () => { const base = { roleArn: 'arn:aws:iam::123456789012:role/x', externalId: 'eid' }; diff --git a/packages/integration-platform/src/manifests/aws/checks/shared.ts b/packages/integration-platform/src/manifests/aws/checks/shared.ts index 8a7e48e6eb..d75bea6d5e 100644 --- a/packages/integration-platform/src/manifests/aws/checks/shared.ts +++ b/packages/integration-platform/src/manifests/aws/checks/shared.ts @@ -47,6 +47,43 @@ export function resolveAwsCredentialInputs( return { roleArn, externalId, regions }; } +/** + * Short-lived AWS session credentials (or an error) injected into the connection + * credentials by the Cloud Tests CHECK runner (apps/api `checks-aws-session.ts`) + * when the cross-account assume was performed in ECS. The underscore-prefixed + * keys never collide with real connection fields. When present, the check uses + * these directly instead of assuming the role itself — it runs in the Trigger.dev + * runtime, which has no base AWS credentials or roleAssumer ARN. + */ +function readInjectedAwsSession( + credentials: Record, +): + | { + credentials: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { error: string } + | null { + const error = credentials.__resolvedSessionError; + if (typeof error === 'string' && error.length > 0) { + return { error }; + } + const accessKeyId = credentials.__resolvedAccessKeyId; + const secretAccessKey = credentials.__resolvedSecretAccessKey; + const sessionToken = credentials.__resolvedSessionToken; + if ( + typeof accessKeyId === 'string' && + typeof secretAccessKey === 'string' && + typeof sessionToken === 'string' + ) { + return { credentials: { accessKeyId, secretAccessKey, sessionToken } }; + } + return null; +} + type AwsPartition = 'aws' | 'aws-us-gov'; function awsPartitionForRegion(region: string): AwsPartition { @@ -95,6 +132,18 @@ export async function assumeAwsSession( if (!inputs) return null; const { roleArn, externalId, regions } = inputs; + // If the CHECK runner already resolved a session in ECS (the cross-account + // assume cannot run in the Trigger.dev runtime, which lacks base AWS creds and + // the roleAssumer ARN), use it directly. An injected error surfaces the real + // failure reason via the caller's "Could not assume AWS role" finding. + const injected = readInjectedAwsSession( + ctx.credentials as Record, + ); + if (injected) { + if ('error' in injected) throw new Error(injected.error); + return { credentials: injected.credentials, regions }; + } + // IAM is global — assume once in the first region; the creds work everywhere. const region = regions[0]; const partition = awsPartitionForRegion(region);