diff --git a/apps/api/src/auth/service-token-only.guard.spec.ts b/apps/api/src/auth/service-token-only.guard.spec.ts new file mode 100644 index 0000000000..40e16b67bd --- /dev/null +++ b/apps/api/src/auth/service-token-only.guard.spec.ts @@ -0,0 +1,45 @@ +import { ExecutionContext, ForbiddenException } from '@nestjs/common'; +import { ServiceTokenOnlyGuard } from './service-token-only.guard'; +import type { AuthenticatedRequest } from './types'; + +function contextFor( + request: Partial, +): ExecutionContext { + return { + switchToHttp: () => ({ + getRequest: () => request as AuthenticatedRequest, + }), + } as unknown as ExecutionContext; +} + +describe('ServiceTokenOnlyGuard', () => { + const guard = new ServiceTokenOnlyGuard(); + + it('allows internal service-token requests', () => { + expect( + guard.canActivate(contextFor({ isServiceToken: true })), + ).toBe(true); + }); + + it('rejects session requests', () => { + expect(() => + guard.canActivate( + contextFor({ authType: 'session', isApiKey: false, isServiceToken: false }), + ), + ).toThrow(ForbiddenException); + }); + + it('rejects API-key requests', () => { + expect(() => + guard.canActivate( + contextFor({ authType: 'api-key', isApiKey: true, isServiceToken: false }), + ), + ).toThrow(ForbiddenException); + }); + + it('rejects requests with no service-token flag set', () => { + expect(() => guard.canActivate(contextFor({}))).toThrow( + ForbiddenException, + ); + }); +}); diff --git a/apps/api/src/auth/service-token-only.guard.ts b/apps/api/src/auth/service-token-only.guard.ts new file mode 100644 index 0000000000..a3adaae49a --- /dev/null +++ b/apps/api/src/auth/service-token-only.guard.ts @@ -0,0 +1,30 @@ +import { + CanActivate, + ExecutionContext, + ForbiddenException, + Injectable, +} from '@nestjs/common'; +import { AuthenticatedRequest } from './types'; + +/** + * Guard that rejects everything except internal service-token auth. + * Use on internal-only endpoints that return sensitive data not meant for + * customers (e.g., minting temporary AWS credentials for background workers). + * + * Place between HybridAuthGuard and PermissionGuard: + * @UseGuards(HybridAuthGuard, ServiceTokenOnlyGuard, PermissionGuard) + */ +@Injectable() +export class ServiceTokenOnlyGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const request = context.switchToHttp().getRequest(); + + if (!request.isServiceToken) { + throw new ForbiddenException( + 'This endpoint is only available for internal service-token requests.', + ); + } + + return true; + } +} diff --git a/apps/api/src/cloud-security/cloud-security.controller.spec.ts b/apps/api/src/cloud-security/cloud-security.controller.spec.ts index a08dd6404b..3940fa8f21 100644 --- a/apps/api/src/cloud-security/cloud-security.controller.spec.ts +++ b/apps/api/src/cloud-security/cloud-security.controller.spec.ts @@ -16,7 +16,10 @@ import { Test, TestingModule } from '@nestjs/testing'; import { HttpException, HttpStatus } from '@nestjs/common'; import { CloudSecurityController } from './cloud-security.controller'; -import { CloudSecurityService } from './cloud-security.service'; +import { + CloudSecurityService, + ConnectionNotFoundError, +} from './cloud-security.service'; import { CloudSecurityQueryService } from './cloud-security-query.service'; import { CloudSecurityLegacyService } from './cloud-security-legacy.service'; import { CloudSecurityActivityService } from './cloud-security-activity.service'; @@ -29,6 +32,7 @@ import { CloudAwsScanModeService } from './aws-scan-mode.service'; import { ActingUserResolver } from '../auth/acting-user.service'; import { HybridAuthGuard } from '../auth/hybrid-auth.guard'; import { PermissionGuard } from '../auth/permission.guard'; +import { ServiceTokenOnlyGuard } from '../auth/service-token-only.guard'; import type { AuthenticatedRequest } from '../auth/types'; /** @@ -87,6 +91,8 @@ describe('CloudSecurityController — API-key mutation support', () => { .useValue(mockGuard) .overrideGuard(PermissionGuard) .useValue(mockGuard) + .overrideGuard(ServiceTokenOnlyGuard) + .useValue(mockGuard) .compile(); controller = module.get(CloudSecurityController); @@ -309,3 +315,88 @@ describe('CloudSecurityController — API-key mutation support', () => { }); }); }); + +describe('CloudSecurityController — resolveSession (internal)', () => { + let controller: CloudSecurityController; + let cloudSecurityService: { resolveAwsSession: jest.Mock }; + const mockGuard = { canActivate: jest.fn().mockReturnValue(true) }; + + beforeEach(async () => { + cloudSecurityService = { resolveAwsSession: jest.fn() }; + + const module: TestingModule = await Test.createTestingModule({ + controllers: [CloudSecurityController], + providers: [ + { provide: CloudSecurityService, useValue: cloudSecurityService }, + { provide: CloudSecurityQueryService, useValue: {} }, + { provide: CloudSecurityLegacyService, useValue: {} }, + { provide: CloudSecurityActivityService, useValue: {} }, + { provide: GCPSecurityService, useValue: {} }, + { provide: AzureSecurityService, useValue: {} }, + { provide: CheckDefinitionService, useValue: {} }, + { provide: CloudExceptionService, useValue: {} }, + { provide: CloudHistoryService, useValue: {} }, + { provide: CloudAwsScanModeService, useValue: {} }, + { provide: ActingUserResolver, useValue: {} }, + ], + }) + .overrideGuard(HybridAuthGuard) + .useValue(mockGuard) + .overrideGuard(PermissionGuard) + .useValue(mockGuard) + .overrideGuard(ServiceTokenOnlyGuard) + .useValue(mockGuard) + .compile(); + + controller = module.get(CloudSecurityController); + jest.clearAllMocks(); + }); + + it('returns the resolved session from the service', async () => { + const session = { + ok: true, + session: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }, + }; + cloudSecurityService.resolveAwsSession.mockResolvedValueOnce(session); + + const result = await controller.resolveSession('conn_1', 'org_1'); + + expect(cloudSecurityService.resolveAwsSession).toHaveBeenCalledWith( + 'conn_1', + 'org_1', + ); + expect(result).toEqual(session); + }); + + it('passes through not_configured / assume_failed results', async () => { + cloudSecurityService.resolveAwsSession.mockResolvedValueOnce({ + ok: false, + reason: 'assume_failed', + error: 'denied', + }); + + const result = await controller.resolveSession('conn_1', 'org_1'); + expect(result).toEqual({ + ok: false, + reason: 'assume_failed', + error: 'denied', + }); + }); + + it('maps ConnectionNotFoundError to a 404', async () => { + cloudSecurityService.resolveAwsSession.mockRejectedValueOnce( + new ConnectionNotFoundError(), + ); + + const error = await controller + .resolveSession('conn_missing', 'org_1') + .catch((e) => e); + + expect(error).toBeInstanceOf(HttpException); + expect((error as HttpException).getStatus()).toBe(HttpStatus.NOT_FOUND); + }); +}); diff --git a/apps/api/src/cloud-security/cloud-security.controller.ts b/apps/api/src/cloud-security/cloud-security.controller.ts index 28ed2f1f0d..c919d5d970 100644 --- a/apps/api/src/cloud-security/cloud-security.controller.ts +++ b/apps/api/src/cloud-security/cloud-security.controller.ts @@ -17,6 +17,7 @@ import { ApiOperation } from '@nestjs/swagger'; import { SkipThrottle } from '@nestjs/throttler'; import { HybridAuthGuard } from '../auth/hybrid-auth.guard'; import { PermissionGuard } from '../auth/permission.guard'; +import { ServiceTokenOnlyGuard } from '../auth/service-token-only.guard'; import { RequirePermission } from '../auth/require-permission.decorator'; import { OrganizationId } from '../auth/auth-context.decorator'; import { @@ -254,6 +255,31 @@ export class CloudSecurityController { return { data: definition }; } + @Post('resolve-session/:connectionId') + @SkipThrottle() + @UseGuards(HybridAuthGuard, ServiceTokenOnlyGuard, PermissionGuard) + @RequirePermission('integration', 'update') + @ApiOperation({ + summary: + 'Resolve short-lived AWS credentials for a connection (internal only)', + }) + async resolveSession( + @Param('connectionId') connectionId: string, + @OrganizationId() organizationId: string, + ) { + try { + return await this.cloudSecurityService.resolveAwsSession( + connectionId, + organizationId, + ); + } catch (error) { + if (error instanceof ConnectionNotFoundError) { + throw new HttpException('Connection not found', HttpStatus.NOT_FOUND); + } + throw error; + } + } + @Post('scan/:connectionId') @UseGuards(HybridAuthGuard, PermissionGuard) @RequirePermission('integration', 'update') diff --git a/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts b/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts new file mode 100644 index 0000000000..aa499ce846 --- /dev/null +++ b/apps/api/src/cloud-security/cloud-security.service.resolve-session.spec.ts @@ -0,0 +1,125 @@ +jest.mock('@db', () => ({ + db: { integrationConnection: { findFirst: jest.fn() } }, + Prisma: {}, +})); + +import { db } from '@db'; +import { + CloudSecurityService, + ConnectionNotFoundError, +} from './cloud-security.service'; + +type Ctor = ConstructorParameters; + +const findFirst = (db as unknown as { + integrationConnection: { findFirst: jest.Mock }; +}).integrationConnection.findFirst; + +describe('CloudSecurityService.resolveAwsSession', () => { + let credentialVault: { getDecryptedCredentials: jest.Mock }; + let awsService: { resolveRoleSession: jest.Mock }; + let service: CloudSecurityService; + + beforeEach(() => { + credentialVault = { getDecryptedCredentials: jest.fn() }; + awsService = { resolveRoleSession: jest.fn() }; + service = new CloudSecurityService( + credentialVault as unknown as Ctor[0], + {} as unknown as Ctor[1], + {} as unknown as Ctor[2], + awsService as unknown as Ctor[3], + {} as unknown as Ctor[4], + {} as unknown as Ctor[5], + ); + jest.clearAllMocks(); + }); + + it('scopes the connection lookup by organizationId (tenant isolation)', async () => { + findFirst.mockResolvedValueOnce(null); + + await expect( + service.resolveAwsSession('conn_1', 'org_1'), + ).rejects.toBeInstanceOf(ConnectionNotFoundError); + + expect(findFirst).toHaveBeenCalledWith( + expect.objectContaining({ + where: { id: 'conn_1', organizationId: 'org_1', status: 'active' }, + }), + ); + }); + + it('returns not_configured for a non-AWS provider (never assumes)', async () => { + findFirst.mockResolvedValueOnce({ provider: { slug: 'gcp' } }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ ok: false, reason: 'not_configured' }); + expect(awsService.resolveRoleSession).not.toHaveBeenCalled(); + }); + + it('returns not_configured when roleArn/externalId are missing', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: {}, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + externalId: 'eid', // roleArn missing + }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ ok: false, reason: 'not_configured' }); + expect(awsService.resolveRoleSession).not.toHaveBeenCalled(); + }); + + it('returns the resolved session on success', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: { regions: ['us-east-1'] }, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1'], + }); + awsService.resolveRoleSession.mockResolvedValueOnce({ + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ + ok: true, + session: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret', + sessionToken: 'token', + }, + }); + }); + + it('returns assume_failed with the real reason when the assume throws', async () => { + findFirst.mockResolvedValueOnce({ + provider: { slug: 'aws' }, + variables: {}, + }); + credentialVault.getDecryptedCredentials.mockResolvedValueOnce({ + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1'], + }); + awsService.resolveRoleSession.mockRejectedValueOnce( + new Error('not authorized to perform sts:AssumeRole'), + ); + + const result = await service.resolveAwsSession('conn_1', 'org_1'); + + expect(result).toEqual({ + ok: false, + reason: 'assume_failed', + error: 'not authorized to perform sts:AssumeRole', + }); + }); +}); diff --git a/apps/api/src/cloud-security/cloud-security.service.ts b/apps/api/src/cloud-security/cloud-security.service.ts index 38de61ce74..f3b5b0fccf 100644 --- a/apps/api/src/cloud-security/cloud-security.service.ts +++ b/apps/api/src/cloud-security/cloud-security.service.ts @@ -32,6 +32,23 @@ export interface ScanResult { error?: string; } +/** + * Outcome of resolving short-lived AWS session credentials for a connection. + * `not_configured` = nothing to do (check should no-op); `assume_failed` = the + * assume genuinely failed and the check should surface a finding. + */ +export type ResolveAwsSessionResult = + | { + ok: true; + session: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { ok: false; reason: 'not_configured' } + | { ok: false; reason: 'assume_failed'; error: string }; + export class ConnectionNotFoundError extends Error { constructor() { super('Connection not found'); @@ -392,6 +409,59 @@ export class CloudSecurityService { } } + /** + * Resolve short-lived, customer-scoped AWS credentials for a connection's + * IAM role — performed here in ECS, which holds the roleAssumer task role and + * the SECURITY_HUB_ROLE_ASSUMER_ARN env. + * + * The Cloud Tests CHECK path runs in the Trigger.dev runtime, which has no + * base AWS credentials or roleAssumer ARN, so it cannot assume the + * cross-account role itself. It calls this (via the internal `resolve-session` + * endpoint) and injects the returned temp creds into the check credentials, + * so the cross-tenant master credential never leaves ECS. + * + * Org-scoped: a service-token caller cannot resolve another org's connection. + */ + async resolveAwsSession( + connectionId: string, + organizationId: string, + ): Promise { + const connection = await db.integrationConnection.findFirst({ + where: { id: connectionId, organizationId, status: 'active' }, + include: { provider: true }, + }); + + if (!connection) { + throw new ConnectionNotFoundError(); + } + + if (connection.provider.slug !== 'aws') { + return { ok: false, reason: 'not_configured' }; + } + + const decrypted = + await this.credentialVaultService.getDecryptedCredentials(connectionId); + if (!decrypted || !(decrypted.roleArn && decrypted.externalId)) { + return { ok: false, reason: 'not_configured' }; + } + + const variables = (connection.variables as Record) || {}; + + try { + const session = await this.awsService.resolveRoleSession( + decrypted, + variables, + ); + return { ok: true, session }; + } catch (error) { + return { + ok: false, + reason: 'assume_failed', + error: error instanceof Error ? error.message : String(error), + }; + } + } + /** * Detect which AWS services are actively used (via Cost Explorer). * Saves detected services to connection variables for the frontend. diff --git a/apps/api/src/cloud-security/providers/aws-security.service.ts b/apps/api/src/cloud-security/providers/aws-security.service.ts index de43d89555..58088ca135 100644 --- a/apps/api/src/cloud-security/providers/aws-security.service.ts +++ b/apps/api/src/cloud-security/providers/aws-security.service.ts @@ -1,5 +1,6 @@ import { Injectable, Logger } from '@nestjs/common'; import { AssumeRoleCommand, STSClient } from '@aws-sdk/client-sts'; +import { retryAssume } from '@trycompai/integration-platform'; import { CostExplorerClient, GetCostAndUsageCommand, @@ -252,6 +253,60 @@ export class AWSSecurityService { }); } + /** + * Resolve short-lived, customer-scoped AWS credentials for a role-auth + * connection, using the SAME partition/region normalization and two-hop + * assume as a scan. Returns ONLY the temporary session credentials. + * + * This exists so the Cloud Tests CHECK path can perform the cross-account + * assume in ECS (which holds the roleAssumer task role + env), instead of in + * the Trigger.dev runtime where no base AWS credentials or roleAssumer ARN + * are available. Throws when the connection is not role-auth or the assume + * fails — the caller maps those to "not_configured" / "assume_failed". + */ + async resolveRoleSession( + credentials: Record, + variables: Record, + ): Promise<{ + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }> { + const isRoleAuth = Boolean(credentials.roleArn && credentials.externalId); + if (!isRoleAuth) { + throw new Error( + 'AWS connection is not configured for IAM role access (roleArn/externalId missing).', + ); + } + + const partition = normalizeAwsPartition( + credentials.awsType ?? variables.awsType, + ); + const configuredRegions = this.getConfiguredRegions( + credentials, + variables, + partition, + ); + + const session = await this.resolveRoleCredentials({ + roleArn: credentials.roleArn as string, + externalId: credentials.externalId as string, + partition, + regions: configuredRegions, + primaryRegion: configuredRegions[0], + }); + + if (!session.sessionToken) { + throw new Error('Assumed role returned no session token.'); + } + + return { + accessKeyId: session.accessKeyId, + secretAccessKey: session.secretAccessKey, + sessionToken: session.sessionToken, + }; + } + /** * Today's default scan engine. Runs each registered service adapter * (global once, regional per region) and aggregates the findings. @@ -514,12 +569,14 @@ export class AWSSecurityService { region, credentials: getAwsBaseCredentials(partition), }); - const roleAssumerResp = await baseSts.send( - new AssumeRoleCommand({ - RoleArn: roleAssumerArn, - RoleSessionName: 'CompRoleAssumer', - DurationSeconds: 3600, - }), + const roleAssumerResp = await retryAssume(() => + baseSts.send( + new AssumeRoleCommand({ + RoleArn: roleAssumerArn, + RoleSessionName: 'CompRoleAssumer', + DurationSeconds: 3600, + }), + ), ); const roleAssumerCreds = roleAssumerResp.Credentials; @@ -541,13 +598,15 @@ export class AWSSecurityService { this.logger.log(`Assuming customer role ${roleArn} in region ${region}`); - const customerResp = await roleAssumerSts.send( - new AssumeRoleCommand({ - RoleArn: roleArn, - ExternalId: externalId, - RoleSessionName: sessionName ?? 'CompSecurityAudit', - DurationSeconds: 3600, - }), + const customerResp = await retryAssume(() => + roleAssumerSts.send( + new AssumeRoleCommand({ + RoleArn: roleArn, + ExternalId: externalId, + RoleSessionName: sessionName ?? 'CompSecurityAudit', + DurationSeconds: 3600, + }), + ), ); const customerCreds = customerResp.Credentials; diff --git a/apps/api/src/trigger/integration-platform/checks-aws-session.ts b/apps/api/src/trigger/integration-platform/checks-aws-session.ts new file mode 100644 index 0000000000..ffb032614b --- /dev/null +++ b/apps/api/src/trigger/integration-platform/checks-aws-session.ts @@ -0,0 +1,125 @@ +import { logger } from '@trigger.dev/sdk'; +import type { IntegrationCredentialValues } from './ensure-valid-credentials'; + +const RESOLVE_SESSION_TIMEOUT_MS = 30_000; + +type ResolveSessionResponse = + | { + ok: true; + session: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { ok: false; reason: 'not_configured' } + | { ok: false; reason: 'assume_failed'; error?: string }; + +/** + * Credential keys injected by {@link injectAwsResolvedSession} and consumed by + * the AWS manifest checks (`assumeAwsSession` in the integration-platform + * package). Underscore-prefixed so they never collide with real AWS connection + * credential fields (roleArn, externalId, regions, awsType, remediationRoleArn). + */ +export const RESOLVED_AWS_SESSION_KEYS = { + accessKeyId: '__resolvedAccessKeyId', + secretAccessKey: '__resolvedSecretAccessKey', + sessionToken: '__resolvedSessionToken', + error: '__resolvedSessionError', +} as const; + +/** + * For AWS connections, resolve the cross-account session in ECS (which holds the + * roleAssumer task role + `SECURITY_HUB_ROLE_ASSUMER_ARN`) and inject the + * resulting short-lived, customer-scoped credentials into `credentials`. + * + * Why: the Cloud Tests CHECK path runs inside the Trigger.dev runtime, which has + * no base AWS credentials or roleAssumer ARN, so it cannot perform the two-hop + * assume itself. Resolving in ECS keeps the cross-tenant master credential out + * of Trigger.dev; the check just consumes the temp creds. + * + * On a genuine assume failure (or any transport error) an error marker is + * injected so the AWS check surfaces a real "Could not assume AWS role" finding + * with the true reason, rather than silently failing or falsely passing. + * Non-AWS providers and not-configured connections are left untouched. + * + * Mutates and returns the same credentials object. + */ +export async function injectAwsResolvedSession(params: { + credentials: IntegrationCredentialValues; + apiUrl: string; + connectionId: string; + organizationId: string; + providerSlug: string; +}): Promise { + const { credentials, apiUrl, connectionId, organizationId, providerSlug } = + params; + + if (providerSlug !== 'aws') return credentials; + + const serviceToken = process.env.SERVICE_TOKEN_TRIGGER; + if (!serviceToken) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + 'SERVICE_TOKEN_TRIGGER is not configured'; + return credentials; + } + + const abortController = new AbortController(); + const timeoutId = setTimeout( + () => abortController.abort(), + RESOLVE_SESSION_TIMEOUT_MS, + ); + + try { + const response = await fetch( + `${apiUrl}/v1/cloud-security/resolve-session/${connectionId}`, + { + method: 'POST', + signal: abortController.signal, + headers: { + 'Content-Type': 'application/json', + 'x-service-token': serviceToken, + 'x-organization-id': organizationId, + }, + }, + ); + + if (!response.ok) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + `Could not resolve AWS session (status ${response.status}).`; + return credentials; + } + + const result = (await response.json()) as ResolveSessionResponse; + + if (result.ok) { + credentials[RESOLVED_AWS_SESSION_KEYS.accessKeyId] = + result.session.accessKeyId; + credentials[RESOLVED_AWS_SESSION_KEYS.secretAccessKey] = + result.session.secretAccessKey; + credentials[RESOLVED_AWS_SESSION_KEYS.sessionToken] = + result.session.sessionToken; + logger.info('Resolved AWS session via ECS for connection', { + connectionId, + }); + return credentials; + } + + if (result.reason === 'assume_failed') { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = + result.error || 'The cross-account IAM role could not be assumed.'; + } + // not_configured -> inject nothing; the check no-ops naturally. + return credentials; + } catch (error) { + credentials[RESOLVED_AWS_SESSION_KEYS.error] = abortController.signal + .aborted + ? `Timed out resolving AWS session after ${RESOLVE_SESSION_TIMEOUT_MS}ms.` + : error instanceof Error + ? error.message + : String(error); + return credentials; + } finally { + clearTimeout(timeoutId); + } +} diff --git a/apps/api/src/trigger/integration-platform/run-connection-checks.ts b/apps/api/src/trigger/integration-platform/run-connection-checks.ts index 7523b63b9a..e7515540ed 100644 --- a/apps/api/src/trigger/integration-platform/run-connection-checks.ts +++ b/apps/api/src/trigger/integration-platform/run-connection-checks.ts @@ -6,6 +6,7 @@ import { requestValidCredentials, type IntegrationCredentialValues, } from './ensure-valid-credentials'; +import { injectAwsResolvedSession } from './checks-aws-session'; /** * Trigger task that runs all checks for a connection. @@ -147,6 +148,17 @@ export const runConnectionChecks = task({ return { success: false, error: 'No credentials found' }; } + // For AWS, resolve the cross-account session in ECS and inject the temp + // creds — the checks run in the Trigger.dev runtime, which cannot assume the + // role itself (no base creds / roleAssumer ARN there). + credentials = await injectAwsResolvedSession({ + credentials, + apiUrl, + connectionId, + organizationId, + providerSlug, + }); + const variables = (connection.variables as Record< string, diff --git a/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts b/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts index 8d697fe3dc..4d68611326 100644 --- a/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts +++ b/apps/api/src/trigger/integration-platform/run-task-integration-checks.ts @@ -10,6 +10,7 @@ import { requestValidCredentials, type IntegrationCredentialValues, } from './ensure-valid-credentials'; +import { injectAwsResolvedSession } from './checks-aws-session'; /** * Send email notifications for task status change @@ -292,6 +293,17 @@ export const runTaskIntegrationChecks = task({ }; } + // For AWS, resolve the cross-account session in ECS and inject the temp + // creds — the checks run in the Trigger.dev runtime, which cannot assume the + // role itself (no base creds / roleAssumer ARN there). + credentials = await injectAwsResolvedSession({ + credentials, + apiUrl, + connectionId, + organizationId, + providerSlug, + }); + const variables = (connection.variables as Record< string, diff --git a/packages/integration-platform/src/index.ts b/packages/integration-platform/src/index.ts index 6131578d9c..b50fed9c72 100644 --- a/packages/integration-platform/src/index.ts +++ b/packages/integration-platform/src/index.ts @@ -145,6 +145,13 @@ export { } from './manifests/aws/credentials'; export type { AwsEnvironment } from './manifests/aws/credentials'; +// Shared AWS STS AssumeRole retry (transient / IAM-eventual-consistency safe), +// reused by the Cloud Tests scanner in apps/api. +export { + retryAssume, + isRetryableAssumeError, +} from './manifests/aws/checks/assume-retry'; + // API Response types (for frontend and API type sharing) export type { diff --git a/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts b/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts new file mode 100644 index 0000000000..bb2eed1e8c --- /dev/null +++ b/packages/integration-platform/src/manifests/aws/checks/__tests__/assume-retry.test.ts @@ -0,0 +1,89 @@ +import { describe, expect, it } from 'bun:test'; +import { isRetryableAssumeError, retryAssume } from '../assume-retry'; + +// Fast, deterministic options for tests — never actually sleep. +const fastOpts = { + sleep: async () => {}, + random: () => 0, + baseDelayMs: 1, + maxDelayMs: 1, +}; + +function awsError(name: string, extra: Record = {}): Error { + return Object.assign(new Error(`${name} occurred`), { name, ...extra }); +} + +describe('isRetryableAssumeError', () => { + it('retries IAM/STS eventual-consistency AccessDenied (the customer bug)', () => { + expect(isRetryableAssumeError(awsError('AccessDenied'))).toBe(true); + expect(isRetryableAssumeError(awsError('AccessDeniedException'))).toBe(true); + }); + + it('retries expired-token, throttling, 5xx and network errors', () => { + expect(isRetryableAssumeError(awsError('ExpiredToken'))).toBe(true); + expect(isRetryableAssumeError(awsError('ThrottlingException'))).toBe(true); + expect( + isRetryableAssumeError({ name: 'X', $metadata: { httpStatusCode: 503 } }), + ).toBe(true); + expect(isRetryableAssumeError(new Error('socket hang up'))).toBe(true); + }); + + it('does NOT retry hard configuration errors', () => { + expect(isRetryableAssumeError(awsError('ValidationError'))).toBe(false); + expect( + isRetryableAssumeError(new Error('Invalid IAM Role ARN format')), + ).toBe(false); + expect(isRetryableAssumeError(null)).toBe(false); + expect(isRetryableAssumeError(undefined)).toBe(false); + }); +}); + +describe('retryAssume', () => { + it('retries a transient AccessDenied then succeeds (no AWS change needed)', async () => { + let calls = 0; + const result = await retryAssume(async () => { + calls++; + if (calls < 3) throw awsError('AccessDenied'); + return 'creds'; + }, fastOpts); + expect(result).toBe('creds'); + expect(calls).toBe(3); + }); + + it('does not retry a non-transient error (single attempt)', async () => { + let calls = 0; + await expect( + retryAssume(async () => { + calls++; + throw awsError('ValidationError'); + }, fastOpts), + ).rejects.toThrow(); + expect(calls).toBe(1); + }); + + it('gives up after the configured attempts on a persistent transient error', async () => { + let calls = 0; + await expect( + retryAssume( + async () => { + calls++; + throw awsError('AccessDenied'); + }, + { ...fastOpts, attempts: 4 }, + ), + ).rejects.toThrow(); + expect(calls).toBe(4); + }); + + it('succeeds on the first attempt without sleeping', async () => { + let slept = false; + const result = await retryAssume(async () => 'ok', { + ...fastOpts, + sleep: async () => { + slept = true; + }, + }); + expect(result).toBe('ok'); + expect(slept).toBe(false); + }); +}); diff --git a/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts b/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts index 19259d9698..51bc6b6b70 100644 --- a/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts +++ b/packages/integration-platform/src/manifests/aws/checks/__tests__/aws-checks.test.ts @@ -14,10 +14,78 @@ import { evaluateRdsEncryption, } from '../rds'; import { evaluateS3Encryption, evaluateS3PublicAccess } from '../s3'; -import { resolveAwsCredentialInputs } from '../shared'; +import { assumeAwsSession, resolveAwsCredentialInputs } from '../shared'; const kinds = (os: { kind: string }[]) => os.map((o) => o.kind); +// Minimal CheckContext stub — assumeAwsSession only reads ctx.credentials. +const ctxWith = (credentials: Record) => + ({ credentials }) as unknown as Parameters[0]; + +describe('assumeAwsSession — ECS-resolved session injection (CHECK path)', () => { + const base = { + roleArn: 'arn:aws:iam::123456789012:role/x', + externalId: 'eid', + regions: ['us-east-1', 'eu-west-1'], + }; + + it('uses injected ECS-resolved credentials directly (no STS, no env needed)', async () => { + const session = await assumeAwsSession( + ctxWith({ + ...base, + __resolvedAccessKeyId: 'AKIA_TEMP', + __resolvedSecretAccessKey: 'secret_temp', + __resolvedSessionToken: 'token_temp', + }), + ); + expect(session).toEqual({ + credentials: { + accessKeyId: 'AKIA_TEMP', + secretAccessKey: 'secret_temp', + sessionToken: 'token_temp', + }, + regions: ['us-east-1', 'eu-west-1'], + }); + }); + + it('throws the injected error so the caller surfaces the real reason', async () => { + await expect( + assumeAwsSession( + ctxWith({ + ...base, + __resolvedSessionError: 'The cross-account IAM role could not be assumed.', + }), + ), + ).rejects.toThrow('The cross-account IAM role could not be assumed.'); + }); + + it('returns null for a not-configured connection even if creds are injected', async () => { + const session = await assumeAwsSession( + ctxWith({ + externalId: 'eid', // roleArn missing -> not configured + __resolvedAccessKeyId: 'AKIA_TEMP', + __resolvedSecretAccessKey: 'secret_temp', + __resolvedSessionToken: 'token_temp', + }), + ); + expect(session).toBeNull(); + }); + + it('falls back to the in-runtime two-hop when nothing is injected (ECS/dev path)', async () => { + const prev = process.env.SECURITY_HUB_ROLE_ASSUMER_ARN; + delete process.env.SECURITY_HUB_ROLE_ASSUMER_ARN; + try { + // No injected creds + no roleAssumer env -> the fallback runs and fails + // fast (before any STS call), proving the original path is preserved. + await expect(assumeAwsSession(ctxWith(base))).rejects.toThrow( + /SECURITY_HUB_ROLE_ASSUMER_ARN/, + ); + } finally { + if (prev !== undefined) process.env.SECURITY_HUB_ROLE_ASSUMER_ARN = prev; + } + }); +}); + describe('AWS credential resolution (regions shape)', () => { const base = { roleArn: 'arn:aws:iam::123456789012:role/x', externalId: 'eid' }; diff --git a/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts b/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts new file mode 100644 index 0000000000..d81fe7795b --- /dev/null +++ b/packages/integration-platform/src/manifests/aws/checks/assume-retry.ts @@ -0,0 +1,106 @@ +/** + * Bounded retry for AWS STS `AssumeRole` calls. + * + * Why this exists: assuming a customer's cross-account role is a two-hop STS + * flow, and the FIRST attempt right after a connection is set up (or its trust + * policy edited) can fail transiently with NO change in AWS — IAM/STS is + * eventually consistent, so a brand-new/edited role can return `AccessDenied` + * on the first assume and then succeed seconds later. The base/roleAssumer + * session can likewise briefly return `ExpiredToken`. None of these classes are + * retried by the AWS SDK's default strategy (it only retries throttling/5xx/ + * network), so a single transient failure surfaced to customers as a sticky + * "Could not assume AWS role" finding that "fixed itself" on the next run. + * + * This helper retries only those transient/eventually-consistent classes with + * capped exponential backoff + jitter. Hard configuration errors (bad ARN, + * ValidationError) and a role that is *persistently* denied still propagate — + * the backoff is capped so a genuinely broken role fails within a few seconds. + */ + +const RETRYABLE_ASSUME_ERROR_NAMES = new Set([ + // IAM/STS eventual consistency on a new or just-edited cross-account role. + 'AccessDenied', + 'AccessDeniedException', + // Base/roleAssumer session momentarily expired, or an STS hiccup. + 'ExpiredToken', + 'ExpiredTokenException', + 'IDPCommunicationError', + 'RegionDisabledException', + // Standard transient classes (belt-and-suspenders alongside the SDK). + 'Throttling', + 'ThrottlingException', + 'TooManyRequestsException', + 'RequestLimitExceeded', + 'ServiceUnavailable', + 'ServiceUnavailableException', + 'InternalError', + 'InternalFailure', +]); + +export function isRetryableAssumeError(error: unknown): boolean { + if (!error || typeof error !== 'object') return false; + const e = error as { + name?: string; + Code?: string; + code?: string; + __type?: string; + message?: string; + $metadata?: { httpStatusCode?: number }; + }; + const name = e.name ?? e.Code ?? e.code ?? e.__type ?? ''; + if (RETRYABLE_ASSUME_ERROR_NAMES.has(name)) return true; + + const status = e.$metadata?.httpStatusCode; + if (typeof status === 'number' && status >= 500) return true; + + return /ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|socket hang up|network|timeout/i.test( + e.message ?? '', + ); +} + +export interface RetryAssumeOptions { + /** Total attempts including the first (default 4). */ + attempts?: number; + /** Base backoff in ms; grows exponentially, capped by maxDelayMs (default 500). */ + baseDelayMs?: number; + /** Maximum backoff per attempt in ms (default 5000). */ + maxDelayMs?: number; + /** Injectable sleep so tests don't actually wait. */ + sleep?: (ms: number) => Promise; + /** Injectable jitter source in [0, 1); defaults to Math.random. */ + random?: () => number; +} + +const defaultSleep = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * Run `fn` (an AssumeRole call) with bounded exponential backoff + jitter, + * retrying only transient/eventually-consistent failures. Non-transient errors + * propagate immediately; transient ones that never recover propagate after the + * attempts are exhausted. + */ +export async function retryAssume( + fn: () => Promise, + options: RetryAssumeOptions = {}, +): Promise { + const attempts = Math.max(1, options.attempts ?? 4); + const baseDelayMs = options.baseDelayMs ?? 500; + const maxDelayMs = options.maxDelayMs ?? 5000; + const sleep = options.sleep ?? defaultSleep; + const random = options.random ?? Math.random; + + let lastError: unknown; + for (let attempt = 0; attempt < attempts; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + const isLastAttempt = attempt === attempts - 1; + if (isLastAttempt || !isRetryableAssumeError(error)) throw error; + const backoff = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt); + await sleep(backoff + Math.floor(backoff * 0.5 * random())); + } + } + throw lastError; +} diff --git a/packages/integration-platform/src/manifests/aws/checks/shared.ts b/packages/integration-platform/src/manifests/aws/checks/shared.ts index 8a7e48e6eb..7581ca024d 100644 --- a/packages/integration-platform/src/manifests/aws/checks/shared.ts +++ b/packages/integration-platform/src/manifests/aws/checks/shared.ts @@ -1,5 +1,6 @@ import { AssumeRoleCommand, STSClient } from '@aws-sdk/client-sts'; import type { CheckContext, FindingSeverity } from '../../../types'; +import { retryAssume } from './assume-retry'; export interface AwsSession { credentials: { @@ -47,6 +48,43 @@ export function resolveAwsCredentialInputs( return { roleArn, externalId, regions }; } +/** + * Short-lived AWS session credentials (or an error) injected into the connection + * credentials by the Cloud Tests CHECK runner (apps/api `checks-aws-session.ts`) + * when the cross-account assume was performed in ECS. The underscore-prefixed + * keys never collide with real connection fields. When present, the check uses + * these directly instead of assuming the role itself — it runs in the Trigger.dev + * runtime, which has no base AWS credentials or roleAssumer ARN. + */ +function readInjectedAwsSession( + credentials: Record, +): + | { + credentials: { + accessKeyId: string; + secretAccessKey: string; + sessionToken: string; + }; + } + | { error: string } + | null { + const error = credentials.__resolvedSessionError; + if (typeof error === 'string' && error.length > 0) { + return { error }; + } + const accessKeyId = credentials.__resolvedAccessKeyId; + const secretAccessKey = credentials.__resolvedSecretAccessKey; + const sessionToken = credentials.__resolvedSessionToken; + if ( + typeof accessKeyId === 'string' && + typeof secretAccessKey === 'string' && + typeof sessionToken === 'string' + ) { + return { credentials: { accessKeyId, secretAccessKey, sessionToken } }; + } + return null; +} + type AwsPartition = 'aws' | 'aws-us-gov'; function awsPartitionForRegion(region: string): AwsPartition { @@ -95,6 +133,18 @@ export async function assumeAwsSession( if (!inputs) return null; const { roleArn, externalId, regions } = inputs; + // If the CHECK runner already resolved a session in ECS (the cross-account + // assume cannot run in the Trigger.dev runtime, which lacks base AWS creds and + // the roleAssumer ARN), use it directly. An injected error surfaces the real + // failure reason via the caller's "Could not assume AWS role" finding. + const injected = readInjectedAwsSession( + ctx.credentials as Record, + ); + if (injected) { + if ('error' in injected) throw new Error(injected.error); + return { credentials: injected.credentials, regions }; + } + // IAM is global — assume once in the first region; the creds work everywhere. const region = regions[0]; const partition = awsPartitionForRegion(region); @@ -113,12 +163,14 @@ export async function assumeAwsSession( region, credentials: awsBaseCredentials(partition), }); - const assumerResp = await baseSts.send( - new AssumeRoleCommand({ - RoleArn: roleAssumerArn, - RoleSessionName: 'CompRoleAssumer', - DurationSeconds: 3600, - }), + const assumerResp = await retryAssume(() => + baseSts.send( + new AssumeRoleCommand({ + RoleArn: roleAssumerArn, + RoleSessionName: 'CompRoleAssumer', + DurationSeconds: 3600, + }), + ), ); const assumer = assumerResp.Credentials; if ( @@ -139,13 +191,15 @@ export async function assumeAwsSession( sessionToken: assumer.SessionToken, }, }); - const res = await assumerSts.send( - new AssumeRoleCommand({ - RoleArn: roleArn, - ExternalId: externalId, - RoleSessionName: 'CompEvidenceCheck', - DurationSeconds: 3600, - }), + const res = await retryAssume(() => + assumerSts.send( + new AssumeRoleCommand({ + RoleArn: roleArn, + ExternalId: externalId, + RoleSessionName: 'CompEvidenceCheck', + DurationSeconds: 3600, + }), + ), ); const c = res.Credentials; if (!c?.AccessKeyId || !c.SecretAccessKey || !c.SessionToken) return null;