diff --git a/.server-changes/verify-deployment-image-before-finalize.md b/.server-changes/verify-deployment-image-before-finalize.md new file mode 100644 index 00000000000..f821da49443 --- /dev/null +++ b/.server-changes/verify-deployment-image-before-finalize.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Verify a deployment's image exists in the registry before marking it deployed, so a deploy whose image wasn't pushed fails instead of silently breaking runs (can be turned off via `DEPLOY_IMAGE_VERIFICATION_ENABLED=0` for setups that push images out of band) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 2da52942477..e793cd086ba 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -553,6 +553,9 @@ const EnvironmentSchema = z // log-only mode before enforcement. DEPRECATE_V3_CLI_DEPLOYS_ENABLED: z.string().default("0"), + // Verify the deploy image exists before promoting. Disable for out-of-band/air-gapped push. ECR only. + DEPLOY_IMAGE_VERIFICATION_ENABLED: BoolEnv.default(true), + OBJECT_STORE_BASE_URL: z.string().optional(), OBJECT_STORE_BUCKET: z.string().optional(), OBJECT_STORE_ACCESS_KEY_ID: z.string().optional(), diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index 7ca8a379a3d..464dd8aabef 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -16,6 +16,7 @@ import { getEcrAuthToken, isEcrRegistry } from "../getDeploymentImageRef.server" import { tryCatch } from "@trigger.dev/core"; import { getRegistryConfig, type RegistryConfig } from "../registryConfig.server"; import { ComputeTemplateCreationService } from "./computeTemplateCreation.server"; +import { ecrImageExists } from "./verifyDeploymentImage.server"; export class FinalizeDeploymentV2Service extends BaseService { public async call( @@ -75,6 +76,11 @@ export class FinalizeDeploymentV2Service extends BaseService { }); } + // The CLI claims the image is already in the registry (local build, or a + // self-hosted setup). Verify before promoting so we never mark a + // deployment DEPLOYED when nothing was actually pushed. + await this.#assertImagePullable(deployment, body); + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); return finalizeService.call(authenticatedEnv, id, body); } @@ -142,10 +148,53 @@ export class FinalizeDeploymentV2Service extends BaseService { pushedImage: pushResult.image, }); + // Belt and suspenders: confirm the push actually landed before promoting. + await this.#assertImagePullable(deployment, body); + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); return finalizeService.call(authenticatedEnv, id, body); } + async #assertImagePullable( + deployment: { imageReference: string | null; type: string | null }, + body: FinalizeDeploymentRequestBody + ): Promise { + if (!env.DEPLOY_IMAGE_VERIFICATION_ENABLED) { + return; + } + + if (!deployment.imageReference) { + return; + } + + const registryConfig = getRegistryConfig(deployment.type === "MANAGED"); + + // ECR-only: non-ECR (self-hosted) registries can't be checked this way, so skip. + if (!isEcrRegistry(registryConfig.host)) { + return; + } + + const result = await ecrImageExists({ + imageReference: deployment.imageReference, + imageDigest: body.imageDigest, + registryConfig, + }); + + if (result === "missing") { + throw new ServiceValidationError( + "Deployment image was not found in the registry. It may not have been pushed (for example a local build without a push, or a push to a different registry). Aborting the deploy to avoid promoting a version that cannot start." + ); + } + + // Fail closed: if we can't confirm the image is present, don't promote a version + // that might not start. Set DEPLOY_IMAGE_VERIFICATION_ENABLED=0 for out-of-band pushes. + if (result === "unknown") { + throw new ServiceValidationError( + "Could not verify the deployment image exists in the registry. Aborting the deploy." + ); + } + } + async #createTemplateIfNeeded( deployment: { imageReference: string | null; worker: { project: { id: string } } | null }, deploymentFriendlyId: string, diff --git a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts new file mode 100644 index 00000000000..7062f89d88e --- /dev/null +++ b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts @@ -0,0 +1,187 @@ +import { + BatchGetImageCommand, + type BatchGetImageCommandOutput, + RepositoryNotFoundException, +} from "@aws-sdk/client-ecr"; +import { tryCatch } from "@trigger.dev/core"; +import pRetry, { AbortError } from "p-retry"; +import { logger } from "~/services/logger.server"; +import { + type AssumeRoleConfig, + createEcrClient, + isEcrRegistry, + parseEcrRegistryDomain, +} from "../getDeploymentImageRef.server"; +import { type RegistryConfig } from "../registryConfig.server"; + +const SHA256_DIGEST = /^sha256:[a-f0-9]{64}$/; + +export type ImageLookupResult = "found" | "missing" | "unknown"; + +/** + * Split a stored ECR image reference into repository + tag. + * + * Trust boundary: the ref is platform-generated, but we still bind the lookup to + * our configured host (region/account come from the env host) and only parse refs + * that sit under it. Returns null otherwise. + */ +export function parseEcrImageReference( + imageReference: string, + registryHost: string +): { repositoryName: string; tag: string } | null { + const prefix = `${registryHost}/`; + if (!imageReference.startsWith(prefix)) { + return null; + } + + // namespace/projectRef:tag, optionally @sha256:... which we drop here + const remainder = imageReference.slice(prefix.length).split("@")[0]; + const lastColon = remainder.lastIndexOf(":"); + + if (lastColon <= 0) { + return null; + } + + const repositoryName = remainder.slice(0, lastColon); + const tag = remainder.slice(lastColon + 1); + + if (!repositoryName || !tag || tag.includes("/")) { + return null; + } + + return { repositoryName, tag }; +} + +export function interpretBatchGetImageResponse( + response: BatchGetImageCommandOutput +): ImageLookupResult { + if (response.images && response.images.length > 0) { + return "found"; + } + + if (response.failures?.some((failure) => failure.failureCode === "ImageNotFound")) { + return "missing"; + } + + // No image and no explicit not-found failure (some other failure code) - + // we can't say it's missing, so don't block the deploy on it. + return "unknown"; +} + +type BatchGetImageInput = { + region: string; + assumeRole?: AssumeRoleConfig; + registryId?: string; + repositoryName: string; + imageIds: { imageTag?: string; imageDigest?: string }[]; +}; + +type BatchGetImageSender = (input: BatchGetImageInput) => Promise; + +const sendBatchGetImage: BatchGetImageSender = async ({ + region, + assumeRole, + registryId, + repositoryName, + imageIds, +}) => { + const ecr = await createEcrClient({ region, assumeRole }); + // No acceptedMediaTypes: only the single-manifest types are valid enum values, and + // we only care whether the image exists, not its manifest format. + return ecr.send(new BatchGetImageCommand({ repositoryName, registryId, imageIds })); +}; + +/** + * Pre-promotion backstop: check the deployment image actually exists in ECR. + * + * "found"/"missing" are definitive (a nonexistent repo counts as missing). + * "unknown" means we couldn't determine it - non-ECR registry, unparseable ref, or + * an API error; the caller decides what to do with each. `_send` is a test seam. + */ +export async function ecrImageExists( + { + imageReference, + imageDigest, + registryConfig, + }: { + imageReference: string; + imageDigest?: string; + registryConfig: RegistryConfig; + }, + _send: BatchGetImageSender = sendBatchGetImage +): Promise { + if (!isEcrRegistry(registryConfig.host)) { + return "unknown"; + } + + const parsed = parseEcrImageReference(imageReference, registryConfig.host); + + if (!parsed) { + logger.warn("Could not parse deployment image reference for verification", { imageReference }); + return "unknown"; + } + + const { accountId, region } = parseEcrRegistryDomain(registryConfig.host); + + // imageDigest is supplied by the CLI request body - validate before trusting it. + // Prefer it when valid (catches a tag that resolves to a different image), else + // fall back to the platform-generated tag. + const validDigest = + imageDigest && SHA256_DIGEST.test(imageDigest.trim()) ? imageDigest.trim() : undefined; + const imageId = validDigest ? { imageDigest: validDigest } : { imageTag: parsed.tag }; + + const assumeRole = registryConfig.ecrAssumeRoleArn + ? { + roleArn: registryConfig.ecrAssumeRoleArn, + externalId: registryConfig.ecrAssumeRoleExternalId, + } + : undefined; + + // Retry transient ECR failures (throttling/network) before giving up, so a blip + // doesn't fail an otherwise-fine deploy. A missing repo is definitive - don't retry. + const [error, response] = await tryCatch( + pRetry( + () => + _send({ + region, + assumeRole, + registryId: accountId, + repositoryName: parsed.repositoryName, + imageIds: [imageId], + }).catch((err) => { + if (err instanceof RepositoryNotFoundException) { + throw new AbortError(err); + } + throw err; + }), + { + retries: 2, + minTimeout: 200, + maxTimeout: 1000, + onFailedAttempt: (e) => { + logger.warn("Retrying ECR image verification", { + imageReference, + attempt: e.attemptNumber, + error: e.message, + }); + }, + } + ) + ); + + if (error) { + // A missing repo is a definitive miss, not an ambiguous error. + if (error instanceof RepositoryNotFoundException) { + return "missing"; + } + + logger.error("Failed to verify deployment image in ECR", { + imageReference, + repositoryName: parsed.repositoryName, + error: error.message, + }); + return "unknown"; + } + + return interpretBatchGetImageResponse(response); +} diff --git a/apps/webapp/test/verifyDeploymentImage.test.ts b/apps/webapp/test/verifyDeploymentImage.test.ts new file mode 100644 index 00000000000..43508141cfc --- /dev/null +++ b/apps/webapp/test/verifyDeploymentImage.test.ts @@ -0,0 +1,177 @@ +import { RepositoryNotFoundException } from "@aws-sdk/client-ecr"; +import { describe, expect, it } from "vitest"; +import { + ecrImageExists, + interpretBatchGetImageResponse, + parseEcrImageReference, +} from "~/v3/services/verifyDeploymentImage.server"; +import { type RegistryConfig } from "~/v3/registryConfig.server"; + +const ECR_HOST = "123456789012.dkr.ecr.us-east-1.amazonaws.com"; +const ecrConfig: RegistryConfig = { host: ECR_HOST, namespace: "deployments-test" }; + +describe("parseEcrImageReference", () => { + it("splits repository and tag for a ref under the configured host", () => { + const ref = `${ECR_HOST}/deployments-test/proj_abc:20240101.1.prod.a1b2c3d4`; + expect(parseEcrImageReference(ref, ECR_HOST)).toEqual({ + repositoryName: "deployments-test/proj_abc", + tag: "20240101.1.prod.a1b2c3d4", + }); + }); + + it("drops a trailing @sha256 digest", () => { + const ref = `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4@sha256:${"a".repeat(64)}`; + expect(parseEcrImageReference(ref, ECR_HOST)).toEqual({ + repositoryName: "deployments-test/proj_abc", + tag: "v1.prod.a1b2c3d4", + }); + }); + + it("returns null when the ref is not under the configured host (trust boundary)", () => { + const ref = "evil.example.com/whatever/proj_abc:v1"; + expect(parseEcrImageReference(ref, ECR_HOST)).toBeNull(); + }); + + it("returns null when there is no tag", () => { + expect(parseEcrImageReference(`${ECR_HOST}/deployments-test/proj_abc`, ECR_HOST)).toBeNull(); + }); + + it("returns null when the tag segment contains a slash", () => { + // a stray colon earlier in the path must not be treated as the tag separator + expect(parseEcrImageReference(`${ECR_HOST}/ns:weird/proj_abc`, ECR_HOST)).toBeNull(); + }); +}); + +describe("interpretBatchGetImageResponse", () => { + it("returns found when an image is present", () => { + expect(interpretBatchGetImageResponse({ images: [{}] } as any)).toBe("found"); + }); + + it("returns missing on an ImageNotFound failure", () => { + expect( + interpretBatchGetImageResponse({ failures: [{ failureCode: "ImageNotFound" }] } as any) + ).toBe("missing"); + }); + + it("returns unknown when there is neither an image nor a not-found failure", () => { + expect(interpretBatchGetImageResponse({ failures: [{ failureCode: "Other" }] } as any)).toBe( + "unknown" + ); + expect(interpretBatchGetImageResponse({} as any)).toBe("unknown"); + }); +}); + +describe("ecrImageExists", () => { + it("returns unknown for a non-ECR registry without calling the registry", async () => { + let called = false; + const result = await ecrImageExists( + { + imageReference: "registry.digitalocean.com/trigger-deployments/proj_abc:v1", + registryConfig: { host: "registry.digitalocean.com", namespace: "trigger-deployments" }, + }, + async () => { + called = true; + return {} as any; + } + ); + expect(result).toBe("unknown"); + expect(called).toBe(false); + }); + + it("returns unknown for an unparseable ECR ref without calling the registry", async () => { + let called = false; + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc`, + registryConfig: ecrConfig, + }, + async () => { + called = true; + return {} as any; + } + ); + expect(result).toBe("unknown"); + expect(called).toBe(false); + }); + + it("returns found when the image exists", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => ({ images: [{}] }) as any + ); + expect(result).toBe("found"); + }); + + it("returns missing when the registry reports ImageNotFound", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => ({ failures: [{ failureCode: "ImageNotFound" }] }) as any + ); + expect(result).toBe("missing"); + }); + + it("returns unknown when the registry call throws an ambiguous error", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => { + throw new Error("AccessDenied"); + } + ); + expect(result).toBe("unknown"); + }); + + it("returns missing when the repository does not exist", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => { + throw new RepositoryNotFoundException({ message: "not found", $metadata: {} }); + } + ); + expect(result).toBe("missing"); + }); + + it("queries by digest when a valid digest is supplied", async () => { + const digest = `sha256:${"b".repeat(64)}`; + let seen: any; + await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + imageDigest: digest, + registryConfig: ecrConfig, + }, + async (input) => { + seen = input; + return { images: [{}] } as any; + } + ); + expect(seen.imageIds).toEqual([{ imageDigest: digest }]); + }); + + it("falls back to the tag when the supplied digest is malformed", async () => { + let seen: any; + await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + imageDigest: "not-a-digest", + registryConfig: ecrConfig, + }, + async (input) => { + seen = input; + return { images: [{}] } as any; + } + ); + expect(seen.imageIds).toEqual([{ imageTag: "v1.prod.a1b2c3d4" }]); + }); +});