From 8261b7bb335678a029586b2238e81cb783988009 Mon Sep 17 00:00:00 2001 From: Bradley Mclain <764990+BradMclain@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:16:48 +1000 Subject: [PATCH 1/3] fix(deploy): isolate per-app failures so one bad app doesn't abort the batch App upgrades run concurrently via asyncio.gather() with no exception isolation. The helm upgrade itself is guarded with suppress_errors=True, but the steps before it (cloning a custom git chart, helm dependency build) raise on failure. A custom chart pointing at a git branch that doesn't exist therefore raised an exception that escaped the per-app coroutine, propagated into the bare gather(), and cancelled every other app's deployment in the batch. Wrap _update_app_deployment in update_app_deployment with a try/except that converts any per-app exception into a failed UpdateAppResult routed through the existing post_result path (Slack alert, GitHub status, results summary). The broken app is now reported as failed while the rest of the batch deploys normally. Co-Authored-By: Claude Opus 4.8 (1M context) --- gitops_server/workers/deployer/deploy.py | 20 ++++++++++- tests/test_deploy.py | 44 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/gitops_server/workers/deployer/deploy.py b/gitops_server/workers/deployer/deploy.py index f473a2e..467d2b2 100644 --- a/gitops_server/workers/deployer/deploy.py +++ b/gitops_server/workers/deployer/deploy.py @@ -162,9 +162,27 @@ async def rollback_deployment(self, app: App) -> None: ) async def update_app_deployment(self, app: App) -> UpdateAppResult | None: + from_timestamp = time.time() async with self.semaphore_manager.app_semaphore(app.name): async with helm_parallel_semaphore: - return await self._update_app_deployment(app) + try: + return await self._update_app_deployment(app) + except Exception as e: + # Isolate per-app failures so one broken app doesn't abort the + # whole batch. Steps before the helm upgrade (e.g. cloning a + # custom git chart from a branch that doesn't exist, or + # `helm dependency build`) raise rather than returning a + # non-zero result, and would otherwise propagate up through + # asyncio.gather() and cancel every other app's deployment. + logger.exception(f"Failed to deploy app {app.name!r}.") + result = UpdateAppResult( + app_name=app.name, + slack_message="", + exit_code=1, + output=str(e), + ) + await self.post_result(app=app, result=result, deployer=self, from_timestamp=from_timestamp) + return result async def _update_app_deployment(self, app: App) -> UpdateAppResult | None: app.set_value("deployment.labels.gitops/deploy_id", self.deploy_id) diff --git a/tests/test_deploy.py b/tests/test_deploy.py index bde808a..dfd31a3 100644 --- a/tests/test_deploy.py +++ b/tests/test_deploy.py @@ -145,6 +145,50 @@ async def test_deployer_skip_migrations_in_commit_message_should_run_helm_withou ) assert post_mock.call_count == 2 + @patch("gitops_server.workers.deployer.deploy.run") + @patch("gitops_server.utils.slack.post") + @patch("gitops_server.workers.deployer.deploy.Deployer.post_result") + @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) + @patch("gitops_server.workers.deployer.deploy.temp_repo") + async def test_one_app_failing_does_not_abort_the_batch( + self, temp_repo_mock, post_result_mock, post_mock, run_mock + ): + """A single app failing (e.g. cloning a custom chart from a branch that + doesn't exist) must not prevent the rest of the batch from deploying.""" + run_mock.return_value = {"exit_code": 0, "output": ""} + + class _FakeTempRepo: + async def __aenter__(self): + return "mock-repo" + + async def __aexit__(self, *args): + return False + + # SAMPLE_GITHUB_PAYLOAD deploys two apps. Make the first chart fetch blow + # up (as a non-existent git branch would), the second succeed. + calls = {"n": 0} + + def temp_repo_side_effect(*args, **kwargs): + calls["n"] += 1 + if calls["n"] == 1: + raise Exception("git clone failed: branch 'does-not-exist' not found") + return _FakeTempRepo() + + temp_repo_mock.side_effect = temp_repo_side_effect + + semaphore_manager = AppSemaphoreManager() + deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager) + + # Must not raise even though one app errors out. + await deployer.deploy() + + # The surviving app still ran its full helm flow (dependency build + upgrade). + assert run_mock.call_count == 2 + # post_result was called for both apps: one failure, one success. + assert post_result_mock.call_count == 2 + exit_codes = sorted(call.kwargs["result"]["exit_code"] for call in post_result_mock.call_args_list) + assert exit_codes == [0, 1] + @patch("gitops_server.workers.deployer.deploy.run") @patch("gitops_server.utils.slack.post") @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) From 34a1a51c1857353ba7c25e1e949654a29c6a498d Mon Sep 17 00:00:00 2001 From: Bradley Mclain <764990+BradMclain@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:27:33 +1000 Subject: [PATCH 2/3] chore: keep uv.lock version in sync via release-please release-please's python release-type bumps version strings in pyproject.toml, __init__.py and version.py, but does not regenerate uv.lock. The lockfile embeds the project's own version in its [[package]] entry, so it drifted behind pyproject.toml after the 1.6.1 release (lock still recorded 1.6.0). Add uv.lock to the root package's extra-files with the toml updater targeting the gitops package version, so future releases bump it automatically (per googleapis/release-please#2561). Also resync the lockfile to the current 1.6.1. Co-Authored-By: Claude Opus 4.8 (1M context) --- release-please-config.json | 7 +++++++ uv.lock | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/release-please-config.json b/release-please-config.json index bf4f413..51e8993 100644 --- a/release-please-config.json +++ b/release-please-config.json @@ -7,6 +7,13 @@ "exclude-paths": [ "tests/", "charts/gitops" + ], + "extra-files": [ + { + "type": "toml", + "path": "uv.lock", + "jsonpath": "$.package[?(@.name.value=='gitops')].version" + } ] }, "charts/gitops": { diff --git a/uv.lock b/uv.lock index e483991..9e99d9d 100644 --- a/uv.lock +++ b/uv.lock @@ -295,7 +295,7 @@ wheels = [ [[package]] name = "gitops" -version = "1.6.0" +version = "1.6.1" source = { editable = "." } dependencies = [ { name = "boto3" }, From 83dbc8e217e358ea51168eb955007ac76bceb9aa Mon Sep 17 00:00:00 2001 From: Bradley Mclain <764990+BradMclain@users.noreply.github.com> Date: Tue, 23 Jun 2026 10:41:35 +1000 Subject: [PATCH 3/3] fix(deploy): make post_result fail-proof so reporting errors don't abort the batch Addresses PR review: post_result makes Slack/GitHub network calls and runs inside the asyncio.gather() in deploy(). It is invoked from call sites not covered by update_app_deployment's guard (its own except handler and uninstall_app), so a reporting failure could still cancel every other app's deployment. Wrap post_result's body in try/except that logs and swallows. The success/failure bookkeeping (successful_apps/failed_apps) is recorded before the network call, so it survives a notification failure. Co-Authored-By: Claude Opus 4.8 (1M context) --- gitops_server/workers/deployer/deploy.py | 30 +++++++++++++++--------- tests/test_deploy.py | 29 +++++++++++++++++++++++ 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/gitops_server/workers/deployer/deploy.py b/gitops_server/workers/deployer/deploy.py index 467d2b2..14e968f 100644 --- a/gitops_server/workers/deployer/deploy.py +++ b/gitops_server/workers/deployer/deploy.py @@ -293,17 +293,25 @@ def calculate_app_deltas(self) -> tuple[set[str], set[str], set[str]]: @tracer.start_as_current_span("post_result") async def post_result(self, app: App, result: UpdateAppResult, deployer: "Deployer", **kwargs: Any) -> None: - if result["exit_code"] != 0: - self.failed_apps.add(app.name) - deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs) - message = ( - deploy_result["slack_message"] - or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}" - ) - await slack.post_alert(message) - else: - self.successful_apps.add(app.name) - await handle_successful_deploy(app, result, deployer) + # Reporting a result makes Slack/GitHub network calls and runs inside the + # asyncio.gather() in deploy(). A failure here (e.g. a Slack or GitHub API + # error) must not propagate, or it would cancel every other app's + # deployment in the batch. The success/failure bookkeeping is recorded + # before any network call so it survives a notification failure. + try: + if result["exit_code"] != 0: + self.failed_apps.add(app.name) + deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs) + message = ( + deploy_result["slack_message"] + or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}" + ) + await slack.post_alert(message) + else: + self.successful_apps.add(app.name) + await handle_successful_deploy(app, result, deployer) + except Exception: + logger.exception(f"Failed to post deploy result for app {app.name!r}.") # TODO: Fix this in the future. We want to update the message with the latest status. # But we can't because of the messge length (when messaging > 4000 characters) diff --git a/tests/test_deploy.py b/tests/test_deploy.py index dfd31a3..2e73261 100644 --- a/tests/test_deploy.py +++ b/tests/test_deploy.py @@ -189,6 +189,35 @@ def temp_repo_side_effect(*args, **kwargs): exit_codes = sorted(call.kwargs["result"]["exit_code"] for call in post_result_mock.call_args_list) assert exit_codes == [0, 1] + @patch("gitops_server.workers.deployer.deploy.handle_failed_deploy") + @patch("gitops_server.workers.deployer.deploy.handle_successful_deploy") + @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) + async def test_post_result_swallows_reporting_failures(self, handle_success_mock, handle_failed_mock): + """Reporting a result makes Slack/GitHub network calls inside the gather() + in deploy(); a failure there must never propagate, or it would abort the + rest of the batch. post_result must swallow such errors itself, since it + is also invoked from places not wrapped by update_app_deployment's guard + (its own except handler and uninstall_app).""" + handle_success_mock.side_effect = Exception("slack/github exploded") + handle_failed_mock.side_effect = Exception("slack/github exploded") + + semaphore_manager = AppSemaphoreManager() + deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager) + app = App("some-app", path=create_test_yaml()) + + # Neither the success nor the failure reporting path may raise... + await deployer.post_result( + app, {"app_name": app.name, "slack_message": "", "exit_code": 0, "output": ""}, deployer + ) + await deployer.post_result( + app, {"app_name": app.name, "slack_message": "", "exit_code": 1, "output": "boom"}, deployer + ) + + # ...but the success/failure bookkeeping is still recorded (it happens + # before the network call that blows up). + assert "some-app" in deployer.successful_apps + assert "some-app" in deployer.failed_apps + @patch("gitops_server.workers.deployer.deploy.run") @patch("gitops_server.utils.slack.post") @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions)