diff --git a/gitops_server/workers/deployer/deploy.py b/gitops_server/workers/deployer/deploy.py index f473a2e..14e968f 100644 --- a/gitops_server/workers/deployer/deploy.py +++ b/gitops_server/workers/deployer/deploy.py @@ -162,9 +162,27 @@ async def rollback_deployment(self, app: App) -> None: ) async def update_app_deployment(self, app: App) -> UpdateAppResult | None: + from_timestamp = time.time() async with self.semaphore_manager.app_semaphore(app.name): async with helm_parallel_semaphore: - return await self._update_app_deployment(app) + try: + return await self._update_app_deployment(app) + except Exception as e: + # Isolate per-app failures so one broken app doesn't abort the + # whole batch. Steps before the helm upgrade (e.g. cloning a + # custom git chart from a branch that doesn't exist, or + # `helm dependency build`) raise rather than returning a + # non-zero result, and would otherwise propagate up through + # asyncio.gather() and cancel every other app's deployment. + logger.exception(f"Failed to deploy app {app.name!r}.") + result = UpdateAppResult( + app_name=app.name, + slack_message="", + exit_code=1, + output=str(e), + ) + await self.post_result(app=app, result=result, deployer=self, from_timestamp=from_timestamp) + return result async def _update_app_deployment(self, app: App) -> UpdateAppResult | None: app.set_value("deployment.labels.gitops/deploy_id", self.deploy_id) @@ -275,17 +293,25 @@ def calculate_app_deltas(self) -> tuple[set[str], set[str], set[str]]: @tracer.start_as_current_span("post_result") async def post_result(self, app: App, result: UpdateAppResult, deployer: "Deployer", **kwargs: Any) -> None: - if result["exit_code"] != 0: - self.failed_apps.add(app.name) - deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs) - message = ( - deploy_result["slack_message"] - or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}" - ) - await slack.post_alert(message) - else: - self.successful_apps.add(app.name) - await handle_successful_deploy(app, result, deployer) + # Reporting a result makes Slack/GitHub network calls and runs inside the + # asyncio.gather() in deploy(). A failure here (e.g. a Slack or GitHub API + # error) must not propagate, or it would cancel every other app's + # deployment in the batch. The success/failure bookkeeping is recorded + # before any network call so it survives a notification failure. + try: + if result["exit_code"] != 0: + self.failed_apps.add(app.name) + deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs) + message = ( + deploy_result["slack_message"] + or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}" + ) + await slack.post_alert(message) + else: + self.successful_apps.add(app.name) + await handle_successful_deploy(app, result, deployer) + except Exception: + logger.exception(f"Failed to post deploy result for app {app.name!r}.") # TODO: Fix this in the future. We want to update the message with the latest status. # But we can't because of the messge length (when messaging > 4000 characters) diff --git a/release-please-config.json b/release-please-config.json index bf4f413..51e8993 100644 --- a/release-please-config.json +++ b/release-please-config.json @@ -7,6 +7,13 @@ "exclude-paths": [ "tests/", "charts/gitops" + ], + "extra-files": [ + { + "type": "toml", + "path": "uv.lock", + "jsonpath": "$.package[?(@.name.value=='gitops')].version" + } ] }, "charts/gitops": { diff --git a/tests/test_deploy.py b/tests/test_deploy.py index bde808a..2e73261 100644 --- a/tests/test_deploy.py +++ b/tests/test_deploy.py @@ -145,6 +145,79 @@ async def test_deployer_skip_migrations_in_commit_message_should_run_helm_withou ) assert post_mock.call_count == 2 + @patch("gitops_server.workers.deployer.deploy.run") + @patch("gitops_server.utils.slack.post") + @patch("gitops_server.workers.deployer.deploy.Deployer.post_result") + @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) + @patch("gitops_server.workers.deployer.deploy.temp_repo") + async def test_one_app_failing_does_not_abort_the_batch( + self, temp_repo_mock, post_result_mock, post_mock, run_mock + ): + """A single app failing (e.g. cloning a custom chart from a branch that + doesn't exist) must not prevent the rest of the batch from deploying.""" + run_mock.return_value = {"exit_code": 0, "output": ""} + + class _FakeTempRepo: + async def __aenter__(self): + return "mock-repo" + + async def __aexit__(self, *args): + return False + + # SAMPLE_GITHUB_PAYLOAD deploys two apps. Make the first chart fetch blow + # up (as a non-existent git branch would), the second succeed. + calls = {"n": 0} + + def temp_repo_side_effect(*args, **kwargs): + calls["n"] += 1 + if calls["n"] == 1: + raise Exception("git clone failed: branch 'does-not-exist' not found") + return _FakeTempRepo() + + temp_repo_mock.side_effect = temp_repo_side_effect + + semaphore_manager = AppSemaphoreManager() + deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager) + + # Must not raise even though one app errors out. + await deployer.deploy() + + # The surviving app still ran its full helm flow (dependency build + upgrade). + assert run_mock.call_count == 2 + # post_result was called for both apps: one failure, one success. + assert post_result_mock.call_count == 2 + exit_codes = sorted(call.kwargs["result"]["exit_code"] for call in post_result_mock.call_args_list) + assert exit_codes == [0, 1] + + @patch("gitops_server.workers.deployer.deploy.handle_failed_deploy") + @patch("gitops_server.workers.deployer.deploy.handle_successful_deploy") + @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) + async def test_post_result_swallows_reporting_failures(self, handle_success_mock, handle_failed_mock): + """Reporting a result makes Slack/GitHub network calls inside the gather() + in deploy(); a failure there must never propagate, or it would abort the + rest of the batch. post_result must swallow such errors itself, since it + is also invoked from places not wrapped by update_app_deployment's guard + (its own except handler and uninstall_app).""" + handle_success_mock.side_effect = Exception("slack/github exploded") + handle_failed_mock.side_effect = Exception("slack/github exploded") + + semaphore_manager = AppSemaphoreManager() + deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager) + app = App("some-app", path=create_test_yaml()) + + # Neither the success nor the failure reporting path may raise... + await deployer.post_result( + app, {"app_name": app.name, "slack_message": "", "exit_code": 0, "output": ""}, deployer + ) + await deployer.post_result( + app, {"app_name": app.name, "slack_message": "", "exit_code": 1, "output": "boom"}, deployer + ) + + # ...but the success/failure bookkeeping is still recorded (it happens + # before the network call that blows up). + assert "some-app" in deployer.successful_apps + assert "some-app" in deployer.failed_apps + @patch("gitops_server.workers.deployer.deploy.run") @patch("gitops_server.utils.slack.post") @patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions) diff --git a/uv.lock b/uv.lock index e483991..9e99d9d 100644 --- a/uv.lock +++ b/uv.lock @@ -295,7 +295,7 @@ wheels = [ [[package]] name = "gitops" -version = "1.6.0" +version = "1.6.1" source = { editable = "." } dependencies = [ { name = "boto3" },