Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 38 additions & 12 deletions gitops_server/workers/deployer/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,27 @@ async def rollback_deployment(self, app: App) -> None:
)

async def update_app_deployment(self, app: App) -> UpdateAppResult | None:
from_timestamp = time.time()
async with self.semaphore_manager.app_semaphore(app.name):
async with helm_parallel_semaphore:
return await self._update_app_deployment(app)
try:
return await self._update_app_deployment(app)
except Exception as e:
# Isolate per-app failures so one broken app doesn't abort the
# whole batch. Steps before the helm upgrade (e.g. cloning a
# custom git chart from a branch that doesn't exist, or
# `helm dependency build`) raise rather than returning a
# non-zero result, and would otherwise propagate up through
# asyncio.gather() and cancel every other app's deployment.
logger.exception(f"Failed to deploy app {app.name!r}.")
result = UpdateAppResult(
app_name=app.name,
slack_message="",
exit_code=1,
output=str(e),
)
await self.post_result(app=app, result=result, deployer=self, from_timestamp=from_timestamp)
Comment thread
BradMclain marked this conversation as resolved.
return result

async def _update_app_deployment(self, app: App) -> UpdateAppResult | None:
app.set_value("deployment.labels.gitops/deploy_id", self.deploy_id)
Expand Down Expand Up @@ -275,17 +293,25 @@ def calculate_app_deltas(self) -> tuple[set[str], set[str], set[str]]:

@tracer.start_as_current_span("post_result")
async def post_result(self, app: App, result: UpdateAppResult, deployer: "Deployer", **kwargs: Any) -> None:
if result["exit_code"] != 0:
self.failed_apps.add(app.name)
deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs)
message = (
deploy_result["slack_message"]
or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}"
)
await slack.post_alert(message)
else:
self.successful_apps.add(app.name)
await handle_successful_deploy(app, result, deployer)
# Reporting a result makes Slack/GitHub network calls and runs inside the
# asyncio.gather() in deploy(). A failure here (e.g. a Slack or GitHub API
# error) must not propagate, or it would cancel every other app's
# deployment in the batch. The success/failure bookkeeping is recorded
# before any network call so it survives a notification failure.
try:
if result["exit_code"] != 0:
self.failed_apps.add(app.name)
deploy_result = await handle_failed_deploy(app, result, deployer, **kwargs)
message = (
deploy_result["slack_message"]
or f"Failed to deploy app `{result['app_name']}` for cluster `{settings.CLUSTER_NAME}`:\n>>>{result['output']}"
)
await slack.post_alert(message)
else:
self.successful_apps.add(app.name)
await handle_successful_deploy(app, result, deployer)
except Exception:
logger.exception(f"Failed to post deploy result for app {app.name!r}.")

# TODO: Fix this in the future. We want to update the message with the latest status.
# But we can't because of the messge length (when messaging > 4000 characters)
Expand Down
7 changes: 7 additions & 0 deletions release-please-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
"exclude-paths": [
"tests/",
"charts/gitops"
],
"extra-files": [
{
"type": "toml",
"path": "uv.lock",
"jsonpath": "$.package[?(@.name.value=='gitops')].version"
}
]
},
"charts/gitops": {
Expand Down
73 changes: 73 additions & 0 deletions tests/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,79 @@ async def test_deployer_skip_migrations_in_commit_message_should_run_helm_withou
)
assert post_mock.call_count == 2

@patch("gitops_server.workers.deployer.deploy.run")
@patch("gitops_server.utils.slack.post")
@patch("gitops_server.workers.deployer.deploy.Deployer.post_result")
@patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions)
@patch("gitops_server.workers.deployer.deploy.temp_repo")
async def test_one_app_failing_does_not_abort_the_batch(
self, temp_repo_mock, post_result_mock, post_mock, run_mock
):
"""A single app failing (e.g. cloning a custom chart from a branch that
doesn't exist) must not prevent the rest of the batch from deploying."""
run_mock.return_value = {"exit_code": 0, "output": ""}

class _FakeTempRepo:
async def __aenter__(self):
return "mock-repo"

async def __aexit__(self, *args):
return False

# SAMPLE_GITHUB_PAYLOAD deploys two apps. Make the first chart fetch blow
# up (as a non-existent git branch would), the second succeed.
calls = {"n": 0}

def temp_repo_side_effect(*args, **kwargs):
calls["n"] += 1
if calls["n"] == 1:
raise Exception("git clone failed: branch 'does-not-exist' not found")
return _FakeTempRepo()

temp_repo_mock.side_effect = temp_repo_side_effect

semaphore_manager = AppSemaphoreManager()
deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager)

# Must not raise even though one app errors out.
await deployer.deploy()

# The surviving app still ran its full helm flow (dependency build + upgrade).
assert run_mock.call_count == 2
# post_result was called for both apps: one failure, one success.
assert post_result_mock.call_count == 2
exit_codes = sorted(call.kwargs["result"]["exit_code"] for call in post_result_mock.call_args_list)
assert exit_codes == [0, 1]

@patch("gitops_server.workers.deployer.deploy.handle_failed_deploy")
@patch("gitops_server.workers.deployer.deploy.handle_successful_deploy")
@patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions)
async def test_post_result_swallows_reporting_failures(self, handle_success_mock, handle_failed_mock):
"""Reporting a result makes Slack/GitHub network calls inside the gather()
in deploy(); a failure there must never propagate, or it would abort the
rest of the batch. post_result must swallow such errors itself, since it
is also invoked from places not wrapped by update_app_deployment's guard
(its own except handler and uninstall_app)."""
handle_success_mock.side_effect = Exception("slack/github exploded")
handle_failed_mock.side_effect = Exception("slack/github exploded")

semaphore_manager = AppSemaphoreManager()
deployer = await Deployer.from_push_event(SAMPLE_GITHUB_PAYLOAD, semaphore_manager)
app = App("some-app", path=create_test_yaml())

# Neither the success nor the failure reporting path may raise...
await deployer.post_result(
app, {"app_name": app.name, "slack_message": "", "exit_code": 0, "output": ""}, deployer
)
await deployer.post_result(
app, {"app_name": app.name, "slack_message": "", "exit_code": 1, "output": "boom"}, deployer
)

# ...but the success/failure bookkeeping is still recorded (it happens
# before the network call that blows up).
assert "some-app" in deployer.successful_apps
assert "some-app" in deployer.failed_apps

@patch("gitops_server.workers.deployer.deploy.run")
@patch("gitops_server.utils.slack.post")
@patch("gitops_server.workers.deployer.deploy.load_app_definitions", mock_load_app_definitions)
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading