Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions applications/Unity.GrantManager/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,27 @@ services:
networks:
- common-network

prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./scripts/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./scripts/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro
depends_on:
- unity-grantmanager-web
networks:
- common-network

alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./scripts/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
networks:
- common-network
Comment on lines +149 to +168

volumes:
postgres_data:
redis_volume_data:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# PrometheusRule CRD — loaded by the OpenShift cluster Prometheus Operator
# Deploy with: oc apply -f scripts/openshift/prometheus-rule.yaml -n d18498-<env>
#
# Replaces: scripts/prometheus/alert-rules.yml (docker-compose local only)
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: unity-grantmanager-exceptions
labels:
# These labels must match the Prometheus Operator's ruleSelector in your namespace.
# On BC Gov Silver cluster the label below is standard.
role: alert-rules
spec:
groups:
- name: unity-grantmanager-exceptions
rules:
# Fire if any exception type exceeds 5 occurrences in a 5-minute window
- alert: HighExceptionRate
expr: |
increase(application_exceptions_total[5m]) > 5
for: 1m
labels:
severity: critical
annotations:
summary: "High exception rate in Unity GrantManager"
description: >
Exception type {{ $labels.type }} has fired {{ $value | humanize }} times
in the last 5 minutes (namespace: {{ $labels.namespace }}).

# Fire if a new exception type appears (catches regressions after deploys)
- alert: NewExceptionType
expr: |
increase(application_exceptions_total[10m]) > 0
unless (
increase(application_exceptions_total[10m] offset 10m) > 0
)
for: 0m
labels:
severity: warning
annotations:
summary: "New exception type detected in Unity GrantManager"
description: >
A new exception type {{ $labels.type }} appeared for the first time
in the last 10 minutes (namespace: {{ $labels.namespace }}).
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# ServiceMonitor CRD — tells the Prometheus Operator how to scrape /metrics from the app
# Deploy with: oc apply -f scripts/openshift/service-monitor.yaml -n d18498-<env>
#
# Replaces: scrape_configs in scripts/prometheus/prometheus.yml (docker-compose local only)
#
# Prerequisites:
# The app Service must exist and expose port 8080 (or 80).
# Adjust 'port' below to match your Service's named port.
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: unity-grantmanager
labels:
app: unity-grantmanager
spec:
selector:
matchLabels:
app: unity-grantmanager # must match labels on your OpenShift Service
endpoints:
- port: http # named port on the Service pointing to 8080
path: /metrics
interval: 15s
scheme: http
29 changes: 29 additions & 0 deletions applications/Unity.GrantManager/scripts/prometheus/alert-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
groups:
- name: unity-grantmanager-exceptions
rules:
# Fire if any exception type exceeds 5 occurrences in a 5-minute window
- alert: HighExceptionRate
expr: |
increase(application_exceptions_total[5m]) > 5
for: 1m
labels:
severity: critical
annotations:
summary: "High exception rate in Unity GrantManager"
description: >
Exception type {{ $labels.type }} has fired {{ $value | humanize }} times
in the last 5 minutes (job: {{ $labels.job }}, instance: {{ $labels.instance }}).
- alert: NewExceptionType
expr: |
increase(application_exceptions_total[10m]) > 0
unless (
increase(application_exceptions_total[10m] offset 10m) > 0
)
for: 0m
labels:
severity: warning
annotations:
summary: "New exception type detected in Unity GrantManager"
description: >
A new exception type {{ $labels.type }} appeared for the first time
in the last 10 minutes.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
global:
resolve_timeout: 5m

route:
group_by: ["alertname", "type"]
group_wait: 10s
group_interval: 5m
repeat_interval: 1h
receiver: unity-webhook

receivers:
- name: unity-webhook
webhook_configs:
- url: "http://unity-grantmanager-web:8080/api/monitoring/alert"
send_resolved: false
17 changes: 17 additions & 0 deletions applications/Unity.GrantManager/scripts/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]

rule_files:
- /etc/prometheus/alert-rules.yml

scrape_configs:
- job_name: unity-grantmanager
static_configs:
- targets: ["unity-grantmanager-web:8080"]
metrics_path: /metrics
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
using System;
using System.Collections.Generic;
using System.Text.Json.Serialization;

namespace Unity.GrantManager.Web.Controllers.Monitoring;

public class AlertManagerPayload
{
[JsonPropertyName("receiver")]
public string Receiver { get; set; } = string.Empty;

[JsonPropertyName("status")]
public string Status { get; set; } = string.Empty;

private List<AlertItem> _alerts = [];

[JsonPropertyName("alerts")]
public List<AlertItem> Alerts
{
get => _alerts;
set => _alerts = value ?? [];
}
}

public class AlertItem
{
[JsonPropertyName("status")]
public string Status { get; set; } = string.Empty;

private Dictionary<string, string> _labels = [];
private Dictionary<string, string> _annotations = [];

[JsonPropertyName("labels")]
public Dictionary<string, string> Labels
{
get => _labels;
set => _labels = value ?? [];
}

[JsonPropertyName("annotations")]
public Dictionary<string, string> Annotations
{
get => _annotations;
set => _annotations = value ?? [];
}

[JsonPropertyName("startsAt")]
public DateTimeOffset StartsAt { get; set; }

[JsonPropertyName("generatorURL")]
public string GeneratorURL { get; set; } = string.Empty;

[JsonPropertyName("fingerprint")]
public string Fingerprint { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Logging;
using Unity.GrantManager.Notifications;
using Unity.Notifications.TeamsNotifications;
using Volo.Abp.AspNetCore.Mvc;

namespace Unity.GrantManager.Web.Controllers.Monitoring;

[ApiController]
[Route("api/monitoring")]
[AllowAnonymous]
[IgnoreAntiforgeryToken]
public class AlertWebhookController(
Comment thread
JamesPasta marked this conversation as resolved.
Comment on lines +14 to +18
Comment thread
JamesPasta marked this conversation as resolved.
INotificationsAppService notificationsAppService,
ILogger<AlertWebhookController> logger) : AbpController
Comment on lines +14 to +20
{
Comment thread
JamesPasta marked this conversation as resolved.
/// <summary>
/// Receives Alertmanager webhook payloads and forwards a concise summary to Teams.
/// </summary>
[HttpPost("alert")]
public async Task<IActionResult> ProcessAlert([FromBody] AlertManagerPayload? payload)
{
if (payload is null || !ModelState.IsValid || payload.Alerts.Count == 0)
{
return BadRequest();
}
Comment thread
JamesPasta marked this conversation as resolved.
Comment on lines +25 to +31

try
Comment thread
JamesPasta marked this conversation as resolved.
{
var firing = payload.Alerts
.Where(a => a is not null && string.Equals(a.Status, "firing", StringComparison.OrdinalIgnoreCase))
.ToList();

if (firing.Count == 0)
{
return Ok();
}

Comment thread
JamesPasta marked this conversation as resolved.
// Pick the most severe alert as the headline (critical > error > warning > info > unknown)
var lead = firing
.OrderBy(a => SeverityOrder(a.Labels.GetValueOrDefault("severity", "unknown")))
.First();
string alertName = lead.Labels.GetValueOrDefault("alertname", "Unknown Alert");
string severity = lead.Labels.GetValueOrDefault("severity", "unknown");
string summary = lead.Annotations.GetValueOrDefault("summary", alertName);
string description = lead.Annotations.GetValueOrDefault("description", string.Empty);
string @namespace = lead.Labels.GetValueOrDefault("kubernetes_namespace_name",
lead.Labels.GetValueOrDefault("namespace", string.Empty));
string endpoint = lead.Labels.GetValueOrDefault("handler",
lead.Labels.GetValueOrDefault("endpoint", string.Empty));
string? envInfo = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT");

string activityTitle = $"[{severity.ToUpperInvariant()}] {summary}";
string activitySubtitle = $"Environment: {envInfo} | Namespace: {@namespace}";

var facts = new List<Fact>();

if (!string.IsNullOrEmpty(description))
{
facts.Add(new Fact { Name = "Description", Value = description });
}

if (firing.Count > 1)
{
facts.Add(new Fact { Name = "Firing alerts", Value = firing.Count.ToString() });
}

if (!string.IsNullOrEmpty(endpoint))
{
facts.Add(new Fact { Name = "Affected endpoint", Value = endpoint });
}

facts.Add(new Fact { Name = "First seen", Value = lead.StartsAt.ToString("u") });

if (!string.IsNullOrEmpty(lead.GeneratorURL))
{
facts.Add(new Fact { Name = "Source", Value = lead.GeneratorURL });
}

await notificationsAppService.PostToTeamsAsync(activityTitle, activitySubtitle, facts);

return Ok();
}
catch (Exception ex)
{
logger.LogError(ex, "Failed to forward alert {AlertName} to Teams",
payload.Alerts.FirstOrDefault()?.Labels?.GetValueOrDefault("alertname"));
return StatusCode(500);
}
}

private static int SeverityOrder(string? severity) => severity?.ToLowerInvariant() switch
{
"critical" => 0,
"error" => 1,
"warning" => 2,
"info" => 3,
_ => 4
};
}
Loading
Loading