diff --git a/.dockerignore b/.dockerignore index 01cdf2c..0d871aa 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,3 +12,6 @@ # Re-include template files for go:embed !**/*.gotmpl + +# Re-include yaml for default mapping config +!**/*.yaml diff --git a/Dockerfile b/Dockerfile index 52af1e2..2bbc285 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,7 +51,7 @@ COPY --from=agent-builder /workspace/switch-agent-server . COPY --from=agent-builder /workspace/switch-agent-client . # Expose the service ports -EXPOSE 50051 50051 +EXPOSE 50051 9100 ENTRYPOINT ["/switch-agent-server"] diff --git a/agent b/agent new file mode 100755 index 0000000..9286f30 Binary files /dev/null and b/agent differ diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 35375a7..d859c86 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1,6 +1,6 @@ //go:build !ignore_autogenerated -// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors // SPDX-License-Identifier: Apache-2.0 // Code generated by controller-gen. DO NOT EDIT. diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 600467a..30caeb6 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -76,6 +76,7 @@ export default withMermaid({ { text: 'Getting started', link: '/usage/getting-started' }, { text: 'Provisioning', link: '/usage/provisioning' }, { text: 'Agent', link: '/usage/agent' }, + { text: 'Agent Metrics', link: '/usage/metrics' }, ] }, { diff --git a/docs/usage/agent.md b/docs/usage/agent.md index 4bc6901..bbdb78b 100644 --- a/docs/usage/agent.md +++ b/docs/usage/agent.md @@ -15,3 +15,6 @@ The switch agent runs on the switch and exposes device and interface operations ## Notes The current implementation uses SONiC Redis as the data source for switch state. + +## Metrics +The agent exposes Prometheus metrics on port 9100. See [Agent metrics](./metrics.md) for the full metric reference and configuration schema. diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md new file mode 100644 index 0000000..ef43e81 --- /dev/null +++ b/docs/usage/metrics.md @@ -0,0 +1,274 @@ +# Agent metrics + +The switch agent exposes a Prometheus-compatible `/metrics` endpoint for monitoring switch health, interface state, and transceiver optics. Metrics are collected just-in-time from SONiC Redis on every Prometheus scrape — there is no background polling or caching. + +## Endpoints + +| Path | Description | +|------|-------------| +| `/metrics` | Prometheus metrics | +| `/healthz` | Health check — returns `200 OK` if Redis is reachable, `500` otherwise | + +## Configuration + +The agent accepts two flags for metrics: + +| Flag | Default | Description | +|------|---------|-------------| +| `-metrics-port` | `9100` | HTTP port for the metrics server | +| `-metrics-config` | _(empty)_ | Path to a custom metrics mapping YAML. When empty, the built-in default config is used | + +## Metric types + +Metrics come from two sources: + +### Built-in collectors + +These require custom logic (cross-database joins, aggregate counting, error fallbacks) and are always registered. + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `sonic_switch_info` | gauge | `mac`, `firmware`, `hwsku`, `asic`, `platform` | Device metadata, always 1. Firmware and ASIC fall back to `/etc/sonic/sonic_version.yml` when absent from Redis | +| `sonic_switch_ready` | gauge | — | 1 if the switch is ready, 0 otherwise | +| `sonic_switch_interface_oper_state` | gauge | `interface` | Operational state (1=up, 0=down) | +| `sonic_switch_interface_admin_state` | gauge | `interface` | Admin state (1=up, 0=down) | +| `sonic_switch_interfaces_total` | gauge | `operational_status` | Number of interfaces by status | +| `sonic_switch_ports_total` | gauge | — | Total physical ports | +| `sonic_scrape_duration_seconds` | gauge | — | Duration of the last metrics scrape | + +### Config-driven collectors + +These are defined in YAML and can be customized or extended by operators. The default config ships the following metrics: + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `sonic_switch_transceiver_dom_temperature_celsius` | gauge | `interface` | Transceiver temperature | +| `sonic_switch_transceiver_dom_voltage_volts` | gauge | `interface` | Transceiver supply voltage | +| `sonic_switch_transceiver_dom_rx_power_dbm` | gauge | `interface`, `lane` | RX power per lane | +| `sonic_switch_transceiver_dom_tx_bias_milliamps` | gauge | `interface`, `lane` | TX bias current per lane | +| `sonic_switch_transceiver_dom_threshold` | gauge | `interface`, `sensor`, `level`, `direction` | DOM threshold values | +| `sonic_switch_transceiver_info` | gauge | `interface`, `type`, `vendor`, `model`, `serial` | Transceiver metadata, always 1 | +| `sonic_switch_transceiver_rxlos` | gauge | `interface`, `lane` | RX loss of signal per lane (1=loss, 0=ok) | +| `sonic_switch_transceiver_txfault` | gauge | `interface`, `lane` | TX fault per lane (1=fault, 0=ok) | +| `sonic_switch_interface_neighbor_info` | gauge | `interface`, `neighbor_mac`, `neighbor_name`, `neighbor_port` | LLDP neighbor metadata, always 1 | +| `sonic_switch_temperature_celsius` | gauge | `sensor` | Chassis temperature sensor reading | +| `sonic_switch_temperature_high_threshold_celsius` | gauge | `sensor` | Chassis temperature sensor high threshold | +| `sonic_switch_temperature_warning` | gauge | `sensor` | Chassis temperature warning status (1=warning, 0=ok) | +| `sonic_switch_interface_bytes_total` | counter | `interface`, `direction` | Bytes transferred | +| `sonic_switch_interface_packets_total` | counter | `interface`, `direction`, `type` | Packets by type (unicast, multicast, broadcast, non_unicast) | +| `sonic_switch_interface_errors_total` | counter | `interface`, `direction` | Interface error counters | +| `sonic_switch_interface_discards_total` | counter | `interface`, `direction` | Interface discard counters | +| `sonic_switch_interface_dropped_packets_total` | counter | `interface`, `direction` | SAI-level dropped packets | +| `sonic_switch_interface_fec_frames_total` | counter | `interface`, `type` | FEC frame counters (correctable, uncorrectable, symbol_errors) | +| `sonic_switch_interface_queue_length` | gauge | `interface` | Current output queue length | +| `sonic_switch_interface_pfc_packets_total` | counter | `interface`, `direction`, `priority` | PFC packets per priority (0-7) | +| `sonic_switch_interface_rx_packet_size_bytes` | histogram | `interface` | RX packet size distribution (buckets: 64, 127, 255, 511, 1023, 1518, 2047, 4095, 9216, 16383) | +| `sonic_switch_interface_tx_packet_size_bytes` | histogram | `interface` | TX packet size distribution (buckets: 64, 127, 255, 511, 1023, 1518, 2047, 4095, 9216, 16383) | +| `sonic_switch_interface_anomaly_packets_total` | counter | `interface`, `type` | Anomalous packets (undersize, oversize, fragments, jabbers, unknown_protos) | + +## Metrics configuration schema + +A custom config file replaces all config-driven metrics. The file is YAML with a single top-level key: + +```yaml +metrics: + - redis_db: ... + key_pattern: ... + fields: + - metric: ... + ... +``` + +### `metrics[]` — Metric mapping + +Each entry maps a set of Redis keys to one or more Prometheus metrics. + +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `redis_db` | yes | — | SONiC Redis database name (`CONFIG_DB`, `STATE_DB`, `COUNTERS_DB`, `APPL_DB`) | +| `key_pattern` | yes | — | Redis `KEYS` glob pattern (e.g. `TRANSCEIVER_INFO|*`) | +| `key_separator` | no | `\|` | Character separating the table prefix from the key suffix | +| `key_resolver` | no | — | Name of a Redis hash that maps logical names to key suffixes (e.g. `COUNTERS_PORT_NAME_MAP`) | +| `fields` | yes | — | List of field-to-metric mappings | + +### `fields[]` — Field mapping + +Each entry maps a Redis hash field (or set of fields) to a Prometheus metric. + +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `field` | no | — | Specific Redis hash field name. Mutually exclusive with `field_pattern` | +| `field_pattern` | no | — | Set to `*` to iterate all hash fields. Mutually exclusive with `field` | +| `metric` | yes | — | Prometheus metric name | +| `type` | yes | — | `gauge`, `counter`, or `histogram` | +| `help` | no | — | Metric help string | +| `value` | no | — | Fixed metric value (ignores field value). Use for `_info` pattern metrics | +| `labels` | no | — | Map of label names to [value templates](#label-value-templates) | +| `transform` | no | — | [Value transformation](#transforms) | + +When neither `field` nor `field_pattern` is set, the metric is emitted once per key using `value` or label data from the hash. + +### Label value templates + +Label values are strings that can reference dynamic data using `$` prefixes: + +| Template | Resolves to | +|----------|-------------| +| `$key_suffix` | Part of the Redis key after the separator (e.g. `Ethernet0` from `TRANSCEIVER_INFO\|Ethernet0`) | +| `$port_name` | Resolved name from `key_resolver` (e.g. `Ethernet0` resolved via `COUNTERS_PORT_NAME_MAP`) | +| `$field_name` | The Redis hash field name (useful with `field_pattern: "*"`) | +| `$` | Value of a hash field (e.g. `$vendor_name` reads the `vendor_name` field) | +| _(literal)_ | Any string without a `$` prefix is used as-is | + +### Transforms + +Transforms modify how the metric value is derived. At most one transform should be set per field mapping. + +#### `map` + +Maps string field values to floats. Unmapped values are silently skipped. + +```yaml +transform: + map: + up: 1 + down: 0 +``` + +#### `regex_capture` + +Matches field names against a Go regex with [named capture groups](https://pkg.go.dev/regexp/syntax). Non-matching fields are skipped. Capture group names become additional Prometheus labels. Requires `field_pattern: "*"`. + +```yaml +field_pattern: "*" +metric: sonic_switch_transceiver_dom_rx_power_dbm +labels: + interface: "$key_suffix" +transform: + regex_capture: + pattern: "^rx(?P\\d+)power$" +``` + +This matches `rx1power`, `rx2power`, etc. and produces a `lane` label with the captured digit. + +`regex_capture` can be combined with `map` to filter field names by regex while also converting string values. For example, to expose per-lane boolean fields as numeric gauges: + +```yaml +field_pattern: "*" +metric: sonic_switch_transceiver_rxlos +labels: + interface: "$key_suffix" +transform: + regex_capture: + pattern: "^rxlos(?P\\d+)$" + map: + "True": 1 + "False": 0 +``` + +#### `parse_threshold_field` + +Parses SONiC DOM threshold field names (e.g. `temphighalarm`) into three additional labels: `sensor`, `level`, and `direction`. Requires `field_pattern: "*"`. + +```yaml +transform: + parse_threshold_field: true +``` + +| Field name | sensor | level | direction | +|------------|--------|-------|-----------| +| `temphighalarm` | temperature | alarm | high | +| `vcclowwarning` | voltage | warning | low | +| `rxpowerhighwarning` | rx_power | warning | high | +| `txbiaslowalarm` | tx_bias | alarm | low | +| `txpowerhighalarm` | tx_power | alarm | high | + +#### `dom_flag_severity` + +Computes a severity rollup from all DOM flag fields in the hash. Each field is parsed as a threshold field name; if its value is `"true"`, it contributes to the severity. Returns the highest severity found: `0` (ok), `1` (warning), or `2` (alarm). Note: this transform is available but not included in the default config because the `TRANSCEIVER_DOM_FLAG` table is not present on all platforms. + +```yaml +transform: + dom_flag_severity: true +``` + +#### `histogram` + +Maps multiple Redis hash fields to a single Prometheus histogram. Each entry in `buckets` maps an upper bound (float64) to a Redis hash field name. The transform reads each field, parses the count as an unsigned integer, and accumulates cumulative bucket counts. The resulting histogram has `sum=0` because SAI counters don't provide total bytes — but bucket-based percentile queries and heatmap visualizations still work. Requires `type: "histogram"`. + +```yaml +- metric: sonic_switch_interface_rx_packet_size_bytes + type: histogram + help: "RX packet size distribution" + labels: + interface: "$port_name" + transform: + histogram: + buckets: + 64: SAI_PORT_STAT_ETHER_IN_PKTS_64_OCTETS + 127: SAI_PORT_STAT_ETHER_IN_PKTS_65_TO_127_OCTETS + 255: SAI_PORT_STAT_ETHER_IN_PKTS_128_TO_255_OCTETS + 511: SAI_PORT_STAT_ETHER_IN_PKTS_256_TO_511_OCTETS + 1023: SAI_PORT_STAT_ETHER_IN_PKTS_512_TO_1023_OCTETS + 1518: SAI_PORT_STAT_ETHER_IN_PKTS_1024_TO_1518_OCTETS +``` + +This emits `_bucket`, `_count`, and `_sum` series automatically — Prometheus handles the histogram suffixes. + +## Examples + +### Adding a new counter from COUNTERS_DB + +```yaml +metrics: + - redis_db: COUNTERS_DB + key_pattern: "COUNTERS:*" + key_separator: ":" + key_resolver: COUNTERS_PORT_NAME_MAP + fields: + - field: SAI_PORT_STAT_IF_IN_UCAST_PKTS + metric: sonic_switch_interface_unicast_packets_total + type: counter + help: "Total unicast packets received" + labels: + interface: "$port_name" + direction: "rx" +``` + +### Exposing a string field as an enum gauge + +```yaml +metrics: + - redis_db: STATE_DB + key_pattern: "PORT_TABLE|*" + key_separator: "|" + fields: + - field: oper_status + metric: sonic_switch_interface_oper_state + type: gauge + help: "Operational state of the interface" + labels: + interface: "$key_suffix" + transform: + map: + up: 1 + down: 0 +``` + +### Metadata as labels (info pattern) + +```yaml +metrics: + - redis_db: STATE_DB + key_pattern: "TRANSCEIVER_INFO|*" + key_separator: "|" + fields: + - metric: sonic_switch_transceiver_info + type: gauge + help: "Transceiver metadata" + value: 1 + labels: + interface: "$key_suffix" + vendor: "$manufacturer" + serial: "$serial" +``` diff --git a/go.mod b/go.mod index f20d696..4871e04 100644 --- a/go.mod +++ b/go.mod @@ -4,10 +4,12 @@ go 1.24.5 require ( github.com/go-logr/logr v1.4.3 + github.com/go-redis/redismock/v9 v9.2.0 github.com/ironcore-dev/controller-utils v0.11.0 github.com/jedib0t/go-pretty/v6 v6.7.8 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 + github.com/prometheus/client_golang v1.23.2 github.com/redis/go-redis/v9 v9.18.0 github.com/spf13/cobra v1.10.2 github.com/spf13/pflag v1.0.10 @@ -18,6 +20,7 @@ require ( k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 sigs.k8s.io/controller-runtime v0.22.3 + sigs.k8s.io/yaml v1.6.0 ) require ( @@ -62,12 +65,12 @@ require ( github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect github.com/prometheus/procfs v0.19.1 // indirect @@ -114,5 +117,4 @@ require ( sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect - sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 57cb4dd..8584249 100644 --- a/go.sum +++ b/go.sum @@ -78,6 +78,8 @@ github.com/go-openapi/swag/typeutils v0.25.1 h1:rD/9HsEQieewNt6/k+JBwkxuAHktFtH3 github.com/go-openapi/swag/typeutils v0.25.1/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8= github.com/go-openapi/swag/yamlutils v0.25.1 h1:mry5ez8joJwzvMbaTGLhw8pXUnhDK91oSJLDPF1bmGk= github.com/go-openapi/swag/yamlutils v0.25.1/go.mod h1:cm9ywbzncy3y6uPm/97ysW8+wZ09qsks+9RS8fLWKqg= +github.com/go-redis/redismock/v9 v9.2.0 h1:ZrMYQeKPECZPjOj5u9eyOjg8Nnb0BS9lkVIZ6IpsKLw= +github.com/go-redis/redismock/v9 v9.2.0/go.mod h1:18KHfGDK4Y6c2R0H38EUGWAdc7ZQS9gfYxc94k7rWT0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= @@ -139,6 +141,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= +github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= @@ -300,6 +306,8 @@ gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnf gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt index 1fb47b5..f376d90 100644 --- a/hack/boilerplate.go.txt +++ b/hack/boilerplate.go.txt @@ -1,2 +1,2 @@ -// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors // SPDX-License-Identifier: Apache-2.0 diff --git a/hack/license-header.txt b/hack/license-header.txt index 3ceae8e..62b052b 100644 --- a/hack/license-header.txt +++ b/hack/license-header.txt @@ -1,2 +1,2 @@ -SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors SPDX-License-Identifier: Apache-2.0 diff --git a/internal/agent/agent_server/server.go b/internal/agent/agent_server/server.go index c59b1d5..b7ac62d 100644 --- a/internal/agent/agent_server/server.go +++ b/internal/agent/agent_server/server.go @@ -9,20 +9,23 @@ import ( "fmt" "log" "net" - - pb "github.com/ironcore-dev/sonic-operator/internal/agent/proto" - agent "github.com/ironcore-dev/sonic-operator/internal/agent/types" + "net/http" switchAgent "github.com/ironcore-dev/sonic-operator/internal/agent/interface" + "github.com/ironcore-dev/sonic-operator/internal/agent/metrics" + pb "github.com/ironcore-dev/sonic-operator/internal/agent/proto" "github.com/ironcore-dev/sonic-operator/internal/agent/sonic" + agent "github.com/ironcore-dev/sonic-operator/internal/agent/types" "google.golang.org/grpc" "google.golang.org/grpc/reflection" ) var ( - port = flag.Int("port", 50051, "The server port") - redisAddr = flag.String("redis-addr", "127.0.0.1:6379", "The Redis address") + port = flag.Int("port", 50051, "The server port") + redisAddr = flag.String("redis-addr", "127.0.0.1:6379", "The Redis address") + metricsPort = flag.Int("metrics-port", 9100, "The metrics server port") + metricsConfig = flag.String("metrics-config", "", "Path to metrics mapping config YAML (uses built-in defaults if empty)") ) type proxyServer struct { @@ -234,6 +237,15 @@ func StartServer() { panic(err) } + // Start Prometheus metrics HTTP server + metricsSrv := metrics.NewMetricsServer(fmt.Sprintf("0.0.0.0:%d", *metricsPort), swAgent, sonic.GetSonicVersionInfo, *metricsConfig) + go func() { + log.Printf("metrics server listening at 0.0.0.0:%d", *metricsPort) + if err := metricsSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("metrics server failed: %v", err) + } + }() + pb.RegisterSwitchAgentServiceServer(s, &proxyServer{ SwitchAgent: swAgent, }) diff --git a/internal/agent/metrics/collector.go b/internal/agent/metrics/collector.go new file mode 100644 index 0000000..494e5eb --- /dev/null +++ b/internal/agent/metrics/collector.go @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "context" + "strings" + "time" + + "github.com/redis/go-redis/v9" +) + +// RedisConnector abstracts Redis database access for testability. +// *sonic.SonicAgent satisfies this interface via its Connect method. +type RedisConnector interface { + Connect(dbName string) (*redis.Client, error) +} + +// collectTimeout is the context timeout for Redis reads during a Prometheus scrape. +const collectTimeout = 8 * time.Second + +// batchHGetAll fetches all hash fields for the given keys using a Redis pipeline. +// Returns a map from key to field-value map. Keys that fail are silently skipped. +func batchHGetAll(ctx context.Context, client *redis.Client, keys []string) map[string]map[string]string { + if len(keys) == 0 { + return nil + } + + pipe := client.Pipeline() + cmds := make(map[string]*redis.MapStringStringCmd, len(keys)) + for _, key := range keys { + cmds[key] = pipe.HGetAll(ctx, key) + } + _, _ = pipe.Exec(ctx) + + result := make(map[string]map[string]string, len(keys)) + for key, cmd := range cmds { + fields, err := cmd.Result() + if err != nil || len(fields) == 0 { + continue + } + result[key] = fields + } + return result +} + +// extractKeySuffix returns the portion of a Redis key after the first separator +// ("|" or ":"). For example, "PORT_TABLE|Ethernet0" returns "Ethernet0". +func extractKeySuffix(key, sep string) string { + _, after, ok := strings.Cut(key, sep) + if !ok { + return key + } + return after +} diff --git a/internal/agent/metrics/config.go b/internal/agent/metrics/config.go new file mode 100644 index 0000000..800343e --- /dev/null +++ b/internal/agent/metrics/config.go @@ -0,0 +1,203 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "embed" + "encoding/json" + "fmt" + "os" + "regexp" + "strconv" + + "sigs.k8s.io/yaml" +) + +//go:embed default_config.yaml +var defaultConfigFS embed.FS + +// Metric type constants used in configuration and validation. +const ( + metricTypeGauge = "gauge" + metricTypeCounter = "counter" + metricTypeHistogram = "histogram" +) + +// MetricsConfig is the top-level YAML configuration for config-driven metrics. +type MetricsConfig struct { + Metrics []MetricMapping `json:"metrics"` +} + +// MetricMapping defines how a set of Redis keys maps to Prometheus metrics. +type MetricMapping struct { + // RedisDB is the SONiC Redis database name (e.g. "STATE_DB", "COUNTERS_DB"). + RedisDB string `json:"redis_db"` + // KeyPattern is the Redis KEYS glob pattern (e.g. "TRANSCEIVER_INFO|*"). + KeyPattern string `json:"key_pattern"` + // KeySeparator is the character separating the table prefix from the key suffix. + // Defaults to "|". + KeySeparator string `json:"key_separator,omitempty"` + // KeyResolver is the name of a Redis hash that maps logical names to key suffixes. + // Used for COUNTERS_DB where keys are OIDs resolved via COUNTERS_PORT_NAME_MAP. + KeyResolver string `json:"key_resolver,omitempty"` + // Fields defines the field-to-metric mappings for each matched key. + Fields []FieldMapping `json:"fields"` +} + +// FieldMapping defines how a Redis hash field maps to a Prometheus metric. +type FieldMapping struct { + // Field is a specific Redis hash field name. Mutually exclusive with FieldPattern. + Field string `json:"field,omitempty"` + // FieldPattern iterates all fields when set to "*". Mutually exclusive with Field. + FieldPattern string `json:"field_pattern,omitempty"` + // Metric is the Prometheus metric name. + Metric string `json:"metric"` + // Type is the Prometheus metric type: "gauge" or "counter". + Type string `json:"type"` + // Help is the metric help string. + Help string `json:"help,omitempty"` + // Value is a fixed metric value (e.g. 1 for _info pattern). When set, the Redis + // field value is ignored and this value is used instead. + Value *float64 `json:"value,omitempty"` + // Labels maps Prometheus label names to value templates. + // Templates: "$key_suffix", "$port_name", "$field_name", "$", or literal. + Labels map[string]string `json:"labels,omitempty"` + // Transform defines optional value transformations. + Transform *Transform `json:"transform,omitempty"` +} + +// Transform defines value transformations for a field mapping. +type Transform struct { + // Map converts string field values to float64 (e.g. {"up": 1, "down": 0}). + Map map[string]float64 `json:"map,omitempty"` + // ParseThresholdField enables special DOM threshold field name parsing, + // which extracts sensor/level/direction labels from field names like "temphighalarm". + ParseThresholdField bool `json:"parse_threshold_field,omitempty"` + // RegexCapture matches field names against a regex and extracts capture groups as labels. + RegexCapture *RegexCapture `json:"regex_capture,omitempty"` + // DOMFlagSeverity computes a severity rollup (0=ok, 1=warning, 2=alarm) from all hash fields. + DOMFlagSeverity bool `json:"dom_flag_severity,omitempty"` + // Histogram maps upper bounds to Redis field names, emitting a Prometheus histogram. + Histogram *HistogramBuckets `json:"histogram,omitempty"` +} + +// RegexCapture defines a regex-based field name matching transform. +// The pattern must use Go named capture groups (?P...) to define label names. +// For example, "^rx(?P\\d+)power$" matches "rx1power" and produces label lane="1". +type RegexCapture struct { + // Pattern is a Go regex with named capture groups (e.g. "^rx(?P\\d+)power$"). + Pattern string `json:"pattern"` +} + +// HistogramBuckets defines a histogram transform that maps Redis field names to +// Prometheus histogram bucket upper bounds. +type HistogramBuckets struct { + // Buckets maps upper bounds (in bytes, seconds, etc.) to Redis hash field names. + // Values are read, parsed as uint64, and accumulated into cumulative histogram buckets. + Buckets map[float64]string `json:"buckets"` +} + +// UnmarshalJSON implements custom JSON unmarshaling for HistogramBuckets. +// sigs.k8s.io/yaml converts YAML→JSON, so numeric YAML keys become JSON string keys. +// This method parses those string keys back to float64. +func (hb *HistogramBuckets) UnmarshalJSON(data []byte) error { + var raw struct { + Buckets map[string]string `json:"buckets"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return err + } + hb.Buckets = make(map[float64]string, len(raw.Buckets)) + for k, v := range raw.Buckets { + f, err := strconv.ParseFloat(k, 64) + if err != nil { + return fmt.Errorf("histogram bucket key %q is not a valid number: %w", k, err) + } + hb.Buckets[f] = v + } + return nil +} + +// effectiveSeparator returns the key separator, defaulting to "|". +func (m *MetricMapping) effectiveSeparator() string { + if m.KeySeparator != "" { + return m.KeySeparator + } + return "|" +} + +// LoadConfig loads a MetricsConfig from a YAML file path. +func LoadConfig(path string) (*MetricsConfig, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("reading metrics config %s: %w", path, err) + } + return parseConfig(data) +} + +// DefaultConfig returns the built-in default metrics configuration. +func DefaultConfig() (*MetricsConfig, error) { + data, err := defaultConfigFS.ReadFile("default_config.yaml") + if err != nil { + return nil, fmt.Errorf("reading embedded default config: %w", err) + } + return parseConfig(data) +} + +func parseConfig(data []byte) (*MetricsConfig, error) { + var cfg MetricsConfig + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parsing metrics config: %w", err) + } + if err := validateConfig(&cfg); err != nil { + return nil, err + } + return &cfg, nil +} + +func validateConfig(cfg *MetricsConfig) error { + for i, m := range cfg.Metrics { + if m.RedisDB == "" { + return fmt.Errorf("metrics[%d]: redis_db is required", i) + } + if m.KeyPattern == "" { + return fmt.Errorf("metrics[%d]: key_pattern is required", i) + } + for j, f := range m.Fields { + if f.Metric == "" { + return fmt.Errorf("metrics[%d].fields[%d]: metric is required", i, j) + } + if f.Type != metricTypeGauge && f.Type != metricTypeCounter && f.Type != metricTypeHistogram { + return fmt.Errorf("metrics[%d].fields[%d]: type must be 'gauge', 'counter', or 'histogram', got %q", i, j, f.Type) + } + if f.Type == metricTypeHistogram { + if f.Transform == nil || f.Transform.Histogram == nil || len(f.Transform.Histogram.Buckets) == 0 { + return fmt.Errorf("metrics[%d].fields[%d]: histogram type requires transform.histogram.buckets", i, j) + } + } + if f.Field != "" && f.FieldPattern != "" { + return fmt.Errorf("metrics[%d].fields[%d]: field and field_pattern are mutually exclusive", i, j) + } + if f.Transform != nil && f.Transform.RegexCapture != nil { + rc := f.Transform.RegexCapture + if rc.Pattern == "" { + return fmt.Errorf("metrics[%d].fields[%d]: regex_capture.pattern is required", i, j) + } + re, err := regexp.Compile(rc.Pattern) + if err != nil { + return fmt.Errorf("metrics[%d].fields[%d]: regex_capture.pattern is invalid: %w", i, j, err) + } + if re.NumSubexp() == 0 { + return fmt.Errorf("metrics[%d].fields[%d]: regex_capture.pattern must have at least one named capture group", i, j) + } + for idx, name := range re.SubexpNames()[1:] { + if name == "" { + return fmt.Errorf("metrics[%d].fields[%d]: regex_capture.pattern group %d is unnamed; use (?P...) syntax", i, j, idx+1) + } + } + } + } + } + return nil +} diff --git a/internal/agent/metrics/config_collector.go b/internal/agent/metrics/config_collector.go new file mode 100644 index 0000000..6fc9fd8 --- /dev/null +++ b/internal/agent/metrics/config_collector.go @@ -0,0 +1,365 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "context" + "log" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +// ConfigCollector is a generic Prometheus collector driven by a MetricMapping. +// It reads Redis keys matching a pattern and emits metrics based on field mappings. +type ConfigCollector struct { + connector RedisConnector + mapping MetricMapping + + // descs holds one descriptor per unique metric name. + descs map[string]*prometheus.Desc + // compiledRegex holds pre-compiled regexes keyed by field mapping index. + compiledRegex map[int]*regexp.Regexp +} + +// NewConfigCollector creates a collector from a MetricMapping configuration entry. +func NewConfigCollector(connector RedisConnector, mapping MetricMapping) *ConfigCollector { + descs := make(map[string]*prometheus.Desc) + compiledRegex := make(map[int]*regexp.Regexp) + + for i, f := range mapping.Fields { + if _, exists := descs[f.Metric]; exists { + continue + } + labels := sortedLabelNames(f, mapping.KeyResolver != "") + // If this field uses parse_threshold_field, additional labels are added dynamically + if f.Transform != nil && f.Transform.ParseThresholdField { + labels = appendUnique(labels, "sensor", "level", "direction") + } + // If this field uses regex_capture, append capture labels and pre-compile regex + if f.Transform != nil && f.Transform.RegexCapture != nil { + re := regexp.MustCompile(f.Transform.RegexCapture.Pattern) + compiledRegex[i] = re + // Extract label names from named capture groups + for _, name := range re.SubexpNames()[1:] { + labels = appendUnique(labels, name) + } + } + descs[f.Metric] = prometheus.NewDesc(f.Metric, f.Help, labels, nil) + } + + return &ConfigCollector{ + connector: connector, + mapping: mapping, + descs: descs, + compiledRegex: compiledRegex, + } +} + +func (c *ConfigCollector) Describe(ch chan<- *prometheus.Desc) { + for _, d := range c.descs { + ch <- d + } +} + +func (c *ConfigCollector) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), collectTimeout) + defer cancel() + + client, err := c.connector.Connect(c.mapping.RedisDB) + if err != nil { + log.Printf("ConfigCollector[%s]: failed to connect to %s: %v", + c.mapping.KeyPattern, c.mapping.RedisDB, err) + return + } + + sep := c.mapping.effectiveSeparator() + + // If key_resolver is set, resolve names→keys first + var resolverMap map[string]string // keyInRedis → resolvedName + if c.mapping.KeyResolver != "" { + nameToOID, err := client.HGetAll(ctx, c.mapping.KeyResolver).Result() + if err != nil { + log.Printf("ConfigCollector[%s]: failed to read resolver %s: %v", + c.mapping.KeyPattern, c.mapping.KeyResolver, err) + return + } + // Build reverse map: "COUNTERS:oid:..." → "Ethernet0" + resolverMap = make(map[string]string, len(nameToOID)) + prefix := strings.SplitN(c.mapping.KeyPattern, "*", 2)[0] // "COUNTERS:" + for name, oid := range nameToOID { + resolverMap[prefix+oid] = name + } + } + + // Fetch all matching keys + keys, err := client.Keys(ctx, c.mapping.KeyPattern).Result() + if err != nil { + log.Printf("ConfigCollector[%s]: failed to list keys: %v", + c.mapping.KeyPattern, err) + return + } + + data := batchHGetAll(ctx, client, keys) + + for _, key := range keys { + fields, ok := data[key] + if !ok { + continue + } + keySuffix := extractKeySuffix(key, sep) + + // Resolved port name (for key_resolver) + portName := "" + if resolverMap != nil { + portName, ok = resolverMap[key] + if !ok { + continue // skip keys not in the resolver map + } + } + + for fi, fm := range c.mapping.Fields { + // dom_flag_severity operates on the whole hash, not per-field + if fm.Transform != nil && fm.Transform.DOMFlagSeverity { + desc := c.descs[fm.Metric] + metricType := prometheus.GaugeValue + if fm.Type == metricTypeCounter { + metricType = prometheus.CounterValue + } + severity := domFlagSeverity(fields) + labels := resolveLabels(fm.Labels, keySuffix, portName, "", fields) + ch <- prometheus.MustNewConstMetric(desc, metricType, severity, labels...) + continue + } + + // histogram operates on the whole hash — reads specific bucket fields + if fm.Type == "histogram" && fm.Transform != nil && fm.Transform.Histogram != nil { + desc := c.descs[fm.Metric] + labels := resolveLabels(fm.Labels, keySuffix, portName, "", fields) + collectHistogram(ch, desc, fm.Transform.Histogram, fields, labels) + continue + } + + if fm.FieldPattern == "*" { + // Iterate all fields + c.collectAllFields(ch, fi, fm, fields, keySuffix, portName) + } else if fm.Field != "" { + // Specific field + c.collectField(ch, fi, fm, fm.Field, fields, keySuffix, portName) + } else { + // No field specified — emit using fixed value or labels from hash fields + c.collectField(ch, fi, fm, "", fields, keySuffix, portName) + } + } + } +} + +func (c *ConfigCollector) collectAllFields( + ch chan<- prometheus.Metric, + fieldIdx int, + fm FieldMapping, + hashFields map[string]string, + keySuffix, portName string, +) { + for fieldName, fieldVal := range hashFields { + c.collectFieldEntry(ch, fieldIdx, fm, fieldName, fieldVal, hashFields, keySuffix, portName) + } +} + +func (c *ConfigCollector) collectField( + ch chan<- prometheus.Metric, + fieldIdx int, + fm FieldMapping, + fieldName string, + hashFields map[string]string, + keySuffix, portName string, +) { + fieldVal := "" + if fieldName != "" { + var ok bool + fieldVal, ok = hashFields[fieldName] + if !ok { + return + } + } + c.collectFieldEntry(ch, fieldIdx, fm, fieldName, fieldVal, hashFields, keySuffix, portName) +} + +func (c *ConfigCollector) collectFieldEntry( + ch chan<- prometheus.Metric, + fieldIdx int, + fm FieldMapping, + fieldName, fieldVal string, + hashFields map[string]string, + keySuffix, portName string, +) { + desc := c.descs[fm.Metric] + metricType := prometheus.GaugeValue + if fm.Type == "counter" { + metricType = prometheus.CounterValue + } + + // Handle regex_capture transform — filter by field name and extract capture group labels. + // Does NOT determine the value; falls through to the value resolution below. + var captureLabels []string + if fm.Transform != nil && fm.Transform.RegexCapture != nil { + re := c.compiledRegex[fieldIdx] + m := re.FindStringSubmatch(fieldName) + if m == nil { + return // field doesn't match, skip + } + captureLabels = append(captureLabels, m[1:]...) + } + + // Handle parse_threshold_field transform + if fm.Transform != nil && fm.Transform.ParseThresholdField { + parsed := parseThresholdField(fieldName) + if parsed == nil { + return + } + v, err := strconv.ParseFloat(fieldVal, 64) + if err != nil { + return + } + labels := resolveLabels(fm.Labels, keySuffix, portName, fieldName, hashFields) + labels = append(labels, parsed.Sensor, parsed.Level, parsed.Direction) + ch <- prometheus.MustNewConstMetric(desc, metricType, v, labels...) + return + } + + // Determine value + var v float64 + if fm.Value != nil { + v = *fm.Value + } else if fm.Transform != nil && fm.Transform.Map != nil { + mapped, ok := fm.Transform.Map[fieldVal] + if !ok { + return + } + v = mapped + } else { + var err error + v, err = strconv.ParseFloat(fieldVal, 64) + if err != nil { + return + } + } + + labels := resolveLabels(fm.Labels, keySuffix, portName, fieldName, hashFields) + labels = append(labels, captureLabels...) + ch <- prometheus.MustNewConstMetric(desc, metricType, v, labels...) +} + +// resolveLabels resolves label value templates into concrete values, returning +// them in sorted label-name order (matching the desc's label order). +func resolveLabels( + labelTemplates map[string]string, + keySuffix, portName, fieldName string, + hashFields map[string]string, +) []string { + // Sort label names to match prometheus.Desc label order + names := make([]string, 0, len(labelTemplates)) + for name := range labelTemplates { + names = append(names, name) + } + sort.Strings(names) + + values := make([]string, 0, len(names)) + for _, name := range names { + tmpl := labelTemplates[name] + values = append(values, resolveTemplate(tmpl, keySuffix, portName, fieldName, hashFields)) + } + return values +} + +// resolveTemplate resolves a single label value template. +func resolveTemplate(tmpl, keySuffix, portName, fieldName string, hashFields map[string]string) string { + if !strings.HasPrefix(tmpl, "$") { + return tmpl // literal value + } + varName := tmpl[1:] + switch varName { + case "key_suffix": + return keySuffix + case "port_name": + return portName + case "field_name": + return fieldName + default: + // Treat as a hash field reference (e.g. "$vendor_name") + return hashFields[varName] + } +} + +// sortedLabelNames extracts label names from a FieldMapping in sorted order. +func sortedLabelNames(f FieldMapping, _ bool) []string { + names := make([]string, 0, len(f.Labels)) + for name := range f.Labels { + names = append(names, name) + } + sort.Strings(names) + return names +} + +// appendUnique appends items to a slice only if they're not already present. +func appendUnique(slice []string, items ...string) []string { + set := make(map[string]bool, len(slice)) + for _, s := range slice { + set[s] = true + } + for _, item := range items { + if !set[item] { + slice = append(slice, item) + set[item] = true + } + } + return slice +} + +// collectHistogram reads bucket fields from the hash, accumulates cumulative counts, +// and emits a prometheus.MustNewConstHistogram. +func collectHistogram( + ch chan<- prometheus.Metric, + desc *prometheus.Desc, + hb *HistogramBuckets, + hashFields map[string]string, + labels []string, +) { + // Sort upper bounds + bounds := make([]float64, 0, len(hb.Buckets)) + for ub := range hb.Buckets { + bounds = append(bounds, ub) + } + sort.Float64s(bounds) + + // Read non-cumulative counts from Redis and accumulate into cumulative buckets. + var totalCount uint64 + cumBuckets := make(map[float64]uint64, len(bounds)) + var cumulative uint64 + for _, ub := range bounds { + fieldName := hb.Buckets[ub] + val, ok := hashFields[fieldName] + if !ok { + cumBuckets[ub] = cumulative + continue + } + n, err := strconv.ParseUint(val, 10, 64) + if err != nil { + cumBuckets[ub] = cumulative + continue + } + cumulative += n + cumBuckets[ub] = cumulative + } + totalCount = cumulative + + // +Inf bucket count equals totalCount (Prometheus adds it automatically). + // sum is 0 — SAI doesn't provide total bytes, only bucket counts. + ch <- prometheus.MustNewConstHistogram( + desc, totalCount, 0, cumBuckets, labels..., + ) +} diff --git a/internal/agent/metrics/default_config.yaml b/internal/agent/metrics/default_config.yaml new file mode 100644 index 0000000..fbe02b3 --- /dev/null +++ b/internal/agent/metrics/default_config.yaml @@ -0,0 +1,525 @@ +# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +# SPDX-License-Identifier: Apache-2.0 + +# Default metrics configuration for the sonic-agent. +# This file is embedded in the agent binary and used when no custom config is provided. +# Operators can override by providing their own file via the -metrics-config flag. + +metrics: + # Transceiver DOM sensor readings + - redis_db: STATE_DB + key_pattern: "TRANSCEIVER_DOM_SENSOR|*" + key_separator: "|" + fields: + - field: temperature + metric: sonic_switch_transceiver_dom_temperature_celsius + type: gauge + help: "Transceiver temperature in Celsius" + labels: + interface: "$key_suffix" + - field: voltage + metric: sonic_switch_transceiver_dom_voltage_volts + type: gauge + help: "Transceiver supply voltage in Volts" + labels: + interface: "$key_suffix" + - field_pattern: "*" + metric: sonic_switch_transceiver_dom_rx_power_dbm + type: gauge + help: "Transceiver RX power in dBm" + labels: + interface: "$key_suffix" + transform: + regex_capture: + pattern: "^rx(?P\\d+)power$" + - field_pattern: "*" + metric: sonic_switch_transceiver_dom_tx_bias_milliamps + type: gauge + help: "Transceiver TX bias current in milliamps" + labels: + interface: "$key_suffix" + transform: + regex_capture: + pattern: "^tx(?P\\d+)bias$" + + # Transceiver DOM thresholds — single metric with sensor/level/direction labels + - redis_db: STATE_DB + key_pattern: "TRANSCEIVER_DOM_THRESHOLD|*" + key_separator: "|" + fields: + - field_pattern: "*" + metric: sonic_switch_transceiver_dom_threshold + type: gauge + help: "Transceiver DOM threshold value" + labels: + interface: "$key_suffix" + transform: + parse_threshold_field: true + + # Transceiver static info — _info pattern with metadata as labels + - redis_db: STATE_DB + key_pattern: "TRANSCEIVER_INFO|*" + key_separator: "|" + fields: + - metric: sonic_switch_transceiver_info + type: gauge + help: "Transceiver static metadata as labels, always 1" + value: 1 + labels: + interface: "$key_suffix" + type: "$type" + vendor: "$manufacturer" + model: "$model" + serial: "$serial" + + # Transceiver status — per-lane RX loss of signal and TX fault + - redis_db: STATE_DB + key_pattern: "TRANSCEIVER_STATUS|*" + key_separator: "|" + fields: + - field_pattern: "*" + metric: sonic_switch_transceiver_rxlos + type: gauge + help: "Transceiver RX loss of signal (1=loss, 0=ok)" + labels: + interface: "$key_suffix" + transform: + regex_capture: + pattern: "^rxlos(?P\\d+)$" + map: + "True": 1 + "False": 0 + - field_pattern: "*" + metric: sonic_switch_transceiver_txfault + type: gauge + help: "Transceiver TX fault (1=fault, 0=ok)" + labels: + interface: "$key_suffix" + transform: + regex_capture: + pattern: "^txfault(?P\\d+)$" + map: + "True": 1 + "False": 0 + + # Chassis temperature sensors + - redis_db: STATE_DB + key_pattern: "TEMPERATURE_INFO|*" + key_separator: "|" + fields: + - field: temperature + metric: sonic_switch_temperature_celsius + type: gauge + help: "Chassis temperature sensor reading in Celsius" + labels: + sensor: "$key_suffix" + - field: high_threshold + metric: sonic_switch_temperature_high_threshold_celsius + type: gauge + help: "Chassis temperature sensor high threshold in Celsius" + labels: + sensor: "$key_suffix" + - field: warning_status + metric: sonic_switch_temperature_warning + type: gauge + help: "Chassis temperature sensor warning status (1=warning, 0=ok)" + labels: + sensor: "$key_suffix" + transform: + map: + "True": 1 + "False": 0 + + # LLDP neighbor info — metadata as labels + - redis_db: APPL_DB + key_pattern: "LLDP_ENTRY_TABLE:*" + key_separator: ":" + fields: + - metric: sonic_switch_interface_neighbor_info + type: gauge + help: "LLDP neighbor metadata as labels, always 1" + value: 1 + labels: + interface: "$key_suffix" + neighbor_mac: "$lldp_rem_chassis_id" + neighbor_name: "$lldp_rem_sys_name" + neighbor_port: "$lldp_rem_port_desc" + + # Interface counters — OID resolved via COUNTERS_PORT_NAME_MAP + - redis_db: COUNTERS_DB + key_pattern: "COUNTERS:*" + key_separator: ":" + key_resolver: COUNTERS_PORT_NAME_MAP + fields: + # Traffic volume (bytes) + - field: SAI_PORT_STAT_IF_IN_OCTETS + metric: sonic_switch_interface_bytes_total + type: counter + help: "Total bytes transferred" + labels: + interface: "$port_name" + direction: "rx" + - field: SAI_PORT_STAT_IF_OUT_OCTETS + metric: sonic_switch_interface_bytes_total + type: counter + help: "Total bytes transferred" + labels: + interface: "$port_name" + direction: "tx" + + # Packet counts by type + - field: SAI_PORT_STAT_IF_IN_UCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "rx" + type: "unicast" + - field: SAI_PORT_STAT_IF_OUT_UCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "tx" + type: "unicast" + - field: SAI_PORT_STAT_IF_IN_MULTICAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "rx" + type: "multicast" + - field: SAI_PORT_STAT_IF_OUT_MULTICAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "tx" + type: "multicast" + - field: SAI_PORT_STAT_IF_IN_BROADCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "rx" + type: "broadcast" + - field: SAI_PORT_STAT_IF_OUT_BROADCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "tx" + type: "broadcast" + - field: SAI_PORT_STAT_IF_IN_NON_UCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "rx" + type: "non_unicast" + - field: SAI_PORT_STAT_IF_OUT_NON_UCAST_PKTS + metric: sonic_switch_interface_packets_total + type: counter + help: "Total packets transferred" + labels: + interface: "$port_name" + direction: "tx" + type: "non_unicast" + + # Errors and discards + - field: SAI_PORT_STAT_IF_IN_ERRORS + metric: sonic_switch_interface_errors_total + type: counter + help: "Total interface errors" + labels: + interface: "$port_name" + direction: "rx" + - field: SAI_PORT_STAT_IF_OUT_ERRORS + metric: sonic_switch_interface_errors_total + type: counter + help: "Total interface errors" + labels: + interface: "$port_name" + direction: "tx" + - field: SAI_PORT_STAT_IF_IN_DISCARDS + metric: sonic_switch_interface_discards_total + type: counter + help: "Total interface discards" + labels: + interface: "$port_name" + direction: "rx" + - field: SAI_PORT_STAT_IF_OUT_DISCARDS + metric: sonic_switch_interface_discards_total + type: counter + help: "Total interface discards" + labels: + interface: "$port_name" + direction: "tx" + + # Dropped packets (SAI-level drops) + - field: SAI_PORT_STAT_IN_DROPPED_PKTS + metric: sonic_switch_interface_dropped_packets_total + type: counter + help: "Total SAI-level dropped packets" + labels: + interface: "$port_name" + direction: "rx" + - field: SAI_PORT_STAT_OUT_DROPPED_PKTS + metric: sonic_switch_interface_dropped_packets_total + type: counter + help: "Total SAI-level dropped packets" + labels: + interface: "$port_name" + direction: "tx" + + # FEC counters + - field: SAI_PORT_STAT_IF_IN_FEC_CORRECTABLE_FRAMES + metric: sonic_switch_interface_fec_frames_total + type: counter + help: "Total FEC frames" + labels: + interface: "$port_name" + type: "correctable" + - field: SAI_PORT_STAT_IF_IN_FEC_NOT_CORRECTABLE_FRAMES + metric: sonic_switch_interface_fec_frames_total + type: counter + help: "Total FEC frames" + labels: + interface: "$port_name" + type: "uncorrectable" + - field: SAI_PORT_STAT_IF_IN_FEC_SYMBOL_ERRORS + metric: sonic_switch_interface_fec_frames_total + type: counter + help: "Total FEC frames" + labels: + interface: "$port_name" + type: "symbol_errors" + + # Queue depth + - field: SAI_PORT_STAT_IF_OUT_QLEN + metric: sonic_switch_interface_queue_length + type: gauge + help: "Current output queue length" + labels: + interface: "$port_name" + + # PFC (Priority Flow Control) counters + - field: SAI_PORT_STAT_PFC_0_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "0" + - field: SAI_PORT_STAT_PFC_1_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "1" + - field: SAI_PORT_STAT_PFC_2_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "2" + - field: SAI_PORT_STAT_PFC_3_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "3" + - field: SAI_PORT_STAT_PFC_4_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "4" + - field: SAI_PORT_STAT_PFC_5_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "5" + - field: SAI_PORT_STAT_PFC_6_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "6" + - field: SAI_PORT_STAT_PFC_7_RX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "rx" + priority: "7" + - field: SAI_PORT_STAT_PFC_0_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "0" + - field: SAI_PORT_STAT_PFC_1_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "1" + - field: SAI_PORT_STAT_PFC_2_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "2" + - field: SAI_PORT_STAT_PFC_3_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "3" + - field: SAI_PORT_STAT_PFC_4_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "4" + - field: SAI_PORT_STAT_PFC_5_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "5" + - field: SAI_PORT_STAT_PFC_6_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "6" + - field: SAI_PORT_STAT_PFC_7_TX_PKTS + metric: sonic_switch_interface_pfc_packets_total + type: counter + help: "Total PFC packets" + labels: + interface: "$port_name" + direction: "tx" + priority: "7" + + # Packet size distribution (RX) — Prometheus histogram + - metric: sonic_switch_interface_rx_packet_size_bytes + type: histogram + help: "RX packet size distribution" + labels: + interface: "$port_name" + transform: + histogram: + buckets: + 64: SAI_PORT_STAT_ETHER_IN_PKTS_64_OCTETS + 127: SAI_PORT_STAT_ETHER_IN_PKTS_65_TO_127_OCTETS + 255: SAI_PORT_STAT_ETHER_IN_PKTS_128_TO_255_OCTETS + 511: SAI_PORT_STAT_ETHER_IN_PKTS_256_TO_511_OCTETS + 1023: SAI_PORT_STAT_ETHER_IN_PKTS_512_TO_1023_OCTETS + 1518: SAI_PORT_STAT_ETHER_IN_PKTS_1024_TO_1518_OCTETS + 2047: SAI_PORT_STAT_ETHER_IN_PKTS_1519_TO_2047_OCTETS + 4095: SAI_PORT_STAT_ETHER_IN_PKTS_2048_TO_4095_OCTETS + 9216: SAI_PORT_STAT_ETHER_IN_PKTS_4096_TO_9216_OCTETS + 16383: SAI_PORT_STAT_ETHER_IN_PKTS_9217_TO_16383_OCTETS + + # Packet size distribution (TX) — Prometheus histogram + - metric: sonic_switch_interface_tx_packet_size_bytes + type: histogram + help: "TX packet size distribution" + labels: + interface: "$port_name" + transform: + histogram: + buckets: + 64: SAI_PORT_STAT_ETHER_OUT_PKTS_64_OCTETS + 127: SAI_PORT_STAT_ETHER_OUT_PKTS_65_TO_127_OCTETS + 255: SAI_PORT_STAT_ETHER_OUT_PKTS_128_TO_255_OCTETS + 511: SAI_PORT_STAT_ETHER_OUT_PKTS_256_TO_511_OCTETS + 1023: SAI_PORT_STAT_ETHER_OUT_PKTS_512_TO_1023_OCTETS + 1518: SAI_PORT_STAT_ETHER_OUT_PKTS_1024_TO_1518_OCTETS + 2047: SAI_PORT_STAT_ETHER_OUT_PKTS_1519_TO_2047_OCTETS + 4095: SAI_PORT_STAT_ETHER_OUT_PKTS_2048_TO_4095_OCTETS + 9216: SAI_PORT_STAT_ETHER_OUT_PKTS_4096_TO_9216_OCTETS + 16383: SAI_PORT_STAT_ETHER_OUT_PKTS_9217_TO_16383_OCTETS + + # Anomaly counters + - field: SAI_PORT_STAT_ETHER_STATS_UNDERSIZE_PKTS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "undersize" + - field: SAI_PORT_STAT_ETHER_RX_OVERSIZE_PKTS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "rx_oversize" + - field: SAI_PORT_STAT_ETHER_TX_OVERSIZE_PKTS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "tx_oversize" + - field: SAI_PORT_STAT_ETHER_STATS_FRAGMENTS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "fragments" + - field: SAI_PORT_STAT_ETHER_STATS_JABBERS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "jabbers" + - field: SAI_PORT_STAT_IF_IN_UNKNOWN_PROTOS + metric: sonic_switch_interface_anomaly_packets_total + type: counter + help: "Total anomalous packets" + labels: + interface: "$port_name" + type: "unknown_protos" diff --git a/internal/agent/metrics/device.go b/internal/agent/metrics/device.go new file mode 100644 index 0000000..290382d --- /dev/null +++ b/internal/agent/metrics/device.go @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "context" + "log" + + "github.com/prometheus/client_golang/prometheus" +) + +// VersionInfoFunc returns static device metadata (e.g. from /etc/sonic/sonic_version.yml). +// It is called on every collect as a fallback when Redis fields are empty. +type VersionInfoFunc func() (map[string]string, error) + +// DeviceCollector collects device metadata and readiness metrics from CONFIG_DB, +// with optional fallback to static version info for missing fields. +type DeviceCollector struct { + connector RedisConnector + versionInfo VersionInfoFunc + + infoDesc *prometheus.Desc + readyDesc *prometheus.Desc +} + +// NewDeviceCollector creates a collector for sonic_switch_info and sonic_switch_ready. +// versionInfo is optional — when non-nil it is called to fill in missing Redis fields. +func NewDeviceCollector(connector RedisConnector, versionInfo VersionInfoFunc) *DeviceCollector { + return &DeviceCollector{ + connector: connector, + versionInfo: versionInfo, + infoDesc: prometheus.NewDesc( + "sonic_switch_info", + "Device metadata as labels, always 1", + []string{"asic", "firmware", "hwsku", "mac", "platform"}, + nil, + ), + readyDesc: prometheus.NewDesc( + "sonic_switch_ready", + "Whether the switch is ready (1) or not (0)", + nil, + nil, + ), + } +} + +func (c *DeviceCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.infoDesc + ch <- c.readyDesc +} + +func (c *DeviceCollector) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), collectTimeout) + defer cancel() + + configDB, err := c.connector.Connect("CONFIG_DB") + if err != nil { + log.Printf("DeviceCollector: failed to connect to CONFIG_DB: %v", err) + ch <- prometheus.MustNewConstMetric(c.readyDesc, prometheus.GaugeValue, 0) + return + } + + fields, err := configDB.HGetAll(ctx, "DEVICE_METADATA|localhost").Result() + if err != nil { + log.Printf("DeviceCollector: failed to read DEVICE_METADATA: %v", err) + ch <- prometheus.MustNewConstMetric(c.readyDesc, prometheus.GaugeValue, 0) + return + } + + mac := fields["mac"] + if mac == "" { + ch <- prometheus.MustNewConstMetric(c.readyDesc, prometheus.GaugeValue, 0) + return + } + + hwsku := fields["hwsku"] + platform := fields["platform"] + firmware := fields["sonic_os_version"] + asic := fields["asic_type"] + + // Fall back to static version info for missing fields + if c.versionInfo != nil && (hwsku == "" || firmware == "" || asic == "") { + info, err := c.versionInfo() + if err != nil { + log.Printf("DeviceCollector: failed to read version info: %v", err) + } else { + if hwsku == "" { + hwsku = info["hwsku"] + } + if firmware == "" { + firmware = info["sonic_os_version"] + } + if asic == "" { + asic = info["asic_type"] + } + } + } + + ch <- prometheus.MustNewConstMetric( + c.infoDesc, + prometheus.GaugeValue, + 1, + asic, + firmware, + hwsku, + mac, + platform, + ) + ch <- prometheus.MustNewConstMetric(c.readyDesc, prometheus.GaugeValue, 1) +} diff --git a/internal/agent/metrics/helpers.go b/internal/agent/metrics/helpers.go new file mode 100644 index 0000000..4b32cef --- /dev/null +++ b/internal/agent/metrics/helpers.go @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import "strings" + +// Severity level constants for DOM threshold and flag fields. +const ( + levelAlarm = "alarm" + levelWarning = "warning" +) + +// thresholdField represents a parsed DOM threshold field name. +type thresholdField struct { + Sensor string // e.g. "temperature", "voltage", "rx_power", "tx_bias", "tx_power" + Level string // levelAlarm or levelWarning + Direction string // "high" or "low" +} + +// thresholdPrefixes maps field name prefixes to sensor names and direction. +// SONiC TRANSCEIVER_DOM_THRESHOLD field names follow the pattern: +// +// {sensor_prefix}{direction}{level} +// +// e.g. "temphighalarm", "vcclowwarning", "rxpowerhighwarning", "txbiaslowalarm" +var thresholdPrefixes = []struct { + prefix string + sensor string + direction string +}{ + {"temphigh", "temperature", "high"}, + {"templow", "temperature", "low"}, + {"vcchigh", "voltage", "high"}, + {"vcclow", "voltage", "low"}, + {"rxpowerhigh", "rx_power", "high"}, + {"rxpowerlow", "rx_power", "low"}, + {"txbiashigh", "tx_bias", "high"}, + {"txbiaslow", "tx_bias", "low"}, + {"txpowerhigh", "tx_power", "high"}, + {"txpowerlow", "tx_power", "low"}, +} + +// parseThresholdField parses a SONiC DOM threshold field name into its components. +// Returns nil if the field name is not recognized. +func parseThresholdField(fieldName string) *thresholdField { + lower := strings.ToLower(fieldName) + for _, tp := range thresholdPrefixes { + if !strings.HasPrefix(lower, tp.prefix) { + continue + } + remainder := lower[len(tp.prefix):] + var level string + switch remainder { + case levelAlarm: + level = levelAlarm + case levelWarning: + level = levelWarning + default: + continue + } + return &thresholdField{ + Sensor: tp.sensor, + Level: level, + Direction: tp.direction, + } + } + return nil +} + +// domFlagSeverity computes the overall severity from DOM flag fields for a single interface. +// Flag field names follow the same pattern as thresholds: {sensor_prefix}{direction}{level}. +// Values are "true" or "false" strings from Redis. +// Returns 0 (ok), 1 (warning), or 2 (alarm). +func domFlagSeverity(fields map[string]string) float64 { + severity := 0.0 + for fieldName, val := range fields { + if val != "true" { + continue + } + parsed := parseThresholdField(fieldName) + if parsed == nil { + continue + } + switch parsed.Level { + case levelAlarm: + return 2 // alarm is highest, return immediately + case levelWarning: + severity = 1 + } + } + return severity +} diff --git a/internal/agent/metrics/interfaces.go b/internal/agent/metrics/interfaces.go new file mode 100644 index 0000000..afec357 --- /dev/null +++ b/internal/agent/metrics/interfaces.go @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "context" + "log" + + "github.com/prometheus/client_golang/prometheus" +) + +// InterfaceCollector collects interface state and count metrics. +type InterfaceCollector struct { + connector RedisConnector + + operStateDesc *prometheus.Desc + adminStateDesc *prometheus.Desc + interfaceTotalDesc *prometheus.Desc + portsTotalDesc *prometheus.Desc +} + +// NewInterfaceCollector creates a collector for interface state and count metrics. +func NewInterfaceCollector(connector RedisConnector) *InterfaceCollector { + return &InterfaceCollector{ + connector: connector, + operStateDesc: prometheus.NewDesc( + "sonic_switch_interface_oper_state", + "Operational state of the interface (1=up, 0=down)", + []string{"interface"}, + nil, + ), + adminStateDesc: prometheus.NewDesc( + "sonic_switch_interface_admin_state", + "Admin state of the interface (1=up, 0=down)", + []string{"interface"}, + nil, + ), + interfaceTotalDesc: prometheus.NewDesc( + "sonic_switch_interfaces_total", + "Number of interfaces by operational status", + []string{"operational_status"}, + nil, + ), + portsTotalDesc: prometheus.NewDesc( + "sonic_switch_ports_total", + "Total number of physical ports", + nil, + nil, + ), + } +} + +func (c *InterfaceCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.operStateDesc + ch <- c.adminStateDesc + ch <- c.interfaceTotalDesc + ch <- c.portsTotalDesc +} + +func (c *InterfaceCollector) Collect(ch chan<- prometheus.Metric) { + ctx, cancel := context.WithTimeout(context.Background(), collectTimeout) + defer cancel() + + configDB, err := c.connector.Connect("CONFIG_DB") + if err != nil { + log.Printf("InterfaceCollector: failed to connect to CONFIG_DB: %v", err) + return + } + + stateDB, err := c.connector.Connect("STATE_DB") + if err != nil { + log.Printf("InterfaceCollector: failed to connect to STATE_DB: %v", err) + return + } + + // Get all configured port keys from CONFIG_DB + portKeys, err := configDB.Keys(ctx, "PORT|*").Result() + if err != nil { + log.Printf("InterfaceCollector: failed to list PORT keys: %v", err) + return + } + + ch <- prometheus.MustNewConstMetric(c.portsTotalDesc, prometheus.GaugeValue, float64(len(portKeys))) + + // Build state key list and fetch all state hashes in one pipeline + ifaceNames := make([]string, 0, len(portKeys)) + stateKeys := make([]string, 0, len(portKeys)) + for _, key := range portKeys { + name := extractKeySuffix(key, "|") + ifaceNames = append(ifaceNames, name) + stateKeys = append(stateKeys, "PORT_TABLE|"+name) + } + + stateData := batchHGetAll(ctx, stateDB, stateKeys) + + upCount := 0 + downCount := 0 + + for i, name := range ifaceNames { + stateKey := stateKeys[i] + fields := stateData[stateKey] + + operUp := fields["netdev_oper_status"] == "up" + adminUp := fields["admin_status"] == "up" + + operVal := 0.0 + if operUp { + operVal = 1.0 + upCount++ + } else { + downCount++ + } + + adminVal := 0.0 + if adminUp { + adminVal = 1.0 + } + + ch <- prometheus.MustNewConstMetric(c.operStateDesc, prometheus.GaugeValue, operVal, name) + ch <- prometheus.MustNewConstMetric(c.adminStateDesc, prometheus.GaugeValue, adminVal, name) + } + + ch <- prometheus.MustNewConstMetric(c.interfaceTotalDesc, prometheus.GaugeValue, float64(upCount), "up") + ch <- prometheus.MustNewConstMetric(c.interfaceTotalDesc, prometheus.GaugeValue, float64(downCount), "down") +} diff --git a/internal/agent/metrics/metrics_test.go b/internal/agent/metrics/metrics_test.go new file mode 100644 index 0000000..f75f049 --- /dev/null +++ b/internal/agent/metrics/metrics_test.go @@ -0,0 +1,1127 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/go-redis/redismock/v9" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/redis/go-redis/v9" +) + +// mockConnector implements RedisConnector for testing using redismock. +type mockConnector struct { + clients map[string]*redis.Client + mocks map[string]redismock.ClientMock +} + +func newMockConnector(dbNames ...string) *mockConnector { + mc := &mockConnector{ + clients: make(map[string]*redis.Client), + mocks: make(map[string]redismock.ClientMock), + } + for _, name := range dbNames { + client, mock := redismock.NewClientMock() + mc.clients[name] = client + mc.mocks[name] = mock + } + return mc +} + +func (mc *mockConnector) Connect(dbName string) (*redis.Client, error) { + c, ok := mc.clients[dbName] + if !ok { + return nil, fmt.Errorf("no mock for database %s", dbName) + } + return c, nil +} + +func (mc *mockConnector) expectationsMet(t *testing.T) { + t.Helper() + for name, mock := range mc.mocks { + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations for %s: %v", name, err) + } + } +} + +// --- Device Collector Tests --- + +func TestDeviceCollector(t *testing.T) { + mc := newMockConnector("CONFIG_DB") + mc.mocks["CONFIG_DB"].ExpectHGetAll("DEVICE_METADATA|localhost").SetVal(map[string]string{ + "mac": "aa:bb:cc:dd:ee:ff", + "hwsku": "Accton-AS7726-32X", + "platform": "x86_64-accton_as7726_32x-r0", + "sonic_os_version": "4.2.0", + "asic_type": "broadcom", + }) + + collector := NewDeviceCollector(mc, nil) + expected := ` + # HELP sonic_switch_info Device metadata as labels, always 1 + # TYPE sonic_switch_info gauge + sonic_switch_info{asic="broadcom",firmware="4.2.0",hwsku="Accton-AS7726-32X",mac="aa:bb:cc:dd:ee:ff",platform="x86_64-accton_as7726_32x-r0"} 1 + # HELP sonic_switch_ready Whether the switch is ready (1) or not (0) + # TYPE sonic_switch_ready gauge + sonic_switch_ready 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("DeviceCollector mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestDeviceCollectorFallback(t *testing.T) { + mc := newMockConnector("CONFIG_DB") + mc.mocks["CONFIG_DB"].ExpectHGetAll("DEVICE_METADATA|localhost").SetVal(map[string]string{ + "mac": "aa:bb:cc:dd:ee:ff", + "hwsku": "Accton-AS7726-32X", + "platform": "x86_64-accton_as7726_32x-r0", + // No sonic_os_version or asic_type in Redis + }) + + versionInfo := func() (map[string]string, error) { + return map[string]string{ + "sonic_os_version": "4.4.0", + "asic_type": "broadcom", + }, nil + } + + collector := NewDeviceCollector(mc, versionInfo) + expected := ` + # HELP sonic_switch_info Device metadata as labels, always 1 + # TYPE sonic_switch_info gauge + sonic_switch_info{asic="broadcom",firmware="4.4.0",hwsku="Accton-AS7726-32X",mac="aa:bb:cc:dd:ee:ff",platform="x86_64-accton_as7726_32x-r0"} 1 + # HELP sonic_switch_ready Whether the switch is ready (1) or not (0) + # TYPE sonic_switch_ready gauge + sonic_switch_ready 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("DeviceCollector fallback mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestDeviceCollectorNotReady(t *testing.T) { + mc := newMockConnector("CONFIG_DB") + mc.mocks["CONFIG_DB"].ExpectHGetAll("DEVICE_METADATA|localhost").SetVal(map[string]string{ + // No "mac" field → not ready + "hwsku": "Accton-AS7726-32X", + }) + + collector := NewDeviceCollector(mc, nil) + expected := ` + # HELP sonic_switch_ready Whether the switch is ready (1) or not (0) + # TYPE sonic_switch_ready gauge + sonic_switch_ready 0 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected), "sonic_switch_ready"); err != nil { + t.Errorf("DeviceCollector not-ready mismatch: %v", err) + } + mc.expectationsMet(t) +} + +// --- Interface Collector Tests --- + +func TestInterfaceCollector(t *testing.T) { + mc := newMockConnector("CONFIG_DB", "STATE_DB") + + mc.mocks["CONFIG_DB"].ExpectKeys("PORT|*").SetVal([]string{ + "PORT|Ethernet0", "PORT|Ethernet4", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet0").SetVal(map[string]string{ + "netdev_oper_status": "up", + "admin_status": "up", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet4").SetVal(map[string]string{ + "netdev_oper_status": "down", + "admin_status": "up", + }) + + collector := NewInterfaceCollector(mc) + + expected := ` + # HELP sonic_switch_ports_total Total number of physical ports + # TYPE sonic_switch_ports_total gauge + sonic_switch_ports_total 2 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected), "sonic_switch_ports_total"); err != nil { + t.Errorf("InterfaceCollector ports_total mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestInterfaceCollectorInterfaceTotals(t *testing.T) { + mc := newMockConnector("CONFIG_DB", "STATE_DB") + + mc.mocks["CONFIG_DB"].ExpectKeys("PORT|*").SetVal([]string{ + "PORT|Ethernet0", "PORT|Ethernet4", "PORT|Ethernet8", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet0").SetVal(map[string]string{ + "netdev_oper_status": "up", + "admin_status": "up", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet4").SetVal(map[string]string{ + "netdev_oper_status": "up", + "admin_status": "up", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet8").SetVal(map[string]string{ + "netdev_oper_status": "down", + "admin_status": "down", + }) + + collector := NewInterfaceCollector(mc) + expected := ` + # HELP sonic_switch_interfaces_total Number of interfaces by operational status + # TYPE sonic_switch_interfaces_total gauge + sonic_switch_interfaces_total{operational_status="up"} 2 + sonic_switch_interfaces_total{operational_status="down"} 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected), "sonic_switch_interfaces_total"); err != nil { + t.Errorf("InterfaceCollector interfaces_total mismatch: %v", err) + } + mc.expectationsMet(t) +} + +// --- DOM Sensor Collector Tests (config-driven) --- + +func TestConfigCollectorDOMSensors(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TRANSCEIVER_DOM_SENSOR|*").SetVal([]string{ + "TRANSCEIVER_DOM_SENSOR|Ethernet0", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_DOM_SENSOR|Ethernet0").SetVal(map[string]string{ + "temperature": "32.5", + "voltage": "3.31", + "rx1power": "-8.42", + "rx2power": "-7.50", + "tx1bias": "6.75", + "tx2bias": "6.80", + }) + + // Temperature — simple field + tempMapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TRANSCEIVER_DOM_SENSOR|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + Field: "temperature", + Metric: "sonic_switch_transceiver_dom_temperature_celsius", + Type: "gauge", + Help: "Transceiver temperature in Celsius", + Labels: map[string]string{"interface": "$key_suffix"}, + }, + { + Field: "voltage", + Metric: "sonic_switch_transceiver_dom_voltage_volts", + Type: "gauge", + Help: "Transceiver supply voltage in Volts", + Labels: map[string]string{"interface": "$key_suffix"}, + }, + { + FieldPattern: "*", + Metric: "sonic_switch_transceiver_dom_rx_power_dbm", + Type: "gauge", + Help: "Transceiver RX power in dBm", + Labels: map[string]string{"interface": "$key_suffix"}, + Transform: &Transform{ + RegexCapture: &RegexCapture{ + Pattern: `^rx(?P\d+)power$`, + }, + }, + }, + { + FieldPattern: "*", + Metric: "sonic_switch_transceiver_dom_tx_bias_milliamps", + Type: "gauge", + Help: "Transceiver TX bias current in milliamps", + Labels: map[string]string{"interface": "$key_suffix"}, + Transform: &Transform{ + RegexCapture: &RegexCapture{ + Pattern: `^tx(?P\d+)bias$`, + }, + }, + }, + }, + } + + collector := NewConfigCollector(mc, tempMapping) + expected := ` + # HELP sonic_switch_transceiver_dom_temperature_celsius Transceiver temperature in Celsius + # TYPE sonic_switch_transceiver_dom_temperature_celsius gauge + sonic_switch_transceiver_dom_temperature_celsius{interface="Ethernet0"} 32.5 + # HELP sonic_switch_transceiver_dom_voltage_volts Transceiver supply voltage in Volts + # TYPE sonic_switch_transceiver_dom_voltage_volts gauge + sonic_switch_transceiver_dom_voltage_volts{interface="Ethernet0"} 3.31 + # HELP sonic_switch_transceiver_dom_rx_power_dbm Transceiver RX power in dBm + # TYPE sonic_switch_transceiver_dom_rx_power_dbm gauge + sonic_switch_transceiver_dom_rx_power_dbm{interface="Ethernet0",lane="1"} -8.42 + sonic_switch_transceiver_dom_rx_power_dbm{interface="Ethernet0",lane="2"} -7.5 + # HELP sonic_switch_transceiver_dom_tx_bias_milliamps Transceiver TX bias current in milliamps + # TYPE sonic_switch_transceiver_dom_tx_bias_milliamps gauge + sonic_switch_transceiver_dom_tx_bias_milliamps{interface="Ethernet0",lane="1"} 6.75 + sonic_switch_transceiver_dom_tx_bias_milliamps{interface="Ethernet0",lane="2"} 6.8 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector DOM sensors mismatch: %v", err) + } + mc.expectationsMet(t) +} + +// --- Threshold Field Parsing Tests --- + +func TestParseThresholdField(t *testing.T) { + tests := []struct { + input string + expected *thresholdField + }{ + {"temphighalarm", &thresholdField{"temperature", "alarm", "high"}}, + {"temphighwarning", &thresholdField{"temperature", "warning", "high"}}, + {"templowalarm", &thresholdField{"temperature", "alarm", "low"}}, + {"templowwarning", &thresholdField{"temperature", "warning", "low"}}, + {"vcchighalarm", &thresholdField{"voltage", "alarm", "high"}}, + {"vcclowwarning", &thresholdField{"voltage", "warning", "low"}}, + {"rxpowerhighalarm", &thresholdField{"rx_power", "alarm", "high"}}, + {"rxpowerlowwarning", &thresholdField{"rx_power", "warning", "low"}}, + {"txbiashighalarm", &thresholdField{"tx_bias", "alarm", "high"}}, + {"txbiaslowalarm", &thresholdField{"tx_bias", "alarm", "low"}}, + {"txpowerhighalarm", &thresholdField{"tx_power", "alarm", "high"}}, + {"txpowerlowwarning", &thresholdField{"tx_power", "warning", "low"}}, + {"unknownfield", nil}, + {"", nil}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + result := parseThresholdField(tt.input) + if tt.expected == nil { + if result != nil { + t.Errorf("expected nil, got %+v", result) + } + return + } + if result == nil { + t.Fatalf("expected %+v, got nil", tt.expected) + } + if *result != *tt.expected { + t.Errorf("expected %+v, got %+v", tt.expected, result) + } + }) + } +} + +// --- DOM Flag Severity Tests --- + +func TestDomFlagSeverity(t *testing.T) { + tests := []struct { + name string + fields map[string]string + expected float64 + }{ + {"all ok", map[string]string{ + "temphighalarm": "false", "temphighwarning": "false", + }, 0}, + {"warning", map[string]string{ + "temphighalarm": "false", "temphighwarning": "true", + }, 1}, + {"alarm", map[string]string{ + "temphighalarm": "true", "temphighwarning": "false", + }, 2}, + {"alarm overrides warning", map[string]string{ + "temphighalarm": "true", "vcchighwarning": "true", + }, 2}, + {"empty fields", map[string]string{}, 0}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := domFlagSeverity(tt.fields) + if result != tt.expected { + t.Errorf("expected %v, got %v", tt.expected, result) + } + }) + } +} + +// --- DOM Flag Collector Tests (config-driven) --- + +func TestConfigCollectorDOMFlagSeverity(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TRANSCEIVER_DOM_FLAG|*").SetVal([]string{ + "TRANSCEIVER_DOM_FLAG|Ethernet0", + "TRANSCEIVER_DOM_FLAG|Ethernet4", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_DOM_FLAG|Ethernet0").SetVal(map[string]string{ + "temphighalarm": "false", + "temphighwarning": "false", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_DOM_FLAG|Ethernet4").SetVal(map[string]string{ + "temphighalarm": "false", + "temphighwarning": "true", + }) + + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TRANSCEIVER_DOM_FLAG|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + Metric: "sonic_switch_transceiver_dom_status", + Type: "gauge", + Help: "Transceiver DOM status severity (0=ok, 1=warning, 2=alarm)", + Labels: map[string]string{"interface": "$key_suffix"}, + Transform: &Transform{ + DOMFlagSeverity: true, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_transceiver_dom_status Transceiver DOM status severity (0=ok, 1=warning, 2=alarm) + # TYPE sonic_switch_transceiver_dom_status gauge + sonic_switch_transceiver_dom_status{interface="Ethernet0"} 0 + sonic_switch_transceiver_dom_status{interface="Ethernet4"} 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector DOM flag severity mismatch: %v", err) + } + mc.expectationsMet(t) +} + +// --- Config-driven Collector Tests --- + +func TestConfigCollectorTransceiverInfo(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TRANSCEIVER_INFO|*").SetVal([]string{ + "TRANSCEIVER_INFO|Ethernet0", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_INFO|Ethernet0").SetVal(map[string]string{ + "type": "QSFP28", + "manufacturer": "Finisar", + "model": "FTLX8574D3BCL", + "serial": "ABC1234", + }) + + one := 1.0 + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TRANSCEIVER_INFO|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + Metric: "sonic_switch_transceiver_info", + Type: "gauge", + Help: "Transceiver static metadata as labels, always 1", + Value: &one, + Labels: map[string]string{ + "interface": "$key_suffix", + "type": "$type", + "vendor": "$manufacturer", + "model": "$model", + "serial": "$serial", + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_transceiver_info Transceiver static metadata as labels, always 1 + # TYPE sonic_switch_transceiver_info gauge + sonic_switch_transceiver_info{interface="Ethernet0",model="FTLX8574D3BCL",serial="ABC1234",type="QSFP28",vendor="Finisar"} 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector transceiver info mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorTransceiverStatus(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TRANSCEIVER_STATUS|*").SetVal([]string{ + "TRANSCEIVER_STATUS|Ethernet0", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_STATUS|Ethernet0").SetVal(map[string]string{ + "status": "1", + "error": "N/A", + "rxlos1": "False", + "rxlos2": "True", + "txfault1": "False", + "txfault2": "False", + "tx1disable": "False", + "tx2disable": "False", + "tx_disabled_channel": "0", + }) + + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TRANSCEIVER_STATUS|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + FieldPattern: "*", + Metric: "sonic_switch_transceiver_rxlos", + Type: "gauge", + Help: "Transceiver RX loss of signal (1=loss, 0=ok)", + Labels: map[string]string{"interface": "$key_suffix"}, + Transform: &Transform{ + RegexCapture: &RegexCapture{ + Pattern: `^rxlos(?P\d+)$`, + }, + Map: map[string]float64{"True": 1, "False": 0}, + }, + }, + { + FieldPattern: "*", + Metric: "sonic_switch_transceiver_txfault", + Type: "gauge", + Help: "Transceiver TX fault (1=fault, 0=ok)", + Labels: map[string]string{"interface": "$key_suffix"}, + Transform: &Transform{ + RegexCapture: &RegexCapture{ + Pattern: `^txfault(?P\d+)$`, + }, + Map: map[string]float64{"True": 1, "False": 0}, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_transceiver_rxlos Transceiver RX loss of signal (1=loss, 0=ok) + # TYPE sonic_switch_transceiver_rxlos gauge + sonic_switch_transceiver_rxlos{interface="Ethernet0",lane="1"} 0 + sonic_switch_transceiver_rxlos{interface="Ethernet0",lane="2"} 1 + # HELP sonic_switch_transceiver_txfault Transceiver TX fault (1=fault, 0=ok) + # TYPE sonic_switch_transceiver_txfault gauge + sonic_switch_transceiver_txfault{interface="Ethernet0",lane="1"} 0 + sonic_switch_transceiver_txfault{interface="Ethernet0",lane="2"} 0 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector transceiver status mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorTemperature(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TEMPERATURE_INFO|*").SetVal([]string{ + "TEMPERATURE_INFO|MB_RearMAC_temp(0x48)", + "TEMPERATURE_INFO|CPU_temp(0x4b)", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TEMPERATURE_INFO|MB_RearMAC_temp(0x48)").SetVal(map[string]string{ + "temperature": "34.5", + "high_threshold": "80.0", + "low_threshold": "N/A", + "warning_status": "False", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TEMPERATURE_INFO|CPU_temp(0x4b)").SetVal(map[string]string{ + "temperature": "42.0", + "high_threshold": "95.0", + "low_threshold": "N/A", + "warning_status": "False", + }) + + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TEMPERATURE_INFO|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + Field: "temperature", + Metric: "sonic_switch_temperature_celsius", + Type: "gauge", + Help: "Chassis temperature sensor reading in Celsius", + Labels: map[string]string{"sensor": "$key_suffix"}, + }, + { + Field: "high_threshold", + Metric: "sonic_switch_temperature_high_threshold_celsius", + Type: "gauge", + Help: "Chassis temperature sensor high threshold in Celsius", + Labels: map[string]string{"sensor": "$key_suffix"}, + }, + { + Field: "warning_status", + Metric: "sonic_switch_temperature_warning", + Type: "gauge", + Help: "Chassis temperature sensor warning status (1=warning, 0=ok)", + Labels: map[string]string{"sensor": "$key_suffix"}, + Transform: &Transform{ + Map: map[string]float64{"True": 1, "False": 0}, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_temperature_celsius Chassis temperature sensor reading in Celsius + # TYPE sonic_switch_temperature_celsius gauge + sonic_switch_temperature_celsius{sensor="CPU_temp(0x4b)"} 42 + sonic_switch_temperature_celsius{sensor="MB_RearMAC_temp(0x48)"} 34.5 + # HELP sonic_switch_temperature_high_threshold_celsius Chassis temperature sensor high threshold in Celsius + # TYPE sonic_switch_temperature_high_threshold_celsius gauge + sonic_switch_temperature_high_threshold_celsius{sensor="CPU_temp(0x4b)"} 95 + sonic_switch_temperature_high_threshold_celsius{sensor="MB_RearMAC_temp(0x48)"} 80 + # HELP sonic_switch_temperature_warning Chassis temperature sensor warning status (1=warning, 0=ok) + # TYPE sonic_switch_temperature_warning gauge + sonic_switch_temperature_warning{sensor="CPU_temp(0x4b)"} 0 + sonic_switch_temperature_warning{sensor="MB_RearMAC_temp(0x48)"} 0 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector temperature mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorLLDPNeighborInfo(t *testing.T) { + mc := newMockConnector("APPL_DB") + + mc.mocks["APPL_DB"].ExpectKeys("LLDP_ENTRY_TABLE:*").SetVal([]string{ + "LLDP_ENTRY_TABLE:Ethernet120", + }) + mc.mocks["APPL_DB"].ExpectHGetAll("LLDP_ENTRY_TABLE:Ethernet120").SetVal(map[string]string{ + "lldp_rem_chassis_id": "94:ef:97:94:65:42", + "lldp_rem_sys_name": "swi2-wdf4g-1", + "lldp_rem_port_desc": "Ethernet8", + "lldp_rem_port_id": "Eth3(Port3)", + "lldp_rem_man_addr": "240.127.1.1", + "lldp_rem_chassis_id_subtype": "4", + "lldp_rem_port_id_subtype": "7", + "lldp_rem_index": "1", + "lldp_rem_time_mark": "44873750", + "lldp_rem_sys_desc": "SONiC Software Version: SONiC.Edgecore", + "lldp_rem_sys_cap_supported": "28 00", + "lldp_rem_sys_cap_enabled": "28 00", + }) + + one := 1.0 + mapping := MetricMapping{ + RedisDB: "APPL_DB", + KeyPattern: "LLDP_ENTRY_TABLE:*", + KeySeparator: ":", + Fields: []FieldMapping{ + { + Metric: "sonic_switch_interface_neighbor_info", + Type: "gauge", + Help: "LLDP neighbor metadata as labels, always 1", + Value: &one, + Labels: map[string]string{ + "interface": "$key_suffix", + "neighbor_mac": "$lldp_rem_chassis_id", + "neighbor_name": "$lldp_rem_sys_name", + "neighbor_port": "$lldp_rem_port_desc", + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_interface_neighbor_info LLDP neighbor metadata as labels, always 1 + # TYPE sonic_switch_interface_neighbor_info gauge + sonic_switch_interface_neighbor_info{interface="Ethernet120",neighbor_mac="94:ef:97:94:65:42",neighbor_name="swi2-wdf4g-1",neighbor_port="Ethernet8"} 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector LLDP neighbor info mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorDOMThresholds(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("TRANSCEIVER_DOM_THRESHOLD|*").SetVal([]string{ + "TRANSCEIVER_DOM_THRESHOLD|Ethernet0", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("TRANSCEIVER_DOM_THRESHOLD|Ethernet0").SetVal(map[string]string{ + "temphighalarm": "70.0", + "templowalarm": "-5.0", + "rxpowerlowwarning": "-14.0", + }) + + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "TRANSCEIVER_DOM_THRESHOLD|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + FieldPattern: "*", + Metric: "sonic_switch_transceiver_dom_threshold", + Type: "gauge", + Help: "Transceiver DOM threshold value", + Labels: map[string]string{ + "interface": "$key_suffix", + }, + Transform: &Transform{ + ParseThresholdField: true, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_transceiver_dom_threshold Transceiver DOM threshold value + # TYPE sonic_switch_transceiver_dom_threshold gauge + sonic_switch_transceiver_dom_threshold{direction="high",interface="Ethernet0",level="alarm",sensor="temperature"} 70 + sonic_switch_transceiver_dom_threshold{direction="low",interface="Ethernet0",level="alarm",sensor="temperature"} -5 + sonic_switch_transceiver_dom_threshold{direction="low",interface="Ethernet0",level="warning",sensor="rx_power"} -14 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector DOM thresholds mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorCounters(t *testing.T) { + mc := newMockConnector("COUNTERS_DB") + + mc.mocks["COUNTERS_DB"].ExpectHGetAll("COUNTERS_PORT_NAME_MAP").SetVal(map[string]string{ + "Ethernet0": "oid:0x100000000003", + }) + mc.mocks["COUNTERS_DB"].ExpectKeys("COUNTERS:*").SetVal([]string{ + "COUNTERS:oid:0x100000000003", + }) + mc.mocks["COUNTERS_DB"].ExpectHGetAll("COUNTERS:oid:0x100000000003").SetVal(map[string]string{ + "SAI_PORT_STAT_IF_IN_OCTETS": "123456789", + "SAI_PORT_STAT_IF_OUT_OCTETS": "987654321", + "SAI_PORT_STAT_IF_IN_UCAST_PKTS": "100000", + "SAI_PORT_STAT_IF_OUT_UCAST_PKTS": "200000", + "SAI_PORT_STAT_IF_IN_MULTICAST_PKTS": "500", + "SAI_PORT_STAT_IF_OUT_MULTICAST_PKTS": "300", + "SAI_PORT_STAT_IF_IN_BROADCAST_PKTS": "50", + "SAI_PORT_STAT_IF_OUT_BROADCAST_PKTS": "20", + "SAI_PORT_STAT_IF_IN_NON_UCAST_PKTS": "550", + "SAI_PORT_STAT_IF_OUT_NON_UCAST_PKTS": "320", + "SAI_PORT_STAT_IF_IN_ERRORS": "42", + "SAI_PORT_STAT_IF_OUT_ERRORS": "0", + "SAI_PORT_STAT_IF_IN_DISCARDS": "7", + "SAI_PORT_STAT_IF_OUT_DISCARDS": "3", + "SAI_PORT_STAT_IN_DROPPED_PKTS": "2", + "SAI_PORT_STAT_OUT_DROPPED_PKTS": "1", + "SAI_PORT_STAT_IF_IN_FEC_CORRECTABLE_FRAMES": "1580", + "SAI_PORT_STAT_IF_IN_FEC_NOT_CORRECTABLE_FRAMES": "0", + "SAI_PORT_STAT_IF_IN_FEC_SYMBOL_ERRORS": "23", + "SAI_PORT_STAT_IF_OUT_QLEN": "5", + "SAI_PORT_STAT_PFC_0_RX_PKTS": "100", + "SAI_PORT_STAT_PFC_0_TX_PKTS": "50", + "SAI_PORT_STAT_ETHER_STATS_UNDERSIZE_PKTS": "0", + "SAI_PORT_STAT_ETHER_STATS_FRAGMENTS": "0", + "SAI_PORT_STAT_ETHER_STATS_JABBERS": "0", + "SAI_PORT_STAT_IF_IN_UNKNOWN_PROTOS": "0", + "SAI_PORT_STAT_ETHER_RX_OVERSIZE_PKTS": "0", + "SAI_PORT_STAT_ETHER_TX_OVERSIZE_PKTS": "0", + }) + + mapping := MetricMapping{ + RedisDB: "COUNTERS_DB", + KeyPattern: "COUNTERS:*", + KeySeparator: ":", + KeyResolver: "COUNTERS_PORT_NAME_MAP", + Fields: []FieldMapping{ + {Field: "SAI_PORT_STAT_IF_IN_OCTETS", Metric: "sonic_switch_interface_bytes_total", Type: "counter", Help: "Total bytes transferred", Labels: map[string]string{"interface": "$port_name", "direction": "rx"}}, + {Field: "SAI_PORT_STAT_IF_OUT_OCTETS", Metric: "sonic_switch_interface_bytes_total", Type: "counter", Help: "Total bytes transferred", Labels: map[string]string{"interface": "$port_name", "direction": "tx"}}, + {Field: "SAI_PORT_STAT_IF_IN_UCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "rx", "type": "unicast"}}, + {Field: "SAI_PORT_STAT_IF_OUT_UCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "tx", "type": "unicast"}}, + {Field: "SAI_PORT_STAT_IF_IN_MULTICAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "rx", "type": "multicast"}}, + {Field: "SAI_PORT_STAT_IF_OUT_MULTICAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "tx", "type": "multicast"}}, + {Field: "SAI_PORT_STAT_IF_IN_BROADCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "rx", "type": "broadcast"}}, + {Field: "SAI_PORT_STAT_IF_OUT_BROADCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "tx", "type": "broadcast"}}, + {Field: "SAI_PORT_STAT_IF_IN_NON_UCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "rx", "type": "non_unicast"}}, + {Field: "SAI_PORT_STAT_IF_OUT_NON_UCAST_PKTS", Metric: "sonic_switch_interface_packets_total", Type: "counter", Help: "Total packets transferred", Labels: map[string]string{"interface": "$port_name", "direction": "tx", "type": "non_unicast"}}, + {Field: "SAI_PORT_STAT_IF_IN_ERRORS", Metric: "sonic_switch_interface_errors_total", Type: "counter", Help: "Total interface errors", Labels: map[string]string{"interface": "$port_name", "direction": "rx"}}, + {Field: "SAI_PORT_STAT_IF_OUT_ERRORS", Metric: "sonic_switch_interface_errors_total", Type: "counter", Help: "Total interface errors", Labels: map[string]string{"interface": "$port_name", "direction": "tx"}}, + {Field: "SAI_PORT_STAT_IF_IN_DISCARDS", Metric: "sonic_switch_interface_discards_total", Type: "counter", Help: "Total interface discards", Labels: map[string]string{"interface": "$port_name", "direction": "rx"}}, + {Field: "SAI_PORT_STAT_IF_OUT_DISCARDS", Metric: "sonic_switch_interface_discards_total", Type: "counter", Help: "Total interface discards", Labels: map[string]string{"interface": "$port_name", "direction": "tx"}}, + {Field: "SAI_PORT_STAT_IN_DROPPED_PKTS", Metric: "sonic_switch_interface_dropped_packets_total", Type: "counter", Help: "Total SAI-level dropped packets", Labels: map[string]string{"interface": "$port_name", "direction": "rx"}}, + {Field: "SAI_PORT_STAT_OUT_DROPPED_PKTS", Metric: "sonic_switch_interface_dropped_packets_total", Type: "counter", Help: "Total SAI-level dropped packets", Labels: map[string]string{"interface": "$port_name", "direction": "tx"}}, + {Field: "SAI_PORT_STAT_IF_IN_FEC_CORRECTABLE_FRAMES", Metric: "sonic_switch_interface_fec_frames_total", Type: "counter", Help: "Total FEC frames", Labels: map[string]string{"interface": "$port_name", "type": "correctable"}}, + {Field: "SAI_PORT_STAT_IF_IN_FEC_NOT_CORRECTABLE_FRAMES", Metric: "sonic_switch_interface_fec_frames_total", Type: "counter", Help: "Total FEC frames", Labels: map[string]string{"interface": "$port_name", "type": "uncorrectable"}}, + {Field: "SAI_PORT_STAT_IF_IN_FEC_SYMBOL_ERRORS", Metric: "sonic_switch_interface_fec_frames_total", Type: "counter", Help: "Total FEC frames", Labels: map[string]string{"interface": "$port_name", "type": "symbol_errors"}}, + {Field: "SAI_PORT_STAT_IF_OUT_QLEN", Metric: "sonic_switch_interface_queue_length", Type: "gauge", Help: "Current output queue length", Labels: map[string]string{"interface": "$port_name"}}, + {Field: "SAI_PORT_STAT_PFC_0_RX_PKTS", Metric: "sonic_switch_interface_pfc_packets_total", Type: "counter", Help: "Total PFC packets", Labels: map[string]string{"interface": "$port_name", "direction": "rx", "priority": "0"}}, + {Field: "SAI_PORT_STAT_PFC_0_TX_PKTS", Metric: "sonic_switch_interface_pfc_packets_total", Type: "counter", Help: "Total PFC packets", Labels: map[string]string{"interface": "$port_name", "direction": "tx", "priority": "0"}}, + {Field: "SAI_PORT_STAT_ETHER_STATS_UNDERSIZE_PKTS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "undersize"}}, + {Field: "SAI_PORT_STAT_ETHER_STATS_FRAGMENTS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "fragments"}}, + {Field: "SAI_PORT_STAT_ETHER_STATS_JABBERS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "jabbers"}}, + {Field: "SAI_PORT_STAT_IF_IN_UNKNOWN_PROTOS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "unknown_protos"}}, + {Field: "SAI_PORT_STAT_ETHER_RX_OVERSIZE_PKTS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "rx_oversize"}}, + {Field: "SAI_PORT_STAT_ETHER_TX_OVERSIZE_PKTS", Metric: "sonic_switch_interface_anomaly_packets_total", Type: "counter", Help: "Total anomalous packets", Labels: map[string]string{"interface": "$port_name", "type": "tx_oversize"}}, + }, + } + + collector := NewConfigCollector(mc, mapping) + + // Verify a representative subset of metrics + expected := ` + # HELP sonic_switch_interface_bytes_total Total bytes transferred + # TYPE sonic_switch_interface_bytes_total counter + sonic_switch_interface_bytes_total{direction="rx",interface="Ethernet0"} 1.23456789e+08 + sonic_switch_interface_bytes_total{direction="tx",interface="Ethernet0"} 9.87654321e+08 + # HELP sonic_switch_interface_errors_total Total interface errors + # TYPE sonic_switch_interface_errors_total counter + sonic_switch_interface_errors_total{direction="rx",interface="Ethernet0"} 42 + sonic_switch_interface_errors_total{direction="tx",interface="Ethernet0"} 0 + # HELP sonic_switch_interface_discards_total Total interface discards + # TYPE sonic_switch_interface_discards_total counter + sonic_switch_interface_discards_total{direction="rx",interface="Ethernet0"} 7 + sonic_switch_interface_discards_total{direction="tx",interface="Ethernet0"} 3 + # HELP sonic_switch_interface_dropped_packets_total Total SAI-level dropped packets + # TYPE sonic_switch_interface_dropped_packets_total counter + sonic_switch_interface_dropped_packets_total{direction="rx",interface="Ethernet0"} 2 + sonic_switch_interface_dropped_packets_total{direction="tx",interface="Ethernet0"} 1 + # HELP sonic_switch_interface_fec_frames_total Total FEC frames + # TYPE sonic_switch_interface_fec_frames_total counter + sonic_switch_interface_fec_frames_total{interface="Ethernet0",type="correctable"} 1580 + sonic_switch_interface_fec_frames_total{interface="Ethernet0",type="symbol_errors"} 23 + sonic_switch_interface_fec_frames_total{interface="Ethernet0",type="uncorrectable"} 0 + # HELP sonic_switch_interface_queue_length Current output queue length + # TYPE sonic_switch_interface_queue_length gauge + sonic_switch_interface_queue_length{interface="Ethernet0"} 5 + # HELP sonic_switch_interface_packets_total Total packets transferred + # TYPE sonic_switch_interface_packets_total counter + sonic_switch_interface_packets_total{direction="rx",interface="Ethernet0",type="broadcast"} 50 + sonic_switch_interface_packets_total{direction="rx",interface="Ethernet0",type="multicast"} 500 + sonic_switch_interface_packets_total{direction="rx",interface="Ethernet0",type="non_unicast"} 550 + sonic_switch_interface_packets_total{direction="rx",interface="Ethernet0",type="unicast"} 100000 + sonic_switch_interface_packets_total{direction="tx",interface="Ethernet0",type="broadcast"} 20 + sonic_switch_interface_packets_total{direction="tx",interface="Ethernet0",type="multicast"} 300 + sonic_switch_interface_packets_total{direction="tx",interface="Ethernet0",type="non_unicast"} 320 + sonic_switch_interface_packets_total{direction="tx",interface="Ethernet0",type="unicast"} 200000 + # HELP sonic_switch_interface_pfc_packets_total Total PFC packets + # TYPE sonic_switch_interface_pfc_packets_total counter + sonic_switch_interface_pfc_packets_total{direction="rx",interface="Ethernet0",priority="0"} 100 + sonic_switch_interface_pfc_packets_total{direction="tx",interface="Ethernet0",priority="0"} 50 + # HELP sonic_switch_interface_anomaly_packets_total Total anomalous packets + # TYPE sonic_switch_interface_anomaly_packets_total counter + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="fragments"} 0 + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="jabbers"} 0 + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="rx_oversize"} 0 + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="tx_oversize"} 0 + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="undersize"} 0 + sonic_switch_interface_anomaly_packets_total{interface="Ethernet0",type="unknown_protos"} 0 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector counters mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorHistogram(t *testing.T) { + mc := newMockConnector("COUNTERS_DB") + + mc.mocks["COUNTERS_DB"].ExpectHGetAll("COUNTERS_PORT_NAME_MAP").SetVal(map[string]string{ + "Ethernet0": "oid:0x100000000003", + }) + mc.mocks["COUNTERS_DB"].ExpectKeys("COUNTERS:*").SetVal([]string{ + "COUNTERS:oid:0x100000000003", + }) + mc.mocks["COUNTERS_DB"].ExpectHGetAll("COUNTERS:oid:0x100000000003").SetVal(map[string]string{ + "SAI_PORT_STAT_ETHER_IN_PKTS_64_OCTETS": "10000", + "SAI_PORT_STAT_ETHER_IN_PKTS_65_TO_127_OCTETS": "5000", + "SAI_PORT_STAT_ETHER_IN_PKTS_128_TO_255_OCTETS": "2000", + "SAI_PORT_STAT_ETHER_IN_PKTS_256_TO_511_OCTETS": "1000", + "SAI_PORT_STAT_ETHER_IN_PKTS_512_TO_1023_OCTETS": "500", + "SAI_PORT_STAT_ETHER_IN_PKTS_1024_TO_1518_OCTETS": "200", + "SAI_PORT_STAT_ETHER_IN_PKTS_1519_TO_2047_OCTETS": "50", + "SAI_PORT_STAT_ETHER_IN_PKTS_2048_TO_4095_OCTETS": "10", + "SAI_PORT_STAT_ETHER_IN_PKTS_4096_TO_9216_OCTETS": "5", + "SAI_PORT_STAT_ETHER_IN_PKTS_9217_TO_16383_OCTETS": "0", + }) + + mapping := MetricMapping{ + RedisDB: "COUNTERS_DB", + KeyPattern: "COUNTERS:*", + KeySeparator: ":", + KeyResolver: "COUNTERS_PORT_NAME_MAP", + Fields: []FieldMapping{ + { + Metric: "sonic_switch_interface_rx_packet_size_bytes", + Type: "histogram", + Help: "RX packet size distribution", + Labels: map[string]string{"interface": "$port_name"}, + Transform: &Transform{ + Histogram: &HistogramBuckets{ + Buckets: map[float64]string{ + 64: "SAI_PORT_STAT_ETHER_IN_PKTS_64_OCTETS", + 127: "SAI_PORT_STAT_ETHER_IN_PKTS_65_TO_127_OCTETS", + 255: "SAI_PORT_STAT_ETHER_IN_PKTS_128_TO_255_OCTETS", + 511: "SAI_PORT_STAT_ETHER_IN_PKTS_256_TO_511_OCTETS", + 1023: "SAI_PORT_STAT_ETHER_IN_PKTS_512_TO_1023_OCTETS", + 1518: "SAI_PORT_STAT_ETHER_IN_PKTS_1024_TO_1518_OCTETS", + 2047: "SAI_PORT_STAT_ETHER_IN_PKTS_1519_TO_2047_OCTETS", + 4095: "SAI_PORT_STAT_ETHER_IN_PKTS_2048_TO_4095_OCTETS", + 9216: "SAI_PORT_STAT_ETHER_IN_PKTS_4096_TO_9216_OCTETS", + 16383: "SAI_PORT_STAT_ETHER_IN_PKTS_9217_TO_16383_OCTETS", + }, + }, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + + // Cumulative counts: + // 64: 10000, 127: 15000, 255: 17000, 511: 18000, 1023: 18500, + // 1518: 18700, 2047: 18750, 4095: 18760, 9216: 18765, 16383: 18765 + // count=18765, sum=0 + expected := ` + # HELP sonic_switch_interface_rx_packet_size_bytes RX packet size distribution + # TYPE sonic_switch_interface_rx_packet_size_bytes histogram + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="64"} 10000 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="127"} 15000 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="255"} 17000 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="511"} 18000 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="1023"} 18500 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="1518"} 18700 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="2047"} 18750 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="4095"} 18760 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="9216"} 18765 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="16383"} 18765 + sonic_switch_interface_rx_packet_size_bytes_bucket{interface="Ethernet0",le="+Inf"} 18765 + sonic_switch_interface_rx_packet_size_bytes_sum{interface="Ethernet0"} 0 + sonic_switch_interface_rx_packet_size_bytes_count{interface="Ethernet0"} 18765 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector histogram mismatch: %v", err) + } + mc.expectationsMet(t) +} + +func TestConfigCollectorMapTransform(t *testing.T) { + mc := newMockConnector("STATE_DB") + + mc.mocks["STATE_DB"].ExpectKeys("PORT_TABLE|*").SetVal([]string{ + "PORT_TABLE|Ethernet0", + }) + mc.mocks["STATE_DB"].ExpectHGetAll("PORT_TABLE|Ethernet0").SetVal(map[string]string{ + "oper_status": "up", + }) + + mapping := MetricMapping{ + RedisDB: "STATE_DB", + KeyPattern: "PORT_TABLE|*", + KeySeparator: "|", + Fields: []FieldMapping{ + { + Field: "oper_status", + Metric: "sonic_switch_interface_oper_state", + Type: "gauge", + Help: "Operational state of the interface", + Labels: map[string]string{ + "interface": "$key_suffix", + }, + Transform: &Transform{ + Map: map[string]float64{"up": 1, "down": 0}, + }, + }, + }, + } + + collector := NewConfigCollector(mc, mapping) + expected := ` + # HELP sonic_switch_interface_oper_state Operational state of the interface + # TYPE sonic_switch_interface_oper_state gauge + sonic_switch_interface_oper_state{interface="Ethernet0"} 1 + ` + if err := testutil.CollectAndCompare(collector, strings.NewReader(expected)); err != nil { + t.Errorf("ConfigCollector map transform mismatch: %v", err) + } + mc.expectationsMet(t) +} + +// --- Config Loading Tests --- + +func TestDefaultConfigLoads(t *testing.T) { + cfg, err := DefaultConfig() + if err != nil { + t.Fatalf("DefaultConfig() failed: %v", err) + } + if len(cfg.Metrics) == 0 { + t.Fatal("DefaultConfig() returned empty metrics") + } + // Verify expected metric mappings exist + metricNames := make(map[string]bool) + for _, m := range cfg.Metrics { + for _, f := range m.Fields { + metricNames[f.Metric] = true + } + } + for _, want := range []string{ + "sonic_switch_transceiver_dom_threshold", + "sonic_switch_transceiver_info", + "sonic_switch_interface_errors_total", + "sonic_switch_interface_discards_total", + "sonic_switch_interface_fec_frames_total", + "sonic_switch_transceiver_dom_temperature_celsius", + "sonic_switch_transceiver_dom_voltage_volts", + "sonic_switch_transceiver_dom_rx_power_dbm", + "sonic_switch_transceiver_dom_tx_bias_milliamps", + "sonic_switch_transceiver_rxlos", + "sonic_switch_transceiver_txfault", + "sonic_switch_interface_neighbor_info", + "sonic_switch_temperature_celsius", + "sonic_switch_temperature_high_threshold_celsius", + "sonic_switch_temperature_warning", + "sonic_switch_interface_bytes_total", + "sonic_switch_interface_packets_total", + "sonic_switch_interface_dropped_packets_total", + "sonic_switch_interface_queue_length", + "sonic_switch_interface_pfc_packets_total", + "sonic_switch_interface_rx_packet_size_bytes", + "sonic_switch_interface_tx_packet_size_bytes", + "sonic_switch_interface_anomaly_packets_total", + } { + if !metricNames[want] { + t.Errorf("DefaultConfig missing expected metric %q", want) + } + } +} + +func TestConfigValidation(t *testing.T) { + tests := []struct { + name string + cfg MetricsConfig + wantErr bool + }{ + {"valid", MetricsConfig{Metrics: []MetricMapping{{ + RedisDB: "STATE_DB", KeyPattern: "FOO|*", + Fields: []FieldMapping{{Metric: "test_metric", Type: "gauge"}}, + }}}, false}, + {"missing redis_db", MetricsConfig{Metrics: []MetricMapping{{ + KeyPattern: "FOO|*", + Fields: []FieldMapping{{Metric: "test_metric", Type: "gauge"}}, + }}}, true}, + {"missing key_pattern", MetricsConfig{Metrics: []MetricMapping{{ + RedisDB: "STATE_DB", + Fields: []FieldMapping{{Metric: "test_metric", Type: "gauge"}}, + }}}, true}, + {"missing metric name", MetricsConfig{Metrics: []MetricMapping{{ + RedisDB: "STATE_DB", KeyPattern: "FOO|*", + Fields: []FieldMapping{{Type: "gauge"}}, + }}}, true}, + {"invalid type", MetricsConfig{Metrics: []MetricMapping{{ + RedisDB: "STATE_DB", KeyPattern: "FOO|*", + Fields: []FieldMapping{{Metric: "test_metric", Type: "histogram"}}, + }}}, true}, + {"field and field_pattern exclusive", MetricsConfig{Metrics: []MetricMapping{{ + RedisDB: "STATE_DB", KeyPattern: "FOO|*", + Fields: []FieldMapping{{Metric: "test_metric", Type: "gauge", Field: "foo", FieldPattern: "*"}}, + }}}, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateConfig(&tt.cfg) + if (err != nil) != tt.wantErr { + t.Errorf("validateConfig() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +// --- Health Endpoint Test --- + +func TestHealthEndpointOK(t *testing.T) { + mc := newMockConnector("CONFIG_DB") + mc.mocks["CONFIG_DB"].ExpectPing().SetVal("PONG") + + srv := NewMetricsServer(":0", mc, nil, "") + + req := httptest.NewRequest("GET", "/healthz", nil) + w := httptest.NewRecorder() + srv.Handler.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + if w.Body.String() != "ok" { + t.Errorf("expected 'ok', got %q", w.Body.String()) + } + mc.expectationsMet(t) +} + +func TestHealthEndpointRedisDown(t *testing.T) { + mc := newMockConnector() + + srv := NewMetricsServer(":0", mc, nil, "") + + req := httptest.NewRequest("GET", "/healthz", nil) + w := httptest.NewRecorder() + srv.Handler.ServeHTTP(w, req) + + if w.Code != http.StatusInternalServerError { + t.Errorf("expected 500, got %d", w.Code) + } +} + +// --- Scrape Duration Test --- + +func TestScrapeDurationMetric(t *testing.T) { + mc := newMockConnector("CONFIG_DB") + // DeviceCollector will read DEVICE_METADATA + mc.mocks["CONFIG_DB"].ExpectHGetAll("DEVICE_METADATA|localhost").SetVal(map[string]string{ + "mac": "aa:bb:cc:dd:ee:ff", + }) + // InterfaceCollector will list ports + mc.mocks["CONFIG_DB"].ExpectKeys("PORT|*").SetVal([]string{}) + + srv := NewMetricsServer(":0", mc, nil, "") + + req := httptest.NewRequest("GET", "/metrics", nil) + w := httptest.NewRecorder() + srv.Handler.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + body := w.Body.String() + if !strings.Contains(body, "sonic_scrape_duration_seconds") { + t.Error("response missing sonic_scrape_duration_seconds metric") + } +} + +// --- Error Handling Test --- + +func TestDeviceCollectorRedisDown(t *testing.T) { + mc := newMockConnector() + collector := NewDeviceCollector(mc, nil) + + reg := prometheus.NewRegistry() + reg.MustRegister(collector) + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gather failed: %v", err) + } + found := false + for _, mf := range metrics { + if mf.GetName() == "sonic_switch_ready" { + found = true + if mf.GetMetric()[0].GetGauge().GetValue() != 0 { + t.Errorf("expected sonic_switch_ready=0, got %v", mf.GetMetric()[0].GetGauge().GetValue()) + } + } + } + if !found { + t.Error("sonic_switch_ready metric not found") + } +} diff --git a/internal/agent/metrics/server.go b/internal/agent/metrics/server.go new file mode 100644 index 0000000..c4eca21 --- /dev/null +++ b/internal/agent/metrics/server.go @@ -0,0 +1,81 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "fmt" + "net/http" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// NewMetricsServer creates an HTTP server that serves Prometheus metrics on /metrics +// and a health check on /healthz. All metrics are collected just-in-time from Redis. +// +// versionInfo is optional: when non-nil it provides fallback device metadata. +// configPath is optional: when empty, the embedded default config is used. +// When set, the config is loaded from that file path. +func NewMetricsServer(addr string, connector RedisConnector, versionInfo VersionInfoFunc, configPath string) *http.Server { + registry := prometheus.NewRegistry() + + // Register built-in collectors (require custom logic not expressible in config) + registry.MustRegister( + NewDeviceCollector(connector, versionInfo), + NewInterfaceCollector(connector), + ) + + // Load and register config-driven collectors + var cfg *MetricsConfig + var err error + if configPath != "" { + cfg, err = LoadConfig(configPath) + } else { + cfg, err = DefaultConfig() + } + if err != nil { + // Log but don't crash — built-in collectors still work + fmt.Printf("WARNING: failed to load metrics config: %v\n", err) + } else { + for _, mapping := range cfg.Metrics { + registry.MustRegister(NewConfigCollector(connector, mapping)) + } + } + + // Scrape duration gauge — records wall-clock time of the last /metrics request + scrapeDuration := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "sonic_scrape_duration_seconds", + Help: "Duration of the last metrics scrape in seconds", + }) + registry.MustRegister(scrapeDuration) + + mux := http.NewServeMux() + handler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) + mux.Handle("GET /metrics", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + handler.ServeHTTP(w, r) + scrapeDuration.Set(time.Since(start).Seconds()) + })) + mux.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) { + client, err := connector.Connect("CONFIG_DB") + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + _, _ = fmt.Fprintf(w, "redis unhealthy: %v", err) + return + } + if err := client.Ping(r.Context()).Err(); err != nil { + w.WriteHeader(http.StatusInternalServerError) + _, _ = fmt.Fprintf(w, "redis ping failed: %v", err) + return + } + w.WriteHeader(http.StatusOK) + _, _ = fmt.Fprint(w, "ok") + }) + + return &http.Server{ + Addr: addr, + Handler: mux, + } +}