Skip to content

Commit 1fa2099

Browse files
sradcoAI Assistant
andcommitted
Add multipath collector for NVMe-oF subsystem path health
Add a new disabled-by-default collector (--collector.multipath) that exposes NVMe over Fabrics connectivity metrics by reading /sys/class/nvme-subsystem/: - node_multipath_nvme_subsystem_info: subsystem metadata (NQN, model, etc.) - node_multipath_nvme_subsystem_paths_total: total controller paths - node_multipath_nvme_subsystem_paths_live: live controller paths - node_multipath_nvme_path_state: per-controller state (live/dead/etc.) This fills a monitoring gap for storage connectivity — the existing NVMe collector reports hardware health but is blind to fabric path failures. No special permissions required; sysfs is world-readable. Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent a1cbf81 commit 1fa2099

5 files changed

Lines changed: 553 additions & 13 deletions

File tree

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ lnstat | Exposes stats from `/proc/net/stat/`. | Linux
201201
logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/Software/systemd/logind/). | Linux
202202
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
203203
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
204+
multipath | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
204205
network_route | Exposes the routing table as metrics | Linux
205206
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
206207
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
@@ -339,6 +340,18 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### Multipath Collector
344+
345+
The `multipath` collector exposes NVMe-oF (NVMe over Fabrics) subsystem path
346+
health by reading `/sys/class/nvme-subsystem/`. It provides connectivity-layer
347+
visibility that the standard `nvme` collector does not cover — specifically,
348+
per-controller path state and path redundancy counts at the subsystem level.
349+
350+
This enables alerting on fabric path failures (e.g. disconnected FC cables or
351+
failed switches) before they cause total storage loss.
352+
353+
No special permissions are required — the sysfs files are world-readable.
354+
342355
### Filtering enabled collectors
343356

344357
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/fixtures/sys.ttar

Lines changed: 111 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2219,40 +2219,138 @@ Lines: 1
22192219
Samsung SSD 970 PRO 512GB
22202220
Mode: 444
22212221
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2222-
Path: sys/class/nvme/nvme0/serial
2222+
Directory: sys/class/nvme/nvme0/nvme0c0n0
2223+
Mode: 755
2224+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2225+
Path: sys/class/nvme/nvme0/nvme0c0n0/ana_state
22232226
Lines: 1
2224-
S680HF8N190894I
2227+
optimized
22252228
Mode: 444
22262229
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2227-
Path: sys/class/nvme/nvme0/state
2230+
Path: sys/class/nvme/nvme0/nvme0c0n0/nuse
22282231
Lines: 1
2229-
live
2232+
488281250
22302233
Mode: 444
22312234
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2232-
Directory: sys/class/nvme/nvme0/nvme0c0n0
2235+
Directory: sys/class/nvme/nvme0/nvme0c0n0/queue
22332236
Mode: 755
22342237
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2235-
Path: sys/class/nvme/nvme0/nvme0c0n0/ana_state
2238+
Path: sys/class/nvme/nvme0/nvme0c0n0/queue/logical_block_size
22362239
Lines: 1
2237-
optimized
2238-
Mode: 444
2240+
4096
2241+
Mode: 644
22392242
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22402243
Path: sys/class/nvme/nvme0/nvme0c0n0/size
22412244
Lines: 1
22422245
3906250000
22432246
Mode: 444
22442247
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2245-
Path: sys/class/nvme/nvme0/nvme0c0n0/nuse
2248+
Path: sys/class/nvme/nvme0/serial
22462249
Lines: 1
2247-
488281250
2250+
S680HF8N190894I
22482251
Mode: 444
22492252
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2250-
Directory: sys/class/nvme/nvme0/nvme0c0n0/queue
2253+
Path: sys/class/nvme/nvme0/state
2254+
Lines: 1
2255+
live
2256+
Mode: 444
2257+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2258+
Directory: sys/class/nvme-subsystem
22512259
Mode: 755
22522260
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2253-
Path: sys/class/nvme/nvme0/nvme0c0n0/queue/logical_block_size
2261+
Directory: sys/class/nvme-subsystem/nvme-subsys0
2262+
Mode: 755
2263+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2264+
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
22542265
Lines: 1
2255-
4096
2266+
round-robinEOF
2267+
Mode: 644
2268+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2269+
Path: sys/class/nvme-subsystem/nvme-subsys0/model
2270+
Lines: 1
2271+
Dell PowerStoreEOF
2272+
Mode: 644
2273+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2274+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
2275+
Mode: 755
2276+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2277+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
2278+
Lines: 1
2279+
nn-0x200000109b123456:pn-0x100000109b123456EOF
2280+
Mode: 644
2281+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2282+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
2283+
Lines: 1
2284+
liveEOF
2285+
Mode: 644
2286+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2287+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
2288+
Lines: 1
2289+
fcEOF
2290+
Mode: 644
2291+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2292+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
2293+
Mode: 755
2294+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2295+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
2296+
Lines: 1
2297+
nn-0x200000109b123457:pn-0x100000109b123457EOF
2298+
Mode: 644
2299+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2300+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
2301+
Lines: 1
2302+
liveEOF
2303+
Mode: 644
2304+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2305+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
2306+
Lines: 1
2307+
fcEOF
2308+
Mode: 644
2309+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2310+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
2311+
Mode: 755
2312+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2313+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
2314+
Lines: 1
2315+
nn-0x200000109b123458:pn-0x100000109b123458EOF
2316+
Mode: 644
2317+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2318+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
2319+
Lines: 1
2320+
liveEOF
2321+
Mode: 644
2322+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2323+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
2324+
Lines: 1
2325+
fcEOF
2326+
Mode: 644
2327+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2328+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
2329+
Mode: 755
2330+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2331+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
2332+
Lines: 1
2333+
nn-0x200000109b123459:pn-0x100000109b123459EOF
2334+
Mode: 644
2335+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2336+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
2337+
Lines: 1
2338+
deadEOF
2339+
Mode: 644
2340+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2341+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
2342+
Lines: 1
2343+
fcEOF
2344+
Mode: 644
2345+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2346+
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
2347+
Lines: 1
2348+
SN12345678EOF
2349+
Mode: 644
2350+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2351+
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
2352+
Lines: 1
2353+
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
22562354
Mode: 644
22572355
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22582356
Directory: sys/class/power_supply

collector/multipath_linux.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nomultipath
15+
16+
package collector
17+
18+
import (
19+
"fmt"
20+
"log/slog"
21+
22+
"github.com/prometheus/client_golang/prometheus"
23+
)
24+
25+
type multipathCollector struct {
26+
logger *slog.Logger
27+
scanNVMeSubsystems func() ([]nvmeSubsystem, error)
28+
29+
nvmeSubsystemInfo *prometheus.Desc
30+
nvmeSubsystemPathsTotal *prometheus.Desc
31+
nvmeSubsystemPathsLive *prometheus.Desc
32+
nvmePathState *prometheus.Desc
33+
}
34+
35+
func init() {
36+
registerCollector("multipath", defaultDisabled, NewMultipathCollector)
37+
}
38+
39+
// NewMultipathCollector returns a new Collector exposing multipath storage
40+
// connectivity metrics from /sys/class/nvme-subsystem/.
41+
func NewMultipathCollector(logger *slog.Logger) (Collector, error) {
42+
const subsystem = "multipath"
43+
44+
c := &multipathCollector{
45+
logger: logger,
46+
nvmeSubsystemInfo: prometheus.NewDesc(
47+
prometheus.BuildFQName(namespace, subsystem, "nvme_subsystem_info"),
48+
"Non-numeric information about an NVMe subsystem.",
49+
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
50+
),
51+
nvmeSubsystemPathsTotal: prometheus.NewDesc(
52+
prometheus.BuildFQName(namespace, subsystem, "nvme_subsystem_paths_total"),
53+
"Total number of controller paths for an NVMe subsystem.",
54+
[]string{"subsystem"}, nil,
55+
),
56+
nvmeSubsystemPathsLive: prometheus.NewDesc(
57+
prometheus.BuildFQName(namespace, subsystem, "nvme_subsystem_paths_live"),
58+
"Number of controller paths in live state for an NVMe subsystem.",
59+
[]string{"subsystem"}, nil,
60+
),
61+
nvmePathState: prometheus.NewDesc(
62+
prometheus.BuildFQName(namespace, subsystem, "nvme_path_state"),
63+
"Current NVMe controller path state (1 for the current state, 0 for all others).",
64+
[]string{"subsystem", "controller", "transport", "state"}, nil,
65+
),
66+
}
67+
68+
c.scanNVMeSubsystems = func() ([]nvmeSubsystem, error) {
69+
return scanNVMeSubsystems(*sysPath)
70+
}
71+
72+
return c, nil
73+
}
74+
75+
func (c *multipathCollector) Update(ch chan<- prometheus.Metric) error {
76+
subsystems, err := c.scanNVMeSubsystems()
77+
if err != nil {
78+
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
79+
}
80+
81+
if len(subsystems) > 0 {
82+
c.emitNVMeSubsystemMetrics(ch, subsystems)
83+
}
84+
85+
return nil
86+
}

0 commit comments

Comments
 (0)