Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
network_route | Exposes the routing table as metrics | Linux
nvmesubsystem | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
processes | Exposes aggregate process statistics from `/proc`. | Linux
Expand Down Expand Up @@ -339,6 +340,25 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
```

### NVMe Subsystem Collector

The `nvmesubsystem` collector exposes NVMe-oF (NVMe over Fabrics) subsystem
path health by reading `/sys/class/nvme-subsystem/`. It complements the
existing `nvme` collector (which reports per-controller hardware stats) by
monitoring the **connectivity layer** — how many controller paths are live,
connecting, or dead for each NVMe subsystem.

Enable it with `--collector.nvmesubsystem`.

#### Exposed metrics

| Metric | Description |
|--------|-------------|
| `node_nvmesubsystem_info` | Info metric with subsystem NQN, model, serial and I/O policy as labels. |
| `node_nvmesubsystem_paths_total` | Total number of controller paths for the subsystem. |
| `node_nvmesubsystem_paths_live` | Number of controller paths currently in `live` state. |
| `node_nvmesubsystem_path_state` | Per-controller path state (1 for the current state, 0 for others). |

### Filtering enabled collectors

The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.
Expand Down
98 changes: 98 additions & 0 deletions collector/fixtures/sys.ttar
Original file line number Diff line number Diff line change
Expand Up @@ -2255,6 +2255,104 @@ Lines: 1
4096
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem/nvme-subsys0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
Lines: 1
round-robinEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/model
Lines: 1
Dell PowerStoreEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
Lines: 1
nn-0x200000109b123456:pn-0x100000109b123456EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
Lines: 1
liveEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
Lines: 1
fcEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
Lines: 1
nn-0x200000109b123457:pn-0x100000109b123457EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
Lines: 1
liveEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
Lines: 1
fcEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
Lines: 1
nn-0x200000109b123458:pn-0x100000109b123458EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
Lines: 1
liveEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
Lines: 1
fcEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
Lines: 1
nn-0x200000109b123459:pn-0x100000109b123459EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
Lines: 1
deadEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
Lines: 1
fcEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
Lines: 1
SN12345678EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
Lines: 1
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/power_supply
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down
209 changes: 209 additions & 0 deletions collector/nvmesubsystem_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nonvmesubsystem

package collector

import (
"fmt"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/prometheus/client_golang/prometheus"
)

type nvmeSubsystemCollector struct {
logger *slog.Logger
scanSubsystems func() ([]nvmeSubsystem, error)

subsystemInfo *prometheus.Desc
subsystemPathsTotal *prometheus.Desc
subsystemPathsLive *prometheus.Desc
pathState *prometheus.Desc
}

type nvmeSubsystem struct {
Name string
NQN string
Model string
Serial string
IOPolicy string
Controllers []nvmeController
}

type nvmeController struct {
Name string
State string
Transport string
Address string
}

var (
nvmeControllerRE = regexp.MustCompile(`^nvme\d+$`)

nvmeControllerStates = []string{
"live", "connecting", "resetting", "dead", "unknown",
}
)

func normalizeControllerState(raw string) string {
switch raw {
case "live", "connecting", "resetting", "dead":
return raw
case "deleting", "deleting (no IO)", "new":
return raw
default:
return "unknown"
Comment on lines +64 to +70

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for the record, I checked that this is a complete list of all states reported by the kernel today.

}
}

func init() {
registerCollector("nvmesubsystem", defaultDisabled, NewNVMeSubsystemCollector)
}

// NewNVMeSubsystemCollector returns a new Collector exposing NVMe-oF subsystem
// path health from /sys/class/nvme-subsystem/.
func NewNVMeSubsystemCollector(logger *slog.Logger) (Collector, error) {
const subsystem = "nvmesubsystem"

c := &nvmeSubsystemCollector{
logger: logger,
subsystemInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "info"),
"Non-numeric information about an NVMe subsystem.",
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
),
subsystemPathsTotal: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "paths_total"),
"Total number of controller paths for an NVMe subsystem.",
[]string{"subsystem"}, nil,
),
subsystemPathsLive: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "paths_live"),
"Number of controller paths in live state for an NVMe subsystem.",
[]string{"subsystem"}, nil,
),
pathState: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "path_state"),
"Current NVMe controller path state (1 for the current state, 0 for all others).",
[]string{"subsystem", "controller", "transport", "state"}, nil,
),
}

c.scanSubsystems = func() ([]nvmeSubsystem, error) {
return scanNVMeSubsystems(*sysPath)
}

return c, nil
}

func (c *nvmeSubsystemCollector) Update(ch chan<- prometheus.Metric) error {
subsystems, err := c.scanSubsystems()
if err != nil {
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
}

for _, subsys := range subsystems {
ch <- prometheus.MustNewConstMetric(c.subsystemInfo, prometheus.GaugeValue, 1,
subsys.Name, subsys.NQN, subsys.Model, subsys.Serial, subsys.IOPolicy)

total := float64(len(subsys.Controllers))
var live float64
for _, ctrl := range subsys.Controllers {
state := normalizeControllerState(ctrl.State)
if state == "live" {
live++
}

for _, s := range nvmeControllerStates {
val := 0.0
if s == state {
val = 1.0
}
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, val,
subsys.Name, ctrl.Name, ctrl.Transport, s)
}
}

ch <- prometheus.MustNewConstMetric(c.subsystemPathsTotal, prometheus.GaugeValue, total, subsys.Name)
ch <- prometheus.MustNewConstMetric(c.subsystemPathsLive, prometheus.GaugeValue, live, subsys.Name)
}

return nil
}

func scanNVMeSubsystems(sysfsBase string) ([]nvmeSubsystem, error) {
subsysBase := filepath.Join(sysfsBase, "class", "nvme-subsystem")

entries, err := os.ReadDir(subsysBase)
if err != nil {
return nil, err
}

var subsystems []nvmeSubsystem
for _, entry := range entries {
if !strings.HasPrefix(entry.Name(), "nvme-subsys") {
continue
}
subsysPath := filepath.Join(subsysBase, entry.Name())
subsys, err := parseNVMeSubsystem(entry.Name(), subsysPath)
if err != nil {
continue
}
subsystems = append(subsystems, *subsys)
}

return subsystems, nil
}

func parseNVMeSubsystem(name, path string) (*nvmeSubsystem, error) {
subsys := &nvmeSubsystem{Name: name}

subsys.NQN = readSysfsString(filepath.Join(path, "subsysnqn"))
subsys.Model = readSysfsString(filepath.Join(path, "model"))
subsys.Serial = readSysfsString(filepath.Join(path, "serial"))
subsys.IOPolicy = readSysfsString(filepath.Join(path, "iopolicy"))

entries, err := os.ReadDir(path)
if err != nil {
return subsys, nil
}

for _, entry := range entries {
if !nvmeControllerRE.MatchString(entry.Name()) {
continue
}
ctrlPath := filepath.Join(path, entry.Name())
ctrl := nvmeController{
Name: entry.Name(),
State: readSysfsString(filepath.Join(ctrlPath, "state")),
Transport: readSysfsString(filepath.Join(ctrlPath, "transport")),
Address: readSysfsString(filepath.Join(ctrlPath, "address")),
}
subsys.Controllers = append(subsys.Controllers, ctrl)
}

return subsys, nil
}

func readSysfsString(path string) string {
data, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
Loading