From 3f5f8e91b9b84611d957e9fe6f721210bc23ea94 Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Wed, 4 Mar 2026 16:43:07 +0530 Subject: [PATCH] add error counts / dimm channel for edac collector Signed-off-by: manogna_grandhi --- collector/edac_linux.go | 201 +++++++++++++++++++++++++++++----------- 1 file changed, 148 insertions(+), 53 deletions(-) diff --git a/collector/edac_linux.go b/collector/edac_linux.go index d3a2a07a83..ff9ce76a49 100644 --- a/collector/edac_linux.go +++ b/collector/edac_linux.go @@ -18,8 +18,10 @@ package collector import ( "fmt" "log/slog" + "os" "path/filepath" "regexp" + "strings" "github.com/prometheus/client_golang/prometheus" ) @@ -30,115 +32,208 @@ const ( var ( edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`) - edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`) + edacMemDimmRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/dimm([0-9]*)`) ) type edacCollector struct { - ceCount *prometheus.Desc - ueCount *prometheus.Desc - csRowCECount *prometheus.Desc - csRowUECount *prometheus.Desc - logger *slog.Logger + ceCount *prometheus.Desc + ueCount *prometheus.Desc + channelCECount *prometheus.Desc + channelUECount *prometheus.Desc + dimmCECount *prometheus.Desc + dimmUECount *prometheus.Desc + logger *slog.Logger } func init() { registerCollector("edac", defaultEnabled, NewEdacCollector) } -// NewEdacCollector returns a new Collector exposing edac stats. func NewEdacCollector(logger *slog.Logger) (Collector, error) { + return &edacCollector{ + ceCount: prometheus.NewDesc( prometheus.BuildFQName(namespace, edacSubsystem, "correctable_errors_total"), "Total correctable memory errors.", - []string{"controller"}, nil, + []string{"controller"}, + nil, ), + ueCount: prometheus.NewDesc( prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"), "Total uncorrectable memory errors.", - []string{"controller"}, nil, + []string{"controller"}, + nil, + ), + + channelCECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"), + "Total correctable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, + nil, ), - csRowCECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"), - "Total correctable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + channelUECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, + nil, ), - csRowUECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"), - "Total uncorrectable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + dimmCECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "dimm_correctable_errors_total"), + "Total correctable memory errors for this dimm.", + []string{"controller", "dimm"}, + nil, ), + + dimmUECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "dimm_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this dimm.", + []string{"controller", "dimm"}, + nil, + ), + logger: logger, }, nil } func (c *edacCollector) Update(ch chan<- prometheus.Metric) error { + memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*")) if err != nil { return err } + for _, controller := range memControllers { + controllerMatch := edacMemControllerRE.FindStringSubmatch(controller) if controllerMatch == nil { return fmt.Errorf("controller string didn't match regexp: %s", controller) } + controllerNumber := controllerMatch[1] value, err := readUintFromFile(filepath.Join(controller, "ce_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.ceCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.ceCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err) + value, err = readUintFromFile(filepath.Join(controller, "ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.ueCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") - value, err = readUintFromFile(filepath.Join(controller, "ue_count")) + csrows, err := filepath.Glob(controller + "/csrow[0-9]*") + if err != nil { - return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err) + return err } - ch <- prometheus.MustNewConstMetric( - c.ueCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err) + for _, csrow := range csrows { + csrowMatch := regexp.MustCompile(`csrow([0-9]+)`).FindStringSubmatch(csrow) + csrowNumber := csrowMatch[1] + + channelFiles, err := filepath.Glob(csrow + "/ch*_ce_count") + if err != nil { + return err + } + + for _, chFile := range channelFiles { + + base := filepath.Base(chFile) + + match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(base) + if match == nil { + continue + } + + channelNumber := match[1] + label := "unknown" + labelBytes, err := os.ReadFile(filepath.Join(csrow, "ch"+channelNumber+"_dimm_label")) + if err == nil { + label = strings.TrimSpace(string(labelBytes)) + // format label + label = strings.ReplaceAll(label, "#", "") + label = strings.ReplaceAll(label, "csrow", "_csrow") + label = strings.ReplaceAll(label, "channel", "_channel") + } + value, err := readUintFromFile(chFile) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.channelCECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } + + value, err = readUintFromFile(filepath.Join(csrow, "ch"+channelNumber+"_ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.channelUECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } + } } - ch <- prometheus.MustNewConstMetric( - c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") - // For each controller, walk the csrow directories. - csrows, err := filepath.Glob(controller + "/csrow[0-9]*") + dimms, err := filepath.Glob(controller + "/dimm[0-9]*") if err != nil { return err } - for _, csrow := range csrows { - csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow) - if csrowMatch == nil { - return fmt.Errorf("csrow string didn't match regexp: %s", csrow) + + for _, dimm := range dimms { + + dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm) + if dimmMatch == nil || len(dimmMatch) < 2 { + continue } - csrowNumber := csrowMatch[1] - value, err = readUintFromFile(filepath.Join(csrow, "ce_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + dimmNumber := dimmMatch[1] + + value, err := readUintFromFile(filepath.Join(dimm, "dimm_ce_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.dimmCECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + dimmNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) - value, err = readUintFromFile(filepath.Join(csrow, "ue_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + value, err = readUintFromFile(filepath.Join(dimm, "dimm_ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.dimmUECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + dimmNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) } } - return err + return nil }