1- // Copyright 2015 The Prometheus Authors
2- // Licensed under the Apache License, Version 2.0 (the "License");
3- // you may not use this file except in compliance with the License.
4- // You may obtain a copy of the License at
5- //
6- // http://www.apache.org/licenses/LICENSE-2.0
7- //
8- // Unless required by applicable law or agreed to in writing, software
9- // distributed under the License is distributed on an "AS IS" BASIS,
10- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11- // See the License for the specific language governing permissions and
12- // limitations under the License.
13-
14- //go:build !noedac
15-
161package collector
172
183import (
194 "fmt"
205 "log/slog"
6+ "os"
217 "path/filepath"
228 "regexp"
9+ "strings"
2310
2411 "github.com/prometheus/client_golang/prometheus"
2512)
@@ -30,115 +17,211 @@ const (
3017
3118var (
3219 edacMemControllerRE = regexp .MustCompile (`.*devices/system/edac/mc/mc([0-9]*)` )
33- edacMemCsrowRE = regexp .MustCompile (`.*devices/system/edac/mc/mc[0-9]*/csrow ([0-9]*)` )
20+ edacMemDimmRE = regexp .MustCompile (`.*devices/system/edac/mc/mc[0-9]*/dimm ([0-9]*)` )
3421)
3522
3623type edacCollector struct {
37- ceCount * prometheus.Desc
38- ueCount * prometheus.Desc
39- csRowCECount * prometheus.Desc
40- csRowUECount * prometheus.Desc
41- logger * slog.Logger
24+ ceCount * prometheus.Desc
25+ ueCount * prometheus.Desc
26+ channelCECount * prometheus.Desc
27+ channelUECount * prometheus.Desc
28+ dimmCECount * prometheus.Desc
29+ dimmUECount * prometheus.Desc
30+ logger * slog.Logger
4231}
4332
4433func init () {
4534 registerCollector ("edac" , defaultEnabled , NewEdacCollector )
4635}
4736
48- // NewEdacCollector returns a new Collector exposing edac stats.
4937func NewEdacCollector (logger * slog.Logger ) (Collector , error ) {
38+
5039 return & edacCollector {
40+
5141 ceCount : prometheus .NewDesc (
5242 prometheus .BuildFQName (namespace , edacSubsystem , "correctable_errors_total" ),
5343 "Total correctable memory errors." ,
54- []string {"controller" }, nil ,
44+ []string {"controller" },
45+ nil ,
5546 ),
47+
5648 ueCount : prometheus .NewDesc (
5749 prometheus .BuildFQName (namespace , edacSubsystem , "uncorrectable_errors_total" ),
5850 "Total uncorrectable memory errors." ,
59- []string {"controller" }, nil ,
51+ []string {"controller" },
52+ nil ,
6053 ),
61- csRowCECount : prometheus .NewDesc (
62- prometheus .BuildFQName (namespace , edacSubsystem , "csrow_correctable_errors_total" ),
63- "Total correctable memory errors for this csrow." ,
64- []string {"controller" , "csrow" }, nil ,
54+
55+ channelCECount : prometheus .NewDesc (
56+ prometheus .BuildFQName (namespace , edacSubsystem , "channel_correctable_errors_total" ),
57+ "Total correctable memory errors for this channel." ,
58+ []string {"controller" , "csrow" , "channel" , "dimm_label" },
59+ nil ,
6560 ),
66- csRowUECount : prometheus .NewDesc (
67- prometheus .BuildFQName (namespace , edacSubsystem , "csrow_uncorrectable_errors_total" ),
68- "Total uncorrectable memory errors for this csrow." ,
69- []string {"controller" , "csrow" }, nil ,
61+
62+ channelUECount : prometheus .NewDesc (
63+ prometheus .BuildFQName (namespace , edacSubsystem , "channel_uncorrectable_errors_total" ),
64+ "Total uncorrectable memory errors for this channel." ,
65+ []string {"controller" , "csrow" , "channel" , "dimm_label" },
66+ nil ,
7067 ),
68+
69+ dimmCECount : prometheus .NewDesc (
70+ prometheus .BuildFQName (namespace , edacSubsystem , "dimm_correctable_errors_total" ),
71+ "Total correctable memory errors for this dimm." ,
72+ []string {"controller" , "dimm" },
73+ nil ,
74+ ),
75+
76+ dimmUECount : prometheus .NewDesc (
77+ prometheus .BuildFQName (namespace , edacSubsystem , "dimm_uncorrectable_errors_total" ),
78+ "Total uncorrectable memory errors for this dimm." ,
79+ []string {"controller" , "dimm" },
80+ nil ,
81+ ),
82+
7183 logger : logger ,
84+
7285 }, nil
7386}
7487
7588func (c * edacCollector ) Update (ch chan <- prometheus.Metric ) error {
89+
7690 memControllers , err := filepath .Glob (sysFilePath ("devices/system/edac/mc/mc[0-9]*" ))
7791 if err != nil {
7892 return err
7993 }
94+
8095 for _ , controller := range memControllers {
96+
8197 controllerMatch := edacMemControllerRE .FindStringSubmatch (controller )
8298 if controllerMatch == nil {
8399 return fmt .Errorf ("controller string didn't match regexp: %s" , controller )
84100 }
101+
85102 controllerNumber := controllerMatch [1 ]
86103
87104 value , err := readUintFromFile (filepath .Join (controller , "ce_count" ))
88- if err != nil {
89- return fmt .Errorf ("couldn't get ce_count for controller %s: %w" , controllerNumber , err )
90- }
91- ch <- prometheus .MustNewConstMetric (
92- c .ceCount , prometheus .CounterValue , float64 (value ), controllerNumber )
93-
94- value , err = readUintFromFile (filepath .Join (controller , "ce_noinfo_count" ))
95- if err != nil {
96- return fmt .Errorf ("couldn't get ce_noinfo_count for controller %s: %w" , controllerNumber , err )
105+ if err == nil {
106+ ch <- prometheus .MustNewConstMetric (
107+ c .ceCount ,
108+ prometheus .CounterValue ,
109+ float64 (value ),
110+ controllerNumber ,
111+ )
97112 }
98- ch <- prometheus .MustNewConstMetric (
99- c .csRowCECount , prometheus .CounterValue , float64 (value ), controllerNumber , "unknown" )
100113
101114 value , err = readUintFromFile (filepath .Join (controller , "ue_count" ))
102- if err != nil {
103- return fmt .Errorf ("couldn't get ue_count for controller %s: %w" , controllerNumber , err )
115+ if err == nil {
116+ ch <- prometheus .MustNewConstMetric (
117+ c .ueCount ,
118+ prometheus .CounterValue ,
119+ float64 (value ),
120+ controllerNumber ,
121+ )
104122 }
105- ch <- prometheus .MustNewConstMetric (
106- c .ueCount , prometheus .CounterValue , float64 (value ), controllerNumber )
107123
108- value , err = readUintFromFile (filepath .Join (controller , "ue_noinfo_count" ))
109- if err != nil {
110- return fmt .Errorf ("couldn't get ue_noinfo_count for controller %s: %w" , controllerNumber , err )
111- }
112- ch <- prometheus .MustNewConstMetric (
113- c .csRowUECount , prometheus .CounterValue , float64 (value ), controllerNumber , "unknown" )
114124
115- // For each controller, walk the csrow directories.
116125 csrows , err := filepath .Glob (controller + "/csrow[0-9]*" )
126+
117127 if err != nil {
118128 return err
119129 }
130+
120131 for _ , csrow := range csrows {
121- csrowMatch := edacMemCsrowRE .FindStringSubmatch (csrow )
122- if csrowMatch == nil {
123- return fmt .Errorf ("csrow string didn't match regexp: %s" , csrow )
124- }
132+ csrowMatch := regexp .MustCompile (`csrow([0-9]+)` ).FindStringSubmatch (csrow )
125133 csrowNumber := csrowMatch [1 ]
126134
127- value , err = readUintFromFile ( filepath .Join (csrow , "ce_count" ) )
135+ channelFiles , err := filepath .Glob (csrow + "/ch*_ce_count" )
128136 if err != nil {
129- return fmt . Errorf ( "couldn't get ce_count for controller/csrow %s/%s: %w" , controllerNumber , csrowNumber , err )
137+ return err
130138 }
131- ch <- prometheus .MustNewConstMetric (
132- c .csRowCECount , prometheus .CounterValue , float64 (value ), controllerNumber , csrowNumber )
133139
134- value , err = readUintFromFile (filepath .Join (csrow , "ue_count" ))
135- if err != nil {
136- return fmt .Errorf ("couldn't get ue_count for controller/csrow %s/%s: %w" , controllerNumber , csrowNumber , err )
140+ for _ , chFile := range channelFiles {
141+
142+ base := filepath .Base (chFile )
143+
144+ match := regexp .MustCompile (`ch([0-9]+)_ce_count` ).FindStringSubmatch (base )
145+ if match == nil {
146+ continue
147+ }
148+
149+ channelNumber := match [1 ]
150+ label := "unknown"
151+ labelBytes , err := os .ReadFile (filepath .Join (csrow , "ch" + channelNumber + "_dimm_label" ))
152+ if err == nil {
153+ label = strings .TrimSpace (string (labelBytes ))
154+ // format label
155+ label = strings .ReplaceAll (label , "#" , "" )
156+ label = strings .ReplaceAll (label , "csrow" , "_csrow" )
157+ label = strings .ReplaceAll (label , "channel" , "_channel" )
158+ }
159+ value , err := readUintFromFile (chFile )
160+ if err == nil {
161+ ch <- prometheus .MustNewConstMetric (
162+ c .channelCECount ,
163+ prometheus .CounterValue ,
164+ float64 (value ),
165+ controllerNumber ,
166+ csrowNumber ,
167+ channelNumber ,
168+ label ,
169+ )
170+ }
171+
172+ value , err = readUintFromFile (filepath .Join (csrow , "ch" + channelNumber + "_ue_count" ))
173+ if err == nil {
174+ ch <- prometheus .MustNewConstMetric (
175+ c .channelUECount ,
176+ prometheus .CounterValue ,
177+ float64 (value ),
178+ controllerNumber ,
179+ csrowNumber ,
180+ channelNumber ,
181+ label ,
182+ )
183+ }
184+ }
185+ }
186+
187+
188+ dimms , err := filepath .Glob (controller + "/dimm[0-9]*" )
189+ if err != nil {
190+ return err
191+ }
192+
193+ for _ , dimm := range dimms {
194+
195+ dimmMatch := edacMemDimmRE .FindStringSubmatch (dimm )
196+ if dimmMatch == nil || len (dimmMatch ) < 2 {
197+ continue
198+ }
199+
200+ dimmNumber := dimmMatch [1 ]
201+
202+ value , err := readUintFromFile (filepath .Join (dimm , "dimm_ce_count" ))
203+ if err == nil {
204+ ch <- prometheus .MustNewConstMetric (
205+ c .dimmCECount ,
206+ prometheus .CounterValue ,
207+ float64 (value ),
208+ controllerNumber ,
209+ dimmNumber ,
210+ )
211+ }
212+
213+ value , err = readUintFromFile (filepath .Join (dimm , "dimm_ue_count" ))
214+ if err == nil {
215+ ch <- prometheus .MustNewConstMetric (
216+ c .dimmUECount ,
217+ prometheus .CounterValue ,
218+ float64 (value ),
219+ controllerNumber ,
220+ dimmNumber ,
221+ )
137222 }
138- ch <- prometheus .MustNewConstMetric (
139- c .csRowUECount , prometheus .CounterValue , float64 (value ), controllerNumber , csrowNumber )
140223 }
141224 }
142225
143- return err
226+ return nil
144227}
0 commit comments