Skip to content

Commit 9374632

Browse files
author
manogna_grandhi
committed
add error counts / dimm channel for edac collector
Signed-off-by: manogna_grandhi <grandhi.manogna@flipkart.com>
1 parent 9fd21e8 commit 9374632

1 file changed

Lines changed: 151 additions & 68 deletions

File tree

collector/edac_linux.go

Lines changed: 151 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,12 @@
1-
// Copyright 2015 The Prometheus Authors
2-
// Licensed under the Apache License, Version 2.0 (the "License");
3-
// you may not use this file except in compliance with the License.
4-
// You may obtain a copy of the License at
5-
//
6-
// http://www.apache.org/licenses/LICENSE-2.0
7-
//
8-
// Unless required by applicable law or agreed to in writing, software
9-
// distributed under the License is distributed on an "AS IS" BASIS,
10-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11-
// See the License for the specific language governing permissions and
12-
// limitations under the License.
13-
14-
//go:build !noedac
15-
161
package collector
172

183
import (
194
"fmt"
205
"log/slog"
6+
"os"
217
"path/filepath"
228
"regexp"
9+
"strings"
2310

2411
"github.com/prometheus/client_golang/prometheus"
2512
)
@@ -30,115 +17,211 @@ const (
3017

3118
var (
3219
edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`)
33-
edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`)
20+
edacMemDimmRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/dimm([0-9]*)`)
3421
)
3522

3623
type edacCollector struct {
37-
ceCount *prometheus.Desc
38-
ueCount *prometheus.Desc
39-
csRowCECount *prometheus.Desc
40-
csRowUECount *prometheus.Desc
41-
logger *slog.Logger
24+
ceCount *prometheus.Desc
25+
ueCount *prometheus.Desc
26+
channelCECount *prometheus.Desc
27+
channelUECount *prometheus.Desc
28+
dimmCECount *prometheus.Desc
29+
dimmUECount *prometheus.Desc
30+
logger *slog.Logger
4231
}
4332

4433
func init() {
4534
registerCollector("edac", defaultEnabled, NewEdacCollector)
4635
}
4736

48-
// NewEdacCollector returns a new Collector exposing edac stats.
4937
func NewEdacCollector(logger *slog.Logger) (Collector, error) {
38+
5039
return &edacCollector{
40+
5141
ceCount: prometheus.NewDesc(
5242
prometheus.BuildFQName(namespace, edacSubsystem, "correctable_errors_total"),
5343
"Total correctable memory errors.",
54-
[]string{"controller"}, nil,
44+
[]string{"controller"},
45+
nil,
5546
),
47+
5648
ueCount: prometheus.NewDesc(
5749
prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"),
5850
"Total uncorrectable memory errors.",
59-
[]string{"controller"}, nil,
51+
[]string{"controller"},
52+
nil,
6053
),
61-
csRowCECount: prometheus.NewDesc(
62-
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"),
63-
"Total correctable memory errors for this csrow.",
64-
[]string{"controller", "csrow"}, nil,
54+
55+
channelCECount: prometheus.NewDesc(
56+
prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"),
57+
"Total correctable memory errors for this channel.",
58+
[]string{"controller", "csrow", "channel", "dimm_label"},
59+
nil,
6560
),
66-
csRowUECount: prometheus.NewDesc(
67-
prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"),
68-
"Total uncorrectable memory errors for this csrow.",
69-
[]string{"controller", "csrow"}, nil,
61+
62+
channelUECount: prometheus.NewDesc(
63+
prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"),
64+
"Total uncorrectable memory errors for this channel.",
65+
[]string{"controller", "csrow", "channel", "dimm_label"},
66+
nil,
7067
),
68+
69+
dimmCECount: prometheus.NewDesc(
70+
prometheus.BuildFQName(namespace, edacSubsystem, "dimm_correctable_errors_total"),
71+
"Total correctable memory errors for this dimm.",
72+
[]string{"controller", "dimm"},
73+
nil,
74+
),
75+
76+
dimmUECount: prometheus.NewDesc(
77+
prometheus.BuildFQName(namespace, edacSubsystem, "dimm_uncorrectable_errors_total"),
78+
"Total uncorrectable memory errors for this dimm.",
79+
[]string{"controller", "dimm"},
80+
nil,
81+
),
82+
7183
logger: logger,
84+
7285
}, nil
7386
}
7487

7588
func (c *edacCollector) Update(ch chan<- prometheus.Metric) error {
89+
7690
memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*"))
7791
if err != nil {
7892
return err
7993
}
94+
8095
for _, controller := range memControllers {
96+
8197
controllerMatch := edacMemControllerRE.FindStringSubmatch(controller)
8298
if controllerMatch == nil {
8399
return fmt.Errorf("controller string didn't match regexp: %s", controller)
84100
}
101+
85102
controllerNumber := controllerMatch[1]
86103

87104
value, err := readUintFromFile(filepath.Join(controller, "ce_count"))
88-
if err != nil {
89-
return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err)
90-
}
91-
ch <- prometheus.MustNewConstMetric(
92-
c.ceCount, prometheus.CounterValue, float64(value), controllerNumber)
93-
94-
value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count"))
95-
if err != nil {
96-
return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err)
105+
if err == nil {
106+
ch <- prometheus.MustNewConstMetric(
107+
c.ceCount,
108+
prometheus.CounterValue,
109+
float64(value),
110+
controllerNumber,
111+
)
97112
}
98-
ch <- prometheus.MustNewConstMetric(
99-
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")
100113

101114
value, err = readUintFromFile(filepath.Join(controller, "ue_count"))
102-
if err != nil {
103-
return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err)
115+
if err == nil {
116+
ch <- prometheus.MustNewConstMetric(
117+
c.ueCount,
118+
prometheus.CounterValue,
119+
float64(value),
120+
controllerNumber,
121+
)
104122
}
105-
ch <- prometheus.MustNewConstMetric(
106-
c.ueCount, prometheus.CounterValue, float64(value), controllerNumber)
107123

108-
value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count"))
109-
if err != nil {
110-
return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err)
111-
}
112-
ch <- prometheus.MustNewConstMetric(
113-
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")
114124

115-
// For each controller, walk the csrow directories.
116125
csrows, err := filepath.Glob(controller + "/csrow[0-9]*")
126+
117127
if err != nil {
118128
return err
119129
}
130+
120131
for _, csrow := range csrows {
121-
csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow)
122-
if csrowMatch == nil {
123-
return fmt.Errorf("csrow string didn't match regexp: %s", csrow)
124-
}
132+
csrowMatch := regexp.MustCompile(`csrow([0-9]+)`).FindStringSubmatch(csrow)
125133
csrowNumber := csrowMatch[1]
126134

127-
value, err = readUintFromFile(filepath.Join(csrow, "ce_count"))
135+
channelFiles, err := filepath.Glob(csrow + "/ch*_ce_count")
128136
if err != nil {
129-
return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
137+
return err
130138
}
131-
ch <- prometheus.MustNewConstMetric(
132-
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
133139

134-
value, err = readUintFromFile(filepath.Join(csrow, "ue_count"))
135-
if err != nil {
136-
return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err)
140+
for _, chFile := range channelFiles {
141+
142+
base := filepath.Base(chFile)
143+
144+
match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(base)
145+
if match == nil {
146+
continue
147+
}
148+
149+
channelNumber := match[1]
150+
label := "unknown"
151+
labelBytes, err := os.ReadFile(filepath.Join(csrow, "ch"+channelNumber+"_dimm_label"))
152+
if err == nil {
153+
label = strings.TrimSpace(string(labelBytes))
154+
// format label
155+
label = strings.ReplaceAll(label, "#", "")
156+
label = strings.ReplaceAll(label, "csrow", "_csrow")
157+
label = strings.ReplaceAll(label, "channel", "_channel")
158+
}
159+
value, err := readUintFromFile(chFile)
160+
if err == nil {
161+
ch <- prometheus.MustNewConstMetric(
162+
c.channelCECount,
163+
prometheus.CounterValue,
164+
float64(value),
165+
controllerNumber,
166+
csrowNumber,
167+
channelNumber,
168+
label,
169+
)
170+
}
171+
172+
value, err = readUintFromFile(filepath.Join(csrow, "ch"+channelNumber+"_ue_count"))
173+
if err == nil {
174+
ch <- prometheus.MustNewConstMetric(
175+
c.channelUECount,
176+
prometheus.CounterValue,
177+
float64(value),
178+
controllerNumber,
179+
csrowNumber,
180+
channelNumber,
181+
label,
182+
)
183+
}
184+
}
185+
}
186+
187+
188+
dimms, err := filepath.Glob(controller + "/dimm[0-9]*")
189+
if err != nil {
190+
return err
191+
}
192+
193+
for _, dimm := range dimms {
194+
195+
dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm)
196+
if dimmMatch == nil || len(dimmMatch) < 2 {
197+
continue
198+
}
199+
200+
dimmNumber := dimmMatch[1]
201+
202+
value, err := readUintFromFile(filepath.Join(dimm, "dimm_ce_count"))
203+
if err == nil {
204+
ch <- prometheus.MustNewConstMetric(
205+
c.dimmCECount,
206+
prometheus.CounterValue,
207+
float64(value),
208+
controllerNumber,
209+
dimmNumber,
210+
)
211+
}
212+
213+
value, err = readUintFromFile(filepath.Join(dimm, "dimm_ue_count"))
214+
if err == nil {
215+
ch <- prometheus.MustNewConstMetric(
216+
c.dimmUECount,
217+
prometheus.CounterValue,
218+
float64(value),
219+
controllerNumber,
220+
dimmNumber,
221+
)
137222
}
138-
ch <- prometheus.MustNewConstMetric(
139-
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
140223
}
141224
}
142225

143-
return err
226+
return nil
144227
}

0 commit comments

Comments
 (0)