Skip to content

Commit 5b7221f

Browse files
nmgastonrsdmike
andauthored
feat: add KVM performance timing metrics and monitoring (#761)
* feat: add KVM performance timing metrics and monitoring - Add histograms to track KVM connection setup timing: - Device lookup time from database - TCP connection establishment time - WebSocket upgrade duration - Total connection time - Consent code wait time - API request durations - Add metrics for KVM data flow performance: - Device-to-browser and browser-to-device write/send durations - Payload size distributions - Receive/read block times - Create recording functions for all metrics - Add detailed timing logs with KVM_TIMING prefix - Update tests to handle new timing metrics * refactor: replace manual REST endpoint instrumentation with go-gin-prometheus Replace manual timing instrumentation in individual REST endpoints with go-gin-prometheus middleware for automatic metrics coverage across all HTTP endpoints. Changes: - Add go-gin-prometheus dependency to provide standard HTTP metrics - Integrate Prometheus middleware in router for automatic instrumentation - Remove manual timing code from features, KVM displays, and power endpoints - Remove unused kvmAPIRequestSeconds metric and RecordAPIRequest function - Clean up unused time and devices package imports Benefits: - Automatic metrics for ALL REST endpoints (not just 3) - Standard HTTP metrics (duration, status codes, request/response sizes) - Less repetitive code and improved maintainability - Keeps all custom KVM connection and data flow metrics Addresses reviewer feedback in PR #761 --------- Co-authored-by: Mike <michael.johanson@intel.com>
1 parent 8efa62a commit 5b7221f

7 files changed

Lines changed: 142 additions & 8 deletions

File tree

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ require (
5959
github.com/ryanuber/go-glob v1.0.0 // indirect
6060
github.com/woodsbury/decimal128 v1.4.0 // indirect
6161
github.com/zalando/go-keyring v0.2.6 // indirect
62+
github.com/zsais/go-gin-prometheus v1.0.2 // indirect
6263
go.yaml.in/yaml/v2 v2.4.2 // indirect
6364
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
6465
golang.org/x/oauth2 v0.30.0 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@ github.com/woodsbury/decimal128 v1.4.0 h1:xJATj7lLu4f2oObouMt2tgGiElE5gO6mSWUjQs
269269
github.com/woodsbury/decimal128 v1.4.0/go.mod h1:BP46FUrVjVhdTbKT+XuQh2xfQaGki9LMIRJSFuh6THU=
270270
github.com/zalando/go-keyring v0.2.6 h1:r7Yc3+H+Ux0+M72zacZoItR3UDxeWfKTcabvkI8ua9s=
271271
github.com/zalando/go-keyring v0.2.6/go.mod h1:2TCrxYrbUNYfNS/Kgy/LSrkSQzZ5UPVH85RwfczwvcI=
272+
github.com/zsais/go-gin-prometheus v1.0.2 h1:3asLqrFltMdItpgr/OS4hYc8pLq3HzMa5T1gYuXBIZ0=
273+
github.com/zsais/go-gin-prometheus v1.0.2/go.mod h1:iKBYSOHzvGfe2FyGSOC8JSwUA0MITdnYzI6v+aAbw1Q=
272274
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
273275
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
274276
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=

internal/controller/httpapi/router.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/gin-gonic/gin"
88
"github.com/prometheus/client_golang/prometheus/promhttp"
9+
ginprometheus "github.com/zsais/go-gin-prometheus"
910

1011
"github.com/device-management-toolkit/console/config"
1112
v1 "github.com/device-management-toolkit/console/internal/controller/httpapi/v1"
@@ -21,6 +22,13 @@ func NewRouter(handler *gin.Engine, l logger.Interface, t usecase.Usecases, cfg
2122
handler.Use(gin.Logger())
2223
handler.Use(gin.Recovery())
2324

25+
// Add Prometheus middleware for automatic HTTP metrics
26+
// Don't automatically register /metrics endpoint - we have our own
27+
p := ginprometheus.NewPrometheus("gin")
28+
p.MetricsPath = ""
29+
// Use middleware function directly without calling Use() which would register conflicting routes
30+
handler.Use(p.HandlerFunc())
31+
2432
// Initialize Fuego adapter
2533
fuegoAdapter := openapi.NewFuegoAdapter(t, l)
2634
fuegoAdapter.RegisterRoutes()

internal/controller/ws/v1/redirect.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package v1
33
import (
44
"compress/flate"
55
"net/http"
6+
"time"
67

78
"github.com/gin-gonic/gin"
89
"github.com/golang-jwt/jwt/v5"
@@ -59,7 +60,13 @@ func (r *RedirectRoutes) websocketHandler(c *gin.Context) {
5960
upgrader.Subprotocols = []string{tokenString}
6061
}
6162

63+
// KVM_TIMING: Measure WebSocket upgrade duration
64+
upgradeStart := time.Now()
6265
conn, err := r.u.Upgrade(c.Writer, c.Request, nil)
66+
upgradeDuration := time.Since(upgradeStart)
67+
devices.RecordWebsocketUpgrade(upgradeDuration)
68+
r.l.Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", upgradeDuration.Milliseconds())
69+
6370
if err != nil {
6471
http.Error(c.Writer, "Could not open websocket connection", http.StatusInternalServerError)
6572

@@ -77,7 +84,13 @@ func (r *RedirectRoutes) websocketHandler(c *gin.Context) {
7784

7885
r.l.Info("Websocket connection opened")
7986

87+
// KVM_TIMING: Measure total connection time
88+
totalStart := time.Now()
8089
err = r.d.Redirect(c, conn, c.Query("host"), c.Query("mode"))
90+
totalDuration := time.Since(totalStart)
91+
devices.RecordTotalConnection(totalDuration, c.Query("mode"))
92+
r.l.Debug("KVM_TIMING: Total connection time", "duration_ms", totalDuration.Milliseconds(), "mode", c.Query("mode"))
93+
8194
if err != nil {
8295
r.l.Error(err, "http - devices - v1 - redirect")
8396
errorResponse(c, http.StatusInternalServerError, "redirect failed")

internal/controller/ws/v1/redirect_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,26 @@ func TestWebSocketHandler(t *testing.T) { //nolint:paralleltest // logging libra
6565
Upgrade(gomock.Any(), gomock.Any(), nil).
6666
Return(nil, tc.upgraderError)
6767
mockLogger.EXPECT().Debug("failed to cast Upgrader to *websocket.Upgrader")
68+
mockLogger.EXPECT().Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", gomock.Any())
6869
} else {
6970
mockUpgrader.EXPECT().
7071
Upgrade(gomock.Any(), gomock.Any(), nil).
7172
Return(&websocket.Conn{}, nil)
7273

7374
mockLogger.EXPECT().Debug("failed to cast Upgrader to *websocket.Upgrader")
75+
mockLogger.EXPECT().Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", gomock.Any())
7476
mockLogger.EXPECT().Info("Websocket connection opened")
7577

76-
if tc.redirectError != nil {
77-
mockLogger.EXPECT().Error(tc.redirectError, "http - devices - v1 - redirect")
78-
}
79-
8078
mockFeature.EXPECT().
8179
Redirect(gomock.Any(), gomock.Any(), "someHost", "someMode").
8280
Return(tc.redirectError)
81+
82+
// Total connection time is always logged after Redirect completes
83+
mockLogger.EXPECT().Debug("KVM_TIMING: Total connection time", "duration_ms", gomock.Any(), "mode", "someMode")
84+
85+
if tc.redirectError != nil {
86+
mockLogger.EXPECT().Error(tc.redirectError, "http - devices - v1 - redirect")
87+
}
8388
}
8489

8590
r := gin.Default()

internal/usecase/devices/interceptor.go

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ const (
3030
ConnectionTimeout = 5 * time.Minute
3131
InactivityTimeout = 30 * time.Second // Close connection if no data for 30 seconds
3232
HeartbeatInterval = 30 * time.Second // Check connection health every 30 seconds
33+
SlowReceiveThreshold = 100 // Milliseconds threshold for slow device receive
34+
SlowWriteThreshold = 50 // Milliseconds threshold for slow write operations
3335
)
3436

3537
type DeviceConnection struct {
@@ -48,7 +50,13 @@ type DeviceConnection struct {
4850
}
4951

5052
func (uc *UseCase) Redirect(c context.Context, conn *websocket.Conn, guid, mode string) error {
53+
// KVM_TIMING: Measure device lookup latency
54+
lookupStart := time.Now()
5155
device, err := uc.repo.GetByID(c, guid, "")
56+
57+
RecordDeviceLookup(time.Since(lookupStart))
58+
uc.log.Debug("KVM_TIMING: Device lookup", "duration_ms", time.Since(lookupStart).Milliseconds(), "guid", guid)
59+
5260
if err != nil {
5361
return err
5462
}
@@ -64,7 +72,13 @@ func (uc *UseCase) Redirect(c context.Context, conn *websocket.Conn, guid, mode
6472
return err
6573
}
6674

75+
// KVM_TIMING: Measure connection setup latency
76+
connectStart := time.Now()
6777
err = uc.redirection.RedirectConnect(c, deviceConnection)
78+
79+
RecordConnectionSetup(time.Since(connectStart), mode)
80+
uc.log.Debug("KVM_TIMING: Connection setup", "duration_ms", time.Since(connectStart).Milliseconds(), "mode", mode, "guid", guid)
81+
6882
if err != nil {
6983
deviceConnection.cancel()
7084

@@ -220,7 +234,12 @@ func (uc *UseCase) ListenToDevice(deviceConnection *DeviceConnection) {
220234
// Measure time blocked waiting for device data
221235
recvStart := time.Now()
222236
data, err := uc.redirection.RedirectListen(deviceConnection.ctx, deviceConnection)
223-
kvmDeviceReceiveBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(recvStart).Seconds())
237+
recvDuration := time.Since(recvStart)
238+
kvmDeviceReceiveBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(recvDuration.Seconds())
239+
240+
if recvDuration.Milliseconds() > SlowReceiveThreshold {
241+
uc.log.Debug("KVM_TIMING: Device receive blocked", "duration_ms", recvDuration.Milliseconds(), "mode", deviceConnection.Mode)
242+
}
224243

225244
if err != nil {
226245
break
@@ -249,7 +268,12 @@ func (uc *UseCase) ListenToDevice(deviceConnection *DeviceConnection) {
249268

250269
err = conn.WriteMessage(websocket.BinaryMessage, toSend)
251270

252-
kvmDeviceToBrowserWriteSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(start).Seconds())
271+
writeDuration := time.Since(start)
272+
kvmDeviceToBrowserWriteSeconds.WithLabelValues(deviceConnection.Mode).Observe(writeDuration.Seconds())
273+
274+
if writeDuration.Milliseconds() > SlowWriteThreshold {
275+
uc.log.Debug("KVM_TIMING: Device to browser write slow", "duration_ms", writeDuration.Milliseconds(), "mode", deviceConnection.Mode, "bytes", len(toSend))
276+
}
253277

254278
if err != nil {
255279
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
@@ -281,7 +305,12 @@ func (uc *UseCase) ListenToBrowser(deviceConnection *DeviceConnection) {
281305

282306
readStart := time.Now()
283307
_, msg, err := deviceConnection.Conn.ReadMessage()
284-
kvmBrowserReadBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(readStart).Seconds())
308+
readDuration := time.Since(readStart)
309+
kvmBrowserReadBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(readDuration.Seconds())
310+
311+
if readDuration.Milliseconds() > SlowReceiveThreshold {
312+
uc.log.Debug("KVM_TIMING: Browser read blocked", "duration_ms", readDuration.Milliseconds(), "mode", deviceConnection.Mode)
313+
}
285314

286315
if err != nil {
287316
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
@@ -308,7 +337,12 @@ func (uc *UseCase) ListenToBrowser(deviceConnection *DeviceConnection) {
308337
kvmBrowserToDeviceMessages.WithLabelValues(deviceConnection.Mode).Inc()
309338
// Send the message to the TCP Connection on the device
310339
err = uc.redirection.RedirectSend(deviceConnection.ctx, deviceConnection, toSend)
311-
kvmBrowserToDeviceSendSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(start).Seconds())
340+
sendDuration := time.Since(start)
341+
kvmBrowserToDeviceSendSeconds.WithLabelValues(deviceConnection.Mode).Observe(sendDuration.Seconds())
342+
343+
if sendDuration.Milliseconds() > SlowWriteThreshold {
344+
uc.log.Debug("KVM_TIMING: Browser to device send slow", "duration_ms", sendDuration.Milliseconds(), "mode", deviceConnection.Mode, "bytes", len(toSend))
345+
}
312346

313347
if err != nil {
314348
_ = fmt.Errorf("interceptor - listenToBrowser - error sending message to device: %w", err)

internal/usecase/devices/metrics.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package devices
22

33
import (
4+
"time"
5+
46
"github.com/prometheus/client_golang/prometheus"
57
"github.com/prometheus/client_golang/prometheus/promauto"
68
)
@@ -92,4 +94,73 @@ var (
9294
},
9395
[]string{"mode"},
9496
)
97+
98+
// KVM Connection Performance Metrics.
99+
kvmDeviceLookupSeconds = promauto.NewHistogram(
100+
prometheus.HistogramOpts{
101+
Name: "kvm_device_lookup_seconds",
102+
Help: "Time to look up device from database during KVM connection (KVM_TIMING)",
103+
Buckets: []float64{0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
104+
},
105+
)
106+
107+
kvmConnectionSetupSeconds = promauto.NewHistogramVec(
108+
prometheus.HistogramOpts{
109+
Name: "kvm_connection_setup_seconds",
110+
Help: "Time to establish TCP connection to device during KVM setup (KVM_TIMING)",
111+
Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10},
112+
},
113+
[]string{"mode"},
114+
)
115+
116+
kvmWebsocketUpgradeSeconds = promauto.NewHistogram(
117+
prometheus.HistogramOpts{
118+
Name: "kvm_websocket_upgrade_seconds",
119+
Help: "Time to upgrade HTTP connection to WebSocket for KVM (KVM_TIMING)",
120+
Buckets: []float64{0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1},
121+
},
122+
)
123+
124+
kvmTotalConnectionSeconds = promauto.NewHistogramVec(
125+
prometheus.HistogramOpts{
126+
Name: "kvm_total_connection_seconds",
127+
Help: "Total time from request to ready KVM connection (KVM_TIMING)",
128+
Buckets: []float64{0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 30},
129+
},
130+
[]string{"mode"},
131+
)
132+
133+
kvmConsentCodeWaitSeconds = promauto.NewHistogramVec(
134+
prometheus.HistogramOpts{
135+
Name: "kvm_consent_code_wait_seconds",
136+
Help: "Time spent waiting for consent code handling during KVM setup (KVM_TIMING)",
137+
Buckets: []float64{0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60},
138+
},
139+
[]string{"mode"},
140+
)
95141
)
142+
143+
// RecordWebsocketUpgrade records the WebSocket upgrade duration metric.
144+
func RecordWebsocketUpgrade(duration time.Duration) {
145+
kvmWebsocketUpgradeSeconds.Observe(duration.Seconds())
146+
}
147+
148+
// RecordTotalConnection records the total KVM connection time metric.
149+
func RecordTotalConnection(duration time.Duration, mode string) {
150+
kvmTotalConnectionSeconds.WithLabelValues(mode).Observe(duration.Seconds())
151+
}
152+
153+
// RecordConsentCodeWait records the consent code wait time metric.
154+
func RecordConsentCodeWait(duration time.Duration, mode string) {
155+
kvmConsentCodeWaitSeconds.WithLabelValues(mode).Observe(duration.Seconds())
156+
}
157+
158+
// RecordDeviceLookup records the device lookup duration metric.
159+
func RecordDeviceLookup(duration time.Duration) {
160+
kvmDeviceLookupSeconds.Observe(duration.Seconds())
161+
}
162+
163+
// RecordConnectionSetup records the TCP connection setup duration metric.
164+
func RecordConnectionSetup(duration time.Duration, mode string) {
165+
kvmConnectionSetupSeconds.WithLabelValues(mode).Observe(duration.Seconds())
166+
}

0 commit comments

Comments
 (0)