Skip to content

Commit 228ac3e

Browse files
committed
Prometheus metrics scraper
apply deislabs#54
1 parent bd2bd73 commit 228ac3e

53 files changed

Lines changed: 6576 additions & 3147 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.golangci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ linters:
1414
- goimports
1515
- golint
1616
- govet
17-
- lll
17+
# - lll
1818
- maligned
1919
- misspell
2020
- nakedret

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ The following table lists the supported annotations for Kubernetes `Deployments`
184184
| `osiris.dm.gg/enabled` | Enable the zeroscaler component to scrape and analyze metrics from the deployment's or statefulSet's pods and scale the deployment/statefulSet to zero when idle. Allowed values: `y`, `yes`, `true`, `on`, `1`. | _no value_ (= disabled) |
185185
| `osiris.dm.gg/minReplicas` | The minimum number of replicas to set on the deployment/statefulSet when Osiris will scale up. If you set `2`, Osiris will scale the deployment/statefulSet from `0` to `2` replicas directly. Osiris won't collect metrics from deployments/statefulSets which have more than `minReplicas` replicas - to avoid useless collections of metrics. | `1` |
186186
| `osiris.dm.gg/metricsCheckInterval` | The interval in which Osiris would repeatedly track the pod http request metrics. The value is the number of seconds of the interval. Note that this value override the global value defined by the `zeroscaler.metricsCheckInterval` Helm value. | _value of the `zeroscaler.metricsCheckInterval` Helm value_ |
187+
| `osiris.dm.gg/metricsCollector` | Configure the collection of metrics for a pod. The value is a JSON object with at least a `type` string, and an optional `implementation` object. See the *Metrics Scraping* section for more. | `{ "type": "osiris" }` |
187188

188189
#### Pod Annotations
189190

@@ -210,6 +211,40 @@ The following table lists the supported annotations for Kubernetes `Services` an
210211

211212
Note that you might see an `osiris.dm.gg/selector` annotation - this is for internal use only, and you shouldn't try to set/update or delete it.
212213

214+
#### Metrics Scraping Configuration
215+
216+
Scraping the metrics from the pods is done automatically using Osiris provided sidecar container by default. But if you don't want to use the auto-injected sidecar container, you can also configure a custom metrics scraper, using the `osiris.dm.gg/metricsCollector` annotation on your deployment/statefulset.
217+
218+
The following scrapers are supported:
219+
220+
**osiris**
221+
222+
This is the default scraper, which doesn't need any configuration.
223+
224+
**prometheus**
225+
226+
The prometheus scraper retrieves metrics about the request count from your own prometheus endpoint. To use it, your application need to expose an endpoint with metrics in the prometheus format.
227+
You can then set the following annotation:
228+
229+
```
230+
annotations:
231+
osiris.dm.gg/metricsCollector: |
232+
{
233+
"type": "prometheus",
234+
"implementation": {
235+
"port": 8080,
236+
"path": "/metrics",
237+
"requestCountMetricName": "requests"
238+
}
239+
}
240+
```
241+
242+
The schema of the prometheus implementation configuration is:
243+
- a mandatory `port` integer
244+
- an optional `path` string - default to `/metrics` if not set
245+
- a mandatory `requestCountMetricName` string, for the name of the metric that expose the number of requests
246+
- an optional `requestCountMetricLabels` object, for all labels that should match the metric for request count
247+
213248
### Demo
214249

215250
Deploy the example application `hello-osiris` :

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ require (
1111
github.com/hashicorp/golang-lru v0.5.4 // indirect
1212
github.com/imdario/mergo v0.3.11 // indirect
1313
github.com/kelseyhightower/envconfig v1.4.0
14+
github.com/prometheus/client_model v0.2.0
15+
github.com/prometheus/common v0.10.0
1416
github.com/satori/go.uuid v1.2.0
1517
github.com/stretchr/testify v1.5.1
1618
golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a // indirect

go.sum

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaO
395395
github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
396396
github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
397397
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
398+
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 h1:I0XW9+e1XWDxdcEniV4rQAIOPUGDq67JSCiRCgGCZLI=
398399
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
399400
github.com/mholt/certmagic v0.6.2-0.20190624175158-6a42ef9fe8c2/go.mod h1:g4cOPxcjV0oFq3qwpjSA30LReKD8AoIfwAY9VvG35NY=
400401
github.com/miekg/dns v1.1.3/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
@@ -461,10 +462,12 @@ github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP
461462
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
462463
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
463464
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
465+
github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M=
464466
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
465467
github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
466468
github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
467469
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
470+
github.com/prometheus/common v0.10.0 h1:RyRA7RzGXQZiW+tGMr7sxa85G1z0yOpM1qq5c8lNawc=
468471
github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
469472
github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
470473
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=

pkg/deployments/zeroscaler/metrics_collector.go

Lines changed: 52 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7-
"io/ioutil"
8-
"net/http"
97
"sync"
108
"time"
119

@@ -18,56 +16,46 @@ import (
1816
"k8s.io/client-go/tools/cache"
1917

2018
k8s "github.com/dailymotion/osiris/pkg/kubernetes"
21-
"github.com/dailymotion/osiris/pkg/metrics"
2219
)
2320

24-
const (
25-
proxyContainerName = "osiris-proxy"
26-
proxyPortName = "osiris-metrics"
27-
)
28-
29-
type metricsCollector struct {
30-
kubeClient kubernetes.Interface
21+
type metricsCollectorConfig struct {
3122
appKind string
3223
appName string
3324
appNamespace string
3425
selector labels.Selector
3526
metricsCheckInterval time.Duration
36-
podsInformer cache.SharedIndexInformer
37-
appPods map[string]*corev1.Pod
38-
appPodsLock sync.Mutex
39-
httpClient *http.Client
40-
cancelFunc func()
27+
scraperConfig metricsScraperConfig
28+
}
29+
30+
type metricsCollector struct {
31+
config metricsCollectorConfig
32+
scraper metricsScraper
33+
kubeClient kubernetes.Interface
34+
podsInformer cache.SharedIndexInformer
35+
appPods map[string]*corev1.Pod
36+
appPodsLock sync.Mutex
37+
cancelFunc func()
4138
}
4239

4340
func newMetricsCollector(
4441
kubeClient kubernetes.Interface,
45-
appKind string,
46-
appName string,
47-
appNamespace string,
48-
selector labels.Selector,
49-
metricsCheckInterval time.Duration,
50-
) *metricsCollector {
42+
config metricsCollectorConfig,
43+
) (*metricsCollector, error) {
44+
s, err := newMetricsScraper(config.scraperConfig)
45+
if err != nil {
46+
return nil, err
47+
}
5148
m := &metricsCollector{
52-
kubeClient: kubeClient,
53-
appKind: appKind,
54-
appName: appName,
55-
appNamespace: appNamespace,
56-
selector: selector,
57-
metricsCheckInterval: metricsCheckInterval,
49+
config: config,
50+
scraper: s,
51+
kubeClient: kubeClient,
5852
podsInformer: k8s.PodsIndexInformer(
5953
kubeClient,
60-
appNamespace,
54+
config.appNamespace,
6155
nil,
62-
selector,
56+
config.selector,
6357
),
6458
appPods: map[string]*corev1.Pod{},
65-
// A very aggressive timeout. When collecting metrics, we want to do it very
66-
// quickly to minimize the possibility that some pods we've checked on have
67-
// served requests while we've been checking on OTHER pods.
68-
httpClient: &http.Client{
69-
Timeout: 2 * time.Second,
70-
},
7159
}
7260
m.podsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
7361
AddFunc: m.syncAppPod,
@@ -76,7 +64,7 @@ func newMetricsCollector(
7664
},
7765
DeleteFunc: m.syncDeletedAppPod,
7866
})
79-
return m
67+
return m, nil
8068
}
8169

8270
func (m *metricsCollector) run(ctx context.Context) {
@@ -86,16 +74,16 @@ func (m *metricsCollector) run(ctx context.Context) {
8674
<-ctx.Done()
8775
glog.Infof(
8876
"Stopping metrics collection for %s %s in namespace %s",
89-
m.appKind,
90-
m.appName,
91-
m.appNamespace,
77+
m.config.appKind,
78+
m.config.appName,
79+
m.config.appNamespace,
9280
)
9381
}()
9482
glog.Infof(
9583
"Starting metrics collection for %s %s in namespace %s",
96-
m.appKind,
97-
m.appName,
98-
m.appNamespace,
84+
m.config.appKind,
85+
m.config.appName,
86+
m.config.appNamespace,
9987
)
10088
go m.podsInformer.Run(ctx.Done())
10189
// When this exits, the cancel func will stop the informer
@@ -123,7 +111,7 @@ func (m *metricsCollector) syncDeletedAppPod(obj interface{}) {
123111
func (m *metricsCollector) collectMetrics(ctx context.Context) {
124112
requestCountsByProxy := map[string]uint64{}
125113
var lastTotalRequestCount uint64
126-
ticker := time.NewTicker(m.metricsCheckInterval)
114+
ticker := time.NewTicker(m.config.metricsCheckInterval)
127115
defer ticker.Stop()
128116
for {
129117
select {
@@ -135,26 +123,17 @@ func (m *metricsCollector) collectMetrics(ctx context.Context) {
135123
// all.
136124
timer := time.NewTimer(3 * time.Second)
137125
for _, pod := range m.appPods {
138-
podMetricsPort, ok := getMetricsPort(pod)
139-
if !ok {
140-
mustNotDecide = true
141-
continue
142-
}
143-
url := fmt.Sprintf(
144-
"http://%s:%d/metrics",
145-
pod.Status.PodIP,
146-
podMetricsPort,
147-
)
148126
scrapeWG.Add(1)
149-
go func() {
127+
go func(pod *corev1.Pod) {
150128
defer scrapeWG.Done()
151129
// Get the results
152-
prc, ok := m.scrape(url)
153-
if !ok {
130+
prc := m.scraper.Scrap(pod)
131+
if prc == nil {
154132
mustNotDecide = true
133+
} else {
134+
requestCountsByProxy[prc.ProxyID] = prc.RequestCount
155135
}
156-
requestCountsByProxy[prc.ProxyID] = prc.RequestCount
157-
}()
136+
}(pod)
158137
}
159138
m.appPodsLock.Unlock()
160139
scrapeWG.Wait()
@@ -180,65 +159,12 @@ func (m *metricsCollector) collectMetrics(ctx context.Context) {
180159
}
181160
}
182161

183-
func getMetricsPort(pod *corev1.Pod) (int32, bool) {
184-
for _, c := range pod.Spec.Containers {
185-
if c.Name == proxyContainerName && len(c.Ports) > 0 {
186-
for _, port := range c.Ports {
187-
if port.Name == proxyPortName {
188-
return port.ContainerPort, true
189-
}
190-
}
191-
}
192-
}
193-
return 0, false
194-
}
195-
196-
func (m *metricsCollector) scrape(
197-
target string,
198-
) (metrics.ProxyRequestCount, bool) {
199-
prc := metrics.ProxyRequestCount{}
200-
// Requests made with this client time out after 2 seconds
201-
resp, err := m.httpClient.Get(target)
202-
if err != nil {
203-
glog.Errorf("Error requesting metrics from %s: %s", target, err)
204-
return prc, false
205-
}
206-
defer resp.Body.Close()
207-
if resp.StatusCode != 200 {
208-
glog.Errorf(
209-
"Received unexpected HTTP response code %d when requesting metrics "+
210-
"from %s",
211-
resp.StatusCode,
212-
target,
213-
)
214-
return prc, false
215-
}
216-
bodyBytes, err := ioutil.ReadAll(resp.Body)
217-
if err != nil {
218-
glog.Errorf(
219-
"Error reading metrics request response from %s: %s",
220-
target,
221-
err,
222-
)
223-
return prc, false
224-
}
225-
if err := json.Unmarshal(bodyBytes, &prc); err != nil {
226-
glog.Errorf(
227-
"Error umarshaling metrics request response from %s: %s",
228-
target,
229-
err,
230-
)
231-
return prc, false
232-
}
233-
return prc, true
234-
}
235-
236162
func (m *metricsCollector) scaleToZero(ctx context.Context) {
237163
glog.Infof(
238164
"Scale to zero starting for %s %s in namespace %s",
239-
m.appKind,
240-
m.appName,
241-
m.appNamespace,
165+
m.config.appKind,
166+
m.config.appName,
167+
m.config.appNamespace,
242168
)
243169

244170
patches := []k8s.PatchOperation{{
@@ -248,41 +174,41 @@ func (m *metricsCollector) scaleToZero(ctx context.Context) {
248174
}}
249175
patchesBytes, _ := json.Marshal(patches)
250176
var err error
251-
switch m.appKind {
177+
switch m.config.appKind {
252178
case "Deployment":
253-
_, err = m.kubeClient.AppsV1().Deployments(m.appNamespace).Patch(
179+
_, err = m.kubeClient.AppsV1().Deployments(m.config.appNamespace).Patch(
254180
ctx,
255-
m.appName,
181+
m.config.appName,
256182
k8s_types.JSONPatchType,
257183
patchesBytes,
258184
metav1.PatchOptions{},
259185
)
260186
case "StatefulSet":
261-
_, err = m.kubeClient.AppsV1().StatefulSets(m.appNamespace).Patch(
187+
_, err = m.kubeClient.AppsV1().StatefulSets(m.config.appNamespace).Patch(
262188
ctx,
263-
m.appName,
189+
m.config.appName,
264190
k8s_types.JSONPatchType,
265191
patchesBytes,
266192
metav1.PatchOptions{},
267193
)
268194
default:
269-
err = fmt.Errorf("unknown kind '%s'", m.appKind)
195+
err = fmt.Errorf("unknown kind '%s'", m.config.appKind)
270196
}
271197
if err != nil {
272198
glog.Errorf(
273199
"Error scaling %s %s in namespace %s to zero: %s",
274-
m.appKind,
275-
m.appName,
276-
m.appNamespace,
200+
m.config.appKind,
201+
m.config.appName,
202+
m.config.appNamespace,
277203
err,
278204
)
279205
return
280206
}
281207

282208
glog.Infof(
283209
"Scaled %s %s in namespace %s to zero",
284-
m.appKind,
285-
m.appName,
286-
m.appNamespace,
210+
m.config.appKind,
211+
m.config.appName,
212+
m.config.appNamespace,
287213
)
288214
}

0 commit comments

Comments
 (0)