Skip to content

Commit 8941c12

Browse files
[grafana - oci fetch dash] put on-disk ratio first; exclude on-disk images from fetch stats; label panels clearly (#11406)
<img width="1200" height="2200" alt="new-oci-panels" src="https://github.com/user-attachments/assets/57442ede-bf0f-4af8-8d6d-676c33b1db50" /> --------- Co-authored-by: Shelley <shelley@exe.dev>
1 parent bc5679b commit 8941c12

File tree

1 file changed

+65
-42
lines changed

1 file changed

+65
-42
lines changed

tools/metrics/grafana/dashboards/oci-image-fetches.json

Lines changed: 65 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"type": "prometheus",
2626
"uid": "vm"
2727
},
28-
"description": "Ratio of successful image fetches to total fetches, per isolation type.",
28+
"description": "Ratio of image fetch attempts where the image was already cached on the executor, per isolation type.",
2929
"fieldConfig": {
3030
"defaults": {
3131
"color": {
@@ -85,7 +85,7 @@
8585
"x": 0,
8686
"y": 0
8787
},
88-
"id": 1,
88+
"id": 5,
8989
"options": {
9090
"legend": {
9191
"calcs": [
@@ -109,21 +109,21 @@
109109
"uid": "vm"
110110
},
111111
"editorMode": "code",
112-
"expr": "sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{status=\"ok\"}[$__rate_interval])) / sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count[$__rate_interval]))",
113-
"legendFormat": "{{isolation_type}}",
112+
"expr": "sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\", on_disk=\"true\"}[$__rate_interval])) / sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\"}[$__rate_interval]))",
113+
"legendFormat": "{{isolation_type}} on-disk ratio",
114114
"range": true,
115115
"refId": "A"
116116
}
117117
],
118-
"title": "OCI Image Fetch Success Ratio",
118+
"title": "OCI Image On-Disk Ratio",
119119
"type": "timeseries"
120120
},
121121
{
122122
"datasource": {
123123
"type": "prometheus",
124124
"uid": "vm"
125125
},
126-
"description": "p90 image fetch latency by isolation type, for successful fetches only. Metric is in microseconds, converted to seconds for display.",
126+
"description": "Ratio of successful image fetches to total fetches, per isolation type. Excludes images already cached on-disk.",
127127
"fieldConfig": {
128128
"defaults": {
129129
"color": {
@@ -138,7 +138,7 @@
138138
"barAlignment": 0,
139139
"barWidthFactor": 0.6,
140140
"drawStyle": "line",
141-
"fillOpacity": 5,
141+
"fillOpacity": 10,
142142
"gradientMode": "none",
143143
"hideFrom": {
144144
"legend": false,
@@ -163,6 +163,8 @@
163163
}
164164
},
165165
"mappings": [],
166+
"max": 1,
167+
"min": 0,
166168
"thresholds": {
167169
"mode": "absolute",
168170
"steps": [
@@ -171,7 +173,7 @@
171173
}
172174
]
173175
},
174-
"unit": "s"
176+
"unit": "percentunit"
175177
},
176178
"overrides": []
177179
},
@@ -181,12 +183,12 @@
181183
"x": 0,
182184
"y": 10
183185
},
184-
"id": 2,
186+
"id": 1,
185187
"options": {
186188
"legend": {
187189
"calcs": [
188190
"median",
189-
"max"
191+
"min"
190192
],
191193
"displayMode": "table",
192194
"placement": "bottom",
@@ -205,21 +207,21 @@
205207
"uid": "vm"
206208
},
207209
"editorMode": "code",
208-
"expr": "histogram_quantile(0.90, sum by (isolation_type, le) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_bucket{status=\"ok\"}[$__rate_interval]))) / 1e6",
209-
"legendFormat": "{{isolation_type}}",
210+
"expr": "sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\", status=\"ok\", on_disk!=\"true\"}[$__rate_interval])) / sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\", on_disk!=\"true\"}[$__rate_interval]))",
211+
"legendFormat": "{{isolation_type}} successful fetch percentage",
210212
"range": true,
211213
"refId": "A"
212214
}
213215
],
214-
"title": "OCI Image Fetch Latency (p90)",
216+
"title": "OCI Image Fetch Success Ratio by Isolation Type (excl. on-disk)",
215217
"type": "timeseries"
216218
},
217219
{
218220
"datasource": {
219221
"type": "prometheus",
220222
"uid": "vm"
221223
},
222-
"description": "Ratio of successful image fetches to total fetches, per upstream registry (eTLD+1).",
224+
"description": "p90 image fetch latency by isolation type, for successful fetches only. Excludes images already cached on-disk. Metric is in microseconds, converted to seconds for display.",
223225
"fieldConfig": {
224226
"defaults": {
225227
"color": {
@@ -234,7 +236,7 @@
234236
"barAlignment": 0,
235237
"barWidthFactor": 0.6,
236238
"drawStyle": "line",
237-
"fillOpacity": 10,
239+
"fillOpacity": 5,
238240
"gradientMode": "none",
239241
"hideFrom": {
240242
"legend": false,
@@ -259,8 +261,6 @@
259261
}
260262
},
261263
"mappings": [],
262-
"max": 1,
263-
"min": 0,
264264
"thresholds": {
265265
"mode": "absolute",
266266
"steps": [
@@ -269,7 +269,7 @@
269269
}
270270
]
271271
},
272-
"unit": "percentunit"
272+
"unit": "s"
273273
},
274274
"overrides": []
275275
},
@@ -279,12 +279,12 @@
279279
"x": 0,
280280
"y": 20
281281
},
282-
"id": 3,
282+
"id": 2,
283283
"options": {
284284
"legend": {
285285
"calcs": [
286286
"median",
287-
"min"
287+
"max"
288288
],
289289
"displayMode": "table",
290290
"placement": "bottom",
@@ -303,21 +303,21 @@
303303
"uid": "vm"
304304
},
305305
"editorMode": "code",
306-
"expr": "sum by (registry) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{status=\"ok\"}[$__rate_interval])) / sum by (registry) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count[$__rate_interval]))",
307-
"legendFormat": "{{registry}}",
306+
"expr": "histogram_quantile(0.90, sum by (isolation_type, le) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_bucket{region=\"$region\", status=\"ok\", on_disk!=\"true\"}[$__rate_interval]))) / 1e6",
307+
"legendFormat": "{{isolation_type}} p90 latency",
308308
"range": true,
309309
"refId": "A"
310310
}
311311
],
312-
"title": "OCI Image Fetch Success Ratio by Registry",
312+
"title": "OCI Image Fetch Latency p90 by Isolation Type (excl. on-disk)",
313313
"type": "timeseries"
314314
},
315315
{
316316
"datasource": {
317317
"type": "prometheus",
318318
"uid": "vm"
319319
},
320-
"description": "p90 image fetch latency by upstream registry (eTLD+1), for successful fetches only. Metric is in microseconds, converted to seconds for display.",
320+
"description": "Ratio of successful image fetches to total fetches, per upstream registry (eTLD+1).",
321321
"fieldConfig": {
322322
"defaults": {
323323
"color": {
@@ -332,7 +332,7 @@
332332
"barAlignment": 0,
333333
"barWidthFactor": 0.6,
334334
"drawStyle": "line",
335-
"fillOpacity": 5,
335+
"fillOpacity": 10,
336336
"gradientMode": "none",
337337
"hideFrom": {
338338
"legend": false,
@@ -357,6 +357,8 @@
357357
}
358358
},
359359
"mappings": [],
360+
"max": 1,
361+
"min": 0,
360362
"thresholds": {
361363
"mode": "absolute",
362364
"steps": [
@@ -365,7 +367,7 @@
365367
}
366368
]
367369
},
368-
"unit": "s"
370+
"unit": "percentunit"
369371
},
370372
"overrides": []
371373
},
@@ -375,12 +377,12 @@
375377
"x": 0,
376378
"y": 30
377379
},
378-
"id": 4,
380+
"id": 3,
379381
"options": {
380382
"legend": {
381383
"calcs": [
382384
"median",
383-
"max"
385+
"min"
384386
],
385387
"displayMode": "table",
386388
"placement": "bottom",
@@ -399,21 +401,21 @@
399401
"uid": "vm"
400402
},
401403
"editorMode": "code",
402-
"expr": "histogram_quantile(0.90, sum by (registry, le) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_bucket{status=\"ok\"}[$__rate_interval]))) / 1e6",
403-
"legendFormat": "{{registry}}",
404+
"expr": "sum by (registry) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\", status=\"ok\"}[$__rate_interval])) / sum by (registry) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{region=\"$region\"}[$__rate_interval]))",
405+
"legendFormat": "{{registry}} successful fetch percentage",
404406
"range": true,
405407
"refId": "A"
406408
}
407409
],
408-
"title": "OCI Image Fetch Latency (p90) by Registry",
410+
"title": "OCI Image Fetch Success Ratio by Registry",
409411
"type": "timeseries"
410412
},
411413
{
412414
"datasource": {
413415
"type": "prometheus",
414416
"uid": "vm"
415417
},
416-
"description": "Ratio of image fetch attempts where the image was already cached on the executor, per isolation type.",
418+
"description": "p90 image fetch latency by upstream registry (eTLD+1), for successful fetches only. Metric is in microseconds, converted to seconds for display.",
417419
"fieldConfig": {
418420
"defaults": {
419421
"color": {
@@ -428,7 +430,7 @@
428430
"barAlignment": 0,
429431
"barWidthFactor": 0.6,
430432
"drawStyle": "line",
431-
"fillOpacity": 10,
433+
"fillOpacity": 5,
432434
"gradientMode": "none",
433435
"hideFrom": {
434436
"legend": false,
@@ -453,8 +455,6 @@
453455
}
454456
},
455457
"mappings": [],
456-
"max": 1,
457-
"min": 0,
458458
"thresholds": {
459459
"mode": "absolute",
460460
"steps": [
@@ -463,7 +463,7 @@
463463
}
464464
]
465465
},
466-
"unit": "percentunit"
466+
"unit": "s"
467467
},
468468
"overrides": []
469469
},
@@ -473,12 +473,12 @@
473473
"x": 0,
474474
"y": 40
475475
},
476-
"id": 5,
476+
"id": 4,
477477
"options": {
478478
"legend": {
479479
"calcs": [
480480
"median",
481-
"min"
481+
"max"
482482
],
483483
"displayMode": "table",
484484
"placement": "bottom",
@@ -497,13 +497,13 @@
497497
"uid": "vm"
498498
},
499499
"editorMode": "code",
500-
"expr": "sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count{on_disk=\"true\"}[$__rate_interval])) / sum by (isolation_type) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_count[$__rate_interval]))",
501-
"legendFormat": "{{isolation_type}}",
500+
"expr": "histogram_quantile(0.90, sum by (registry, le) (rate(buildbuddy_remote_execution_image_fetch_duration_usec_bucket{region=\"$region\", status=\"ok\"}[$__rate_interval]))) / 1e6",
501+
"legendFormat": "{{registry}} p90 latency",
502502
"range": true,
503503
"refId": "A"
504504
}
505505
],
506-
"title": "OCI Image On-Disk Ratio",
506+
"title": "OCI Image Fetch Latency (p90) by Registry",
507507
"type": "timeseries"
508508
}
509509
],
@@ -513,7 +513,30 @@
513513
"file:oci-image-fetches.json"
514514
],
515515
"templating": {
516-
"list": []
516+
"list": [
517+
{
518+
"current": {
519+
"text": "us-west1",
520+
"value": "us-west1"
521+
},
522+
"datasource": {
523+
"type": "prometheus",
524+
"uid": "vm"
525+
},
526+
"definition": "label_values(up, region)",
527+
"includeAll": false,
528+
"name": "region",
529+
"options": [],
530+
"query": {
531+
"query": "label_values(up, region)",
532+
"refId": "Prometheus-region-Variable-Query"
533+
},
534+
"refresh": 1,
535+
"regex": "",
536+
"sort": 1,
537+
"type": "query"
538+
}
539+
]
517540
},
518541
"time": {
519542
"from": "now-6h",

0 commit comments

Comments
 (0)