From 45dccd85c0cdf14235a3832907f96fd2dfe55f63 Mon Sep 17 00:00:00 2001 From: Florian Sesser <florian@private.storage> Date: Fri, 2 Jul 2021 18:10:33 +0000 Subject: [PATCH] Add services dashboard monitoring PaymentServer Known issues: - Error rate tooltip computes over whole range (sum), should be per second or per minute or so - Duration buckets not great. Should also be per second probably. --- .../grafana-config/services-overview.json | 664 ++++++++++++++++++ 1 file changed, 664 insertions(+) create mode 100644 nixos/modules/monitoring/server/grafana-config/services-overview.json diff --git a/nixos/modules/monitoring/server/grafana-config/services-overview.json b/nixos/modules/monitoring/server/grafana-config/services-overview.json new file mode 100644 index 00000000..8ee7b130 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-config/services-overview.json @@ -0,0 +1,664 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "RED: Requests-Errors-Duration for our services", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 18, + "panels": [], + "title": "Payments v1/stripe/charge", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "HTTPS responses per second", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_responses_total{path=\"v1/stripe/charge\"}[5m])", + "instant": false, + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/stripe/charge RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": true, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(http_responses_total{path=\"v1/stripe/charge\", status=\"4XX\"}) / sum(http_responses_total{path=\"v1/redeem\"})", + "legendFormat": "Client error (4XX) rate", + "refId": "A" + }, + { + "expr": "sum(http_responses_total{path=\"v1/stripe/charge\", status=\"5XX\"}) / sum(http_responses_total{path=\"v1/redeem\"})", + "legendFormat": "Server error (5XX) rate", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/stripe/charge error rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Requests taking longer than 1 s, between 1 sec and 10 msec, and 10 msec and below", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.4.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/stripe/charge\", le=\"0.01\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "=< 0.01s", + "refId": "A" + }, + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/stripe/charge\", le=\"1.0\"} - ignoring(le) http_request_duration_seconds_bucket{path=\"v1/stripe/charge\", le=\"0.01\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "=< 1s", + "refId": "D" + }, + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/stripe/charge\", le=\"+Inf\"} - ignoring(le) http_request_duration_seconds_bucket{path=\"v1/stripe/charge\", le=\"1.0\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "> 1s", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/stripe/charge durations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 11, + "panels": [], + "title": "Payments v1/redeem", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "HTTPS responses per second", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 9 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_responses_total{path=\"v1/redeem\"}[5m])", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/redeem RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 9 + }, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": true, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(http_responses_total{path=\"v1/redeem\", status=\"4XX\"}) / sum(http_responses_total{path=\"v1/redeem\"})", + "legendFormat": "Client error (4XX) rate", + "refId": "A" + }, + { + "expr": "sum(http_responses_total{path=\"v1/redeem\", status=\"5XX\"}) / sum(http_responses_total{path=\"v1/redeem\"})", + "legendFormat": "Server error (5XX) rate", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/redeem error rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Requests taking longer than 1 s, between 1 sec and 10 msec, and 10 msec and below", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 9 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.4.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/redeem\", le=\"0.01\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "=< 0.01s", + "refId": "A" + }, + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/redeem\", le=\"1.0\"} - ignoring(le) http_request_duration_seconds_bucket{path=\"v1/redeem\", le=\"0.01\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "=< 1s", + "refId": "D" + }, + { + "expr": "http_request_duration_seconds_bucket{path=\"v1/redeem\", le=\"+Inf\"} - ignoring(le) http_request_duration_seconds_bucket{path=\"v1/redeem\", le=\"1.0\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "> 1s", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "v1/redeem durations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "", + "schemaVersion": 20, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Services overview", + "uid": "JX3RVEk7k", + "version": 6 +} -- GitLab