From 90073e2c8280bf97ffaa623c3562e6098a16d30a Mon Sep 17 00:00:00 2001 From: Florian Sesser <florian@private.storage> Date: Tue, 13 Sep 2022 00:46:33 +0200 Subject: [PATCH] Backup duration alert: Get unit state and trigger times from systemd ... and use a smarter Prometheus query to combine the two. --- .../server/grafana-dashboards/backups.json | 154 +++++++----------- 1 file changed, 56 insertions(+), 98 deletions(-) diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index 51c6ec60..586cd4a5 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -22,6 +22,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 54, "links": [], "liveNow": false, "panels": [ @@ -92,7 +93,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -162,10 +163,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})", "hide": false, @@ -174,10 +172,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})", "hide": false, @@ -251,7 +246,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -318,10 +313,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})", "hide": false, @@ -330,10 +322,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})", "hide": false, @@ -463,10 +452,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()", "interval": "", @@ -598,10 +584,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()", "interval": "", @@ -628,7 +611,7 @@ { "evaluator": { "params": [ - 0 + 9000 ], "type": "gt" }, @@ -637,9 +620,9 @@ }, "query": { "params": [ - "A", + "JobRunTime", "5m", - "now-3h" + "now" ] }, "reducer": { @@ -653,12 +636,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", + "message": "A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its \"random\" job delay.", "name": "Daily backup job run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active? With alerts", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -667,8 +650,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -698,18 +679,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 7200 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -733,22 +716,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 9000, "visible": true } ], @@ -762,7 +742,7 @@ { "evaluator": { "params": [ - 0 + 18000 ], "type": "gt" }, @@ -771,9 +751,9 @@ }, "query": { "params": [ - "A", + "JobRunTime", "5m", - "now-6h" + "now" ] }, "reducer": { @@ -787,12 +767,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", + "message": "A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", "name": "Monthly check-repo run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active?", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -801,8 +781,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -832,18 +810,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 15000 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -867,22 +847,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 18000, "visible": true } ], @@ -890,10 +867,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "description": "The \"duration\" that borgbackup status reports.", "fieldConfig": { "defaults": { @@ -968,17 +942,14 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"", "legendFormat": "{{host}}", "queryType": "range", "refId": "A" } ], - "title": "Daily backup job run time", + "title": "Daily backup job run time (as reported by borg)", "transformations": [ { "id": "labelsToFields", @@ -1009,10 +980,7 @@ "type": "barchart" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "fieldConfig": { "defaults": { "color": { @@ -1021,29 +989,17 @@ "custom": { "axisLabel": "", "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, + "axisSoftMin": 0, + "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } }, "mappings": [], @@ -1068,33 +1024,35 @@ }, "id": 55, "options": { + "barRadius": 0, + "barWidth": 0.97, + "groupWidth": 0.7, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" - } + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"", "hide": false, "legendFormat": "{{host}} archive", "refId": "This archive size in bytes" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"", "hide": false, "legendFormat": "{{host}} all archives", @@ -1134,7 +1092,7 @@ "options": {} } ], - "type": "timeseries" + "type": "barchart" } ], "refresh": "5m", -- GitLab