From 0e7e464df4147cfe836243f56a0c687e385ee126 Mon Sep 17 00:00:00 2001 From: Florian Sesser <florian@private.storage> Date: Mon, 12 Sep 2022 17:04:41 +0200 Subject: [PATCH] Disable broken alerting for backup durations --- .../server/grafana-dashboards/backups.json | 162 +++--------------- 1 file changed, 22 insertions(+), 140 deletions(-) diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index 51c6ec60..e2ed09b2 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -22,6 +22,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 58, "links": [], "liveNow": false, "panels": [ @@ -74,6 +75,7 @@ "noDataState": "no_data", "notifications": [] }, + "datasource": "Prometheus", "description": "Daily backup job systemd timer unit state", "fieldConfig": { "defaults": { @@ -162,10 +164,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})", "hide": false, @@ -174,10 +173,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})", "hide": false, @@ -233,6 +229,7 @@ "noDataState": "no_data", "notifications": [] }, + "datasource": "Prometheus", "description": "Monthly check-repo systemd timer unit state", "fieldConfig": { "defaults": { @@ -318,10 +315,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})", "hide": false, @@ -330,10 +324,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})", "hide": false, @@ -390,6 +381,7 @@ "noDataState": "no_data", "notifications": [] }, + "datasource": "Prometheus", "description": "This shows the last triggering of the borgbackup-job-daily.timer systemd unit.", "fieldConfig": { "defaults": { @@ -463,10 +455,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()", "interval": "", @@ -523,6 +512,7 @@ "noDataState": "no_data", "notifications": [] }, + "datasource": "Prometheus", "description": "This shows the last triggering of the borgbackup-job-daily.timer systemd unit.", "fieldConfig": { "defaults": { @@ -598,10 +588,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()", "interval": "", @@ -622,42 +609,7 @@ "type": "timeseries" }, { - "alert": { - "alertRuleTags": {}, - "conditions": [ - { - "evaluator": { - "params": [ - 0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now-3h" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "5m", - "frequency": "1m", - "handler": 1, - "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", - "name": "Daily backup job run time alert", - "noDataState": "no_data", - "notifications": [] - }, + "datasource": "Prometheus", "description": "When was the unit active? With alerts", "fieldConfig": { "defaults": { @@ -733,10 +685,7 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", "interval": "", @@ -744,54 +693,12 @@ "refId": "A" } ], - "thresholds": [ - { - "colorMode": "critical", - "op": "gt", - "value": 0, - "visible": true - } - ], + "thresholds": [], "title": "Daily backup job run time", "type": "timeseries" }, { - "alert": { - "alertRuleTags": {}, - "conditions": [ - { - "evaluator": { - "params": [ - 0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now-6h" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "5m", - "frequency": "1m", - "handler": 1, - "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", - "name": "Monthly check-repo run time alert", - "noDataState": "no_data", - "notifications": [] - }, + "datasource": "Prometheus", "description": "When was the unit active?", "fieldConfig": { "defaults": { @@ -867,10 +774,7 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", "interval": "", @@ -878,22 +782,12 @@ "refId": "A" } ], - "thresholds": [ - { - "colorMode": "critical", - "op": "gt", - "value": 0, - "visible": true - } - ], + "thresholds": [], "title": "Monthly check-repo run time", "type": "timeseries" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "description": "The \"duration\" that borgbackup status reports.", "fieldConfig": { "defaults": { @@ -968,10 +862,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"", "legendFormat": "{{host}}", "queryType": "range", @@ -1009,10 +900,7 @@ "type": "barchart" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "fieldConfig": { "defaults": { "color": { @@ -1081,20 +969,14 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"", "hide": false, "legendFormat": "{{host}} archive", "refId": "This archive size in bytes" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"", "hide": false, "legendFormat": "{{host}} all archives", -- GitLab