diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index 51c6ec60444f4f7bac0bf0753e9e0b8e1c3aa1e9..586cd4a59a92c33a507a9b0ee7b6eff52a8f4194 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -22,6 +22,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 54, "links": [], "liveNow": false, "panels": [ @@ -92,7 +93,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -162,10 +163,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})", "hide": false, @@ -174,10 +172,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})", "hide": false, @@ -251,7 +246,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -318,10 +313,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})", "hide": false, @@ -330,10 +322,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})", "hide": false, @@ -463,10 +452,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()", "interval": "", @@ -598,10 +584,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()", "interval": "", @@ -628,7 +611,7 @@ { "evaluator": { "params": [ - 0 + 9000 ], "type": "gt" }, @@ -637,9 +620,9 @@ }, "query": { "params": [ - "A", + "JobRunTime", "5m", - "now-3h" + "now" ] }, "reducer": { @@ -653,12 +636,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", + "message": "A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its \"random\" job delay.", "name": "Daily backup job run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active? With alerts", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -667,8 +650,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -698,18 +679,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 7200 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -733,22 +716,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 9000, "visible": true } ], @@ -762,7 +742,7 @@ { "evaluator": { "params": [ - 0 + 18000 ], "type": "gt" }, @@ -771,9 +751,9 @@ }, "query": { "params": [ - "A", + "JobRunTime", "5m", - "now-6h" + "now" ] }, "reducer": { @@ -787,12 +767,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", + "message": "A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", "name": "Monthly check-repo run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active?", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -801,8 +781,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -832,18 +810,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 15000 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -867,22 +847,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 18000, "visible": true } ], @@ -890,10 +867,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "description": "The \"duration\" that borgbackup status reports.", "fieldConfig": { "defaults": { @@ -968,17 +942,14 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"", "legendFormat": "{{host}}", "queryType": "range", "refId": "A" } ], - "title": "Daily backup job run time", + "title": "Daily backup job run time (as reported by borg)", "transformations": [ { "id": "labelsToFields", @@ -1009,10 +980,7 @@ "type": "barchart" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "fieldConfig": { "defaults": { "color": { @@ -1021,29 +989,17 @@ "custom": { "axisLabel": "", "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, + "axisSoftMin": 0, + "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } }, "mappings": [], @@ -1068,33 +1024,35 @@ }, "id": 55, "options": { + "barRadius": 0, + "barWidth": 0.97, + "groupWidth": 0.7, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" - } + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"", "hide": false, "legendFormat": "{{host}} archive", "refId": "This archive size in bytes" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"", "hide": false, "legendFormat": "{{host}} all archives", @@ -1134,7 +1092,7 @@ "options": {} } ], - "type": "timeseries" + "type": "barchart" } ], "refresh": "5m",