diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index 10600a0dcc91b72e2a0ea5b4518ff7b3aebfa27b..5c9f9ef28457eac67a73826b46a34eddb6b53619 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -26,6 +26,7 @@ "liveNow": false, "panels": [ { + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -33,6 +34,7 @@ "y": 0 }, "id": 44, + "panels": [], "title": "Customer ciphertext backup to Borgbase.com", "type": "row" }, @@ -619,11 +621,280 @@ "title": "Monthly check-repo trigger", "type": "timeseries" }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "3h", + "now" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", + "name": "Daily backup job run time alert", + "noDataState": "no_data", + "notifications": [] + }, + "description": "When was the unit active? With alerts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "left", + "axisSoftMax": -0.8, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1.2, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 52, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Daily backup job run time", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "6h", + "now" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", + "name": "Monthly check-repo run time alert", + "noDataState": "no_data", + "notifications": [] + }, + "description": "When was the unit active?", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "left", + "axisSoftMax": -0.8, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1.2, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Monthly check-repo run time", + "type": "timeseries" + }, { "datasource": { "type": "loki", "uid": "000000002" }, + "description": "The \"duration\" that borgbackup status reports.", "fieldConfig": { "defaults": { "color": { @@ -632,6 +903,7 @@ "custom": { "axisLabel": "", "axisPlacement": "auto", + "axisSoftMin": 0, "fillOpacity": 60, "gradientMode": "none", "hideFrom": { @@ -670,7 +942,7 @@ "h": 7, "w": 12, "x": 0, - "y": 15 + "y": 22 }, "id": 49, "options": {