diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index 96a2bceb994104d126fd93fd7bf16ce127810dc2..df77fb3747d17ac8bd86848e966b470dceae8669 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -22,11 +22,11 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 39, "links": [], "liveNow": false, "panels": [ { + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -34,6 +34,7 @@ "y": 0 }, "id": 44, + "panels": [], "title": "Customer ciphertext backup to Borgbase.com", "type": "row" }, @@ -141,7 +142,7 @@ ] }, "gridPos": { - "h": 7, + "h": 5, "w": 12, "x": 0, "y": 1 @@ -297,7 +298,7 @@ ] }, "gridPos": { - "h": 7, + "h": 5, "w": 12, "x": 12, "y": 1 @@ -442,10 +443,10 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 5, "w": 12, "x": 0, - "y": 8 + "y": 6 }, "id": 41, "options": { @@ -577,10 +578,10 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 5, "w": 12, "x": 12, - "y": 8 + "y": 6 }, "id": 42, "options": { @@ -619,9 +620,524 @@ ], "title": "Monthly check-repo trigger", "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "3h", + "now" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", + "name": "Daily backup job run time alert", + "noDataState": "no_data", + "notifications": [] + }, + "description": "When was the unit active? With alerts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "left", + "axisSoftMax": -0.8, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1.2, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 52, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Daily backup job run time", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "6h", + "now" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", + "name": "Monthly check-repo run time alert", + "noDataState": "no_data", + "notifications": [] + }, + "description": "When was the unit active?", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "left", + "axisSoftMax": -0.8, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1.2, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Monthly check-repo run time", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "000000002" + }, + "description": "The \"duration\" that borgbackup status reports.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 3600 + }, + { + "color": "red", + "value": 10800 + } + ] + }, + "unit": "dtdurations" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 49, + "options": { + "barRadius": 0, + "barWidth": 0.1, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xField": "host", + "xTickLabelRotation": -45, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "000000002" + }, + "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"", + "legendFormat": "{{host}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Daily backup job run time", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "duration", + "host" + ] + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "duration" + } + ], + "fields": {} + } + }, + { + "id": "merge", + "options": {} + } + ], + "type": "barchart" + }, + { + "datasource": { + "type": "loki", + "uid": "000000002" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 55, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "000000002" + }, + "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"", + "hide": false, + "legendFormat": "{{host}} archive", + "refId": "This archive size in bytes" + }, + { + "datasource": { + "type": "loki", + "uid": "000000002" + }, + "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"", + "hide": false, + "legendFormat": "{{host}} all archives", + "refId": "All archives deduplicated size" + } + ], + "title": "Backup set size", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "host", + "unique_csize", + "compressed_size" + ] + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "unique_csize" + }, + { + "destinationType": "number", + "targetField": "compressed_size" + } + ], + "fields": {} + } + }, + { + "id": "merge", + "options": {} + } + ], + "type": "timeseries" } ], - "refresh": "1m", + "refresh": "5m", "schemaVersion": 35, "style": "dark", "tags": [],