Skip to content
Snippets Groups Projects

Backup duration alert: Get unit state and trigger times from systemd

Merged Florian Sesser requested to merge 125.get-durations-from-systemd into develop
1 file
+ 56
98
Compare changes
  • Side-by-side
  • Inline
@@ -22,6 +22,7 @@
@@ -22,6 +22,7 @@
"editable": true,
"editable": true,
"fiscalYearStartMonth": 0,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"graphTooltip": 0,
 
"id": 54,
"links": [],
"links": [],
"liveNow": false,
"liveNow": false,
"panels": [
"panels": [
@@ -92,7 +93,7 @@
@@ -92,7 +93,7 @@
"tooltip": false,
"tooltip": false,
"viz": false
"viz": false
},
},
"lineInterpolation": "linear",
"lineInterpolation": "stepAfter",
"lineWidth": 1,
"lineWidth": 1,
"pointSize": 5,
"pointSize": 5,
"scaleDistribution": {
"scaleDistribution": {
@@ -162,10 +163,7 @@
@@ -162,10 +163,7 @@
"pluginVersion": "8.4.7",
"pluginVersion": "8.4.7",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})",
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})",
"hide": false,
"hide": false,
@@ -174,10 +172,7 @@
@@ -174,10 +172,7 @@
"refId": "Active timers"
"refId": "Active timers"
},
},
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})",
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})",
"hide": false,
"hide": false,
@@ -251,7 +246,7 @@
@@ -251,7 +246,7 @@
"tooltip": false,
"tooltip": false,
"viz": false
"viz": false
},
},
"lineInterpolation": "linear",
"lineInterpolation": "stepAfter",
"lineWidth": 1,
"lineWidth": 1,
"pointSize": 5,
"pointSize": 5,
"scaleDistribution": {
"scaleDistribution": {
@@ -318,10 +313,7 @@
@@ -318,10 +313,7 @@
"pluginVersion": "8.4.7",
"pluginVersion": "8.4.7",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})",
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})",
"hide": false,
"hide": false,
@@ -330,10 +322,7 @@
@@ -330,10 +322,7 @@
"refId": "Active timers"
"refId": "Active timers"
},
},
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})",
"expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})",
"hide": false,
"hide": false,
@@ -463,10 +452,7 @@
@@ -463,10 +452,7 @@
"pluginVersion": "8.3.5",
"pluginVersion": "8.3.5",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": false,
"exemplar": false,
"expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()",
"expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()",
"interval": "",
"interval": "",
@@ -598,10 +584,7 @@
@@ -598,10 +584,7 @@
"pluginVersion": "8.3.5",
"pluginVersion": "8.3.5",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": false,
"exemplar": false,
"expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()",
"expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()",
"interval": "",
"interval": "",
@@ -628,7 +611,7 @@
@@ -628,7 +611,7 @@
{
{
"evaluator": {
"evaluator": {
"params": [
"params": [
0
9000
],
],
"type": "gt"
"type": "gt"
},
},
@@ -637,9 +620,9 @@
@@ -637,9 +620,9 @@
},
},
"query": {
"query": {
"params": [
"params": [
"A",
"JobRunTime",
"5m",
"5m",
"now-3h"
"now"
]
]
},
},
"reducer": {
"reducer": {
@@ -653,12 +636,12 @@
@@ -653,12 +636,12 @@
"for": "5m",
"for": "5m",
"frequency": "1m",
"frequency": "1m",
"handler": 1,
"handler": 1,
"message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.",
"message": "A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its \"random\" job delay.",
"name": "Daily backup job run time alert",
"name": "Daily backup job run time alert",
"noDataState": "no_data",
"noDataState": "no_data",
"notifications": []
"notifications": []
},
},
"description": "When was the unit active? With alerts",
"description": "When was the systemd unit active?",
"fieldConfig": {
"fieldConfig": {
"defaults": {
"defaults": {
"color": {
"color": {
@@ -667,8 +650,6 @@
@@ -667,8 +650,6 @@
"custom": {
"custom": {
"axisLabel": "",
"axisLabel": "",
"axisPlacement": "left",
"axisPlacement": "left",
"axisSoftMax": -0.8,
"axisSoftMin": 0,
"barAlignment": 0,
"barAlignment": 0,
"drawStyle": "line",
"drawStyle": "line",
"fillOpacity": 60,
"fillOpacity": 60,
@@ -698,18 +679,20 @@
@@ -698,18 +679,20 @@
}
}
},
},
"mappings": [],
"mappings": [],
"max": 1.2,
"min": 0,
"thresholds": {
"thresholds": {
"mode": "absolute",
"mode": "absolute",
"steps": [
"steps": [
{
{
"color": "green",
"color": "green",
"value": null
"value": null
 
},
 
{
 
"color": "#EAB839",
 
"value": 7200
}
}
]
]
},
},
"unit": "short"
"unit": "s"
},
},
"overrides": []
"overrides": []
},
},
@@ -733,22 +716,19 @@
@@ -733,22 +716,19 @@
},
},
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}",
"expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}",
"interval": "",
"interval": "",
"legendFormat": "{{instance}}",
"legendFormat": "{{instance}}",
"refId": "A"
"refId": "JobRunTime"
}
}
],
],
"thresholds": [
"thresholds": [
{
{
"colorMode": "critical",
"colorMode": "critical",
"op": "gt",
"op": "gt",
"value": 0,
"value": 9000,
"visible": true
"visible": true
}
}
],
],
@@ -762,7 +742,7 @@
@@ -762,7 +742,7 @@
{
{
"evaluator": {
"evaluator": {
"params": [
"params": [
0
18000
],
],
"type": "gt"
"type": "gt"
},
},
@@ -771,9 +751,9 @@
@@ -771,9 +751,9 @@
},
},
"query": {
"query": {
"params": [
"params": [
"A",
"JobRunTime",
"5m",
"5m",
"now-6h"
"now"
]
]
},
},
"reducer": {
"reducer": {
@@ -787,12 +767,12 @@
@@ -787,12 +767,12 @@
"for": "5m",
"for": "5m",
"frequency": "1m",
"frequency": "1m",
"handler": 1,
"handler": 1,
"message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).",
"message": "A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).",
"name": "Monthly check-repo run time alert",
"name": "Monthly check-repo run time alert",
"noDataState": "no_data",
"noDataState": "no_data",
"notifications": []
"notifications": []
},
},
"description": "When was the unit active?",
"description": "When was the systemd unit active?",
"fieldConfig": {
"fieldConfig": {
"defaults": {
"defaults": {
"color": {
"color": {
@@ -801,8 +781,6 @@
@@ -801,8 +781,6 @@
"custom": {
"custom": {
"axisLabel": "",
"axisLabel": "",
"axisPlacement": "left",
"axisPlacement": "left",
"axisSoftMax": -0.8,
"axisSoftMin": 0,
"barAlignment": 0,
"barAlignment": 0,
"drawStyle": "line",
"drawStyle": "line",
"fillOpacity": 60,
"fillOpacity": 60,
@@ -832,18 +810,20 @@
@@ -832,18 +810,20 @@
}
}
},
},
"mappings": [],
"mappings": [],
"max": 1.2,
"min": 0,
"thresholds": {
"thresholds": {
"mode": "absolute",
"mode": "absolute",
"steps": [
"steps": [
{
{
"color": "green",
"color": "green",
"value": null
"value": null
 
},
 
{
 
"color": "#EAB839",
 
"value": 15000
}
}
]
]
},
},
"unit": "short"
"unit": "s"
},
},
"overrides": []
"overrides": []
},
},
@@ -867,22 +847,19 @@
@@ -867,22 +847,19 @@
},
},
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Prometheus",
"type": "prometheus",
"uid": "000000001"
},
"exemplar": true,
"exemplar": true,
"expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}",
"expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}",
"interval": "",
"interval": "",
"legendFormat": "{{instance}}",
"legendFormat": "{{instance}}",
"refId": "A"
"refId": "JobRunTime"
}
}
],
],
"thresholds": [
"thresholds": [
{
{
"colorMode": "critical",
"colorMode": "critical",
"op": "gt",
"op": "gt",
"value": 0,
"value": 18000,
"visible": true
"visible": true
}
}
],
],
@@ -890,10 +867,7 @@
@@ -890,10 +867,7 @@
"type": "timeseries"
"type": "timeseries"
},
},
{
{
"datasource": {
"datasource": "Loki",
"type": "loki",
"uid": "000000002"
},
"description": "The \"duration\" that borgbackup status reports.",
"description": "The \"duration\" that borgbackup status reports.",
"fieldConfig": {
"fieldConfig": {
"defaults": {
"defaults": {
@@ -968,17 +942,14 @@
@@ -968,17 +942,14 @@
"pluginVersion": "8.4.7",
"pluginVersion": "8.4.7",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Loki",
"type": "loki",
"uid": "000000002"
},
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"",
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"",
"legendFormat": "{{host}}",
"legendFormat": "{{host}}",
"queryType": "range",
"queryType": "range",
"refId": "A"
"refId": "A"
}
}
],
],
"title": "Daily backup job run time",
"title": "Daily backup job run time (as reported by borg)",
"transformations": [
"transformations": [
{
{
"id": "labelsToFields",
"id": "labelsToFields",
@@ -1009,10 +980,7 @@
@@ -1009,10 +980,7 @@
"type": "barchart"
"type": "barchart"
},
},
{
{
"datasource": {
"datasource": "Loki",
"type": "loki",
"uid": "000000002"
},
"fieldConfig": {
"fieldConfig": {
"defaults": {
"defaults": {
"color": {
"color": {
@@ -1021,29 +989,17 @@
@@ -1021,29 +989,17 @@
"custom": {
"custom": {
"axisLabel": "",
"axisLabel": "",
"axisPlacement": "auto",
"axisPlacement": "auto",
"barAlignment": 0,
"axisSoftMin": 0,
"drawStyle": "line",
"fillOpacity": 80,
"fillOpacity": 0,
"gradientMode": "none",
"gradientMode": "none",
"hideFrom": {
"hideFrom": {
"legend": false,
"legend": false,
"tooltip": false,
"tooltip": false,
"viz": false
"viz": false
},
},
"lineInterpolation": "linear",
"lineWidth": 1,
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"scaleDistribution": {
"type": "linear"
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
}
},
},
"mappings": [],
"mappings": [],
@@ -1068,33 +1024,35 @@
@@ -1068,33 +1024,35 @@
},
},
"id": 55,
"id": 55,
"options": {
"options": {
 
"barRadius": 0,
 
"barWidth": 0.97,
 
"groupWidth": 0.7,
"legend": {
"legend": {
"calcs": [],
"calcs": [],
"displayMode": "list",
"displayMode": "list",
"placement": "bottom"
"placement": "bottom"
},
},
 
"orientation": "auto",
 
"showValue": "auto",
 
"stacking": "none",
"tooltip": {
"tooltip": {
"mode": "multi",
"mode": "single",
"sort": "none"
"sort": "none"
}
},
 
"xTickLabelRotation": 0,
 
"xTickLabelSpacing": 0
},
},
"pluginVersion": "8.4.7",
"pluginVersion": "8.4.7",
"targets": [
"targets": [
{
{
"datasource": {
"datasource": "Loki",
"type": "loki",
"uid": "000000002"
},
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"",
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"",
"hide": false,
"hide": false,
"legendFormat": "{{host}} archive",
"legendFormat": "{{host}} archive",
"refId": "This archive size in bytes"
"refId": "This archive size in bytes"
},
},
{
{
"datasource": {
"datasource": "Loki",
"type": "loki",
"uid": "000000002"
},
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"",
"expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"",
"hide": false,
"hide": false,
"legendFormat": "{{host}} all archives",
"legendFormat": "{{host}} all archives",
@@ -1134,7 +1092,7 @@
@@ -1134,7 +1092,7 @@
"options": {}
"options": {}
}
}
],
],
"type": "timeseries"
"type": "barchart"
}
}
],
],
"refresh": "5m",
"refresh": "5m",
Loading