From bc17a72360fc82782cef558b70b33fb6cf6470b1 Mon Sep 17 00:00:00 2001 From: Benoit Donneaux <benoit@leastauthority.com> Date: Fri, 23 Feb 2024 22:39:16 +0100 Subject: [PATCH] Provision Grafana alert rules for all environments Signed-off-by: Benoit Donneaux <benoit@leastauthority.com> --- .../server/grafana-alert-rules.yaml | 1154 +++++++++++++++++ nixos/modules/monitoring/server/grafana.nix | 1 + 2 files changed, 1155 insertions(+) create mode 100644 nixos/modules/monitoring/server/grafana-alert-rules.yaml diff --git a/nixos/modules/monitoring/server/grafana-alert-rules.yaml b/nixos/modules/monitoring/server/grafana-alert-rules.yaml new file mode 100644 index 00000000..2b4ec6dc --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-alert-rules.yaml @@ -0,0 +1,1154 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: 15 min load average alert + folder: General Alerting + interval: 1m + rules: + - uid: mTy0TxX4k + title: 15 min load average alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + expr: node_load15 + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 1 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 6 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "171" + __dashboardUid__: ResourcesOverview + __panelId__: "6" + message: "" + labels: + rule_uid: mTy0TxX4k + isPaused: false + - orgId: 1 + name: Corruption Advisory rate alert + folder: General Alerting + interval: 1m + rules: + - uid: LTyAoxXVz + title: Corruption Advisory rate alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(tahoe_corruption_advisories_total[5m]) + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: TahoeLAFS + panelId: 46 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "181" + __dashboardUid__: TahoeLAFS + __panelId__: "46" + message: "" + labels: + rule_uid: LTyAoxXVz + isPaused: false + - orgId: 1 + name: Daily backup job run time alert + folder: General Alerting + interval: 1m + rules: + - uid: iAsATbu4z + title: Daily backup job run time alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 9000 + type: gt + operator: + type: and + query: + params: + - JobRunTime + reducer: + type: last + refId: A + type: classic_conditions + - refId: JobRunTime + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-job-daily.service", state="active"} + interval: "" + legendFormat: '{{instance}}' + refId: JobRunTime + dashboardUid: backups + panelId: 52 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "187" + __dashboardUid__: backups + __panelId__: "52" + message: A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its "random" job delay. + labels: + rule_uid: iAsATbu4z + isPaused: false + - orgId: 1 + name: Daily backup jobs state alert + folder: General Alerting + interval: 1m + rules: + - uid: yos0TbXVz + title: Daily backup jobs state alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - Failed jobs + reducer: + type: last + refId: A + type: classic_conditions + - refId: Failed jobs + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: sum(node_systemd_unit_state{name="borgbackup-job-daily.service", state="failed"}) + interval: "" + legendFormat: Failed + refId: Failed jobs + dashboardUid: backups + panelId: 46 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "183" + __dashboardUid__: backups + __panelId__: "46" + message: "" + labels: + rule_uid: yos0TbXVz + isPaused: false + - orgId: 1 + name: Daily backup to Borgbase Trigger alert + folder: General Alerting + interval: 1m + rules: + - uid: 6oyAoxX4z + title: Daily backup to Borgbase Trigger alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: false + expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time() + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -90000 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: backups + panelId: 41 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __alertId__: "185" + __dashboardUid__: backups + __panelId__: "41" + message: Borgbase.com daily backup job trigger. + labels: + rule_uid: 6oyAoxX4z + isPaused: false + - orgId: 1 + name: Degraded RAID alert + folder: General Alerting + interval: 5m + rules: + - uid: xTsAobu4z + title: Degraded RAID alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: megacli_drives{state="Degraded"} + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: last + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 32 + noDataState: OK + execErrState: Alerting + for: 5m + annotations: + __alertId__: "177" + __dashboardUid__: ResourcesOverview + __panelId__: "32" + message: "" + labels: + rule_uid: xTsAobu4z + isPaused: false + - orgId: 1 + name: Filesystem usage % alert + folder: General Alerting + interval: 1m + rules: + - uid: 2oyAoxu4k + title: Filesystem usage % alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) > 0.1 + format: time_series + instant: false + interval: "" + intervalFactor: 2 + legendFormat: '{{instance}} {{mountpoint}} ' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 4 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "175" + __dashboardUid__: ResourcesOverview + __panelId__: "4" + message: "" + labels: + rule_uid: 2oyAoxu4k + isPaused: false + - orgId: 1 + name: Incident Reports rate alert + folder: General Alerting + interval: 1m + rules: + - uid: Eoy0Tbu4k + title: Incident Reports rate alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(tahoe_incident_reports_total[5m]) + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: TahoeLAFS + panelId: 54 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "182" + __dashboardUid__: TahoeLAFS + __panelId__: "54" + message: "" + labels: + rule_uid: Eoy0Tbu4k + isPaused: false + - orgId: 1 + name: Monthly check of Borgbase backup Trigger alert + folder: General Alerting + interval: 1m + rules: + - uid: z0s0oxu4k + title: Monthly check of Borgbase backup Trigger alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: false + expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time() + interval: "" + intervalFactor: 4 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -2.7648e+06 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: backups + panelId: 42 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __alertId__: "186" + __dashboardUid__: backups + __panelId__: "42" + message: Borgbase.com monthly check-repo trigger. + labels: + rule_uid: z0s0oxu4k + isPaused: false + - orgId: 1 + name: Monthly check-repo run time alert + folder: General Alerting + interval: 1m + rules: + - uid: M0y0obu4k + title: Monthly check-repo run time alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 18000 + type: gt + operator: + type: and + query: + params: + - JobRunTime + reducer: + type: last + refId: A + type: classic_conditions + - refId: JobRunTime + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-check-repo.service", state="active"} + interval: "" + legendFormat: '{{instance}}' + refId: JobRunTime + dashboardUid: backups + panelId: 53 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "188" + __dashboardUid__: backups + __panelId__: "53" + message: A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's "random" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter). + labels: + rule_uid: M0y0obu4k + isPaused: false + - orgId: 1 + name: Monthly check-repo timer state alert + folder: General Alerting + interval: 1m + rules: + - uid: 3TsAobu4z + title: Monthly check-repo timer state alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - Failed jobs + reducer: + type: last + refId: A + type: classic_conditions + - refId: Failed jobs + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: sum(node_systemd_unit_state{name="borgbackup-check-repo.service", state="failed"}) + interval: "" + legendFormat: Failed + refId: Failed jobs + dashboardUid: backups + panelId: 47 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "184" + __dashboardUid__: backups + __panelId__: "47" + message: "" + labels: + rule_uid: 3TsAobu4z + isPaused: false + - orgId: 1 + name: Network errors alert + folder: General Alerting + interval: 1m + rules: + - uid: cosAobXVz + title: Network errors alert + condition: E + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: | + rate(node_network_transmit_errs_total{device!="lo"}[5m]) + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: A + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(node_network_transmit_drop_total{device!="lo"}[5m]) + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: B + - refId: C + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: '- rate(node_network_receive_drop_total{device!="lo"}[5m])' + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: C + - refId: D + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: '- rate(node_network_receive_errs_total{device!="lo"}[5m])' + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: D + - refId: E + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: and + query: + params: + - A + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - B + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - C + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - D + reducer: + type: avg + refId: E + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 10 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "174" + __dashboardUid__: ResourcesOverview + __panelId__: "10" + message: "" + labels: + rule_uid: cosAobXVz + isPaused: false + - orgId: 1 + name: Probe fails alert + folder: General Alerting + interval: 1m + rules: + - uid: aoy0TxuVz + title: Probe fails alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: count by (instance) (probe_http_status_code!=200 and probe_http_status_code!=401 and probe_http_status_code!=404) + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: count + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 38 + noDataState: OK + execErrState: Alerting + for: 10m + annotations: + __alertId__: "179" + __dashboardUid__: ServicesOverview + __panelId__: "38" + message: "" + labels: + rule_uid: aoy0TxuVz + isPaused: false + - orgId: 1 + name: RAM filling up + folder: General Alerting + interval: 1m + rules: + - uid: GosAobuVk + title: RAM filling up + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - Hosts without ZFS + reducer: + type: avg + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - Hosts with ZFS + reducer: + type: avg + refId: A + type: classic_conditions + - refId: Hosts with ZFS + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - (node_memory_MemAvailable_bytes + node_zfs_arc_size) / node_memory_MemTotal_bytes + instant: false + interval: "" + legendFormat: '{{instance}}' + refId: Hosts with ZFS + - refId: Hosts without ZFS + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes unless node_zfs_arc_size + instant: false + interval: "" + intervalFactor: 4 + legendFormat: '{{instance}}' + refId: Hosts without ZFS + dashboardUid: ResourcesOverview + panelId: 2 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "172" + __dashboardUid__: ResourcesOverview + __panelId__: "2" + message: "" + labels: + rule_uid: GosAobuVk + isPaused: false + - orgId: 1 + name: Response times alert + folder: General Alerting + interval: 1m + rules: + - uid: boyATbXVzz + title: Response times alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: probe_duration_seconds + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 3.142 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 36 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "178" + __dashboardUid__: ServicesOverview + __panelId__: "36" + message: "" + labels: + rule_uid: boyATbXVzz + isPaused: false + - orgId: 1 + name: Scraping down + folder: General Alerting + interval: 1m + rules: + - uid: gosAobuVk + title: Scraping down + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: count by (job, instance) (up == 0) + interval: "" + legendFormat: '{{job}}/{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: count + refId: B + type: classic_conditions + dashboardUid: MetaMonitoring + panelId: 6 + noDataState: OK + execErrState: Alerting + for: 10m + annotations: + __alertId__: "169" + __dashboardUid__: MetaMonitoring + __panelId__: "6" + message: "" + labels: + rule_uid: gosAobuVk + isPaused: false + - orgId: 1 + name: Swap usage alert + folder: General Alerting + interval: 1m + rules: + - uid: OTs0TxXVz + title: Swap usage alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.1 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 30 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "173" + __dashboardUid__: ResourcesOverview + __panelId__: "30" + message: "" + labels: + rule_uid: OTs0TxXVz + isPaused: false + - orgId: 1 + name: TLS certificate expiry alert + folder: General Alerting + interval: 1h + rules: + - uid: aoyAoxX4zz + title: TLS certificate expiry alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: probe_ssl_earliest_cert_expiry - time() + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 2.4192e+06 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 34 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "180" + __dashboardUid__: ServicesOverview + __panelId__: "34" + message: A TLS certificate is expiring within four weeks. + labels: + rule_uid: aoyAoxX4zz + isPaused: false + - orgId: 1 + name: Textcollector staleness alert + folder: General Alerting + interval: 1m + rules: + - uid: kTsAoxuVz + title: Textcollector staleness alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: time() - node_textfile_mtime_seconds + interval: "" + legendFormat: '{{instance}}/{{file}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 600 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: last + refId: B + type: classic_conditions + dashboardUid: MetaMonitoring + panelId: 8 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "170" + __dashboardUid__: MetaMonitoring + __panelId__: "8" + message: A metrics text file is oder than 10 minutes. + labels: + rule_uid: kTsAoxuVz + isPaused: false + - orgId: 1 + name: User ciphertext usage % per node alert + folder: General Alerting + interval: 15m + rules: + - uid: 0oyATbXVk + title: User ciphertext usage % per node alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - (node_filesystem_avail_bytes{mountpoint="/storage"} / node_filesystem_size_bytes{mountpoint="/storage"}) + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.5 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 41 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "176" + __dashboardUid__: ResourcesOverview + __panelId__: "41" + message: "" + labels: + rule_uid: 0oyATbXVk + isPaused: false diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix index 5299829c..64ac1d5b 100644 --- a/nixos/modules/monitoring/server/grafana.nix +++ b/nixos/modules/monitoring/server/grafana.nix @@ -195,6 +195,7 @@ in { url = "$__file{${toString cfg.grafanaZulipUrlFile}}"; }; }]); + alerting.rules.path = ./grafana-alert-rules.yaml; }; }; -- GitLab