diff --git a/nixos/modules/monitoring/server/grafana-alert-rules.yaml b/nixos/modules/monitoring/server/grafana-alert-rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b4ec6dcdcd8d615d8a43d956e1092925bd04109 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-alert-rules.yaml @@ -0,0 +1,1154 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: 15 min load average alert + folder: General Alerting + interval: 1m + rules: + - uid: mTy0TxX4k + title: 15 min load average alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + expr: node_load15 + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 1 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 6 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "171" + __dashboardUid__: ResourcesOverview + __panelId__: "6" + message: "" + labels: + rule_uid: mTy0TxX4k + isPaused: false + - orgId: 1 + name: Corruption Advisory rate alert + folder: General Alerting + interval: 1m + rules: + - uid: LTyAoxXVz + title: Corruption Advisory rate alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(tahoe_corruption_advisories_total[5m]) + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: TahoeLAFS + panelId: 46 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "181" + __dashboardUid__: TahoeLAFS + __panelId__: "46" + message: "" + labels: + rule_uid: LTyAoxXVz + isPaused: false + - orgId: 1 + name: Daily backup job run time alert + folder: General Alerting + interval: 1m + rules: + - uid: iAsATbu4z + title: Daily backup job run time alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 9000 + type: gt + operator: + type: and + query: + params: + - JobRunTime + reducer: + type: last + refId: A + type: classic_conditions + - refId: JobRunTime + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-job-daily.service", state="active"} + interval: "" + legendFormat: '{{instance}}' + refId: JobRunTime + dashboardUid: backups + panelId: 52 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "187" + __dashboardUid__: backups + __panelId__: "52" + message: A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its "random" job delay. + labels: + rule_uid: iAsATbu4z + isPaused: false + - orgId: 1 + name: Daily backup jobs state alert + folder: General Alerting + interval: 1m + rules: + - uid: yos0TbXVz + title: Daily backup jobs state alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - Failed jobs + reducer: + type: last + refId: A + type: classic_conditions + - refId: Failed jobs + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: sum(node_systemd_unit_state{name="borgbackup-job-daily.service", state="failed"}) + interval: "" + legendFormat: Failed + refId: Failed jobs + dashboardUid: backups + panelId: 46 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "183" + __dashboardUid__: backups + __panelId__: "46" + message: "" + labels: + rule_uid: yos0TbXVz + isPaused: false + - orgId: 1 + name: Daily backup to Borgbase Trigger alert + folder: General Alerting + interval: 1m + rules: + - uid: 6oyAoxX4z + title: Daily backup to Borgbase Trigger alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: false + expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time() + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -90000 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: backups + panelId: 41 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __alertId__: "185" + __dashboardUid__: backups + __panelId__: "41" + message: Borgbase.com daily backup job trigger. + labels: + rule_uid: 6oyAoxX4z + isPaused: false + - orgId: 1 + name: Degraded RAID alert + folder: General Alerting + interval: 5m + rules: + - uid: xTsAobu4z + title: Degraded RAID alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: megacli_drives{state="Degraded"} + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: last + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 32 + noDataState: OK + execErrState: Alerting + for: 5m + annotations: + __alertId__: "177" + __dashboardUid__: ResourcesOverview + __panelId__: "32" + message: "" + labels: + rule_uid: xTsAobu4z + isPaused: false + - orgId: 1 + name: Filesystem usage % alert + folder: General Alerting + interval: 1m + rules: + - uid: 2oyAoxu4k + title: Filesystem usage % alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) > 0.1 + format: time_series + instant: false + interval: "" + intervalFactor: 2 + legendFormat: '{{instance}} {{mountpoint}} ' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 4 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "175" + __dashboardUid__: ResourcesOverview + __panelId__: "4" + message: "" + labels: + rule_uid: 2oyAoxu4k + isPaused: false + - orgId: 1 + name: Incident Reports rate alert + folder: General Alerting + interval: 1m + rules: + - uid: Eoy0Tbu4k + title: Incident Reports rate alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(tahoe_incident_reports_total[5m]) + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: TahoeLAFS + panelId: 54 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "182" + __dashboardUid__: TahoeLAFS + __panelId__: "54" + message: "" + labels: + rule_uid: Eoy0Tbu4k + isPaused: false + - orgId: 1 + name: Monthly check of Borgbase backup Trigger alert + folder: General Alerting + interval: 1m + rules: + - uid: z0s0oxu4k + title: Monthly check of Borgbase backup Trigger alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: false + expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time() + interval: "" + intervalFactor: 4 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -2.7648e+06 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: backups + panelId: 42 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __alertId__: "186" + __dashboardUid__: backups + __panelId__: "42" + message: Borgbase.com monthly check-repo trigger. + labels: + rule_uid: z0s0oxu4k + isPaused: false + - orgId: 1 + name: Monthly check-repo run time alert + folder: General Alerting + interval: 1m + rules: + - uid: M0y0obu4k + title: Monthly check-repo run time alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 18000 + type: gt + operator: + type: and + query: + params: + - JobRunTime + reducer: + type: last + refId: A + type: classic_conditions + - refId: JobRunTime + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-check-repo.service", state="active"} + interval: "" + legendFormat: '{{instance}}' + refId: JobRunTime + dashboardUid: backups + panelId: 53 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "188" + __dashboardUid__: backups + __panelId__: "53" + message: A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's "random" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter). + labels: + rule_uid: M0y0obu4k + isPaused: false + - orgId: 1 + name: Monthly check-repo timer state alert + folder: General Alerting + interval: 1m + rules: + - uid: 3TsAobu4z + title: Monthly check-repo timer state alert + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - Failed jobs + reducer: + type: last + refId: A + type: classic_conditions + - refId: Failed jobs + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: sum(node_systemd_unit_state{name="borgbackup-check-repo.service", state="failed"}) + interval: "" + legendFormat: Failed + refId: Failed jobs + dashboardUid: backups + panelId: 47 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "184" + __dashboardUid__: backups + __panelId__: "47" + message: "" + labels: + rule_uid: 3TsAobu4z + isPaused: false + - orgId: 1 + name: Network errors alert + folder: General Alerting + interval: 1m + rules: + - uid: cosAobXVz + title: Network errors alert + condition: E + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: | + rate(node_network_transmit_errs_total{device!="lo"}[5m]) + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: A + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: rate(node_network_transmit_drop_total{device!="lo"}[5m]) + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: B + - refId: C + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: '- rate(node_network_receive_drop_total{device!="lo"}[5m])' + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: C + - refId: D + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: '- rate(node_network_receive_errs_total{device!="lo"}[5m])' + interval: "" + legendFormat: '{{instance}} {{device}}' + refId: D + - refId: E + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: and + query: + params: + - A + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - B + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - C + reducer: + type: avg + - evaluator: + params: + - -1 + - 1 + type: outside_range + operator: + type: or + query: + params: + - D + reducer: + type: avg + refId: E + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 10 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "174" + __dashboardUid__: ResourcesOverview + __panelId__: "10" + message: "" + labels: + rule_uid: cosAobXVz + isPaused: false + - orgId: 1 + name: Probe fails alert + folder: General Alerting + interval: 1m + rules: + - uid: aoy0TxuVz + title: Probe fails alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: count by (instance) (probe_http_status_code!=200 and probe_http_status_code!=401 and probe_http_status_code!=404) + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: count + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 38 + noDataState: OK + execErrState: Alerting + for: 10m + annotations: + __alertId__: "179" + __dashboardUid__: ServicesOverview + __panelId__: "38" + message: "" + labels: + rule_uid: aoy0TxuVz + isPaused: false + - orgId: 1 + name: RAM filling up + folder: General Alerting + interval: 1m + rules: + - uid: GosAobuVk + title: RAM filling up + condition: A + data: + - refId: A + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - Hosts without ZFS + reducer: + type: avg + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: + - Hosts with ZFS + reducer: + type: avg + refId: A + type: classic_conditions + - refId: Hosts with ZFS + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - (node_memory_MemAvailable_bytes + node_zfs_arc_size) / node_memory_MemTotal_bytes + instant: false + interval: "" + legendFormat: '{{instance}}' + refId: Hosts with ZFS + - refId: Hosts without ZFS + relativeTimeRange: + from: 900 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes unless node_zfs_arc_size + instant: false + interval: "" + intervalFactor: 4 + legendFormat: '{{instance}}' + refId: Hosts without ZFS + dashboardUid: ResourcesOverview + panelId: 2 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "172" + __dashboardUid__: ResourcesOverview + __panelId__: "2" + message: "" + labels: + rule_uid: GosAobuVk + isPaused: false + - orgId: 1 + name: Response times alert + folder: General Alerting + interval: 1m + rules: + - uid: boyATbXVzz + title: Response times alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: probe_duration_seconds + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 3.142 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 36 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "178" + __dashboardUid__: ServicesOverview + __panelId__: "36" + message: "" + labels: + rule_uid: boyATbXVzz + isPaused: false + - orgId: 1 + name: Scraping down + folder: General Alerting + interval: 1m + rules: + - uid: gosAobuVk + title: Scraping down + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: count by (job, instance) (up == 0) + interval: "" + legendFormat: '{{job}}/{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: count + refId: B + type: classic_conditions + dashboardUid: MetaMonitoring + panelId: 6 + noDataState: OK + execErrState: Alerting + for: 10m + annotations: + __alertId__: "169" + __dashboardUid__: MetaMonitoring + __panelId__: "6" + message: "" + labels: + rule_uid: gosAobuVk + isPaused: false + - orgId: 1 + name: Swap usage alert + folder: General Alerting + interval: 1m + rules: + - uid: OTs0TxXVz + title: Swap usage alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.1 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 30 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "173" + __dashboardUid__: ResourcesOverview + __panelId__: "30" + message: "" + labels: + rule_uid: OTs0TxXVz + isPaused: false + - orgId: 1 + name: TLS certificate expiry alert + folder: General Alerting + interval: 1h + rules: + - uid: aoyAoxX4zz + title: TLS certificate expiry alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + exemplar: true + expr: probe_ssl_earliest_cert_expiry - time() + interval: "" + intervalFactor: 1 + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 2.4192e+06 + type: lt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ServicesOverview + panelId: 34 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "180" + __dashboardUid__: ServicesOverview + __panelId__: "34" + message: A TLS certificate is expiring within four weeks. + labels: + rule_uid: aoyAoxX4zz + isPaused: false + - orgId: 1 + name: Textcollector staleness alert + folder: General Alerting + interval: 1m + rules: + - uid: kTsAoxuVz + title: Textcollector staleness alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: time() - node_textfile_mtime_seconds + interval: "" + legendFormat: '{{instance}}/{{file}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 600 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: last + refId: B + type: classic_conditions + dashboardUid: MetaMonitoring + panelId: 8 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "170" + __dashboardUid__: MetaMonitoring + __panelId__: "8" + message: A metrics text file is oder than 10 minutes. + labels: + rule_uid: kTsAoxuVz + isPaused: false + - orgId: 1 + name: User ciphertext usage % per node alert + folder: General Alerting + interval: 15m + rules: + - uid: 0oyATbXVk + title: User ciphertext usage % per node alert + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: LocalPrometheus + model: + datasource: + type: prometheus + uid: LocalPrometheus + exemplar: true + expr: 1 - (node_filesystem_avail_bytes{mountpoint="/storage"} / node_filesystem_size_bytes{mountpoint="/storage"}) + interval: "" + legendFormat: '{{instance}}' + refId: A + - refId: B + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 0.5 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: avg + refId: B + type: classic_conditions + dashboardUid: ResourcesOverview + panelId: 41 + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + __alertId__: "176" + __dashboardUid__: ResourcesOverview + __panelId__: "41" + message: "" + labels: + rule_uid: 0oyATbXVk + isPaused: false diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix index 5299829ccc7fec081e9ccb4f0e41c66daa8a0251..64ac1d5b07cbf590ac3c446d92841ffe1653e2ac 100644 --- a/nixos/modules/monitoring/server/grafana.nix +++ b/nixos/modules/monitoring/server/grafana.nix @@ -195,6 +195,7 @@ in { url = "$__file{${toString cfg.grafanaZulipUrlFile}}"; }; }]); + alerting.rules.path = ./grafana-alert-rules.yaml; }; };