From bc17a72360fc82782cef558b70b33fb6cf6470b1 Mon Sep 17 00:00:00 2001
From: Benoit Donneaux <benoit@leastauthority.com>
Date: Fri, 23 Feb 2024 22:39:16 +0100
Subject: [PATCH] Provision Grafana alert rules for all environments

Signed-off-by: Benoit Donneaux <benoit@leastauthority.com>
---
 .../server/grafana-alert-rules.yaml           | 1154 +++++++++++++++++
 nixos/modules/monitoring/server/grafana.nix   |    1 +
 2 files changed, 1155 insertions(+)
 create mode 100644 nixos/modules/monitoring/server/grafana-alert-rules.yaml

diff --git a/nixos/modules/monitoring/server/grafana-alert-rules.yaml b/nixos/modules/monitoring/server/grafana-alert-rules.yaml
new file mode 100644
index 00000000..2b4ec6dc
--- /dev/null
+++ b/nixos/modules/monitoring/server/grafana-alert-rules.yaml
@@ -0,0 +1,1154 @@
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: 15 min load average alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: mTy0TxX4k
+          title: 15 min load average alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                expr: node_load15
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 1
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 6
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "171"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "6"
+            message: ""
+          labels:
+            rule_uid: mTy0TxX4k
+          isPaused: false
+    - orgId: 1
+      name: Corruption Advisory rate alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: LTyAoxXVz
+          title: Corruption Advisory rate alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: rate(tahoe_corruption_advisories_total[5m])
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: TahoeLAFS
+          panelId: 46
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "181"
+            __dashboardUid__: TahoeLAFS
+            __panelId__: "46"
+            message: ""
+          labels:
+            rule_uid: LTyAoxXVz
+          isPaused: false
+    - orgId: 1
+      name: Daily backup job run time alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: iAsATbu4z
+          title: Daily backup job run time alert
+          condition: A
+          data:
+            - refId: A
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 9000
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - JobRunTime
+                      reducer:
+                        type: last
+                refId: A
+                type: classic_conditions
+            - refId: JobRunTime
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-job-daily.service", state="active"}
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: JobRunTime
+          dashboardUid: backups
+          panelId: 52
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "187"
+            __dashboardUid__: backups
+            __panelId__: "52"
+            message: A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its "random" job delay.
+          labels:
+            rule_uid: iAsATbu4z
+          isPaused: false
+    - orgId: 1
+      name: Daily backup jobs state alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: yos0TbXVz
+          title: Daily backup jobs state alert
+          condition: A
+          data:
+            - refId: A
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - Failed jobs
+                      reducer:
+                        type: last
+                refId: A
+                type: classic_conditions
+            - refId: Failed jobs
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: sum(node_systemd_unit_state{name="borgbackup-job-daily.service", state="failed"})
+                interval: ""
+                legendFormat: Failed
+                refId: Failed jobs
+          dashboardUid: backups
+          panelId: 46
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "183"
+            __dashboardUid__: backups
+            __panelId__: "46"
+            message: ""
+          labels:
+            rule_uid: yos0TbXVz
+          isPaused: false
+    - orgId: 1
+      name: Daily backup to Borgbase Trigger alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: 6oyAoxX4z
+          title: Daily backup to Borgbase Trigger alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: false
+                expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-job-daily.timer"} - time()
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - -90000
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: backups
+          panelId: 41
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            __alertId__: "185"
+            __dashboardUid__: backups
+            __panelId__: "41"
+            message: Borgbase.com daily backup job trigger.
+          labels:
+            rule_uid: 6oyAoxX4z
+          isPaused: false
+    - orgId: 1
+      name: Degraded RAID alert
+      folder: General Alerting
+      interval: 5m
+      rules:
+        - uid: xTsAobu4z
+          title: Degraded RAID alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: megacli_drives{state="Degraded"}
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: last
+                refId: B
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 32
+          noDataState: OK
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "177"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "32"
+            message: ""
+          labels:
+            rule_uid: xTsAobu4z
+          isPaused: false
+    - orgId: 1
+      name: Filesystem usage % alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: 2oyAoxu4k
+          title: Filesystem usage % alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) > 0.1
+                format: time_series
+                instant: false
+                interval: ""
+                intervalFactor: 2
+                legendFormat: '{{instance}} {{mountpoint}} '
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0.8
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 4
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "175"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "4"
+            message: ""
+          labels:
+            rule_uid: 2oyAoxu4k
+          isPaused: false
+    - orgId: 1
+      name: Incident Reports rate alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: Eoy0Tbu4k
+          title: Incident Reports rate alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: rate(tahoe_incident_reports_total[5m])
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: TahoeLAFS
+          panelId: 54
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "182"
+            __dashboardUid__: TahoeLAFS
+            __panelId__: "54"
+            message: ""
+          labels:
+            rule_uid: Eoy0Tbu4k
+          isPaused: false
+    - orgId: 1
+      name: Monthly check of Borgbase backup Trigger alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: z0s0oxu4k
+          title: Monthly check of Borgbase backup Trigger alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: false
+                expr: node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time()
+                interval: ""
+                intervalFactor: 4
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - -2.7648e+06
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: backups
+          panelId: 42
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            __alertId__: "186"
+            __dashboardUid__: backups
+            __panelId__: "42"
+            message: Borgbase.com monthly check-repo trigger.
+          labels:
+            rule_uid: z0s0oxu4k
+          isPaused: false
+    - orgId: 1
+      name: Monthly check-repo run time alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: M0y0obu4k
+          title: Monthly check-repo run time alert
+          condition: A
+          data:
+            - refId: A
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 18000
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - JobRunTime
+                      reducer:
+                        type: last
+                refId: A
+                type: classic_conditions
+            - refId: JobRunTime
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: abs((node_systemd_timer_last_trigger_seconds{name="borgbackup-check-repo.timer"} - time())) * on (instance) node_systemd_unit_state{name="borgbackup-check-repo.service", state="active"}
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: JobRunTime
+          dashboardUid: backups
+          panelId: 53
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "188"
+            __dashboardUid__: backups
+            __panelId__: "53"
+            message: A borg check-repo job ran for more than five hours.  After six hours it could collide with the daily backup job, depending on that job's "random" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).
+          labels:
+            rule_uid: M0y0obu4k
+          isPaused: false
+    - orgId: 1
+      name: Monthly check-repo timer state alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: 3TsAobu4z
+          title: Monthly check-repo timer state alert
+          condition: A
+          data:
+            - refId: A
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - Failed jobs
+                      reducer:
+                        type: last
+                refId: A
+                type: classic_conditions
+            - refId: Failed jobs
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: sum(node_systemd_unit_state{name="borgbackup-check-repo.service", state="failed"})
+                interval: ""
+                legendFormat: Failed
+                refId: Failed jobs
+          dashboardUid: backups
+          panelId: 47
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "184"
+            __dashboardUid__: backups
+            __panelId__: "47"
+            message: ""
+          labels:
+            rule_uid: 3TsAobu4z
+          isPaused: false
+    - orgId: 1
+      name: Network errors alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: cosAobXVz
+          title: Network errors alert
+          condition: E
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: |
+                    rate(node_network_transmit_errs_total{device!="lo"}[5m])
+                interval: ""
+                legendFormat: '{{instance}} {{device}}'
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: rate(node_network_transmit_drop_total{device!="lo"}[5m])
+                interval: ""
+                legendFormat: '{{instance}} {{device}}'
+                refId: B
+            - refId: C
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: '- rate(node_network_receive_drop_total{device!="lo"}[5m])'
+                interval: ""
+                legendFormat: '{{instance}} {{device}}'
+                refId: C
+            - refId: D
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: '- rate(node_network_receive_errs_total{device!="lo"}[5m])'
+                interval: ""
+                legendFormat: '{{instance}} {{device}}'
+                refId: D
+            - refId: E
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - -1
+                            - 1
+                        type: outside_range
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                    - evaluator:
+                        params:
+                            - -1
+                            - 1
+                        type: outside_range
+                      operator:
+                        type: or
+                      query:
+                        params:
+                            - B
+                      reducer:
+                        type: avg
+                    - evaluator:
+                        params:
+                            - -1
+                            - 1
+                        type: outside_range
+                      operator:
+                        type: or
+                      query:
+                        params:
+                            - C
+                      reducer:
+                        type: avg
+                    - evaluator:
+                        params:
+                            - -1
+                            - 1
+                        type: outside_range
+                      operator:
+                        type: or
+                      query:
+                        params:
+                            - D
+                      reducer:
+                        type: avg
+                refId: E
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 10
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "174"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "10"
+            message: ""
+          labels:
+            rule_uid: cosAobXVz
+          isPaused: false
+    - orgId: 1
+      name: Probe fails alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: aoy0TxuVz
+          title: Probe fails alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: count by (instance) (probe_http_status_code!=200 and probe_http_status_code!=401 and probe_http_status_code!=404)
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: count
+                refId: B
+                type: classic_conditions
+          dashboardUid: ServicesOverview
+          panelId: 38
+          noDataState: OK
+          execErrState: Alerting
+          for: 10m
+          annotations:
+            __alertId__: "179"
+            __dashboardUid__: ServicesOverview
+            __panelId__: "38"
+            message: ""
+          labels:
+            rule_uid: aoy0TxuVz
+          isPaused: false
+    - orgId: 1
+      name: RAM filling up
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: GosAobuVk
+          title: RAM filling up
+          condition: A
+          data:
+            - refId: A
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0.8
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - Hosts without ZFS
+                      reducer:
+                        type: avg
+                    - evaluator:
+                        params:
+                            - 0.8
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - Hosts with ZFS
+                      reducer:
+                        type: avg
+                refId: A
+                type: classic_conditions
+            - refId: Hosts with ZFS
+              relativeTimeRange:
+                from: 900
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: 1 - (node_memory_MemAvailable_bytes + node_zfs_arc_size) / node_memory_MemTotal_bytes
+                instant: false
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: Hosts with ZFS
+            - refId: Hosts without ZFS
+              relativeTimeRange:
+                from: 900
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes unless node_zfs_arc_size
+                instant: false
+                interval: ""
+                intervalFactor: 4
+                legendFormat: '{{instance}}'
+                refId: Hosts without ZFS
+          dashboardUid: ResourcesOverview
+          panelId: 2
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "172"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "2"
+            message: ""
+          labels:
+            rule_uid: GosAobuVk
+          isPaused: false
+    - orgId: 1
+      name: Response times alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: boyATbXVzz
+          title: Response times alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: probe_duration_seconds
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 3.142
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ServicesOverview
+          panelId: 36
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "178"
+            __dashboardUid__: ServicesOverview
+            __panelId__: "36"
+            message: ""
+          labels:
+            rule_uid: boyATbXVzz
+          isPaused: false
+    - orgId: 1
+      name: Scraping down
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: gosAobuVk
+          title: Scraping down
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: count by (job, instance) (up == 0)
+                interval: ""
+                legendFormat: '{{job}}/{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: count
+                refId: B
+                type: classic_conditions
+          dashboardUid: MetaMonitoring
+          panelId: 6
+          noDataState: OK
+          execErrState: Alerting
+          for: 10m
+          annotations:
+            __alertId__: "169"
+            __dashboardUid__: MetaMonitoring
+            __panelId__: "6"
+            message: ""
+          labels:
+            rule_uid: gosAobuVk
+          isPaused: false
+    - orgId: 1
+      name: Swap usage alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: OTs0TxXVz
+          title: Swap usage alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0.1
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 30
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "173"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "30"
+            message: ""
+          labels:
+            rule_uid: OTs0TxXVz
+          isPaused: false
+    - orgId: 1
+      name: TLS certificate expiry alert
+      folder: General Alerting
+      interval: 1h
+      rules:
+        - uid: aoyAoxX4zz
+          title: TLS certificate expiry alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                exemplar: true
+                expr: probe_ssl_earliest_cert_expiry - time()
+                interval: ""
+                intervalFactor: 1
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 2.4192e+06
+                        type: lt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ServicesOverview
+          panelId: 34
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "180"
+            __dashboardUid__: ServicesOverview
+            __panelId__: "34"
+            message: A TLS certificate is expiring within four weeks.
+          labels:
+            rule_uid: aoyAoxX4zz
+          isPaused: false
+    - orgId: 1
+      name: Textcollector staleness alert
+      folder: General Alerting
+      interval: 1m
+      rules:
+        - uid: kTsAoxuVz
+          title: Textcollector staleness alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: time() - node_textfile_mtime_seconds
+                interval: ""
+                legendFormat: '{{instance}}/{{file}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 600
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: last
+                refId: B
+                type: classic_conditions
+          dashboardUid: MetaMonitoring
+          panelId: 8
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "170"
+            __dashboardUid__: MetaMonitoring
+            __panelId__: "8"
+            message: A metrics text file is oder than 10 minutes.
+          labels:
+            rule_uid: kTsAoxuVz
+          isPaused: false
+    - orgId: 1
+      name: User ciphertext usage % per node alert
+      folder: General Alerting
+      interval: 15m
+      rules:
+        - uid: 0oyATbXVk
+          title: User ciphertext usage % per node alert
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 300
+                to: 0
+              datasourceUid: LocalPrometheus
+              model:
+                datasource:
+                    type: prometheus
+                    uid: LocalPrometheus
+                exemplar: true
+                expr: 1 - (node_filesystem_avail_bytes{mountpoint="/storage"} / node_filesystem_size_bytes{mountpoint="/storage"})
+                interval: ""
+                legendFormat: '{{instance}}'
+                refId: A
+            - refId: B
+              datasourceUid: "-100"
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0.5
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        type: avg
+                refId: B
+                type: classic_conditions
+          dashboardUid: ResourcesOverview
+          panelId: 41
+          noDataState: NoData
+          execErrState: Alerting
+          for: 5m
+          annotations:
+            __alertId__: "176"
+            __dashboardUid__: ResourcesOverview
+            __panelId__: "41"
+            message: ""
+          labels:
+            rule_uid: 0oyATbXVk
+          isPaused: false
diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix
index 5299829c..64ac1d5b 100644
--- a/nixos/modules/monitoring/server/grafana.nix
+++ b/nixos/modules/monitoring/server/grafana.nix
@@ -195,6 +195,7 @@ in {
             url = "$__file{${toString cfg.grafanaZulipUrlFile}}";
           };
         }]);
+        alerting.rules.path = ./grafana-alert-rules.yaml;
       };
     };
 
-- 
GitLab