diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 50a346987f40a460f38037b3cbc0a8f3a6e568b2..7e7348ffeeca9e8e39a16adabd7ce9b3eed0418f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -67,8 +67,9 @@ unit-tests: stage: "build" script: - | + set -x # GRID is set in one of the "instantiations" of this job template. - nix-shell --run "morph build --show-trace morph/grid/${GRID}/grid.nix" + nix-shell --pure --run "morph build --show-trace morph/grid/${GRID}/grid.nix" morph-build-localdev: @@ -111,7 +112,7 @@ system-tests: timeout: "3 hours" stage: "build" script: - - "nix-build --attr system-tests" + - "nix-shell --pure --run 'nix-build --attr system-tests'" # A template for a job that can update one of the grids. .update-grid: &UPDATE_GRID diff --git a/nixos/modules/monitoring/exporters/tahoe.nix b/nixos/modules/monitoring/exporters/tahoe.nix index a537600ccf45e1634c9d0748ae479dc37ba4715e..6a2cf6a45d8280aee2f073ba00d4dc2dff0ad9fb 100644 --- a/nixos/modules/monitoring/exporters/tahoe.nix +++ b/nixos/modules/monitoring/exporters/tahoe.nix @@ -67,6 +67,9 @@ in { NUM_CORRUPTION_ADVISORIES=$(find /storage/corruption-advisories/ -type f | wc -l) echo "tahoe_corruption_advisories_total $NUM_CORRUPTION_ADVISORIES" > "${cfg.outFile}.tmp" + NUM_INCIDENT_REPORTS=$(find /var/db/tahoe-lafs/storage/logs/incidents/ -type f | wc -l) + echo "tahoe_incident_reports_total $NUM_INCIDENT_REPORTS" >> "${cfg.outFile}.tmp" + curl --silent --show-error --fail-with-body "${cfg.scrapeEndpoint}" >> "${cfg.outFile}.tmp" mv "${cfg.outFile}.tmp" "${cfg.outFile}" ''; diff --git a/nixos/modules/monitoring/server/grafana-dashboards/backups.json b/nixos/modules/monitoring/server/grafana-dashboards/backups.json index df77fb3747d17ac8bd86848e966b470dceae8669..586cd4a59a92c33a507a9b0ee7b6eff52a8f4194 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/backups.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/backups.json @@ -22,6 +22,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 54, "links": [], "liveNow": false, "panels": [ @@ -92,7 +93,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -162,10 +163,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.timer\", state=~\"active\"})", "hide": false, @@ -174,10 +172,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"failed\"})", "hide": false, @@ -251,7 +246,7 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "linear", + "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { @@ -318,10 +313,7 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.timer\", state=\"active\"})", "hide": false, @@ -330,10 +322,7 @@ "refId": "Active timers" }, { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, "expr": "sum(node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"failed\"})", "hide": false, @@ -463,10 +452,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time()", "interval": "", @@ -598,10 +584,7 @@ "pluginVersion": "8.3.5", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": false, "expr": "node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time()", "interval": "", @@ -628,7 +611,7 @@ { "evaluator": { "params": [ - 0 + 9000 ], "type": "gt" }, @@ -637,8 +620,8 @@ }, "query": { "params": [ - "A", - "3h", + "JobRunTime", + "5m", "now" ] }, @@ -653,12 +636,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A backup job ran for more than three hours. This means it could run into the check-repo job start time, depending on its \"random\" job delay.", + "message": "A backup job ran for more than 2 ½ hours. After 3 hours it could run into the check-repo job start time, depending on its \"random\" job delay.", "name": "Daily backup job run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active? With alerts", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -667,8 +650,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -698,18 +679,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 7200 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -733,22 +716,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-job-daily.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-job-daily.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 9000, "visible": true } ], @@ -762,7 +742,7 @@ { "evaluator": { "params": [ - 0 + 18000 ], "type": "gt" }, @@ -771,8 +751,8 @@ }, "query": { "params": [ - "A", - "6h", + "JobRunTime", + "5m", "now" ] }, @@ -787,12 +767,12 @@ "for": "5m", "frequency": "1m", "handler": 1, - "message": "A borg check-repo job ran for more than six hours. This means it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", + "message": "A borg check-repo job ran for more than five hours. After six hours it could collide with the daily backup job, depending on that job's \"random\" delay. If the backup set is large and this is expected to happen again, consider using borgbackup partial checks (--max-duration SECONDS parameter).", "name": "Monthly check-repo run time alert", "noDataState": "no_data", "notifications": [] }, - "description": "When was the unit active?", + "description": "When was the systemd unit active?", "fieldConfig": { "defaults": { "color": { @@ -801,8 +781,6 @@ "custom": { "axisLabel": "", "axisPlacement": "left", - "axisSoftMax": -0.8, - "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, @@ -832,18 +810,20 @@ } }, "mappings": [], - "max": 1.2, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "#EAB839", + "value": 15000 } ] }, - "unit": "short" + "unit": "s" }, "overrides": [] }, @@ -867,22 +847,19 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "000000001" - }, + "datasource": "Prometheus", "exemplar": true, - "expr": "node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", + "expr": "abs((node_systemd_timer_last_trigger_seconds{name=\"borgbackup-check-repo.timer\"} - time())) * on (instance) node_systemd_unit_state{name=\"borgbackup-check-repo.service\", state=\"active\"}", "interval": "", "legendFormat": "{{instance}}", - "refId": "A" + "refId": "JobRunTime" } ], "thresholds": [ { "colorMode": "critical", "op": "gt", - "value": 0, + "value": 18000, "visible": true } ], @@ -890,10 +867,7 @@ "type": "timeseries" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "description": "The \"duration\" that borgbackup status reports.", "fieldConfig": { "defaults": { @@ -968,17 +942,14 @@ "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"duration\" | pattern \"<_>\\\"duration\\\": <duration>,\"", "legendFormat": "{{host}}", "queryType": "range", "refId": "A" } ], - "title": "Daily backup job run time", + "title": "Daily backup job run time (as reported by borg)", "transformations": [ { "id": "labelsToFields", @@ -1009,10 +980,7 @@ "type": "barchart" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "fieldConfig": { "defaults": { "color": { @@ -1021,29 +989,17 @@ "custom": { "axisLabel": "", "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, + "axisSoftMin": 0, + "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } }, "mappings": [], @@ -1068,33 +1024,35 @@ }, "id": 55, "options": { + "barRadius": 0, + "barWidth": 0.97, + "groupWidth": 0.7, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" - } + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, "pluginVersion": "8.4.7", "targets": [ { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"compressed_size\" | pattern \"<_>\\\"compressed_size\\\": <compressed_size>,\"", "hide": false, "legendFormat": "{{host}} archive", "refId": "This archive size in bytes" }, { - "datasource": { - "type": "loki", - "uid": "000000002" - }, + "datasource": "Loki", "expr": "{unit=\"borgbackup-job-daily.service\"} |= \"unique_csize\" | pattern \"<_>\\\"unique_csize\\\": <unique_csize>,\"", "hide": false, "legendFormat": "{{host}} all archives", @@ -1134,7 +1092,7 @@ "options": {} } ], - "type": "timeseries" + "type": "barchart" } ], "refresh": "5m", diff --git a/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json b/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json index 28af6aeffd37c5695bd27215dfb3b1fb917c555f..8547575f19363ce376caf6d341b1e9bf49434bcb 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json @@ -22,7 +22,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 41, + "id": 125, "links": [], "liveNow": false, "panels": [ @@ -275,7 +275,30 @@ }, "query": { "params": [ - "A", + "Hosts without ZFS", + "15m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 0.8 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "Hosts with ZFS", "15m", "now" ] @@ -399,6 +422,14 @@ "refId": "Hosts with ZFS" } ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0.8, + "visible": true + } + ], "title": "RAM used %", "type": "timeseries" }, diff --git a/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json b/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json index ad7966d08d862d0c791faceb077d7264763162a2..47f916bfe1694cf30ee764754ef6b7e270cc2b4c 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json @@ -22,7 +22,8 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "iteration": 1662390420143, + "id": 55, + "iteration": 1662661580921, "links": [], "liveNow": false, "panels": [ @@ -1072,6 +1073,10 @@ "type": "row" }, { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, "description": "File count of /storage/corruption-advisories/", "fieldConfig": { "defaults": { @@ -1191,6 +1196,10 @@ "noDataState": "no_data", "notifications": [] }, + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, "description": "Rate of new files in /storage/corruption-advisories/", "fieldConfig": { "defaults": { @@ -1280,6 +1289,231 @@ ], "title": "Corruption Advisory rate", "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 + }, + "id": 50, + "panels": [], + "title": "Incident Reports", + "type": "row" + }, + { + "datasource": {}, + "description": "File count of /var/db/tahoe-lafs/storage/logs/incidents/", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "tahoe_incident_reports_total", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Incident Reports count", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Incident Reports rate alert", + "noDataState": "no_data", + "notifications": [] + }, + "datasource": {}, + "description": "Rate of new files in /var/db/tahoe-lafs/storage/logs/incidents/", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 54, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000001" + }, + "exemplar": true, + "expr": "rate(tahoe_incident_reports_total[5m])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Incident Reports rate", + "type": "timeseries" } ], "schemaVersion": 35, @@ -1289,7 +1523,7 @@ "list": [ { "current": { - "selected": true, + "selected": false, "text": "storage1", "value": "storage1" }, diff --git a/nixos/modules/monitoring/server/grafana-service.nix b/nixos/modules/monitoring/server/grafana-service.nix new file mode 100644 index 0000000000000000000000000000000000000000..fd4055ee396ab4ea450a102402782045920bc851 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-service.nix @@ -0,0 +1,739 @@ +# This is the NixOS 21.11 Grafana service definition module +# with the backported UID setting for data sources, so that +# we can have the same dashboards in our dev, test and prod +# environments. +# +# The change from nixpkgs 81291cc793cf88bd6eff3fd8512e5eb9d037066c +# will land in NixOS 22.11. +# +# When upgrading PrivateStorageio to 22.05, this file will +# need an upgrade too. When upgrading PrivateStorageio to +# 22.11, it can be removed. + +{ options, config, lib, pkgs, ... }: + +with lib; + +let + cfg = config.services.grafana; + opt = options.services.grafana; + declarativePlugins = pkgs.linkFarm "grafana-plugins" (builtins.map (pkg: { name = pkg.pname; path = pkg; }) cfg.declarativePlugins); + useMysql = cfg.database.type == "mysql"; + usePostgresql = cfg.database.type == "postgres"; + + envOptions = { + PATHS_DATA = cfg.dataDir; + PATHS_PLUGINS = if builtins.isNull cfg.declarativePlugins then "${cfg.dataDir}/plugins" else declarativePlugins; + PATHS_LOGS = "${cfg.dataDir}/log"; + + SERVER_PROTOCOL = cfg.protocol; + SERVER_HTTP_ADDR = cfg.addr; + SERVER_HTTP_PORT = cfg.port; + SERVER_SOCKET = cfg.socket; + SERVER_DOMAIN = cfg.domain; + SERVER_ROOT_URL = cfg.rootUrl; + SERVER_STATIC_ROOT_PATH = cfg.staticRootPath; + SERVER_CERT_FILE = cfg.certFile; + SERVER_CERT_KEY = cfg.certKey; + + DATABASE_TYPE = cfg.database.type; + DATABASE_HOST = cfg.database.host; + DATABASE_NAME = cfg.database.name; + DATABASE_USER = cfg.database.user; + DATABASE_PASSWORD = cfg.database.password; + DATABASE_PATH = cfg.database.path; + DATABASE_CONN_MAX_LIFETIME = cfg.database.connMaxLifetime; + + SECURITY_ADMIN_USER = cfg.security.adminUser; + SECURITY_ADMIN_PASSWORD = cfg.security.adminPassword; + SECURITY_SECRET_KEY = cfg.security.secretKey; + + USERS_ALLOW_SIGN_UP = boolToString cfg.users.allowSignUp; + USERS_ALLOW_ORG_CREATE = boolToString cfg.users.allowOrgCreate; + USERS_AUTO_ASSIGN_ORG = boolToString cfg.users.autoAssignOrg; + USERS_AUTO_ASSIGN_ORG_ROLE = cfg.users.autoAssignOrgRole; + + AUTH_ANONYMOUS_ENABLED = boolToString cfg.auth.anonymous.enable; + AUTH_ANONYMOUS_ORG_NAME = cfg.auth.anonymous.org_name; + AUTH_ANONYMOUS_ORG_ROLE = cfg.auth.anonymous.org_role; + AUTH_GOOGLE_ENABLED = boolToString cfg.auth.google.enable; + AUTH_GOOGLE_ALLOW_SIGN_UP = boolToString cfg.auth.google.allowSignUp; + AUTH_GOOGLE_CLIENT_ID = cfg.auth.google.clientId; + + ANALYTICS_REPORTING_ENABLED = boolToString cfg.analytics.reporting.enable; + + SMTP_ENABLED = boolToString cfg.smtp.enable; + SMTP_HOST = cfg.smtp.host; + SMTP_USER = cfg.smtp.user; + SMTP_PASSWORD = cfg.smtp.password; + SMTP_FROM_ADDRESS = cfg.smtp.fromAddress; + } // cfg.extraOptions; + + datasourceConfiguration = { + apiVersion = 1; + datasources = cfg.provision.datasources; + }; + + datasourceFile = pkgs.writeText "datasource.yaml" (builtins.toJSON datasourceConfiguration); + + dashboardConfiguration = { + apiVersion = 1; + providers = cfg.provision.dashboards; + }; + + dashboardFile = pkgs.writeText "dashboard.yaml" (builtins.toJSON dashboardConfiguration); + + notifierConfiguration = { + apiVersion = 1; + notifiers = cfg.provision.notifiers; + }; + + notifierFile = pkgs.writeText "notifier.yaml" (builtins.toJSON notifierConfiguration); + + provisionConfDir = pkgs.runCommand "grafana-provisioning" { } '' + mkdir -p $out/{datasources,dashboards,notifiers} + ln -sf ${datasourceFile} $out/datasources/datasource.yaml + ln -sf ${dashboardFile} $out/dashboards/dashboard.yaml + ln -sf ${notifierFile} $out/notifiers/notifier.yaml + ''; + + # Get a submodule without any embedded metadata: + _filter = x: filterAttrs (k: v: k != "_module") x; + + # http://docs.grafana.org/administration/provisioning/#datasources + grafanaTypes.datasourceConfig = types.submodule { + options = { + name = mkOption { + type = types.str; + description = "Name of the datasource. Required."; + }; + type = mkOption { + type = types.str; + description = "Datasource type. Required."; + }; + access = mkOption { + type = types.enum ["proxy" "direct"]; + default = "proxy"; + description = "Access mode. proxy or direct (Server or Browser in the UI). Required."; + }; + orgId = mkOption { + type = types.int; + default = 1; + description = "Org id. will default to orgId 1 if not specified."; + }; + uid = mkOption { + type = types.nullOr types.str; + default = null; + description = "Custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically."; + }; + url = mkOption { + type = types.str; + description = "Url of the datasource."; + }; + password = mkOption { + type = types.nullOr types.str; + default = null; + description = "Database password, if used."; + }; + user = mkOption { + type = types.nullOr types.str; + default = null; + description = "Database user, if used."; + }; + database = mkOption { + type = types.nullOr types.str; + default = null; + description = "Database name, if used."; + }; + basicAuth = mkOption { + type = types.nullOr types.bool; + default = null; + description = "Enable/disable basic auth."; + }; + basicAuthUser = mkOption { + type = types.nullOr types.str; + default = null; + description = "Basic auth username."; + }; + basicAuthPassword = mkOption { + type = types.nullOr types.str; + default = null; + description = "Basic auth password."; + }; + withCredentials = mkOption { + type = types.bool; + default = false; + description = "Enable/disable with credentials headers."; + }; + isDefault = mkOption { + type = types.bool; + default = false; + description = "Mark as default datasource. Max one per org."; + }; + jsonData = mkOption { + type = types.nullOr types.attrs; + default = null; + description = "Datasource specific configuration."; + }; + secureJsonData = mkOption { + type = types.nullOr types.attrs; + default = null; + description = "Datasource specific secure configuration."; + }; + version = mkOption { + type = types.int; + default = 1; + description = "Version."; + }; + editable = mkOption { + type = types.bool; + default = false; + description = "Allow users to edit datasources from the UI."; + }; + }; + }; + + # http://docs.grafana.org/administration/provisioning/#dashboards + grafanaTypes.dashboardConfig = types.submodule { + options = { + name = mkOption { + type = types.str; + default = "default"; + description = "Provider name."; + }; + orgId = mkOption { + type = types.int; + default = 1; + description = "Organization ID."; + }; + folder = mkOption { + type = types.str; + default = ""; + description = "Add dashboards to the specified folder."; + }; + type = mkOption { + type = types.str; + default = "file"; + description = "Dashboard provider type."; + }; + disableDeletion = mkOption { + type = types.bool; + default = false; + description = "Disable deletion when JSON file is removed."; + }; + updateIntervalSeconds = mkOption { + type = types.int; + default = 10; + description = "How often Grafana will scan for changed dashboards."; + }; + options = { + path = mkOption { + type = types.path; + description = "Path grafana will watch for dashboards."; + }; + }; + }; + }; + + grafanaTypes.notifierConfig = types.submodule { + options = { + name = mkOption { + type = types.str; + default = "default"; + description = "Notifier name."; + }; + type = mkOption { + type = types.enum ["dingding" "discord" "email" "googlechat" "hipchat" "kafka" "line" "teams" "opsgenie" "pagerduty" "prometheus-alertmanager" "pushover" "sensu" "sensugo" "slack" "telegram" "threema" "victorops" "webhook"]; + description = "Notifier type."; + }; + uid = mkOption { + type = types.str; + description = "Unique notifier identifier."; + }; + org_id = mkOption { + type = types.int; + default = 1; + description = "Organization ID."; + }; + org_name = mkOption { + type = types.str; + default = "Main Org."; + description = "Organization name."; + }; + is_default = mkOption { + type = types.bool; + description = "Is the default notifier."; + default = false; + }; + send_reminder = mkOption { + type = types.bool; + default = true; + description = "Should the notifier be sent reminder notifications while alerts continue to fire."; + }; + frequency = mkOption { + type = types.str; + default = "5m"; + description = "How frequently should the notifier be sent reminders."; + }; + disable_resolve_message = mkOption { + type = types.bool; + default = false; + description = "Turn off the message that sends when an alert returns to OK."; + }; + settings = mkOption { + type = types.nullOr types.attrs; + default = null; + description = "Settings for the notifier type."; + }; + secure_settings = mkOption { + type = types.nullOr types.attrs; + default = null; + description = "Secure settings for the notifier type."; + }; + }; + }; +in { + options.services.grafana = { + enable = mkEnableOption "grafana"; + + protocol = mkOption { + description = "Which protocol to listen."; + default = "http"; + type = types.enum ["http" "https" "socket"]; + }; + + addr = mkOption { + description = "Listening address."; + default = "127.0.0.1"; + type = types.str; + }; + + port = mkOption { + description = "Listening port."; + default = 3000; + type = types.port; + }; + + socket = mkOption { + description = "Listening socket."; + default = "/run/grafana/grafana.sock"; + type = types.str; + }; + + domain = mkOption { + description = "The public facing domain name used to access grafana from a browser."; + default = "localhost"; + type = types.str; + }; + + rootUrl = mkOption { + description = "Full public facing url."; + default = "%(protocol)s://%(domain)s:%(http_port)s/"; + type = types.str; + }; + + certFile = mkOption { + description = "Cert file for ssl."; + default = ""; + type = types.str; + }; + + certKey = mkOption { + description = "Cert key for ssl."; + default = ""; + type = types.str; + }; + + staticRootPath = mkOption { + description = "Root path for static assets."; + default = "${cfg.package}/share/grafana/public"; + defaultText = literalExpression ''"''${package}/share/grafana/public"''; + type = types.str; + }; + + package = mkOption { + description = "Package to use."; + default = pkgs.grafana; + defaultText = literalExpression "pkgs.grafana"; + type = types.package; + }; + + declarativePlugins = mkOption { + type = with types; nullOr (listOf path); + default = null; + description = "If non-null, then a list of packages containing Grafana plugins to install. If set, plugins cannot be manually installed."; + example = literalExpression "with pkgs.grafanaPlugins; [ grafana-piechart-panel ]"; + # Make sure each plugin is added only once; otherwise building + # the link farm fails, since the same path is added multiple + # times. + apply = x: if isList x then lib.unique x else x; + }; + + dataDir = mkOption { + description = "Data directory."; + default = "/var/lib/grafana"; + type = types.path; + }; + + database = { + type = mkOption { + description = "Database type."; + default = "sqlite3"; + type = types.enum ["mysql" "sqlite3" "postgres"]; + }; + + host = mkOption { + description = "Database host."; + default = "127.0.0.1:3306"; + type = types.str; + }; + + name = mkOption { + description = "Database name."; + default = "grafana"; + type = types.str; + }; + + user = mkOption { + description = "Database user."; + default = "root"; + type = types.str; + }; + + password = mkOption { + description = '' + Database password. + This option is mutual exclusive with the passwordFile option. + ''; + default = ""; + type = types.str; + }; + + passwordFile = mkOption { + description = '' + File that containts the database password. + This option is mutual exclusive with the password option. + ''; + default = null; + type = types.nullOr types.path; + }; + + path = mkOption { + description = "Database path."; + default = "${cfg.dataDir}/data/grafana.db"; + type = types.path; + }; + + connMaxLifetime = mkOption { + description = '' + Sets the maximum amount of time (in seconds) a connection may be reused. + For MySQL this setting should be shorter than the `wait_timeout' variable. + ''; + default = "unlimited"; + example = 14400; + type = types.either types.int (types.enum [ "unlimited" ]); + }; + }; + + provision = { + enable = mkEnableOption "provision"; + datasources = mkOption { + description = "Grafana datasources configuration."; + default = []; + type = types.listOf grafanaTypes.datasourceConfig; + apply = x: map _filter x; + }; + dashboards = mkOption { + description = "Grafana dashboard configuration."; + default = []; + type = types.listOf grafanaTypes.dashboardConfig; + apply = x: map _filter x; + }; + notifiers = mkOption { + description = "Grafana notifier configuration."; + default = []; + type = types.listOf grafanaTypes.notifierConfig; + apply = x: map _filter x; + }; + }; + + security = { + adminUser = mkOption { + description = "Default admin username."; + default = "admin"; + type = types.str; + }; + + adminPassword = mkOption { + description = '' + Default admin password. + This option is mutual exclusive with the adminPasswordFile option. + ''; + default = "admin"; + type = types.str; + }; + + adminPasswordFile = mkOption { + description = '' + Default admin password. + This option is mutual exclusive with the <literal>adminPassword</literal> option. + ''; + default = null; + type = types.nullOr types.path; + }; + + secretKey = mkOption { + description = "Secret key used for signing."; + default = "SW2YcwTIb9zpOOhoPsMm"; + type = types.str; + }; + + secretKeyFile = mkOption { + description = "Secret key used for signing."; + default = null; + type = types.nullOr types.path; + }; + }; + + smtp = { + enable = mkEnableOption "smtp"; + host = mkOption { + description = "Host to connect to."; + default = "localhost:25"; + type = types.str; + }; + user = mkOption { + description = "User used for authentication."; + default = ""; + type = types.str; + }; + password = mkOption { + description = '' + Password used for authentication. + This option is mutual exclusive with the passwordFile option. + ''; + default = ""; + type = types.str; + }; + passwordFile = mkOption { + description = '' + Password used for authentication. + This option is mutual exclusive with the password option. + ''; + default = null; + type = types.nullOr types.path; + }; + fromAddress = mkOption { + description = "Email address used for sending."; + default = "admin@grafana.localhost"; + type = types.str; + }; + }; + + users = { + allowSignUp = mkOption { + description = "Disable user signup / registration."; + default = false; + type = types.bool; + }; + + allowOrgCreate = mkOption { + description = "Whether user is allowed to create organizations."; + default = false; + type = types.bool; + }; + + autoAssignOrg = mkOption { + description = "Whether to automatically assign new users to default org."; + default = true; + type = types.bool; + }; + + autoAssignOrgRole = mkOption { + description = "Default role new users will be auto assigned."; + default = "Viewer"; + type = types.enum ["Viewer" "Editor"]; + }; + }; + + auth = { + anonymous = { + enable = mkOption { + description = "Whether to allow anonymous access."; + default = false; + type = types.bool; + }; + org_name = mkOption { + description = "Which organization to allow anonymous access to."; + default = "Main Org."; + type = types.str; + }; + org_role = mkOption { + description = "Which role anonymous users have in the organization."; + default = "Viewer"; + type = types.str; + }; + }; + google = { + enable = mkOption { + description = "Whether to allow Google OAuth2."; + default = false; + type = types.bool; + }; + allowSignUp = mkOption { + description = "Whether to allow sign up with Google OAuth2."; + default = false; + type = types.bool; + }; + clientId = mkOption { + description = "Google OAuth2 client ID."; + default = ""; + type = types.str; + }; + clientSecretFile = mkOption { + description = "Google OAuth2 client secret."; + default = null; + type = types.nullOr types.path; + }; + }; + }; + + analytics.reporting = { + enable = mkOption { + description = "Whether to allow anonymous usage reporting to stats.grafana.net."; + default = true; + type = types.bool; + }; + }; + + extraOptions = mkOption { + description = '' + Extra configuration options passed as env variables as specified in + <link xlink:href="http://docs.grafana.org/installation/configuration/">documentation</link>, + but without GF_ prefix + ''; + default = {}; + type = with types; attrsOf (either str path); + }; + }; + + config = mkIf cfg.enable { + warnings = flatten [ + (optional ( + cfg.database.password != opt.database.password.default || + cfg.security.adminPassword != opt.security.adminPassword.default + ) "Grafana passwords will be stored as plaintext in the Nix store!") + (optional ( + any (x: x.password != null || x.basicAuthPassword != null || x.secureJsonData != null) cfg.provision.datasources + ) "Datasource passwords will be stored as plaintext in the Nix store!") + (optional ( + any (x: x.secure_settings != null) cfg.provision.notifiers + ) "Notifier secure settings will be stored as plaintext in the Nix store!") + ]; + + environment.systemPackages = [ cfg.package ]; + + assertions = [ + { + assertion = cfg.database.password != opt.database.password.default -> cfg.database.passwordFile == null; + message = "Cannot set both password and passwordFile"; + } + { + assertion = cfg.security.adminPassword != opt.security.adminPassword.default -> cfg.security.adminPasswordFile == null; + message = "Cannot set both adminPassword and adminPasswordFile"; + } + { + assertion = cfg.security.secretKey != opt.security.secretKey.default -> cfg.security.secretKeyFile == null; + message = "Cannot set both secretKey and secretKeyFile"; + } + { + assertion = cfg.smtp.password != opt.smtp.password.default -> cfg.smtp.passwordFile == null; + message = "Cannot set both password and passwordFile"; + } + ]; + + systemd.services.grafana = { + description = "Grafana Service Daemon"; + wantedBy = ["multi-user.target"]; + after = ["networking.target"] ++ lib.optional usePostgresql "postgresql.service" ++ lib.optional useMysql "mysql.service"; + environment = { + QT_QPA_PLATFORM = "offscreen"; + } // mapAttrs' (n: v: nameValuePair "GF_${n}" (toString v)) envOptions; + script = '' + set -o errexit -o pipefail -o nounset -o errtrace + shopt -s inherit_errexit + + ${optionalString (cfg.auth.google.clientSecretFile != null) '' + GF_AUTH_GOOGLE_CLIENT_SECRET="$(<${escapeShellArg cfg.auth.google.clientSecretFile})" + export GF_AUTH_GOOGLE_CLIENT_SECRET + ''} + ${optionalString (cfg.database.passwordFile != null) '' + GF_DATABASE_PASSWORD="$(<${escapeShellArg cfg.database.passwordFile})" + export GF_DATABASE_PASSWORD + ''} + ${optionalString (cfg.security.adminPasswordFile != null) '' + GF_SECURITY_ADMIN_PASSWORD="$(<${escapeShellArg cfg.security.adminPasswordFile})" + export GF_SECURITY_ADMIN_PASSWORD + ''} + ${optionalString (cfg.security.secretKeyFile != null) '' + GF_SECURITY_SECRET_KEY="$(<${escapeShellArg cfg.security.secretKeyFile})" + export GF_SECURITY_SECRET_KEY + ''} + ${optionalString (cfg.smtp.passwordFile != null) '' + GF_SMTP_PASSWORD="$(<${escapeShellArg cfg.smtp.passwordFile})" + export GF_SMTP_PASSWORD + ''} + ${optionalString cfg.provision.enable '' + export GF_PATHS_PROVISIONING=${provisionConfDir}; + ''} + exec ${cfg.package}/bin/grafana-server -homepath ${cfg.dataDir} + ''; + serviceConfig = { + WorkingDirectory = cfg.dataDir; + User = "grafana"; + RuntimeDirectory = "grafana"; + RuntimeDirectoryMode = "0755"; + # Hardening + AmbientCapabilities = lib.mkIf (cfg.port < 1024) [ "CAP_NET_BIND_SERVICE" ]; + CapabilityBoundingSet = if (cfg.port < 1024) then [ "CAP_NET_BIND_SERVICE" ] else [ "" ]; + DeviceAllow = [ "" ]; + LockPersonality = true; + NoNewPrivileges = true; + PrivateDevices = true; + PrivateTmp = true; + ProtectClock = true; + ProtectControlGroups = true; + ProtectHome = true; + ProtectHostname = true; + ProtectKernelLogs = true; + ProtectKernelModules = true; + ProtectKernelTunables = true; + ProtectProc = "invisible"; + ProtectSystem = "full"; + RemoveIPC = true; + RestrictAddressFamilies = [ "AF_INET" "AF_INET6" "AF_UNIX" ]; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + SystemCallArchitectures = "native"; + # Upstream grafana is not setting SystemCallFilter for compatibility + # reasons, see https://github.com/grafana/grafana/pull/40176 + SystemCallFilter = [ "@system-service" "~@privileged" "~@resources" ]; + UMask = "0027"; + }; + preStart = '' + ln -fs ${cfg.package}/share/grafana/conf ${cfg.dataDir} + ln -fs ${cfg.package}/share/grafana/tools ${cfg.dataDir} + ''; + }; + + users.users.grafana = { + uid = config.ids.uids.grafana; + description = "Grafana user"; + home = cfg.dataDir; + createHome = true; + group = "grafana"; + }; + users.groups.grafana = {}; + }; +} diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix index 0923885f86d9bcebc4d3df590c71fbcddd8d1df8..9f8bf6b5242db919afe18b6f5031a15d09cdb539 100644 --- a/nixos/modules/monitoring/server/grafana.nix +++ b/nixos/modules/monitoring/server/grafana.nix @@ -19,6 +19,12 @@ let }; in { + + # Override Grafana module so we can specify datasource UIDs + # Copied from https://nixos.org/manual/nixos/stable/#sec-replace-modules + disabledModules = [ "services/monitoring/grafana.nix" ]; + imports = [ ./grafana-service.nix ]; + options.services.private-storage.monitoring.grafana = { domains = lib.mkOption { type = lib.types.listOf lib.types.str; diff --git a/nixos/modules/monitoring/server/prometheus.nix b/nixos/modules/monitoring/server/prometheus.nix index 2a78dd3e797c0b28d14fc9e9e0858811ac86ef76..fac29c29ffaae507549f78826edd3e838ccb4e6c 100644 --- a/nixos/modules/monitoring/server/prometheus.nix +++ b/nixos/modules/monitoring/server/prometheus.nix @@ -13,6 +13,7 @@ let regex = "^(.*)(?:\\.monitoringvpn):\\d+$"; target_label = "instance"; }; + logRetention = toString(config.services.private-storage.monitoring.policy.logRetentionSeconds) + "s"; in { options.services.private-storage.monitoring.prometheus = { @@ -44,6 +45,7 @@ in { services.prometheus = { enable = true; # port = 9090; # Option only in recent (20.09?) nixpkgs, 9090 default + retentionTime = logRetention; scrapeConfigs = [ { job_name = "node-exporters"; diff --git a/nixpkgs-2105.nix b/nixpkgs-2105.nix deleted file mode 100644 index e33347a21c29186826256e60bea0122fc85322bd..0000000000000000000000000000000000000000 --- a/nixpkgs-2105.nix +++ /dev/null @@ -1,6 +0,0 @@ -# This actually imports nixos-21.11 but we need to keep this file around so that -# upgrades work, as the on-node deployment script expects this file in the checkout. -# See https://whetstone.private.storage/privatestorage/PrivateStorageio/-/merge_requests/222#note_18600 -# This file can be removed once all nodes have been updated to point to the new file. - -import ./nixpkgs.nix diff --git a/shell.nix b/shell.nix index b8be3a3a6088a987468329ad29919a6957313c6a..d23d71f2a08f548f9fd95c6a95490f2de7f8b60a 100644 --- a/shell.nix +++ b/shell.nix @@ -22,6 +22,8 @@ pkgs.mkShell { inputsFrom = [tools]; buildInputs = [ tools + pkgs.cacert + pkgs.nix pkgs.morph pkgs.jp ];