diff --git a/morph/grid/local/grid.nix b/morph/grid/local/grid.nix index f977876037c7a3769852f43cab0e5d3e59fce154..5502b8faa622b8af1e967978b714d4693961c603 100644 --- a/morph/grid/local/grid.nix +++ b/morph/grid/local/grid.nix @@ -112,7 +112,10 @@ let imports = [ gridlib.monitoring (gridlib.customize-monitoring { - inherit hostsMap vpnClientIPs nodeExporterTargets paymentExporterTargets; + inherit hostsMap vpnClientIPs + nodeExporterTargets + paymentExporterTargets + blackboxExporterHttpsTargets; inherit (grid-config) letsEncryptAdminEmail; googleOAuthClientID = grid-config.monitoringGoogleOAuthClientID; enableSlackAlert = false; @@ -136,6 +139,10 @@ let vpnClientIPs = [ "172.23.23.11" "172.23.23.12" "172.23.23.13" ]; nodeExporterTargets = [ "monitoring" "payments" "storage1" "storage2" ]; paymentExporterTargets = [ "payments" ]; + blackboxExporterHttpsTargets = [ + # "https://private.storage/" + # "https://payments.private.storage/" + ]; in { network = { diff --git a/morph/grid/production/grid.nix b/morph/grid/production/grid.nix index ec0c1b37f996dd836f6011425f114471a2ccde0a..b42f4a3e40f56bf3d1b9808b3e87d3f769c8e2a7 100644 --- a/morph/grid/production/grid.nix +++ b/morph/grid/production/grid.nix @@ -45,7 +45,10 @@ let gridlib.monitoring gridlib.hardware-aws (gridlib.customize-monitoring { - inherit hostsMap vpnClientIPs nodeExporterTargets paymentExporterTargets; + inherit hostsMap vpnClientIPs + nodeExporterTargets + paymentExporterTargets + blackboxExporterHttpsTargets; inherit (grid-config) letsEncryptAdminEmail; googleOAuthClientID = grid-config.monitoringGoogleOAuthClientID; enableSlackAlert = true; @@ -127,6 +130,16 @@ let "storage005" ]; paymentExporterTargets = [ "payments" ]; + blackboxExporterHttpsTargets = [ + "https://private.storage/" + "https://www.private.storage/" + "https://privatestorage.io/" + "https://www.privatestorage.io/" + "https://payments.private.storage/" + "https://payments.privatestorage.io/" + "https://monitoring.private.storage/" + "https://monitoring.privatestorage.io/" + ]; in { network = { diff --git a/morph/grid/testing/grid.nix b/morph/grid/testing/grid.nix index 7a304e11cfd7ec3fbef3f8efa2417a9004132c42..ea4cd56a0b7b17c168114579a287f57e03e96914 100644 --- a/morph/grid/testing/grid.nix +++ b/morph/grid/testing/grid.nix @@ -58,7 +58,10 @@ let gridlib.monitoring gridlib.hardware-aws (gridlib.customize-monitoring { - inherit hostsMap vpnClientIPs nodeExporterTargets paymentExporterTargets; + inherit hostsMap vpnClientIPs + nodeExporterTargets + paymentExporterTargets + blackboxExporterHttpsTargets; inherit (grid-config) letsEncryptAdminEmail; googleOAuthClientID = grid-config.monitoringGoogleOAuthClientID; enableSlackAlert = true; @@ -78,6 +81,16 @@ let vpnClientIPs = [ "172.23.23.11" "172.23.23.12" ]; nodeExporterTargets = [ "monitoring" "payments" "storage001" ]; paymentExporterTargets = [ "payments" ]; + blackboxExporterHttpsTargets = [ + "https://privatestorage-staging.com/" + "https://www.privatestorage-staging.com/" + "https://extra.privatestorage-staging.com/" + "https://www.extra.privatestorage-staging.com/" + "https://payments.privatestorage-staging.com/" + "https://payments.extra.privatestorage-staging.com/" + "https://monitoring.privatestorage-staging.com/" + "https://monitoring.extra.privatestorage-staging.com/" + ]; in { network = { diff --git a/morph/lib/customize-monitoring.nix b/morph/lib/customize-monitoring.nix index d9842692481777a7e65b70208b5c0246b3209d1a..ef89119f7f54d1b644c86f7a72129ebd6a79cae6 100644 --- a/morph/lib/customize-monitoring.nix +++ b/morph/lib/customize-monitoring.nix @@ -28,6 +28,10 @@ # which nodes to scrape PaymentServer metrics from. , paymentExporterTargets ? [] + # A list of HTTPS servers (URLs, IP addresses or hostnames) as strings indicating + # which nodes the BlackboxExporter should scrape HTTP and TLS metrics from. +, blackboxExporterHttpsTargets ? [] + # A string containing the GSuite OAuth2 ClientID to use to authenticate # logins to Grafana. , googleOAuthClientID @@ -108,6 +112,7 @@ in { inherit nodeExporterTargets; inherit nginxExporterTargets; inherit paymentExporterTargets; + inherit blackboxExporterHttpsTargets; }; services.private-storage.monitoring.grafana = { diff --git a/morph/lib/monitoring.nix b/morph/lib/monitoring.nix index bf92d1041f2bf9b9fb1ff4580a25ff7b596a9bbb..89a328e89a799b445dff7180dff552350b9629cf 100644 --- a/morph/lib/monitoring.nix +++ b/morph/lib/monitoring.nix @@ -25,6 +25,7 @@ ../../nixos/modules/monitoring/server/grafana.nix ../../nixos/modules/monitoring/server/prometheus.nix ../../nixos/modules/monitoring/exporters/node.nix + ../../nixos/modules/monitoring/exporters/blackbox.nix # Loki 0.3.0 from Nixpkgs 19.09 is too old and does not work: # ../../nixos/modules/monitoring/server/loki.nix ]; diff --git a/nixos/modules/monitoring/exporters/blackbox.nix b/nixos/modules/monitoring/exporters/blackbox.nix new file mode 100644 index 0000000000000000000000000000000000000000..c08dee2a5778b9ac037268cef9907f20537e96b2 --- /dev/null +++ b/nixos/modules/monitoring/exporters/blackbox.nix @@ -0,0 +1,32 @@ +# Prometheus Blackbox exporter configuration +# +# Scope: From the monitoring machine, ping (etc.) hosts to check whether +# they are reachable, certs still are valid for a while, etc. +# +# Notes: The Blackbox exporter is using the "Multi Target Exporter" pattern, +# see https://prometheus.io/docs/guides/multi-target-exporter/ . +# +# Usage: Import this on a monitoring server + +{ config, lib, pkgs, ... }: { + config.services.prometheus.exporters.blackbox = { + enable = true; + + configFile = pkgs.writeText "blackbox-exporter.yaml" (builtins.toJSON { + modules = { + https_2xx = { + prober = "http"; + timeout = "5s"; + http = { + fail_if_not_ssl = true; + # This prober is for IPv4 only. + preferred_ip_protocol = "ip4"; + ip_protocol_fallback = false; + }; + }; + }; + }); + + }; +} + diff --git a/nixos/modules/monitoring/server/grafana-dashboards/services-overview.json b/nixos/modules/monitoring/server/grafana-dashboards/services-overview.json new file mode 100644 index 0000000000000000000000000000000000000000..549e3b11e349b4cb69843dffc8506d9c250a1899 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-dashboards/services-overview.json @@ -0,0 +1,477 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "How are our user-facing services doing?", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 32, + "panels": [], + "title": "HTTPS endpoints", + "type": "row" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 3.142 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Response times alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "probe_duration_seconds", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 3.142, + "visible": true + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Response times", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:425", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:426", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "count" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Probe fails alert", + "noDataState": "ok", + "notifications": [] + }, + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Shows all HTTP endpoints where probe_success == 0. This could have different reasons, likely ones being the service is down or the TLS certificate is not trusted.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 1 + }, + "hiddenSeries": false, + "id": 38, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "count by (instance) (probe_success == 0)", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0, + "visible": true + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Probe fails", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:903", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:904", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 2419200 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "60m", + "handler": 1, + "message": "A TLS certificate is expiring within four weeks.", + "name": "TLS certificate expiry alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "probe_ssl_earliest_cert_expiry - time()", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "lt", + "value": 2419200, + "visible": true + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TLS certificate expiry", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:179", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:180", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Services overview", + "uid": "ServicesOverview", + "version": 1 +} diff --git a/nixos/modules/monitoring/server/prometheus.nix b/nixos/modules/monitoring/server/prometheus.nix index 1f27f023df5b3211a81e3603226cc7cfe2c25e27..3bb00a5b95855859e455b5df8fb065b3d70bc855 100644 --- a/nixos/modules/monitoring/server/prometheus.nix +++ b/nixos/modules/monitoring/server/prometheus.nix @@ -18,19 +18,24 @@ in { options.services.private-storage.monitoring.prometheus = { nodeExporterTargets = lib.mkOption { type = with lib.types; listOf str; - example = lib.literalExample "[ node1 node2 ]"; + example = [ "node1" "node2" ]; description = "List of nodes (hostnames or IPs) to scrape."; }; nginxExporterTargets = lib.mkOption { type = with lib.types; listOf str; - example = lib.literalExample "[ node1 node2 ]"; + example = [ "node1" "node2" ]; description = "List of nodes (hostnames or IPs) to scrape."; }; paymentExporterTargets = lib.mkOption { type = with lib.types; listOf str; - example = lib.literalExample "[ node1 node2 ]"; + example = [ "node1" "node2" ]; description = "List of nodes (hostnames or IPs) to scrape."; }; + blackboxExporterHttpsTargets = lib.mkOption { + type = with lib.types; listOf str; + example = [ "https://node1.com/" "https://node2.org/" ]; + description = "List of https URLs to scrape."; + }; }; config = rec { @@ -65,6 +70,32 @@ in { }]; relabel_configs = [ dropPortNumber ]; } + { + # The Blackbox exporter is using Prometheus' "Multi-Target Exporter Pattern", + # see https://prometheus.io/docs/guides/multi-target-exporter/ + job_name = "blackboxExporterHttps"; + static_configs = [{ + targets = cfg.blackboxExporterHttpsTargets; + }]; + metrics_path = "/probe"; + params.module = [ "https_2xx" ]; + relabel_configs = [ + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + source_labels = []; + target_label = "__address__"; + # The blackbox exporter’s real hostname:port + replacement = "monitoring:9115"; + } + ]; + } ]; }; };