diff --git a/morph/grid/local/grid.nix b/morph/grid/local/grid.nix index a762186d3aaad642fe24aaf7666853fde79986b3..0c114ccf11b9ddb963d5e2e6297b6a7c83792aba 100644 --- a/morph/grid/local/grid.nix +++ b/morph/grid/local/grid.nix @@ -7,45 +7,58 @@ import ../../lib/make-grid.nix { nodes = cfg: let sshUsers = import ./secrets/users.nix; - vpnClientIPs = [ "172.23.23.11" "172.23.23.12" "172.23.23.13" ]; # TBD: derive automatically + # Get absolute vpn key directory path, as a string: monitoringvpnKeyDir = toString ./. + "/${cfg.monitoringvpnKeyDir}"; + + # TBD: derive these automatically: + hostsMap = { + "172.23.23.1" = [ "monitoring1" "monitoring1.monitoringvpn" ]; + "172.23.23.11" = [ "payments1" "payments1.monitoringvpn" ]; + "172.23.23.12" = [ "storage1" "storage1.monitoringvpn" ]; + "172.23.23.13" = [ "storage2" "storage2.monitoringvpn" ]; + }; + vpnClientIPs = [ "172.23.23.11" "172.23.23.12" "172.23.23.13" ]; + nodeExporterTargets = [ "monitoring1" "payments1" "storage1" "storage2" ]; + in { "payments1" = import ../../lib/make-issuer.nix (cfg // rec { publicIPv4 = "192.168.67.21"; monitoringvpnIPv4 = "172.23.23.11"; - inherit monitoringvpnKeyDir; - inherit sshUsers; hardware = import ./virtual-hardware.nix ({ inherit publicIPv4; }); stateVersion = "19.03"; + inherit monitoringvpnKeyDir; + inherit sshUsers; }); "storage1" = import ../../lib/make-testing.nix (cfg // rec { publicIPv4 = "192.168.67.22"; monitoringvpnIPv4 = "172.23.23.12"; - inherit monitoringvpnKeyDir; - inherit sshUsers; hardware = import ./virtual-hardware.nix ({ inherit publicIPv4; }); stateVersion = "19.09"; + inherit monitoringvpnKeyDir; + inherit sshUsers; }); "storage2" = import ../../lib/make-testing.nix (cfg // rec { publicIPv4 = "192.168.67.23"; monitoringvpnIPv4 = "172.23.23.13"; - inherit monitoringvpnKeyDir; - inherit sshUsers; hardware = import ./virtual-hardware.nix ({ inherit publicIPv4; }); stateVersion = "19.09"; + inherit monitoringvpnKeyDir; + inherit sshUsers; }); "monitoring1" = import ../../lib/make-monitoring.nix (cfg // rec { publicIPv4 = "192.168.67.24"; monitoringvpnIPv4 = "172.23.23.1"; inherit vpnClientIPs; - inherit sshUsers; - inherit monitoringvpnKeyDir; + inherit hostsMap; + inherit nodeExporterTargets; hardware = import ./virtual-hardware.nix ({ inherit publicIPv4; }); stateVersion = "19.09"; + inherit monitoringvpnKeyDir; + inherit sshUsers; }); }; } diff --git a/morph/lib/make-issuer.nix b/morph/lib/make-issuer.nix index 58b8a4f20496472409c2063a2923bc29f161d68a..bbdf0cebbf770738e9ccb997daec75e58df021b5 100644 --- a/morph/lib/make-issuer.nix +++ b/morph/lib/make-issuer.nix @@ -64,6 +64,7 @@ in rec { hardware ../../nixos/modules/issuer.nix ../../nixos/modules/monitoring/vpn/client.nix + ../../nixos/modules/monitoring/exporters/node.nix ]; services.private-storage.sshUsers = sshUsers; diff --git a/morph/lib/make-monitoring.nix b/morph/lib/make-monitoring.nix index c37ea2297088fafba1b97e8d037c378505c3d84c..8eb53c6db9552e65a84e5d3e5564449db437e902 100644 --- a/morph/lib/make-monitoring.nix +++ b/morph/lib/make-monitoring.nix @@ -8,6 +8,9 @@ , monitoringvpnIPv4 ? null , monitoringvpnKeyDir ? null , vpnClientIPs ? null +, nodeExporterTargets ? [] +, nginxExporterTargets ? [] +, hostsMap ? {} , ... }: let enableVpn = monitoringvpnKeyDir != null && @@ -32,6 +35,7 @@ action = ["sudo" "systemctl" "restart" "wireguard-monitoringvpn.service"]; }; }; + in rec { deployment = { @@ -42,6 +46,11 @@ in rec { imports = [ hardware ../../nixos/modules/monitoring/vpn/server.nix + ../../nixos/modules/monitoring/server/grafana.nix + ../../nixos/modules/monitoring/server/prometheus.nix + ../../nixos/modules/monitoring/exporters/node.nix + # Loki 0.3.0 from Nixpkgs 19.09 is too old and does not work: + # ../../nixos/modules/monitoring/server/loki.nix ]; services.private-storage.monitoring.vpn.server = if !enableVpn then {} else { @@ -51,5 +60,18 @@ in rec { pubKeysPath = monitoringvpnKeyDir; }; + services.private-storage.monitoring.grafana = { + domain = "grafana.grid.private.storage"; + prometheusUrl = "http://localhost:9090/"; + lokiUrl = "http://localhost:3100/"; + }; + + services.private-storage.monitoring.prometheus = { + inherit nodeExporterTargets; + inherit nginxExporterTargets; + }; + system.stateVersion = stateVersion; + + networking.hosts = hostsMap; } diff --git a/morph/lib/make-testing.nix b/morph/lib/make-testing.nix index f1c1b56fc5444322a8f3a1191fe296fe23528a3e..3f6e767db5ee734a8ca2314b216d4fa602c01907 100644 --- a/morph/lib/make-testing.nix +++ b/morph/lib/make-testing.nix @@ -57,6 +57,7 @@ in rec { hardware ../../nixos/modules/private-storage.nix ../../nixos/modules/monitoring/vpn/client.nix + ../../nixos/modules/monitoring/exporters/node.nix ]; services.private-storage = diff --git a/nixos/modules/monitoring/exporters/node.nix b/nixos/modules/monitoring/exporters/node.nix new file mode 100644 index 0000000000000000000000000000000000000000..62702e82f1e0a6bd9effae871f275c5dd23a37ae --- /dev/null +++ b/nixos/modules/monitoring/exporters/node.nix @@ -0,0 +1,74 @@ +# Prometheus common node exporter config +# +# Scope: Export platform data like CPU, memory, disk space etc. to be +# polled by Prometheus server +# Usage: Import this to every server you want to include in the central +# monitoring system +# See https://nixos.org/manual/nixos/stable/#module-services-prometheus-exporters + +{ config, lib, pkgs, ... }: + +with lib; + +let + mountsFileSystemType = fsType: {} != filterAttrs (n: v: v.fsType == fsType) config.fileSystems; + +in { + config.services.prometheus.exporters.node = { + enable = true; + openFirewall = true; + firewallFilter = "-i monitoringvpn -p tcp -m tcp --dport 9100"; + port = 9100; + # extraFlags = [ "--collector.disable-defaults" ]; # not in nixpkgs 19.09 + # Thanks https://github.com/mayflower/nixexprs/blob/master/modules/monitoring/default.nix + enabledCollectors = [ + "arp" + "bcache" + "conntrack" + "filefd" + "logind" + "netclass" + "netdev" + "netstat" + #"rapl" # not in nixpkgs 19.09 + "sockstat" + #"softnet" # not in nixpkgs 19.09 + "stat" + "systemd" + # "textfile" + # "textfile.directory /run/prometheus-node-exporter" + #"thermal_zone" # not in nixpkgs 19.09 + "time" + #"udp_queues" # not in nixpkgs 19.09 + "uname" + "vmstat" + ] ++ optionals (!config.boot.isContainer) [ + "cpu" + "cpufreq" + "diskstats" + "edac" + "entropy" + "filesystem" + "hwmon" + "interrupts" + "ksmd" + "loadavg" + "meminfo" + "pressure" + "timex" + ] ++ ( + optionals (config.services.nfs.server.enable) [ "nfsd" ] + ) ++ ( + optionals ("" != config.boot.initrd.mdadmConf) [ "mdadm" ] + ) ++ ( + optionals ({} != config.networking.bonds) [ "bonding" ] + ) ++ ( + optionals (mountsFileSystemType "nfs") [ "nfs" ] + ) ++ ( + optionals (mountsFileSystemType "xfs") [ "xfs" ] + ) ++ ( + optionals (mountsFileSystemType "zfs" || elem "zfs" config.boot.supportedFilesystems) [ "zfs" ] + ); + }; +} + diff --git a/nixos/modules/monitoring/server/grafana-config/resources-overview.json b/nixos/modules/monitoring/server/grafana-config/resources-overview.json new file mode 100644 index 0000000000000000000000000000000000000000..cd171d50594d77153f4d905bd91aec12f6bafcb9 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-config/resources-overview.json @@ -0,0 +1,1286 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "USE: Usage, Saturation and Error rate for our resources", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 22, + "panels": [], + "title": "CPU & Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Some of our software runs in a single thread, so this shows max CPU per core (instead of averaged over all cores)", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (max by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Max CPU % per core per node", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "15 min load average alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "displayName": "${__field.labels.instance}" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "node_load15{instance=\"grafana:9100\", job=\"node-exporters\"}" + }, + "properties": [ + { + "id": "links" + } + ] + } + ] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 1 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load15", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "15 min load average", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0.8 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "15m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "RAM filling up", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How much RAM is in use? Relative to available system memory.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes\r\n", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.8, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "RAM used %", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 20, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Shows most saturated network link for every node. Baseline is the reported NIC link speed - that might not be the actual limit.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (instance) (rate(node_network_transmit_bytes_total{device!~\"lo|monitoringvpn\"}[5m]) / node_network_speed_bytes)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} out", + "refId": "A" + }, + { + "expr": "- max by (instance) (rate(node_network_receive_bytes_total{device!~\"lo|monitoringvpn\"}[5m]) / node_network_speed_bytes)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} in", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throughput %", + "tooltip": { + "shared": false, + "sort": 2, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "-1", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Packet and error count. Positive values mean transmit, negative receive.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 9 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "- rate(node_network_receive_packets_total{device!~\"lo|monitoringvpn\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "A" + }, + { + "expr": "- rate(node_network_receive_errs_total{device!~\"lo|monitoringvpn\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "B" + }, + { + "expr": "rate(node_network_transmit_packets_total{device!~\"lo|monitoringvpn\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "C" + }, + { + "expr": "rate(node_network_transmit_errs_total{device!~\"lo|monitoringvpn\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network pkt/s", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "C", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "D", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Network errors alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Network errors, drops etc. Should all be 0.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 9 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_transmit_errs_total\n", + "interval": "", + "legendFormat": "{{instance}} {{device}}", + "refId": "A" + }, + { + "expr": "node_network_transmit_drop_total", + "interval": "", + "legendFormat": "{{instance}} {{device}}", + "refId": "B" + }, + { + "expr": "- node_network_receive_drop_total", + "interval": "", + "legendFormat": "{{instance}} {{device}}", + "refId": "C" + }, + { + "expr": "- node_network_receive_errs_total", + "interval": "", + "legendFormat": "{{instance}} {{device}}", + "refId": "D" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 10 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 18, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0.8 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Filesystem usage % alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Watch filesystems filling up. Shows only mounts over 10 % of available bytes used.", + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) > 0.1", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{mountpoint}} ", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.8, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage usage %", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Input Output Operations per second. Positive values mean read, negative write.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 17 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_reads_completed_total[5m]) > 0", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} R {{device}}", + "refId": "A" + }, + { + "expr": "- (rate(node_disk_writes_completed_total[5m]) > 0)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} W {{device}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Max average storage latency per node. Positive values mean read, negative write.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 17 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (instance, device) (rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m]))", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} R {{device}}", + "refId": "A" + }, + { + "expr": "- max by (instance, device) (rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m]))", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} W {{device}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 20, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "utc", + "title": "Resources overview", + "uid": "ResSatUse", + "version": 1 +} diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix new file mode 100644 index 0000000000000000000000000000000000000000..d5724e7188cab5155d7f1976420185388caf5d64 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana.nix @@ -0,0 +1,79 @@ +# Grafana Server +# +# Scope: Beautiful plots of time series data retrieved from Prometheus +# See https://christine.website/blog/prometheus-grafana-loki-nixos-2020-11-20 + +{ config, lib, ... }: + +let + cfg = config.services.private-storage.monitoring.grafana; + +in { + options.services.private-storage.monitoring.grafana = { + domain = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "grafana.grid.private.storage"; + description = "The FQDN of the Grafana host"; + }; + prometheusUrl = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "http://prometheus:9090/"; + default = "http://prometheus:9090/"; + description = "The URL of the Prometheus host to access"; + }; + lokiUrl = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "http://loki:3100/"; + default = "http://loki:3100/"; + description = "The URL of the Loki host to access"; + }; + }; + + config = { + # networking.firewall.allowedTCPPorts = [ 80 443 ]; + + services.grafana = { + enable = true; + domain = cfg.domain; + port = 2342; + addr = "127.0.0.1"; + + # All three are required to forego the user/pass prompt: + auth.anonymous.enable = true; + auth.anonymous.org_role = "Admin"; + auth.anonymous.org_name = "Main Org."; + }; + + services.grafana.provision = { + enable = true; + # See https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources + datasources = [{ + name = "Prometheus"; + type = "prometheus"; + access = "proxy"; + url = cfg.prometheusUrl; + isDefault = true; + } { + name = "Loki"; + type = "loki"; + access = "proxy"; + url = cfg.lokiUrl; + }]; + # See https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards + dashboards = [{ + name = "provisioned"; + options.path = ./grafana-config; + }]; + }; + + # nginx reverse proxy + services.nginx.enable = true; + services.nginx.virtualHosts.${config.services.grafana.domain} = { + locations."/" = { + proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}"; + proxyWebsockets = true; + }; + }; + }; +} + diff --git a/nixos/modules/monitoring/server/loki.nix b/nixos/modules/monitoring/server/loki.nix new file mode 100644 index 0000000000000000000000000000000000000000..96554523f06d0d86c620db445b2443575a1c3fd3 --- /dev/null +++ b/nixos/modules/monitoring/server/loki.nix @@ -0,0 +1,78 @@ +# Loki Server +# +# Scope: Log aggregator + +{ + config.networking.firewall.allowedTCPPorts = [ 3100 ]; + + config.services.loki = { + enable = true; + + configuration = + { + auth_enabled = false; + + server = { + http_listen_port = 3100; + }; + + ingester = { + lifecycler = { + address = "0.0.0.0"; + ring = { + kvstore = { + store = "inmemory"; + }; + replication_factor = 1; + }; + final_sleep = "0s"; + }; + chunk_idle_period = "1h"; # Any chunk not receiving new logs in this time will be flushed + max_chunk_age = "1h"; # All chunks will be flushed when they hit this age, default is 1h + chunk_target_size = 1048576; # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first + chunk_retain_period = "30s"; # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m) + max_transfer_retries = 0; # Chunk transfers disabled + }; + + schema_config = { + configs = [{ + from = "2020-10-24"; # TODO: Should this be "today"? + store = "boltdb-shipper"; + object_store = "filesystem"; + schema = "v11"; + index = { + prefix = "index_"; + period = "24h"; + }; + }]; + }; + + storage_config = { + boltdb_shipper = { + active_index_directory = "/var/lib/loki/boltdb-shipper-active"; + cache_location = "/var/lib/loki/boltdb-shipper-cache"; + cache_ttl = "24h"; # Can be increased for faster performance over longer query periods, uses more disk space + shared_store = "filesystem"; + }; + filesystem = { + directory = "/var/lib/loki/chunks"; + }; + }; + + limits_config = { + reject_old_samples = true; + reject_old_samples_max_age = "168h"; + }; + + chunk_store_config = { + max_look_back_period = "336h"; + }; + + table_manager = { + retention_deletes_enabled = true; + retention_period = "336h"; + }; + }; + }; +} + diff --git a/nixos/modules/monitoring/server/prometheus.nix b/nixos/modules/monitoring/server/prometheus.nix new file mode 100644 index 0000000000000000000000000000000000000000..36c2ba6402559771dff8771b1369842e21f7ff7f --- /dev/null +++ b/nixos/modules/monitoring/server/prometheus.nix @@ -0,0 +1,56 @@ +# Prometheus server +# +# Scope: Pull data from our cluster machines into TSDB +# See https://christine.website/blog/prometheus-grafana-loki-nixos-2020-11-20 + +{ config, lib, ... }: +let + + exportersCfg = config.services.prometheus.exporters; + cfg = config.services.private-storage.monitoring.prometheus; + dropPortNumber = { + source_labels = [ "__address__" ]; + regex = "^(.*):\\d+$"; + target_label = "instance"; + }; + +in { + options.services.private-storage.monitoring.prometheus = { + nodeExporterTargets = lib.mkOption { + type = with lib.types; listOf str; + example = lib.literalExample "[ node1 node2 ]"; + description = "List of nodes (hostnames or IPs) to scrape."; + }; + nginxExporterTargets = lib.mkOption { + type = with lib.types; listOf str; + example = lib.literalExample "[ node1 node2 ]"; + description = "List of nodes (hostnames or IPs) to scrape."; + }; + }; + + config = rec { + # networking.firewall.allowedTCPPorts = [ services.prometheus.port ]; + + services.prometheus = { + enable = true; + # port = 9090; # Option only in recent (20.09?) nixpkgs, 9090 default + scrapeConfigs = [ + { + job_name = "node-exporters"; + static_configs = [{ + targets = map (x: x + ":" + (toString exportersCfg.node.port)) cfg.nodeExporterTargets; + }]; + relabel_configs = [ dropPortNumber ]; + } + { + job_name = "nginx-exporters"; + static_configs = [{ + targets = map (x: x + ":" + (toString exportersCfg.nginx.port)) cfg.nginxExporterTargets; + }]; + relabel_configs = [ dropPortNumber ]; + } + ]; + }; + }; +} +