diff --git a/nixos/modules/monitoring/server/grafana-config/resources-overview.json b/nixos/modules/monitoring/server/grafana-config/resources-overview.json new file mode 100644 index 0000000000000000000000000000000000000000..70519e5bf77c9a3dc5973e1a88ca62c7030f6018 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana-config/resources-overview.json @@ -0,0 +1,1294 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "USE: Usage, Saturation and Error rate for our resources", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 4, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 22, + "panels": [], + "title": "CPU & Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Some of our software runs in a single thread, so this shows max CPU per core (instead of averaged over all cores)", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (max by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Max CPU % per core per node", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:163", + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:164", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "15 min load average alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "displayName": "${__field.labels.instance}" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "node_load15{instance=\"grafana:9100\", job=\"node-exporters\"}" + }, + "properties": [ + { + "id": "links" + } + ] + } + ] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 1 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load15", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "15 min load average", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:412", + "format": "short", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + }, + { + "$$hashKey": "object:413", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0.8 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "15m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "RAM filling up", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How much RAM is in use? Relative to available system memory.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes\r\n", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.8 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "RAM used %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:43", + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:44", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 20, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Shows most saturated network link for every node.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (instance) (rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) / node_network_speed_bytes)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} out", + "refId": "A" + }, + { + "expr": "- max by (instance) (rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) / node_network_speed_bytes)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} in", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throughput %", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1100", + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "-1", + "show": true + }, + { + "$$hashKey": "object:1101", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Packet and error count", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 9 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_reveive_packets_total{device!=\"lo\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "A" + }, + { + "expr": "rate(node_network_reveive_errs_total{device!=\"lo\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "B" + }, + { + "expr": "- rate(node_network_transmit_packets_total{device!=\"lo\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "C" + }, + { + "expr": "- rate(node_network_transmit_errs_total{device!=\"lo\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} {{device}}", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network pkt/s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:760", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:761", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "B", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "C", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + }, + { + "evaluator": { + "params": [ + 10 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "D", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Network errors alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Network errors, drops etc. Should all be 0.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 9 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_transmit_errs_total\n", + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "node_network_transmit_drop_total", + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "expr": "node_network_receive_drop_total", + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "expr": "node_network_receive_errs_total", + "interval": "", + "legendFormat": "", + "refId": "D" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 10 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:874", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:875", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 18, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0.8 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Filesystem usage % alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Watch filesystems filling up.", + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) > 0.1", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{mountpoint}} ", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.8 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage usage %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:165", + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:166", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Input Output Operations per second", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 17 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_reads_completed_total[5m]) > 0", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} R {{device}}", + "refId": "A" + }, + { + "expr": "- (rate(node_disk_writes_completed_total[5m]) > 0)", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} W {{device}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:471", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:472", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Max average storage latency per node.", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 17 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (instance, device) (rate(node_disk_read_time_seconds_total[5m]) / rate(node_disk_reads_completed_total[5m])) > 0", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} R {{device}}", + "refId": "A" + }, + { + "expr": "- max by (instance, device) (rate(node_disk_write_time_seconds_total[5m]) / rate(node_disk_writes_completed_total[5m])) < 0", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{instance}} W {{device}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:211", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:212", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 26, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "utc", + "title": "Resources overview", + "uid": "ResSatUse", + "version": 1 +} \ No newline at end of file diff --git a/nixos/modules/monitoring/server/grafana.nix b/nixos/modules/monitoring/server/grafana.nix new file mode 100644 index 0000000000000000000000000000000000000000..3ff096209fddc46ab9a5b9a301cf4fec3ce32646 --- /dev/null +++ b/nixos/modules/monitoring/server/grafana.nix @@ -0,0 +1,79 @@ +# Grafana Server +# +# Scope: Beautiful plots of time series data retrieved from Prometheus +# See https://christine.website/blog/prometheus-grafana-loki-nixos-2020-11-20 + +{ config, lib, ... }: + +let + cfg = config.services.private-storage.monitoring.grafana; + +in { + options.services.private-storage.monitoring.grafana = { + domain = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "grafana.grid.private.storage"; + description = "The FQDN of the Grafana host"; + }; + prometheusUrl = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "http://prometheus:9090/"; + default = "http://prometheus:9090/"; + description = "The URL of the Prometheus host to access"; + }; + lokiUrl = lib.mkOption + { type = lib.types.str; + example = lib.literalExample "http://loki:3100/"; + default = "http://loki:3100/"; + description = "The URL of the Loki host to access"; + }; + }; + + config = { + networking.firewall.allowedTCPPorts = [ 80 443 ]; + + services.grafana = { + enable = true; + domain = cfg.domain; + port = 2342; + addr = "127.0.0.1"; + + # All three are required to forego the user/pass prompt: + auth.anonymous.enable = true; + auth.anonymous.org_role = "Admin"; + auth.anonymous.org_name = "Main Org."; + }; + + services.grafana.provision = { + enable = true; + # See https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources + datasources = [{ + name = "Prometheus"; + type = "prometheus"; + access = "proxy"; + url = cfg.prometheusUrl; + isDefault = true; + } { + name = "Loki"; + type = "loki"; + access = "proxy"; + url = cfg.lokiUrl; + }]; + # See https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards + dashboards = [{ + name = "provisioned"; + options.path = ./grafana-config; + }]; + }; + + # nginx reverse proxy + services.nginx.enable = true; + services.nginx.virtualHosts.${config.services.grafana.domain} = { + locations."/" = { + proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}"; + proxyWebsockets = true; + }; + }; + }; +} + diff --git a/nixos/modules/monitoring/server/loki.nix b/nixos/modules/monitoring/server/loki.nix new file mode 100644 index 0000000000000000000000000000000000000000..96554523f06d0d86c620db445b2443575a1c3fd3 --- /dev/null +++ b/nixos/modules/monitoring/server/loki.nix @@ -0,0 +1,78 @@ +# Loki Server +# +# Scope: Log aggregator + +{ + config.networking.firewall.allowedTCPPorts = [ 3100 ]; + + config.services.loki = { + enable = true; + + configuration = + { + auth_enabled = false; + + server = { + http_listen_port = 3100; + }; + + ingester = { + lifecycler = { + address = "0.0.0.0"; + ring = { + kvstore = { + store = "inmemory"; + }; + replication_factor = 1; + }; + final_sleep = "0s"; + }; + chunk_idle_period = "1h"; # Any chunk not receiving new logs in this time will be flushed + max_chunk_age = "1h"; # All chunks will be flushed when they hit this age, default is 1h + chunk_target_size = 1048576; # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first + chunk_retain_period = "30s"; # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m) + max_transfer_retries = 0; # Chunk transfers disabled + }; + + schema_config = { + configs = [{ + from = "2020-10-24"; # TODO: Should this be "today"? + store = "boltdb-shipper"; + object_store = "filesystem"; + schema = "v11"; + index = { + prefix = "index_"; + period = "24h"; + }; + }]; + }; + + storage_config = { + boltdb_shipper = { + active_index_directory = "/var/lib/loki/boltdb-shipper-active"; + cache_location = "/var/lib/loki/boltdb-shipper-cache"; + cache_ttl = "24h"; # Can be increased for faster performance over longer query periods, uses more disk space + shared_store = "filesystem"; + }; + filesystem = { + directory = "/var/lib/loki/chunks"; + }; + }; + + limits_config = { + reject_old_samples = true; + reject_old_samples_max_age = "168h"; + }; + + chunk_store_config = { + max_look_back_period = "336h"; + }; + + table_manager = { + retention_deletes_enabled = true; + retention_period = "336h"; + }; + }; + }; +} + diff --git a/nixos/modules/monitoring/server/prometheus.nix b/nixos/modules/monitoring/server/prometheus.nix new file mode 100644 index 0000000000000000000000000000000000000000..cffb126ddd3a80a715799bee6037f709adf97fc2 --- /dev/null +++ b/nixos/modules/monitoring/server/prometheus.nix @@ -0,0 +1,49 @@ +# Prometheus server +# +# Scope: Pull data from our cluster machines into TSDB +# See https://christine.website/blog/prometheus-grafana-loki-nixos-2020-11-20 + +{ config, lib, ... }: +let + + exportersCfg = config.services.prometheus.exporters; + cfg = config.services.private-storage.monitoring.prometheus; + +in { + options.services.private-storage.monitoring.prometheus = { + nodeExporterTargets = lib.mkOption { + type = with lib.types; listOf str; + example = lib.literalExample "[ node1 node2 ]"; + description = "List of nodes (hostnames or IPs) to scrape."; + }; + nginxExporterTargets = lib.mkOption { + type = with lib.types; listOf str; + example = lib.literalExample "[ node1 node2 ]"; + description = "List of nodes (hostnames or IPs) to scrape."; + }; + }; + + config = rec { + networking.firewall.allowedTCPPorts = [ services.prometheus.port ]; + + services.prometheus = { + enable = true; + port = 9090; # Option only in recent (20.09?) nixpkgs, 9090 default + scrapeConfigs = [ + { + job_name = "node-exporters"; + static_configs = [{ + targets = map (x: x + ":" + (toString exportersCfg.node.port)) cfg.nodeExporterTargets; + }]; + } + { + job_name = "nginx-exporters"; + static_configs = [{ + targets = map (x: x + ":" + (toString exportersCfg.nginx.port)) cfg.nginxExporterTargets; + }]; + } + ]; + }; + }; +} +