diff --git a/morph/grid/production/grid.nix b/morph/grid/production/grid.nix index b42f4a3e40f56bf3d1b9808b3e87d3f769c8e2a7..0b433273e07148c609c1ad5ddd4f14032bd92579 100644 --- a/morph/grid/production/grid.nix +++ b/morph/grid/production/grid.nix @@ -73,6 +73,10 @@ let # Slightly awkwardly, enable some of our hardware / network / bootloader options. ../../../nixos/modules/100tb.nix + # At least some of our storage nodes utilize MegaRAID storage controllers. + # Monitor their array status. + ../../../nixos/modules/monitoring/exporters/megacli2prom.nix + # Get all of the configuration that is common across all storage nodes. gridlib.storage @@ -91,6 +95,10 @@ let # name is quoted because `1` makes `100tb` look an awful lot like a # number. "100tb".config = nodecfg; + + # Enable statistics gathering for MegaRAID cards. + # TODO would be nice to enable only on machines that have such a device. + services.private-storage.monitoring.megacli2prom.enable = true; }; # Define all of the storage nodes for this grid. diff --git a/morph/lib/default.nix b/morph/lib/default.nix index 78de2506382d023bc76b723eb866efc90f20ef7f..a820cc559b6b2da78c06bcb84282e392c3a1ebc7 100644 --- a/morph/lib/default.nix +++ b/morph/lib/default.nix @@ -26,7 +26,11 @@ # Ensure that configuration of the system where this runs # doesn't leak into what we build. # See https://github.com/NixOS/nixpkgs/issues/62513 - config = {}; + config = { pkgs }: let lib = pkgs.lib; in { + allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ + "megacli" + ]; + }; overlays = []; }; } diff --git a/nixos/modules/monitoring/exporters/megacli2prom.nix b/nixos/modules/monitoring/exporters/megacli2prom.nix new file mode 100644 index 0000000000000000000000000000000000000000..b364088aa148a2faa58a6ac34980120f8d7bf9d0 --- /dev/null +++ b/nixos/modules/monitoring/exporters/megacli2prom.nix @@ -0,0 +1,49 @@ +# MegaCli to Prometheus text format exporter +# +# Scope: Gets data from MegaRAID compatible storage controllers and mogrifies +# to Prometheus text format, saves to a temp file, to later be scraped +# by the node exporter. +# +# Usage: Import this to every server with a MegaRAID card that you want to +# include in the central monitoring system +# +# See https://nixos.org/manual/nixos/stable/#module-services-prometheus-exporters + +{ config, options, lib, ourpkgs, pkgs, ... }: + +let + cfg = config.services.private-storage.monitoring.megacli2prom; + +in { + options.services.private-storage.monitoring.megacli2prom = { + enable = lib.mkEnableOption "MegaCli2Prom metrics gathering service"; + outFile = lib.mkOption { + type = lib.types.str; + description = "Where to store the temporary file for node exporter to scrape?"; + default = "/run/prometheus-node-exporter/megacli.prom"; + }; + interval = lib.mkOption { + type = lib.types.str; + description = '' + How often to do it? + See https://www.freedesktop.org/software/systemd/man/systemd.time.html#Calendar%20Events + ''; + # Every five minutes. + default = "*:0/5"; + }; + }; + + config = + lib.mkIf cfg.enable { + environment.systemPackages = [ ourpkgs.megacli2prom ]; + systemd.services.megacli2prom = { + enable = true; + description = "MegaCli2Prom metrics gathering service"; + wantedBy = [ "multi-user.target" ]; + startAt = cfg.interval; + path = [ pkgs.megacli ]; + script = "${ourpkgs.megacli2prom}/bin/megacli2prom > ${cfg.outFile}"; + }; + }; +} + diff --git a/nixos/modules/monitoring/exporters/node.nix b/nixos/modules/monitoring/exporters/node.nix index 62702e82f1e0a6bd9effae871f275c5dd23a37ae..d854ff7398cd19ac4d4d5b3f8739073feb84834a 100644 --- a/nixos/modules/monitoring/exporters/node.nix +++ b/nixos/modules/monitoring/exporters/node.nix @@ -35,8 +35,8 @@ in { #"softnet" # not in nixpkgs 19.09 "stat" "systemd" - # "textfile" - # "textfile.directory /run/prometheus-node-exporter" + "textfile" + "textfile.directory /run/prometheus-node-exporter" #"thermal_zone" # not in nixpkgs 19.09 "time" #"udp_queues" # not in nixpkgs 19.09 diff --git a/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json b/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json index 0b22728e178ddd5295afd43a17cd0b9c20c530fd..745f7b7ff03d9a0a4e93fc50946f8066fb08e7b9 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/resources-overview.json @@ -1105,7 +1105,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, + "w": 6, "x": 0, "y": 17 }, @@ -1217,8 +1217,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 8, + "w": 6, + "x": 6, "y": 17 }, "hiddenSeries": false, @@ -1321,8 +1321,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 8, - "x": 16, + "w": 6, + "x": 12, "y": 17 }, "hiddenSeries": false, @@ -1407,6 +1407,148 @@ "align": false, "alignLevel": null } + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "count" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "5m", + "handler": 1, + "name": "Degraded RAID alert", + "noDataState": "ok", + "notifications": [] + }, + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 17 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": false + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "megacli_drives{state=\"Degraded\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": false, + "line": false, + "op": "gt", + "value": 0, + "visible": true + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Degraded RAID arrays", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:151", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:152", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "30s", diff --git a/nixos/pkgs/default.nix b/nixos/pkgs/default.nix index 9e98f253692ed95e4be89c0753542d797db710d1..bfc30b36101c220434606832127a7e8ca0a70490 100644 --- a/nixos/pkgs/default.nix +++ b/nixos/pkgs/default.nix @@ -11,4 +11,5 @@ privatestorage = callPackage ./privatestorage {}; zkap-spending-service = callPackage ./zkap-spending-service {}; zkapissuer = callPackage ./zkapissuer {}; + megacli2prom = callPackage ./megacli2prom {}; } diff --git a/nixos/pkgs/megacli2prom/default.nix b/nixos/pkgs/megacli2prom/default.nix new file mode 100644 index 0000000000000000000000000000000000000000..942f43ff747e4e6ecaa90a7bd1d6bc3c1927cb0e --- /dev/null +++ b/nixos/pkgs/megacli2prom/default.nix @@ -0,0 +1,18 @@ +{ pkgs ? import <nixpkgs> {} }: + +let + repo-data = pkgs.lib.importJSON ./repo.json; + repo = pkgs.fetchFromGitHub (builtins.removeAttrs repo-data [ "branch" ]); + +in +pkgs.stdenv.mkDerivation { + name = "megacli2prom"; + buildInputs = [ pkgs.python3 pkgs.megacli ]; + src = repo; + installPhase = '' + mkdir -p $out/bin + cp ./megacli2prom.py $out/bin/megacli2prom + chmod +x $out/bin/megacli2prom + ''; +} + diff --git a/nixos/pkgs/megacli2prom/repo.json b/nixos/pkgs/megacli2prom/repo.json new file mode 100644 index 0000000000000000000000000000000000000000..3c8cd0af95adf95e22def4e727b8c2c5d12044aa --- /dev/null +++ b/nixos/pkgs/megacli2prom/repo.json @@ -0,0 +1,8 @@ +{ + "owner": "PrivateStorageio", + "repo": "megacli2prom", + "branch": "main", + "rev": "9536933d325c843b2662f80486660bf81d73941e", + "outputHashAlgo": "sha512", + "outputHash": "1xrsv0bkmazbhqarx84lhvmrzzdv1bm04xvr0hw1yrw1f4xb450f4pwgapnkjczy0l4c6rp3pmh64cblgbs3ki30wacbv1bqzv5745g" +} \ No newline at end of file