Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 118-borg-backup-not-running-as-it-should
  • 125.dont-set-static-datasource-uids
  • 125.silence-broken-backup-alerts
  • 133.give-access-to-prod-infra
  • 149.fix-bootloader
  • 157.authorize-new-hro-key
  • 162.flexible-grafana-module
  • 163.jp-to-ben-for-prod
  • 164.grafana-alert-rules
  • 190-our-regular-updates-fill-up-the-servers-boot-partitions
  • 207.payment-server-exception-reporting
  • 287.publish-tahoe-error-rate
  • 300.monitor-payment-server
  • 352.cachix
  • 42.update-nixpkgs
  • 445.update-zkapauthorizer
  • 62.openssl-111k
  • 67.rationalize-morph-names.2
  • 87.qemu-local-grid
  • 87.test-local-grid
  • 88.no-gui-for-qemu
  • also-alert-on-incoming-network-errors
  • develop
  • doc-fix
  • dont-use-etc-hosts
  • failsafe-payment-process
  • fix-repo-update-docs
  • flake
  • hro-cloud
  • localdev-qemu
  • make-sure-we-run-a-openzfs-compatible-kernel
  • meejah-develop-patch-44361
  • monitored-node
  • nixpkgs-upgrade-2022-07-13
  • nixpkgs-upgrade-2022-07-14
  • nixpkgs-upgrade-2022-07-22
  • nixpkgs-upgrade-2023-11-06
  • nixpkgs-upgrade-2024-02-12
  • nixpkgs-upgrade-2024-02-19
  • nixpkgs-upgrade-2024-02-26
  • nixpkgs-upgrade-2024-03-04
  • nixpkgs-upgrade-2024-03-11
  • nixpkgs-upgrade-2024-03-18
  • nixpkgs-upgrade-2024-03-25
  • nixpkgs-upgrade-2024-04-22
  • nixpkgs-upgrade-2024-05-13
  • nixpkgs-upgrade-2024-10-14
  • nixpkgs-upgrade-2024-12-23
  • nixpkgs-upgrade-2025-06-16
  • parallel-privatestorage-system-tests
  • payment-proxy-timeouts
  • per-node-monitor-config
  • production
  • reproduce-permission-errors
  • smaller-system-images
  • spending-node
  • spending-node-rebase
  • staging
  • upgrade-nixos-to-22.11_with-libvirt-localgrid
59 results

Target

Select target project
  • tomprince/PrivateStorageio
  • privatestorage/PrivateStorageio
2 results
Select Git revision
  • arion
  • develop
  • dont-use-etc-hosts
  • local-test-grid
  • no-morph-on-nodes
  • sec
  • simple-docs-build
  • simplify-grafana
  • stuff
9 results
Show changes
Showing
with 28912 additions and 1874 deletions
{ lib, config, ...}:
let
inherit (config.grid) publicKeyPath privateKeyPath monitoringvpnEndpoint monitoringvpnIPv4;
in {
config = {
deployment = {
secrets = {
"monitoringvpn-secret-key" = {
destination = "/run/keys/monitoringvpn/client.key";
source = "${privateKeyPath}/monitoringvpn/${monitoringvpnIPv4}.key";
owner.user = "root";
owner.group = "root";
permissions = "0400";
action = ["sudo" "systemctl" "restart" "wireguard-monitoringvpn.service"];
};
"monitoringvpn-preshared-key" = {
destination = "/run/keys/monitoringvpn/preshared.key";
source = "${privateKeyPath}/monitoringvpn/preshared.key";
owner.user = "root";
owner.group = "root";
permissions = "0400";
action = ["sudo" "systemctl" "restart" "wireguard-monitoringvpn.service"];
};
};
};
services.private-storage.monitoring.vpn.client = {
enable = true;
ip = monitoringvpnIPv4;
endpoint = monitoringvpnEndpoint;
endpointPublicKeyFile = "${publicKeyPath}/monitoringvpn/server.pub";
};
};
}
......@@ -2,19 +2,12 @@
# "storage"-type system.
{ lib, config, ...} :
let
inherit (config.grid) publicKeyPath privateKeyPath monitoringvpnIPv4 monitoringvpnEndpoint;
inherit (config.grid) privateKeyPath;
in {
# Any extra NixOS modules to load on this server.
imports = [
# Bring in our module for configuring the Tahoe-LAFS service and other
# Private Storage-specific things.
../../nixos/modules/private-storage.nix
# Connect to the monitoringvpn.
../../nixos/modules/monitoring/vpn/client.nix
# Expose base system metrics over the monitoringvpn.
../../nixos/modules/monitoring/exporters/node.nix
# Collect Tahoe OpenMetrics statistics.
../../nixos/modules/monitoring/exporters/tahoe.nix
./monitoringvpn-client.nix
./borgbackup.nix
];
options.grid.storage = {
......@@ -48,26 +41,13 @@ in {
# extract it from the tahoe-lafs nixos module somehow?
action = ["sudo" "systemctl" "restart" "tahoe.storage.service"];
};
"monitoringvpn-secret-key" = {
destination = "/run/keys/monitoringvpn/client.key";
source = "${privateKeyPath}/monitoringvpn/${monitoringvpnIPv4}.key";
owner.user = "root";
owner.group = "root";
permissions = "0400";
action = ["sudo" "systemctl" "restart" "wireguard-monitoringvpn.service"];
};
"monitoringvpn-preshared-key" = {
destination = "/run/keys/monitoringvpn/preshared.key";
source = "${privateKeyPath}/monitoringvpn/preshared.key";
owner.user = "root";
owner.group = "root";
permissions = "0400";
action = ["sudo" "systemctl" "restart" "wireguard-monitoringvpn.service"];
};
};
};
services.private-storage.monitoring.tahoe.enable = true;
services.private-storage.monitoring.exporters.node.enable = true;
services.private-storage.monitoring.exporters.tahoe.enable = true;
services.private-storage.borgbackup.enable = lib.mkDefault true;
# Turn on the Private Storage (Tahoe-LAFS) service.
services.private-storage = {
......@@ -77,12 +57,5 @@ in {
ristrettoSigningKeyPath = config.deployment.secrets.ristretto-signing-key.destination;
inherit (config.grid.storage) passValue publicStoragePort;
};
services.private-storage.monitoring.vpn.client = {
enable = true;
ip = monitoringvpnIPv4;
endpoint = monitoringvpnEndpoint;
endpointPublicKeyFile = "${publicKeyPath}/monitoringvpn/server.pub";
};
};
}
# Thank you: https://gist.github.com/petabyteboy/558ffddb9aeb24e1eab2d5d6d021b5d7
with import <nixpkgs/lib>;
rec {
# FIXME: add case for negative numbers
pow = base: exponent: if exponent == 0 then 1 else fold (
x: y: y * base
) base (
range 2 exponent
);
fromHexString = hex: foldl (
x: y: 16 * x + (
(
listToAttrs (
map (
x: nameValuePair (
toString x
) x
) (
range 0 9
)
) // {
"a" = 10;
"b" = 11;
"c" = 12;
"d" = 13;
"e" = 14;
"f" = 15;
}
).${y}
)
) 0 (
stringToCharacters (
removePrefix "0x" (
hex
)
)
);
ipv4 = rec {
decode = address: foldl (
x: y: 256 * x + y
) 0 (
map toInt (
splitString "." address
)
);
encode = num: concatStringsSep "." (
map (
x: toString (mod (num / x) 256)
) (
reverseList (
genList (
x: pow 2 (x * 8)
) 4
)
)
);
netmask = prefixLength: (
foldl (
x: y: 2 * x + 1
) 0 (
range 1 prefixLength
)
) * (
pow 2 (
32 - prefixLength
)
);
reverseZone = net: (
concatStringsSep "." (
reverseList (
splitString "." net
)
)
) + ".in-addr.arpa";
eachAddress = net: prefixLength: genList (
x: decode (
x + (
decode net
)
)
) (
pow 2 (
32 - prefixLength
)
);
networkOf = address: prefixLength: encode (
bitAnd (
decode address
) (
netmask prefixLength
)
);
isInNetwork = net: address: networkOf address == net;
/* nixos-specific stuff */
findOwnAddress = config: net: head (
filter (
isInNetwork net
) (
configuredAddresses config
)
);
configuredAddresses = config: concatLists (
mapAttrsToList (
name: iface: iface.ipv4.addresses
) config.networking.interfaces
);
};
ipv6 = rec {
expand = address: (
replaceStrings ["::"] [(
concatStringsSep "0" (
genList (x: ":") (
9 - (count (x: x == ":") (stringToCharacters address))
)
)
)] address
) + (
if hasSuffix "::" address then
"0"
else
""
);
decode = address: map fromHexString (
splitString ":" (
expand address
)
);
encode = address: toLower (
concatStringsSep ":" (
map toHexString address
)
);
netmask = prefixLength: map (
x: if prefixLength > x + 16 then
(pow 2 16) - 1
else if prefixLength < x then
0
else
(
foldl (
x: y: 2 * x + 1
) 0 (
range 1 (prefixLength - x)
)
) * (
pow 2 (
16 - (prefixLength - x)
)
)
) (
genList (
x: x * 16
) 8
);
reverseZone = net: (
concatStringsSep "." (
concatLists (
reverseList (
map (
x: stringToCharacters (fixedWidthString 4 "0" x)
) (
splitString ":" (
expand net
)
)
)
)
)
) + ".ip6.arpa";
networkOf = address: prefixLength: encode (
zipListsWith bitAnd (
decode address
) (
netmask prefixLength
)
);
isInNetwork = net: address: networkOf address == (expand net);
/* nixos-specific stuff */
findOwnAddress = config: net: head (
filter (
isInNetwork net
) (
configuredAddresses config
)
);
configuredAddresses = config: concatLists (
mapAttrsToList (
name: iface: iface.ipv6.addresses
) config.networking.interfaces
);
};
}
......@@ -14,7 +14,8 @@
''
# The driver runs pyflakes on this script before letting it
# run... Convince pyflakes that there is a `test` name.
test = None
def test():
pass
with open("${testpath}") as testfile:
exec(testfile.read(), globals())
# For simple types, JSON is compatible with Python syntax!
......
......@@ -68,6 +68,7 @@ let
{ type = lib.types.str;
example = "wwn-0x5000c500936410b9";
description = "The ID of the disk on which to install grub.";
default = "nodev";
};
};
in {
......@@ -102,10 +103,11 @@ in {
# harder to deploy in the bootstrap environment.
config =
{ boot.loader.grub.enable = true;
boot.loader.grub.version = 2;
boot.loader.grub.device = "/dev/disk/by-id/${cfg.grubDeviceID}";
boot.loader.grub.device = if cfg.grubDeviceID == "nodev" then "nodev" else "/dev/disk/by-id/${cfg.grubDeviceID}";
boot.loader.timeout = 10;
# NixOS likes to fill up boot partitions with (by default) 100 old kernels.
# Keep a (for us) more reasonable number around.
boot.loader.grub.configurationLimit = 8;
networking.firewall.enable = false;
networking.hostId = cfg.hostId;
......
These are mostly modelled on upstream nixos modules.
They are generally fairly configurable (they don't tend to hard-code paths, they can be enabled or disabled).
They don't know anything about morph (e.g. ``deployment.secrets``) or how the different grids are configured (e.g. ``grid.publicKeyPath``).
Each module here tends to define one service (or group of related services) or feature.
Eventually, all of these will be imported automatically and controlled by ``services.private-storage.*.enabled`` options.
......@@ -12,5 +12,11 @@
imports = [
./packages.nix
./issuer.nix
./private-storage.nix
./monitoring/policy.nix
./monitoring/vpn/client.nix
./monitoring/exporters/node.nix
./monitoring/exporters/tahoe.nix
./monitoring/exporters/promtail.nix
];
}
......@@ -35,11 +35,11 @@ in {
config = {
# Configure the system to use our binary cache so that deployment updates
# only require downloading pre-built software, not building it ourselves.
nix = {
binaryCachePublicKeys = [
nix.settings = {
trusted-public-keys = [
"saxtons.private.storage:MplOcEH8G/6mRlhlKkbA8GdeFR3dhCFsSszrspE/ZwY="
];
binaryCaches = [
substituters = [
"http://saxtons.private.storage"
];
};
......
......@@ -38,6 +38,15 @@ in {
algorithm or Ristretto for Ristretto-flavored PrivacyPass.
'';
};
services.private-storage-issuer.tokensPerVoucher = lib.mkOption {
default = null;
type = lib.types.nullOr lib.types.int;
example = 50000;
description = ''
If not null, a value to pass to PaymentServer for
``--tokens-per-voucher``.
'';
};
services.private-storage-issuer.ristrettoSigningKeyPath = lib.mkOption {
default = null;
type = lib.types.path;
......@@ -53,6 +62,13 @@ in {
and payment management.
'';
};
services.private-storage-issuer.stripeWebhookSecretKeyPath = lib.mkOption {
type = lib.types.path;
description = ''
The path to a file containing a Stripe "webhook" secret key to use for
charge and payment management.
'';
};
services.private-storage-issuer.stripeEndpointDomain = lib.mkOption {
type = lib.types.str;
description = ''
......@@ -207,11 +223,14 @@ in {
stripeArgs =
"--stripe-key-path ${cfg.stripeSecretKeyPath} " +
"--stripe-webhook-key-path ${cfg.stripeWebhookSecretKeyPath} " +
"--stripe-endpoint-domain ${cfg.stripeEndpointDomain} " +
"--stripe-endpoint-scheme ${cfg.stripeEndpointScheme} " +
"--stripe-endpoint-port ${toString cfg.stripeEndpointPort}";
redemptionConfig = lib.optionalString (cfg.tokensPerVoucher != null) "--tokens-per-voucher ${builtins.toString cfg.tokensPerVoucher}";
in
"${cfg.package.exePath} ${originArgs} ${issuerArgs} ${databaseArgs} ${httpArgs} ${stripeArgs}";
"${cfg.package.exePath} ${originArgs} ${issuerArgs} ${databaseArgs} ${httpArgs} ${stripeArgs} ${redemptionConfig}";
};
# PaymentServer runs as this user and group by default
......@@ -235,7 +254,7 @@ in {
];
# NGINX reverse proxy
security.acme.email = cfg.letsEncryptAdminEmail;
security.acme.defaults.email = cfg.letsEncryptAdminEmail;
security.acme.acceptTerms = true;
services.nginx = {
enable = true;
......@@ -254,6 +273,17 @@ in {
# we pass less scanning spam on to our backend
# Want a regex instead? try locations."~ /v\d+/"
proxyPass = "http://127.0.0.1:${internalHttpPort}";
# The redemption endpoint can intentionally delay its response for
# up to 600 seconds for a cheap kind of server-push when payment
# completes. Let that timeout control how long the connection stays
# open. PaymentServer does not accept configuration for that
# duration so we also hard-code it here.
#
# http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout
extraConfig = ''
proxy_read_timeout 660;
'';
};
locations."/metrics" = {
# Only allow our monitoringvpn subnet
......
......@@ -12,10 +12,10 @@
{ config, options, lib, ourpkgs, pkgs, ... }:
let
cfg = config.services.private-storage.monitoring.megacli2prom;
cfg = config.services.private-storage.monitoring.exporters.megacli2prom;
in {
options.services.private-storage.monitoring.megacli2prom = {
options.services.private-storage.monitoring.exporters.megacli2prom = {
enable = lib.mkEnableOption "MegaCli2Prom metrics gathering service";
outFile = lib.mkOption {
type = lib.types.str;
......
......@@ -6,15 +6,25 @@
# monitoring system
# See https://nixos.org/manual/nixos/stable/#module-services-prometheus-exporters
{ config, lib, pkgs, ... }:
{ config, lib, pkgs, options, ... }:
with lib;
let
cfg = config.services.private-storage.monitoring.exporters.node;
mountsFileSystemType = fsType: {} != filterAttrs (n: v: v.fsType == fsType) config.fileSystems;
in {
config.services.prometheus.exporters.node = {
options.services.private-storage.monitoring.exporters.node = {
enable = lib.mkEnableOption "Base system metrics collection";
textfiles-directory = lib.mkOption {
type = lib.types.str;
description = "Directory used by the textfiles collector.";
default = "/run/prometheus-node-exporter";
};
};
config.services.prometheus.exporters.node = lib.mkIf cfg.enable {
enable = true;
openFirewall = true;
firewallFilter = "-i monitoringvpn -p tcp -m tcp --dport 9100";
......@@ -22,7 +32,7 @@ in {
# extraFlags = [ "--collector.disable-defaults" ]; # not in nixpkgs 19.09
# Thanks https://github.com/mayflower/nixexprs/blob/master/modules/monitoring/default.nix
enabledCollectors = [
"arp"
# "arp" # is broken in 1.7.0 (2024-02-07)
"bcache"
"conntrack"
"filefd"
......@@ -30,16 +40,16 @@ in {
"netclass"
"netdev"
"netstat"
#"rapl" # not in nixpkgs 19.09
"rapl"
"sockstat"
#"softnet" # not in nixpkgs 19.09
"softnet"
"stat"
"systemd"
"textfile"
"textfile.directory /run/prometheus-node-exporter"
#"thermal_zone" # not in nixpkgs 19.09
"textfile.directory ${cfg.textfiles-directory}"
"thermal_zone"
"time"
#"udp_queues" # not in nixpkgs 19.09
"udp_queues"
"uname"
"vmstat"
] ++ optionals (!config.boot.isContainer) [
......@@ -59,7 +69,7 @@ in {
] ++ (
optionals (config.services.nfs.server.enable) [ "nfsd" ]
) ++ (
optionals ("" != config.boot.initrd.mdadmConf) [ "mdadm" ]
optionals ("" != config.boot.swraid.mdadmConf) [ "mdadm" ]
) ++ (
optionals ({} != config.networking.bonds) [ "bonding" ]
) ++ (
......@@ -67,7 +77,7 @@ in {
) ++ (
optionals (mountsFileSystemType "xfs") [ "xfs" ]
) ++ (
optionals (mountsFileSystemType "zfs" || elem "zfs" config.boot.supportedFilesystems) [ "zfs" ]
optionals (mountsFileSystemType "zfs" || config.boot.supportedFilesystems.zfs or false) [ "zfs" ]
);
};
}
......
# Promtail log forwarder configuration
#
# Scope: Tail logs on the local system and send them to Loki
#
# Description: This is not strictly an "exporter" like the Prometheus
# exporters, but it is very similar in what it is doing -
# preparing local data and sending it off to a TSDB.
{ config, options, lib, ... }:
let
cfg = config.services.private-storage.monitoring.exporters.promtail;
hostName = config.networking.hostName;
logRetention = toString(config.services.private-storage.monitoring.policy.logRetentionSeconds) + "s";
in {
options.services.private-storage.monitoring.exporters.promtail = {
enable = lib.mkEnableOption "Promtail log exporter service";
lokiUrl = lib.mkOption {
type = lib.types.str;
description = ''
The server URL that logs should be pushed to.
'';
# Resolving names is hard, let's have breakfast
# If you are curious why there's a plain IP address in here, read all of
# https://whetstone.private.storage/privatestorage/PrivateStorageio/-/merge_requests/251
# https://whetstone.private.storage/privatestorage/PrivateStorageio/-/merge_requests/257
# https://whetstone.private.storage/privatestorage/PrivateStorageio/-/merge_requests/258
default = "http://172.23.23.1:3100/loki/api/v1/push";
};
};
config = lib.mkIf cfg.enable {
services.promtail.enable = true;
networking.firewall.interfaces.monitoringvpn.allowedTCPPorts = [ 9080 ];
services.journald.extraConfig = ''
# This tells journald it can discard log files that contain only log
# entries older than...
MaxRetentionSec=${logRetention}
# This tells journald to start a new log file once a day. Together with
# the MaxRetentionSec setting, this means that entries are kept for
# up to a full day longer than MaxRetentionSec.
#
# https://www.freedesktop.org/software/systemd/man/journald.conf.html
# for further details about these options.
#
MaxFileSec=1day
# This asks journald to not use more than 500M of disk space. Due to
# journald's characteristics this might only be a week of logs, but that
# should be okay since we ship all logs to a central server that keeps
# them for a while longer.
SystemMaxUse=500M
'';
services.promtail.configuration = {
server = {
http_listen_port = 9080; # Using /metrics for health check
grpc_listen_address = "127.0.0.1"; # unused, but no option to turn it off.
grpc_listen_port = 9094; # unused, but no option to turn it off.
};
clients = [{
url = cfg.lokiUrl;
}];
scrape_configs = [{
job_name = "systemd-journal";
journal = {
labels = {
job = "systemd-journal";
host = hostName;
};
};
# The journal has many internal labels, that by default will
# be dropped because of their "__" prefix. To keep them, rename them.
# https://grafana.com/docs/loki/latest/clients/promtail/scraping/#journal-scraping-linux-only
# https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html
relabel_configs = [{
source_labels = [ "__journal__systemd_unit" ];
target_label = "unit";
}];
}];
};
};
}
# Tahoe Prometheus metrics collector
#
# Scope: Retrieves OpenMetrics from Tahoe and puts them
# where textfile collector can find them.
# Scope: Retrieve metrics from Tahoe and put them where Prometheus'
# node-exporter's textfile collector can find them.
#
# Usage: Import this to every server running Tahoe.
#
......@@ -10,10 +10,11 @@
{ config, options, lib, pkgs, ... }:
let
cfg = config.services.private-storage.monitoring.tahoe;
cfg = config.services.private-storage.monitoring.exporters.tahoe;
inherit (config.services.private-storage.monitoring.exporters.node) textfiles-directory;
in {
options.services.private-storage.monitoring.tahoe = {
options.services.private-storage.monitoring.exporters.tahoe = {
enable = lib.mkEnableOption "Tahoe OpenMetrics collecting service";
scrapeEndpoint = lib.mkOption {
type = lib.types.str;
......@@ -23,7 +24,7 @@ in {
outFile = lib.mkOption {
type = lib.types.str;
description = "Where to store the temporary file for node exporter to scrape?";
default = "/run/prometheus-node-exporter/tahoe.prom";
default = "${textfiles-directory}/tahoe.prom";
};
interval = lib.mkOption {
type = lib.types.str;
......@@ -38,6 +39,15 @@ in {
config =
lib.mkIf cfg.enable {
assertions = [
{
assertion = config.services.private-storage.monitoring.exporters.node.enable;
message = ''
services.private-storage.monitoring.tahoe requires services.private-storage.monitoring.exporters.node to provide the textfile prometheus collector.
'';
}
];
environment.systemPackages = [ pkgs.curl ];
systemd.services.tahoe-metrics-collector = {
......@@ -45,14 +55,22 @@ in {
description = "Tahoe metrics gathering service";
after = [ "tahoe.storage.service" ];
startAt = cfg.interval;
path = [ pkgs.curl ];
path = [ pkgs.coreutils pkgs.findutils pkgs.curl ];
restartIfChanged = false;
# Save to a temp file and then move atomically so the
# textfile collector won't read a partial file.
# See https://github.com/prometheus/node_exporter#textfile-collector
script = ''
curl --silent --show-error --fail-with-body --output "${cfg.outFile}.tmp" "${cfg.scrapeEndpoint}"
set -euo pipefail
NUM_CORRUPTION_ADVISORIES=$(find /storage/corruption-advisories/ -type f | wc -l)
echo "tahoe_corruption_advisories_total $NUM_CORRUPTION_ADVISORIES" > "${cfg.outFile}.tmp"
NUM_INCIDENT_REPORTS=$(find /var/db/tahoe-lafs/storage/logs/incidents/ -type f | wc -l)
echo "tahoe_incident_reports_total $NUM_INCIDENT_REPORTS" >> "${cfg.outFile}.tmp"
curl --silent --show-error --fail-with-body "${cfg.scrapeEndpoint}" >> "${cfg.outFile}.tmp"
mv "${cfg.outFile}.tmp" "${cfg.outFile}"
'';
};
......
# Codify our log data retention policy
#
# A maximum retention of 30 days conforms to the published log retention policy,
# see https://private.storage/privacy-policy/ .
{ options, lib, ... }: {
options.services.private-storage.monitoring.policy = {
logRetentionSeconds = lib.mkOption {
type = lib.types.int;
description = "How long do we retain logs (seconds)";
default = 29 * (24 * 60 * 60); # 29 days, to accomodate for the journald log rotation (1 day).
};
};
}
......@@ -3,21 +3,30 @@
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "Watching the watchers",
"editable": true,
"gnetId": null,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 22,
"links": [],
"liveNow": false,
"panels": [
{
"alert": {
......@@ -48,59 +57,100 @@
}
],
"executionErrorState": "alerting",
"for": "5m",
"for": "10m",
"frequency": "1m",
"handler": 1,
"name": "Scraping down",
"noDataState": "ok",
"notifications": []
},
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": null,
"datasource": {
"type": "prometheus",
"uid": "LocalPrometheus"
},
"description": "Is Prometheus having problems scraping our instances? Should be zero.",
"fieldConfig": {
"defaults": {},
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "line"
}
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent",
"value": null
},
{
"color": "red",
"value": 0
}
]
},
"unit": "short"
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": false,
"linewidth": 1,
"nullPointMode": "null as zero",
"options": {
"alertThreshold": false
},
"percentage": false,
"pluginVersion": "7.5.10",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "10.4.6",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "LocalPrometheus"
},
"exemplar": true,
"expr": "count by (job, instance) (up == 0)",
"hide": false,
......@@ -109,61 +159,152 @@
"refId": "A"
}
],
"thresholds": [
"title": "Scraping failures",
"type": "timeseries"
},
{
"colorMode": "critical",
"fill": false,
"line": false,
"op": "gt",
"value": 0,
"visible": true
"alert": {
"alertRuleTags": {},
"conditions": [
{
"evaluator": {
"params": [
600
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scraping failures",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
"executionErrorState": "alerting",
"for": "5m",
"frequency": "1m",
"handler": 1,
"message": "A metrics text file is older than 10 minutes.",
"name": "Textcollector staleness alert",
"noDataState": "no_data",
"notifications": []
},
"datasource": {
"type": "prometheus",
"uid": "LocalPrometheus"
},
"description": "Node-Exporter's TextCollector reads in plain text files containing metrics every few minutes. Make sure we're not reporting stale text files as new data - Alert if any of the text files is not getting updated.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"color": "green",
"value": null
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "LocalPrometheus"
},
"exemplar": true,
"expr": "time() - node_textfile_mtime_seconds",
"interval": "",
"legendFormat": "{{instance}}/{{file}}",
"refId": "A"
}
],
"yaxis": {
"align": false,
"alignLevel": null
"thresholds": [
{
"colorMode": "critical",
"op": "gt",
"value": 600,
"visible": true
}
],
"title": "Textfile collector freshness",
"type": "timeseries"
}
],
"refresh": false,
"schemaVersion": 27,
"style": "dark",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": []
......@@ -176,5 +317,6 @@
"timezone": "",
"title": "Meta monitoring",
"uid": "MetaMonitoring",
"version": 1
"version": 1,
"weekStart": ""
}