From f8dbc0584afde6cc8063218dd5fd869b04b09386 Mon Sep 17 00:00:00 2001 From: Florian Sesser <florian@private.storage> Date: Mon, 5 Sep 2022 15:38:25 +0000 Subject: [PATCH] Monitoring: Add Tahoe-LAFS corruption advisory count + rate + alert on rate > 0 --- .../server/grafana-dashboards/tahoe-lafs.json | 575 ++++++++---------- 1 file changed, 255 insertions(+), 320 deletions(-) diff --git a/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json b/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json index 6d7e7014..909572db 100644 --- a/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json +++ b/nixos/modules/monitoring/server/grafana-dashboards/tahoe-lafs.json @@ -8,21 +8,26 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "", "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 41, - "iteration": 1636742282779, + "iteration": 1662390420143, "links": [], + "liveNow": false, "panels": [ { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -39,7 +44,6 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "These stats keep track of local disk latencies for storage-server operations. All values are in seconds. These are recorded by the storage server, starting from the time the request arrives (post-deserialization) and ending when the response begins serialization. As such, they are mostly useful for measuring disk speeds.", "fieldConfig": { "defaults": { @@ -73,7 +77,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -139,9 +143,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Latency means", "tooltip": { "shared": true, @@ -150,9 +152,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -160,25 +160,18 @@ { "$$hashKey": "object:1111", "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "$$hashKey": "object:1112", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -186,12 +179,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "This counts inbound storage-server operations.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -218,7 +206,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -308,9 +296,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Counts/s", "tooltip": { "shared": true, @@ -319,9 +305,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -329,25 +313,19 @@ { "$$hashKey": "object:2483", "format": "short", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:2484", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -355,11 +333,6 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -388,7 +361,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -414,9 +387,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Bytes/s", "tooltip": { "shared": true, @@ -425,9 +396,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -435,30 +404,23 @@ { "$$hashKey": "object:2568", "format": "bytes", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:2569", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -467,12 +429,10 @@ }, "id": 19, "panels": [], - "repeat": null, "title": "Latency Histograms", "type": "row" }, { - "datasource": null, "description": "These stats keep track of local disk latencies for storage-server operations. All values are in seconds. These are recorded by the storage server, starting from the time the request arrives (post-deserialization) and ending when the response begins serialization. As such, they are mostly useful for measuring disk speeds.", "fieldConfig": { "defaults": { @@ -517,16 +477,9 @@ "showUnfilled": true, "text": {} }, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "repeat": "storageserverop", "repeatDirection": "h", - "scopedVars": { - "storageserverop": { - "selected": true, - "text": "allocate", - "value": "allocate" - } - }, "targets": [ { "exemplar": true, @@ -539,147 +492,8 @@ "title": "$storageserverop", "type": "bargauge" }, - { - "datasource": null, - "description": "These stats keep track of local disk latencies for storage-server operations. All values are in seconds. These are recorded by the storage server, starting from the time the request arrives (post-deserialization) and ending when the response begins serialization. As such, they are mostly useful for measuring disk speeds.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 39, - "options": { - "displayMode": "gradient", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": {} - }, - "pluginVersion": "7.5.11", - "repeatDirection": "h", - "repeatIteration": 1636742282779, - "repeatPanelId": 11, - "scopedVars": { - "storageserverop": { - "selected": true, - "text": "write", - "value": "write" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "tahoe_stats_storage_server_latencies_$storageserverop", - "interval": "", - "legendFormat": "{{quantile}}", - "refId": "A" - } - ], - "title": "$storageserverop", - "type": "bargauge" - }, - { - "datasource": null, - "description": "These stats keep track of local disk latencies for storage-server operations. All values are in seconds. These are recorded by the storage server, starting from the time the request arrives (post-deserialization) and ending when the response begins serialization. As such, they are mostly useful for measuring disk speeds.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 40, - "options": { - "displayMode": "gradient", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": {} - }, - "pluginVersion": "7.5.11", - "repeatDirection": "h", - "repeatIteration": 1636742282779, - "repeatPanelId": 11, - "scopedVars": { - "storageserverop": { - "selected": true, - "text": "readv", - "value": "readv" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "tahoe_stats_storage_server_latencies_$storageserverop", - "interval": "", - "legendFormat": "{{quantile}}", - "refId": "A" - } - ], - "title": "$storageserverop", - "type": "bargauge" - }, { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -696,12 +510,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "These all reflect disk-space usage policies and status.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -728,7 +537,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -771,9 +580,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Bytes free", "tooltip": { "shared": true, @@ -782,9 +589,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -792,25 +597,18 @@ { "$$hashKey": "object:712", "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "$$hashKey": "object:713", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -818,12 +616,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "These all reflect disk-space usage policies and status.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -850,7 +643,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -876,9 +669,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Bytes used", "tooltip": { "shared": true, @@ -887,9 +678,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -897,25 +686,18 @@ { "$$hashKey": "object:712", "format": "bytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "$$hashKey": "object:713", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -923,12 +705,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "This counts the number of ‘buckets’ (i.e. unique storage-index values) currently managed by the storage server. It indicates roughly how many files are managed by the server.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -955,7 +732,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -973,9 +750,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Total bucket count", "tooltip": { "shared": true, @@ -984,9 +759,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -994,30 +767,22 @@ { "$$hashKey": "object:797", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "$$hashKey": "object:798", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -1034,12 +799,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "Estimate of what percentage of system CPU time was consumed by the node process, over the given time interval. ", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1066,7 +826,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -1102,9 +862,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "CPU monitor", "tooltip": { "shared": true, @@ -1113,18 +871,14 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:62", - "decimals": null, "format": "percentunit", - "label": null, "logBase": 1, "max": "1", "min": "0", @@ -1133,16 +887,12 @@ { "$$hashKey": "object:63", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1150,12 +900,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "Estimate of total number of CPU seconds consumed by node since the process was started. Ticket #472 indicates that .total may sometimes be negative due to wraparound of the kernel’s counter.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1182,7 +927,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -1200,9 +945,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "CPU time total", "tooltip": { "shared": true, @@ -1211,36 +954,27 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "$$hashKey": "object:62", - "decimals": null, "format": "s", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "$$hashKey": "object:63", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1248,12 +982,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "description": "How many seconds since the node process was started.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1280,7 +1009,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.11", + "pluginVersion": "8.4.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -1298,9 +1027,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Node uptime", "tooltip": { "shared": true, @@ -1309,9 +1036,7 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, @@ -1319,44 +1044,257 @@ { "$$hashKey": "object:386", "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "$$hashKey": "object:387", "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 42, + "panels": [], + "title": "Corruption Advisories", + "type": "row" + }, + { + "description": "File count of /storage/corruption-advisories/", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "tahoe_corruption_advisories_total", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Corruption Advisory count", + "type": "timeseries" + }, + { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Corruption Advisory rate alert", + "noDataState": "no_data", + "notifications": [] + }, + "description": "Rate of new files in /storage/corruption-advisories/", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 60, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 37 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "exemplar": true, + "expr": "rate(tahoe_corruption_advisories_total[5m])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "op": "gt", + "value": 0, + "visible": true + } + ], + "title": "Corruption Advisory rate", + "type": "timeseries" } ], - "schemaVersion": 27, + "schemaVersion": 35, "style": "dark", "tags": [], "templating": { "list": [ { - "allValue": null, "current": { - "selected": false, + "selected": true, "text": "storage1", "value": "storage1" }, - "datasource": null, "definition": "tahoe_stats_cpu_monitor_1min_avg", "description": "Which node (instamce) to show", - "error": null, "hide": 0, "includeAll": false, "label": "Node", @@ -1372,16 +1310,13 @@ "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": true, - "tags": [], "text": [ "allocate", "write", @@ -1394,7 +1329,6 @@ ] }, "description": "Inbound storage-server operations ", - "error": null, "hide": 0, "includeAll": true, "label": "Detailed latencies for", @@ -1472,5 +1406,6 @@ "timezone": "", "title": "Tahoe-LAFS", "uid": "TahoeLAFS", - "version": 1 + "version": 1, + "weekStart": "" } -- GitLab