From eb498726f554412ccb3d21a84286d0dc435cf351 Mon Sep 17 00:00:00 2001 From: Florian Sesser <florian@leastauthority.com> Date: Tue, 16 Feb 2021 15:16:42 +0000 Subject: [PATCH] Add graphviz and text to explain dashboard order --- docs/source/ops/monitoring.rst | 20 +++++++++++-- .../ops/service-dag-to-dashboard-order.dot | 29 +++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 docs/source/ops/service-dag-to-dashboard-order.dot diff --git a/docs/source/ops/monitoring.rst b/docs/source/ops/monitoring.rst index 6c010ac6..e30831ad 100644 --- a/docs/source/ops/monitoring.rst +++ b/docs/source/ops/monitoring.rst @@ -22,9 +22,23 @@ Introduction to our dashboards We have two groups of dashboards: Requests (external view, RED method) and Resources (internal view, USE method). -Services and their dependencies can be visualized as a tree from external-facing to internal systems. -We order our dashboards like a breadth-first-search of that tree. -This makes it easier to understand dependencies and faster to trouble shoot when a high-latency problem on a low-level service bubbles up. +Resources like CPU and memory exist independently of one another (at least in theory) and their corresponding dashboards are listed in arbitrary order. + +Services, on the other hand, often directly depend on other services: +A request might cause sub-requests, which in turn might call other services. +These dependencies can be visualized as a DAG (directed acyclic graph, like a tree but with directed edges) from external-facing to internal systems. + +When a service fails, and an Alert is triggered, often the services which depend on the failing service will fail and trigger Alerts as well. +This can cause confusion and cost valuable time especially when the current on-call staff is not familiar with the inner workings of a particular machinery. + +To mitigate this problem, we order our dashboards to resemble these dependencies according to a `breadth-first-search <https://en.wikipedia.org/wiki/Breadth-first_search>`_ of the service dependency DAG: + +.. graphviz:: service-dag-to-dashboard-order.dot + :caption: DAG of services to resulting order of corresponding dashboards + +This makes finding the first failing link, and thus the cause of the problem, quicker: +Problems of a failing service lowest in the DAG bubble "upwards". +Therefore, the "lowest" dashboard that indicates a problem has a high probability of highlighting the origin of the cascading failures. Meaning of our metrics diff --git a/docs/source/ops/service-dag-to-dashboard-order.dot b/docs/source/ops/service-dag-to-dashboard-order.dot new file mode 100644 index 00000000..e957a8aa --- /dev/null +++ b/docs/source/ops/service-dag-to-dashboard-order.dot @@ -0,0 +1,29 @@ +digraph { + subgraph cluster01 { + label = "DAG of service dependencies"; + + 1->2; + 1->3; + + 3->4; + 3->5; + } + + subgraph cluster02 { + label = "Resulting order of dashboards"; + node [ shape = box ]; + edge [ style = invis ]; + + d1 [ label = 1 ]; + d2 [ label = 2 ]; + d3 [ label = 3 ]; + d4 [ label = 4 ]; + d5 [ label = 5 ]; + + d1->d2; + d2->d3; + d3->d4; + d4->d5; + } +} + -- GitLab