[WIFI-11509] Add: Grafana QA debugging dashboard

Signed-off-by: Dmitry Dunaev <dmitry@opsfleet.com>
This commit is contained in:
Dmitry Dunaev
2022-11-08 14:42:25 +03:00
parent f99ce7864f
commit 22f4579d99
3 changed files with 863 additions and 2 deletions

View File

@@ -1,4 +1,3 @@
# uCentral analytics
## Collecting metrics with ELK stack
@@ -79,4 +78,27 @@ Finally we can create visualization in Kibana based on AP metrics like the follo
![kibana-viz-00.png](img/kibana-viz-00.png)
![kibana-viz-01.png](img/kibana-viz-01.png)
Exports of these Kibana visualization can be found [here](kibana/dashboard-ucentral-metrics.json).
Exports of these Kibana visualization can be found [here](kibana/dashboard-ucentral-metrics.json).
## Grafana QA debugging dashboard
![grafana-qa-debugging](img/grafana.png)
Repository includes exported Grafana dashboard that's used for debug puproses in TIP-managed testing environments deployed in Kubernetes. Dashboard allows to get all required information regarding container operation, restarts information and runtime logs.
Provided information (from top left to bottom right):
1. CPU usage information (usage, container requests and limits, throttling info);
2. RAM usage information (usage, container requests and limits);
3. Total container restarts count;
4. Last restart exit code;
5. Container logs.
### Deployment requirements
1. ElasticSearch 6 (version 6.8.22 is tested) - used for logs collection;
2. Prometheus (version 2.39.1 is tested) - used for metrics collection (Operator is recommended);
3. kube-state-metrics - used for pod metrics collection (requires forked version from https://github.com/Telecominfraproject/wlan-cloud-kube-state-metrics with enabled pod exit code metric OR if you need Docker image, you may use the latest from https://tip.jfrog.io/ui/repos/tree/General/tip-wlan-cloud-ucentral/kube-state-metrics);
4. Grafana 8 (version 8.5.13 is tested) - used for visualization;
Grafana 8 is used because of ElasticSearch 6 deprecation in version 9.

839
grafana/qa-debugging.json Normal file
View File

@@ -0,0 +1,839 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
},
{
"name": "DS_ELASTICSEARCH",
"label": "Elasticsearch",
"description": "",
"type": "datasource",
"pluginId": "elasticsearch",
"pluginName": "Elasticsearch"
}
],
"__elements": [],
"__requires": [
{
"type": "datasource",
"id": "elasticsearch",
"name": "Elasticsearch",
"version": "1.0.0"
},
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "8.5.3"
},
{
"type": "panel",
"id": "logs",
"name": "Logs",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"enable": true,
"expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\",pod=\"$pod\"}[10m])",
"iconColor": "dark-purple",
"name": "Restarts",
"titleFormat": "Restart"
}
]
},
"description": "",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"iteration": 1667481532739,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": ".* requests"
},
"properties": [
{
"id": "custom.lineStyle",
"value": {
"dash": [
10,
15
],
"fill": "dash"
}
},
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed",
"seriesBy": "last"
}
},
{
"id": "custom.lineWidth",
"value": 2
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".* limits"
},
"properties": [
{
"id": "custom.lineStyle",
"value": {
"dash": [
10,
10
],
"fill": "dash"
}
},
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
},
{
"id": "custom.lineWidth",
"value": 2
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".* throttling"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "unit",
"value": "percentunit"
},
{
"id": "color",
"value": {
"fixedColor": "dark-blue",
"mode": "fixed"
}
},
{
"id": "max",
"value": 1
},
{
"id": "custom.lineWidth",
"value": 2
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".* usage"
},
"properties": [
{
"id": "custom.fillOpacity",
"value": 100
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", container=\"$container\"}) by (container)",
"legendFormat": "{{container}} usage",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\", container=\"$container\"}) by (container)",
"hide": false,
"legendFormat": "{{container}} requests",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\"$container\", resource=\"cpu\"}) by (container)",
"hide": false,
"legendFormat": "{{container}} limits",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(container_cpu_cfs_throttled_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container=\"$container\", cluster=\"$cluster\"}[$__rate_interval])) by (container) /sum(increase(container_cpu_cfs_periods_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", namespace=\"$namespace\", pod=\"$pod\", container=\"$container\", cluster=\"$cluster\"}[$__rate_interval])) by (container)\n",
"hide": false,
"legendFormat": "{{container}} throttling",
"range": true,
"refId": "D"
}
],
"title": "CPU Usage (in shares of cores)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": ".* requests"
},
"properties": [
{
"id": "custom.lineStyle",
"value": {
"dash": [
10,
15
],
"fill": "dash"
}
},
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed",
"seriesBy": "last"
}
},
{
"id": "custom.lineWidth",
"value": 2
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".* limits"
},
"properties": [
{
"id": "custom.lineStyle",
"value": {
"dash": [
10,
10
],
"fill": "dash"
}
},
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
},
{
"id": "custom.lineWidth",
"value": 2
}
]
},
{
"matcher": {
"id": "byRegexp",
"options": ".* usage"
},
"properties": [
{
"id": "custom.fillOpacity",
"value": 100
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\"$container\", image!=\"\"}) by (container)",
"legendFormat": "{{container}} usage",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\", container=\"$container\"}) by (container)",
"hide": false,
"legendFormat": "{{container}} requests",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\"$container\", resource=\"memory\"}) by (container)",
"hide": false,
"legendFormat": "{{container}} limits",
"range": true,
"refId": "C"
}
],
"title": "Memory usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 3,
"x": 0,
"y": 9
},
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.5.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(kube_pod_container_status_restarts_total{namespace=\"$namespace\",pod=\"$pod\",container=\"$container\"})",
"refId": "A"
}
],
"title": "Total restarts amount",
"type": "stat"
},
{
"datasource": {
"type": "elasticsearch",
"uid": "${DS_ELASTICSEARCH}"
},
"gridPos": {
"h": 13,
"w": 21,
"x": 3,
"y": 9
},
"id": 7,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"alias": "",
"bucketAggs": [
{
"field": "@timestamp",
"id": "2",
"settings": {
"interval": "auto"
},
"type": "date_histogram"
}
],
"datasource": {
"type": "elasticsearch",
"uid": "${DS_ELASTICSEARCH}"
},
"metrics": [
{
"id": "1",
"type": "logs"
}
],
"query": "kubernetes.namespace_name.keyword: \"$namespace\" AND kubernetes.pod_name.keyword: \"$pod\" AND kubernetes.container_name.keyword: \"$container\"",
"refId": "A",
"timeField": "@timestamp"
}
],
"title": "Logs",
"type": "logs"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 3,
"x": 0,
"y": 15
},
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.5.3",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "sum(kube_pod_container_status_last_terminated_exitcode{namespace=\"$namespace\",pod=\"$pod\",container=\"$container\"})",
"refId": "A"
}
],
"title": "Last restart exit code",
"type": "stat"
}
],
"refresh": "10s",
"schemaVersion": 36,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "default",
"value": "default"
},
"hide": 0,
"includeAll": false,
"label": "Prometheus source",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "Elasticsearch",
"value": "Elasticsearch"
},
"hide": 0,
"includeAll": false,
"label": "ES source",
"multi": false,
"name": "DS_ELASTICSEARCH",
"options": [],
"query": "elasticsearch",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"hide": 2,
"includeAll": false,
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"hide": 0,
"includeAll": false,
"label": "Pod",
"multi": false,
"name": "pod",
"options": [],
"query": {
"query": "label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, pod)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(container_memory_usage_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}, container)",
"hide": 0,
"includeAll": false,
"label": "Container",
"multi": false,
"name": "container",
"options": [],
"query": {
"query": "label_values(container_memory_usage_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}, container)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "QA debugging",
"uid": "1Ge5lwNVz",
"version": 5,
"weekStart": ""
}

BIN
img/grafana.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 370 KiB