feat(cilium): add Prometheus alerts

This commit is contained in:
JJGadgets
2024-08-19 18:43:35 +08:00
parent cd0eb86b15
commit e8ffb64265
3 changed files with 161 additions and 10 deletions

View File

@@ -2,7 +2,7 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/kashalls/kromgo/main/config.schema.json
metrics:
- name: cluster_cpu_usage
query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.1)
query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.01)
suffix: "%"
colors:
- { color: "green", min: 0, max: 35 }
@@ -10,7 +10,7 @@ metrics:
- { color: "red", min: 76, max: 1000 }
- name: cluster_memory_usage
query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.1)
query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.01)
suffix: "%"
colors:
- { color: "green", min: 0, max: 35 }
@@ -25,7 +25,7 @@ metrics:
- { color: "red", min: 0, max: 1 }
- name: cluster_network_transmit_usage
query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1)
query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01)
suffix: "Mb/s"
colors:
- { color: "red", min: 0, max: 199 }
@@ -33,7 +33,7 @@ metrics:
- { color: "green", min: 1000, max: 10000 }
- name: cluster_network_receive_usage
query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1)
query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01)
suffix: "Mb/s"
colors:
- { color: "red", min: 0, max: 199 }
@@ -55,7 +55,7 @@ metrics:
- { color: "green", min: 360, max: 99999 }
- name: cluster_uptime_days
query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.1)
query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.01)
suffix: " days"
colors:
- { color: "red", min: 0, max: 6 }
@@ -78,7 +78,7 @@ metrics:
- { color: "red", min: 5, max: 9999 }
- name: blackhawk_battery_percent
query: round(node_power_supply_capacity{instance="blackhawk:9100"}, 0.1)
query: round(node_power_supply_capacity{instance="blackhawk.*"}, 0.01)
suffix: "%"
colors:
- { color: "red", min: 0, max: 19 }
@@ -86,7 +86,7 @@ metrics:
- { color: "green", min: 51, max: 99999 }
- name: blackhawk_battery_cycles
query: round(node_power_supply_cyclecount{instance="blackhawk:9100"}, 0.1)
query: round(node_power_supply_cyclecount{instance="blackhawk.*"}, 0.01)
suffix: " cycles"
colors:
- { color: "red", min: 0, max: 199 }
@@ -94,9 +94,103 @@ metrics:
- { color: "green", min: 400, max: 99999 }
- name: blackhawk_battery_health
query: round((node_power_supply_energy_full{instance="blackhawk:9100"} / node_power_supply_energy_full_design{instance="blackhawk:9100"} * 100), 0.1)
query: round((node_power_supply_energy_full{instance="blackhawk.*"} / node_power_supply_energy_full_design{instance="blackhawk.*"} * 100), 0.01)
suffix: "%"
colors:
- { color: "red", min: 0, max: 79 }
- { color: "orange", min: 80, max: 89 }
- { color: "green", min: 90, max: 99999 }
- name: nighthawk_cpu_usage
#query: round(sum by(instance) (100 - avg(irate(windows_cpu_time_total{instance=~"nighthawk.*",mode="idle"}[1m]))*100), 0.01)
#query: round(sum by(instance) (irate(windows_cpu_time_total{mode!~"idle|privileged"}[1m]))*10, 0.01)
query: round((sum by(instance) (irate(windows_cpu_processor_utility_total{instance=~"nighthawk.*"}[1m])) / 10000000), 0.01)
suffix: "%"
colors:
- { color: "green", min: 0, max: 35 }
- { color: "orange", min: 36, max: 75 }
- { color: "red", min: 76, max: 1000 }
- name: nighthawk_memory_usage
query: round(((windows_cs_physical_memory_bytes - windows_memory_available_bytes) / windows_cs_physical_memory_bytes), 0.01)
suffix: "%"
colors:
- { color: "green", min: 0, max: 35 }
- { color: "orange", min: 36, max: 75 }
- { color: "red", min: 76, max: 1000 }
- name: nighthawk_memory_bytes
query: round((((windows_cs_physical_memory_bytes - windows_memory_available_bytes)) / 1024 / 1024 / 1024), 0.01)
suffix: "%"
colors:
- { color: "green", min: 0, max: 35 }
- { color: "orange", min: 36, max: 75 }
- { color: "red", min: 76, max: 1000 }
- name: nighthawk_nvidia_temperature
query: round(nvidia_smi_temperature_gpu{instance=~"nighthawk.*"}, 0.01)
suffix: "°C"
colors:
- { color: "green", min: 0, max: 60 }
- { color: "orange", min: 61, max: 70 }
- { color: "red", min: 71, max: 99999 }
- name: nighthawk_nvidia_power_draw_watts
query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"}, 0.01)
suffix: "W"
colors:
- { color: "green", min: 0, max: 50 }
- { color: "orange", min: 51, max: 80 }
- { color: "red", min: 81, max: 99999 }
- name: nighthawk_nvidia_power_draw_percent
query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"} / nvidia_smi_enforced_power_limit_watts{instance=~"nighthawk.*"}, 0.01)
suffix: "W"
colors:
- { color: "green", min: 0, max: 39 }
- { color: "orange", min: 40, max: 69 }
- { color: "red", min: 70, max: 99999 }
- name: nighthawk_nvidia_fan_speed_percent
query: round(nvidia_smi_fan_speed_ratio{instance=~"nighthawk.*"}, 0.01)
suffix: "W"
colors:
- { color: "green", min: 0, max: 50 }
- { color: "orange", min: 51, max: 80 }
- { color: "red", min: 81, max: 99999 }
- name: nighthawk_nvidia_graphics_clock
query: round(nvidia_smi_clocks_current_graphics_clock_hz{instance=~"nighthawk.*"} / 1000000, 0.01)
suffix: "MHz"
colors:
- { color: "green", min: 0, max: 99999 }
- name: nighthawk_nvidia_graphics_usage
query: round(nvidia_smi_utilization_gpu_ratio{instance=~"nighthawk.*"} * 100, 0.01)
suffix: "MHz"
colors:
- { color: "green", min: 0, max: 99999 }
- name: nighthawk_nvidia_memory_used
query: round(nvidia_smi_memory_used_bytes{instance=~"nighthawk.*"} / 1000000000, 0.01)
suffix: "GB"
colors:
- { color: "red", min: 0, max: 2 }
- { color: "orange", min: 2.1, max: 5 }
- { color: "green", min: 5.1, max: 99999 }
- name: cilium_endpoints_unready
query: round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1)
suffix: "%"
colors:
- { color: "green", min: 0, max: 9 }
- { color: "orange", min: 10, max: 50 }
- { color: "red", min: 51, max: 99999 }
- name: cilium_bpf_map_pressure
query: round(sum by(cluster) (cilium_bpf_map_pressure * 100), 0.1)
suffix: "%"
colors:
- { color: "green", min: 0, max: 50 }
- { color: "orange", min: 50.1, max: 69.9 }
- { color: "red", min: 70, max: 99999 }

View File

@@ -32,9 +32,9 @@ spec:
env: &env
TZ: "${CONFIG_TZ}"
PROMETHEUS_URL: http://vmsingle-victoria.monitoring.svc.cluster.local.:8429
SERVER_HOST: "::"
SERVER_HOST: &host "[::]"
SERVER_PORT: &http 8080
HEALTH_HOST: "::"
HEALTH_HOST: *host
HEALTH_PORT: &health 8888
securityContext: &sc
readOnlyRootFilesystem: true

View File

@@ -0,0 +1,57 @@
---
# yaml-language-server: $schema=https://crds.jank.ing/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: cilium
namespace: monitoring
spec:
groups:
- name: cilium.rules
rules:
- alert: CiliumAgentAbsent
annotations:
summary: Cilium Agent has disappeared from Prometheus target discovery.
expr: |
absent(up{job="cilium-agent"})
for: 15m
labels:
severity: critical
- alert: CiliumEndpointsStuckNode
annotations:
summary: >-
Node `{{ $labels.node }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%.
#expr: | # >10% of endpoints are un-ready # might be more noisy without averaging over 5 minutes?
# round((sum by(node) (cilium_endpoint_state{endpoint_state!="ready"})) / (sum by(node) (cilium_endpoint_state)) * 100, 1) > 10
expr: | # >10% of endpoints are un-ready, calculated after some smoothening via averaging over 5 minutes
round((sum by(node) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(node) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10
for: 10m
labels:
severity: critical
- alert: CiliumEndpointsStuckCluster
annotations:
summary: >-
Cluster `{{ $labels.cluster }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%.
expr: | # >10% of endpoints across cluster are un-ready, calculated after some smoothening via averaging over 5 minutes
round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10
for: 10m
labels:
severity: critical
- alert: HighBPFMapPressureNode
annotations:
summary: >-
Node `{{ $labels.node }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%.
expr: |
max by (node, map_name) (cilium_bpf_map_pressure * 100) >= 60
for: 1m
labels:
severity: critical
- alert: HighBPFMapPressureNode
annotations:
summary: >-
Cluster `{{ $labels.cluster }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%.
expr: |
max by (cluster, map_name) (cilium_bpf_map_pressure * 100) >= 60
for: 1m
labels:
severity: critical