mirror of
https://github.com/outbackdingo/Biohazard.git
synced 2026-01-27 10:18:27 +00:00
feat(cilium): add Prometheus alerts
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
# yaml-language-server: $schema=https://raw.githubusercontent.com/kashalls/kromgo/main/config.schema.json
|
||||
metrics:
|
||||
- name: cluster_cpu_usage
|
||||
query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.1)
|
||||
query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 35 }
|
||||
@@ -10,7 +10,7 @@ metrics:
|
||||
- { color: "red", min: 76, max: 1000 }
|
||||
|
||||
- name: cluster_memory_usage
|
||||
query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.1)
|
||||
query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 35 }
|
||||
@@ -25,7 +25,7 @@ metrics:
|
||||
- { color: "red", min: 0, max: 1 }
|
||||
|
||||
- name: cluster_network_transmit_usage
|
||||
query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1)
|
||||
query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01)
|
||||
suffix: "Mb/s"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 199 }
|
||||
@@ -33,7 +33,7 @@ metrics:
|
||||
- { color: "green", min: 1000, max: 10000 }
|
||||
|
||||
- name: cluster_network_receive_usage
|
||||
query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1)
|
||||
query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01)
|
||||
suffix: "Mb/s"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 199 }
|
||||
@@ -55,7 +55,7 @@ metrics:
|
||||
- { color: "green", min: 360, max: 99999 }
|
||||
|
||||
- name: cluster_uptime_days
|
||||
query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.1)
|
||||
query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.01)
|
||||
suffix: " days"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 6 }
|
||||
@@ -78,7 +78,7 @@ metrics:
|
||||
- { color: "red", min: 5, max: 9999 }
|
||||
|
||||
- name: blackhawk_battery_percent
|
||||
query: round(node_power_supply_capacity{instance="blackhawk:9100"}, 0.1)
|
||||
query: round(node_power_supply_capacity{instance="blackhawk.*"}, 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 19 }
|
||||
@@ -86,7 +86,7 @@ metrics:
|
||||
- { color: "green", min: 51, max: 99999 }
|
||||
|
||||
- name: blackhawk_battery_cycles
|
||||
query: round(node_power_supply_cyclecount{instance="blackhawk:9100"}, 0.1)
|
||||
query: round(node_power_supply_cyclecount{instance="blackhawk.*"}, 0.01)
|
||||
suffix: " cycles"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 199 }
|
||||
@@ -94,9 +94,103 @@ metrics:
|
||||
- { color: "green", min: 400, max: 99999 }
|
||||
|
||||
- name: blackhawk_battery_health
|
||||
query: round((node_power_supply_energy_full{instance="blackhawk:9100"} / node_power_supply_energy_full_design{instance="blackhawk:9100"} * 100), 0.1)
|
||||
query: round((node_power_supply_energy_full{instance="blackhawk.*"} / node_power_supply_energy_full_design{instance="blackhawk.*"} * 100), 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 79 }
|
||||
- { color: "orange", min: 80, max: 89 }
|
||||
- { color: "green", min: 90, max: 99999 }
|
||||
|
||||
- name: nighthawk_cpu_usage
|
||||
#query: round(sum by(instance) (100 - avg(irate(windows_cpu_time_total{instance=~"nighthawk.*",mode="idle"}[1m]))*100), 0.01)
|
||||
#query: round(sum by(instance) (irate(windows_cpu_time_total{mode!~"idle|privileged"}[1m]))*10, 0.01)
|
||||
query: round((sum by(instance) (irate(windows_cpu_processor_utility_total{instance=~"nighthawk.*"}[1m])) / 10000000), 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 35 }
|
||||
- { color: "orange", min: 36, max: 75 }
|
||||
- { color: "red", min: 76, max: 1000 }
|
||||
|
||||
- name: nighthawk_memory_usage
|
||||
query: round(((windows_cs_physical_memory_bytes - windows_memory_available_bytes) / windows_cs_physical_memory_bytes), 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 35 }
|
||||
- { color: "orange", min: 36, max: 75 }
|
||||
- { color: "red", min: 76, max: 1000 }
|
||||
|
||||
- name: nighthawk_memory_bytes
|
||||
query: round((((windows_cs_physical_memory_bytes - windows_memory_available_bytes)) / 1024 / 1024 / 1024), 0.01)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 35 }
|
||||
- { color: "orange", min: 36, max: 75 }
|
||||
- { color: "red", min: 76, max: 1000 }
|
||||
|
||||
- name: nighthawk_nvidia_temperature
|
||||
query: round(nvidia_smi_temperature_gpu{instance=~"nighthawk.*"}, 0.01)
|
||||
suffix: "°C"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 60 }
|
||||
- { color: "orange", min: 61, max: 70 }
|
||||
- { color: "red", min: 71, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_power_draw_watts
|
||||
query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"}, 0.01)
|
||||
suffix: "W"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 50 }
|
||||
- { color: "orange", min: 51, max: 80 }
|
||||
- { color: "red", min: 81, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_power_draw_percent
|
||||
query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"} / nvidia_smi_enforced_power_limit_watts{instance=~"nighthawk.*"}, 0.01)
|
||||
suffix: "W"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 39 }
|
||||
- { color: "orange", min: 40, max: 69 }
|
||||
- { color: "red", min: 70, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_fan_speed_percent
|
||||
query: round(nvidia_smi_fan_speed_ratio{instance=~"nighthawk.*"}, 0.01)
|
||||
suffix: "W"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 50 }
|
||||
- { color: "orange", min: 51, max: 80 }
|
||||
- { color: "red", min: 81, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_graphics_clock
|
||||
query: round(nvidia_smi_clocks_current_graphics_clock_hz{instance=~"nighthawk.*"} / 1000000, 0.01)
|
||||
suffix: "MHz"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_graphics_usage
|
||||
query: round(nvidia_smi_utilization_gpu_ratio{instance=~"nighthawk.*"} * 100, 0.01)
|
||||
suffix: "MHz"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 99999 }
|
||||
|
||||
- name: nighthawk_nvidia_memory_used
|
||||
query: round(nvidia_smi_memory_used_bytes{instance=~"nighthawk.*"} / 1000000000, 0.01)
|
||||
suffix: "GB"
|
||||
colors:
|
||||
- { color: "red", min: 0, max: 2 }
|
||||
- { color: "orange", min: 2.1, max: 5 }
|
||||
- { color: "green", min: 5.1, max: 99999 }
|
||||
|
||||
- name: cilium_endpoints_unready
|
||||
query: round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 9 }
|
||||
- { color: "orange", min: 10, max: 50 }
|
||||
- { color: "red", min: 51, max: 99999 }
|
||||
|
||||
- name: cilium_bpf_map_pressure
|
||||
query: round(sum by(cluster) (cilium_bpf_map_pressure * 100), 0.1)
|
||||
suffix: "%"
|
||||
colors:
|
||||
- { color: "green", min: 0, max: 50 }
|
||||
- { color: "orange", min: 50.1, max: 69.9 }
|
||||
- { color: "red", min: 70, max: 99999 }
|
||||
|
||||
@@ -32,9 +32,9 @@ spec:
|
||||
env: &env
|
||||
TZ: "${CONFIG_TZ}"
|
||||
PROMETHEUS_URL: http://vmsingle-victoria.monitoring.svc.cluster.local.:8429
|
||||
SERVER_HOST: "::"
|
||||
SERVER_HOST: &host "[::]"
|
||||
SERVER_PORT: &http 8080
|
||||
HEALTH_HOST: "::"
|
||||
HEALTH_HOST: *host
|
||||
HEALTH_PORT: &health 8888
|
||||
securityContext: &sc
|
||||
readOnlyRootFilesystem: true
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
---
|
||||
# yaml-language-server: $schema=https://crds.jank.ing/monitoring.coreos.com/prometheusrule_v1.json
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: cilium
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: cilium.rules
|
||||
rules:
|
||||
- alert: CiliumAgentAbsent
|
||||
annotations:
|
||||
summary: Cilium Agent has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="cilium-agent"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CiliumEndpointsStuckNode
|
||||
annotations:
|
||||
summary: >-
|
||||
Node `{{ $labels.node }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%.
|
||||
#expr: | # >10% of endpoints are un-ready # might be more noisy without averaging over 5 minutes?
|
||||
# round((sum by(node) (cilium_endpoint_state{endpoint_state!="ready"})) / (sum by(node) (cilium_endpoint_state)) * 100, 1) > 10
|
||||
expr: | # >10% of endpoints are un-ready, calculated after some smoothening via averaging over 5 minutes
|
||||
round((sum by(node) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(node) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CiliumEndpointsStuckCluster
|
||||
annotations:
|
||||
summary: >-
|
||||
Cluster `{{ $labels.cluster }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%.
|
||||
expr: | # >10% of endpoints across cluster are un-ready, calculated after some smoothening via averaging over 5 minutes
|
||||
round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: HighBPFMapPressureNode
|
||||
annotations:
|
||||
summary: >-
|
||||
Node `{{ $labels.node }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%.
|
||||
expr: |
|
||||
max by (node, map_name) (cilium_bpf_map_pressure * 100) >= 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: HighBPFMapPressureNode
|
||||
annotations:
|
||||
summary: >-
|
||||
Cluster `{{ $labels.cluster }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%.
|
||||
expr: |
|
||||
max by (cluster, map_name) (cilium_bpf_map_pressure * 100) >= 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
Reference in New Issue
Block a user