diff --git a/kube/deploy/apps/kromgo/app/config/config.yaml b/kube/deploy/apps/kromgo/app/config/config.yaml index 2b1946f8..84baaf6b 100644 --- a/kube/deploy/apps/kromgo/app/config/config.yaml +++ b/kube/deploy/apps/kromgo/app/config/config.yaml @@ -2,7 +2,7 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/kashalls/kromgo/main/config.schema.json metrics: - name: cluster_cpu_usage - query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.1) + query: round((sum by(cluster) (irate(node_cpu_seconds_total{kubernetes_node!="",mode!="idle"}[1m])) / on(cluster) group_left sum by (cluster)((irate(node_cpu_seconds_total{kubernetes_node!=""}[1m])))) * 100, 0.01) suffix: "%" colors: - { color: "green", min: 0, max: 35 } @@ -10,7 +10,7 @@ metrics: - { color: "red", min: 76, max: 1000 } - name: cluster_memory_usage - query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.1) + query: round(sum(node_memory_MemTotal_bytes{kubernetes_node!=""} - node_memory_MemAvailable_bytes{kubernetes_node!=""}) / sum(node_memory_MemTotal_bytes{kubernetes_node!=""}) * 100, 0.01) suffix: "%" colors: - { color: "green", min: 0, max: 35 } @@ -25,7 +25,7 @@ metrics: - { color: "red", min: 0, max: 1 } - name: cluster_network_transmit_usage - query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1) + query: round(sum by(cluster) (irate(node_network_transmit_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01) suffix: "Mb/s" colors: - { color: "red", min: 0, max: 199 } @@ -33,7 +33,7 @@ metrics: - { color: "green", min: 1000, max: 10000 } - name: cluster_network_receive_usage - query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.1) + query: round(sum by(cluster) (irate(node_network_receive_bytes_total{kubernetes_node!="",device!~".+\\..+|lo|lxc.+",job="node-exporter"}[1m])*8/1024/1024),0.01) suffix: "Mb/s" colors: - { color: "red", min: 0, max: 199 } @@ -55,7 +55,7 @@ metrics: - { color: "green", min: 360, max: 99999 } - name: cluster_uptime_days - query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.1) + query: round(avg((node_time_seconds - node_boot_time_seconds) / 3600 / 24), 0.01) suffix: " days" colors: - { color: "red", min: 0, max: 6 } @@ -78,7 +78,7 @@ metrics: - { color: "red", min: 5, max: 9999 } - name: blackhawk_battery_percent - query: round(node_power_supply_capacity{instance="blackhawk:9100"}, 0.1) + query: round(node_power_supply_capacity{instance="blackhawk.*"}, 0.01) suffix: "%" colors: - { color: "red", min: 0, max: 19 } @@ -86,7 +86,7 @@ metrics: - { color: "green", min: 51, max: 99999 } - name: blackhawk_battery_cycles - query: round(node_power_supply_cyclecount{instance="blackhawk:9100"}, 0.1) + query: round(node_power_supply_cyclecount{instance="blackhawk.*"}, 0.01) suffix: " cycles" colors: - { color: "red", min: 0, max: 199 } @@ -94,9 +94,103 @@ metrics: - { color: "green", min: 400, max: 99999 } - name: blackhawk_battery_health - query: round((node_power_supply_energy_full{instance="blackhawk:9100"} / node_power_supply_energy_full_design{instance="blackhawk:9100"} * 100), 0.1) + query: round((node_power_supply_energy_full{instance="blackhawk.*"} / node_power_supply_energy_full_design{instance="blackhawk.*"} * 100), 0.01) suffix: "%" colors: - { color: "red", min: 0, max: 79 } - { color: "orange", min: 80, max: 89 } - { color: "green", min: 90, max: 99999 } + + - name: nighthawk_cpu_usage + #query: round(sum by(instance) (100 - avg(irate(windows_cpu_time_total{instance=~"nighthawk.*",mode="idle"}[1m]))*100), 0.01) + #query: round(sum by(instance) (irate(windows_cpu_time_total{mode!~"idle|privileged"}[1m]))*10, 0.01) + query: round((sum by(instance) (irate(windows_cpu_processor_utility_total{instance=~"nighthawk.*"}[1m])) / 10000000), 0.01) + suffix: "%" + colors: + - { color: "green", min: 0, max: 35 } + - { color: "orange", min: 36, max: 75 } + - { color: "red", min: 76, max: 1000 } + + - name: nighthawk_memory_usage + query: round(((windows_cs_physical_memory_bytes - windows_memory_available_bytes) / windows_cs_physical_memory_bytes), 0.01) + suffix: "%" + colors: + - { color: "green", min: 0, max: 35 } + - { color: "orange", min: 36, max: 75 } + - { color: "red", min: 76, max: 1000 } + + - name: nighthawk_memory_bytes + query: round((((windows_cs_physical_memory_bytes - windows_memory_available_bytes)) / 1024 / 1024 / 1024), 0.01) + suffix: "%" + colors: + - { color: "green", min: 0, max: 35 } + - { color: "orange", min: 36, max: 75 } + - { color: "red", min: 76, max: 1000 } + + - name: nighthawk_nvidia_temperature + query: round(nvidia_smi_temperature_gpu{instance=~"nighthawk.*"}, 0.01) + suffix: "°C" + colors: + - { color: "green", min: 0, max: 60 } + - { color: "orange", min: 61, max: 70 } + - { color: "red", min: 71, max: 99999 } + + - name: nighthawk_nvidia_power_draw_watts + query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"}, 0.01) + suffix: "W" + colors: + - { color: "green", min: 0, max: 50 } + - { color: "orange", min: 51, max: 80 } + - { color: "red", min: 81, max: 99999 } + + - name: nighthawk_nvidia_power_draw_percent + query: round(nvidia_smi_power_draw_watts{instance=~"nighthawk.*"} / nvidia_smi_enforced_power_limit_watts{instance=~"nighthawk.*"}, 0.01) + suffix: "W" + colors: + - { color: "green", min: 0, max: 39 } + - { color: "orange", min: 40, max: 69 } + - { color: "red", min: 70, max: 99999 } + + - name: nighthawk_nvidia_fan_speed_percent + query: round(nvidia_smi_fan_speed_ratio{instance=~"nighthawk.*"}, 0.01) + suffix: "W" + colors: + - { color: "green", min: 0, max: 50 } + - { color: "orange", min: 51, max: 80 } + - { color: "red", min: 81, max: 99999 } + + - name: nighthawk_nvidia_graphics_clock + query: round(nvidia_smi_clocks_current_graphics_clock_hz{instance=~"nighthawk.*"} / 1000000, 0.01) + suffix: "MHz" + colors: + - { color: "green", min: 0, max: 99999 } + + - name: nighthawk_nvidia_graphics_usage + query: round(nvidia_smi_utilization_gpu_ratio{instance=~"nighthawk.*"} * 100, 0.01) + suffix: "MHz" + colors: + - { color: "green", min: 0, max: 99999 } + + - name: nighthawk_nvidia_memory_used + query: round(nvidia_smi_memory_used_bytes{instance=~"nighthawk.*"} / 1000000000, 0.01) + suffix: "GB" + colors: + - { color: "red", min: 0, max: 2 } + - { color: "orange", min: 2.1, max: 5 } + - { color: "green", min: 5.1, max: 99999 } + + - name: cilium_endpoints_unready + query: round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) + suffix: "%" + colors: + - { color: "green", min: 0, max: 9 } + - { color: "orange", min: 10, max: 50 } + - { color: "red", min: 51, max: 99999 } + + - name: cilium_bpf_map_pressure + query: round(sum by(cluster) (cilium_bpf_map_pressure * 100), 0.1) + suffix: "%" + colors: + - { color: "green", min: 0, max: 50 } + - { color: "orange", min: 50.1, max: 69.9 } + - { color: "red", min: 70, max: 99999 } diff --git a/kube/deploy/apps/kromgo/app/hr.yaml b/kube/deploy/apps/kromgo/app/hr.yaml index 96c921b8..226d7e21 100644 --- a/kube/deploy/apps/kromgo/app/hr.yaml +++ b/kube/deploy/apps/kromgo/app/hr.yaml @@ -32,9 +32,9 @@ spec: env: &env TZ: "${CONFIG_TZ}" PROMETHEUS_URL: http://vmsingle-victoria.monitoring.svc.cluster.local.:8429 - SERVER_HOST: "::" + SERVER_HOST: &host "[::]" SERVER_PORT: &http 8080 - HEALTH_HOST: "::" + HEALTH_HOST: *host HEALTH_PORT: &health 8888 securityContext: &sc readOnlyRootFilesystem: true diff --git a/kube/deploy/core/_networking/cilium/app/prometheusrule-alerts.yaml b/kube/deploy/core/_networking/cilium/app/prometheusrule-alerts.yaml new file mode 100644 index 00000000..6820b776 --- /dev/null +++ b/kube/deploy/core/_networking/cilium/app/prometheusrule-alerts.yaml @@ -0,0 +1,57 @@ +--- +# yaml-language-server: $schema=https://crds.jank.ing/monitoring.coreos.com/prometheusrule_v1.json +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cilium + namespace: monitoring +spec: + groups: + - name: cilium.rules + rules: + - alert: CiliumAgentAbsent + annotations: + summary: Cilium Agent has disappeared from Prometheus target discovery. + expr: | + absent(up{job="cilium-agent"}) + for: 15m + labels: + severity: critical + - alert: CiliumEndpointsStuckNode + annotations: + summary: >- + Node `{{ $labels.node }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%. + #expr: | # >10% of endpoints are un-ready # might be more noisy without averaging over 5 minutes? + # round((sum by(node) (cilium_endpoint_state{endpoint_state!="ready"})) / (sum by(node) (cilium_endpoint_state)) * 100, 1) > 10 + expr: | # >10% of endpoints are un-ready, calculated after some smoothening via averaging over 5 minutes + round((sum by(node) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(node) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10 + for: 10m + labels: + severity: critical + - alert: CiliumEndpointsStuckCluster + annotations: + summary: >- + Cluster `{{ $labels.cluster }}` has {{ $value }}% stuck Cilium endpoints, threshold 10%. + expr: | # >10% of endpoints across cluster are un-ready, calculated after some smoothening via averaging over 5 minutes + round((sum by(cluster) (avg_over_time(cilium_endpoint_state{endpoint_state!="ready"}[5m]))) / (sum by(cluster) (avg_over_time(cilium_endpoint_state[5m]))) * 100, 1) > 10 + for: 10m + labels: + severity: critical + - alert: HighBPFMapPressureNode + annotations: + summary: >- + Node `{{ $labels.node }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%. + expr: | + max by (node, map_name) (cilium_bpf_map_pressure * 100) >= 60 + for: 1m + labels: + severity: critical + - alert: HighBPFMapPressureNode + annotations: + summary: >- + Cluster `{{ $labels.cluster }}` has high ({{ $value }}%) BPF map pressure for map `{{ $labels.map_name }}`, threshold 60%. + expr: | + max by (cluster, map_name) (cilium_bpf_map_pressure * 100) >= 60 + for: 1m + labels: + severity: critical