mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-27 18:18:41 +00:00
add kubevirt metrics and dashboards (#573)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added PrometheusRule configuration to monitor virtual machine (VM) and virtual machine instance (VMI) states. - Introduced ServiceMonitor resource for Kubevirt metrics monitoring. - Added `monitorNamespace` configuration in KubeVirt custom resource. - **Monitoring Enhancements** - Implemented alerts for VMs and VMIs not running for more than 10 minutes. - Configured metrics endpoint with HTTPS support. - **Version Updates** - Updated version mappings for several packages, reflecting new commit hashes. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Co-authored-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
5196
dashboards/kubevirt/kubevirt-control-plane.json
Normal file
5196
dashboards/kubevirt/kubevirt-control-plane.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -31,3 +31,4 @@ control-plane/control-plane-status
|
||||
control-plane/deprecated-resources
|
||||
control-plane/dns-coredns
|
||||
control-plane/kube-etcd3
|
||||
kubevirt/kubevirt-control-plane
|
||||
|
||||
43
packages/system/kubevirt-operator/alerts/PrometheusRule.yaml
Normal file
43
packages/system/kubevirt-operator/alerts/PrometheusRule.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: vm-not-running-alert
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: kubevirt-alerts
|
||||
rules:
|
||||
- alert: VMNotRunningFor10Minutes
|
||||
expr: |
|
||||
max_over_time(
|
||||
kubevirt_vm_info{
|
||||
status!="Running",
|
||||
exported_namespace=~".+",
|
||||
name=~".+"
|
||||
}[10m]
|
||||
) == 1
|
||||
labels:
|
||||
severity: critical
|
||||
namespace: "{{ $labels.exported_namespace }}"
|
||||
annotations:
|
||||
summary: "VM '{{ $labels.name }}' in namespace '{{ $labels.exported_namespace }}' is not Running for more than 10 minutes"
|
||||
description: >
|
||||
The VM '{{ $labels.name }}' in namespace '{{ $labels.exported_namespace }}' has been in a non-Running state
|
||||
for more than 10 minutes. Please investigate the issue.
|
||||
- alert: VMINotRunningFor10Minutes
|
||||
expr: |
|
||||
max_over_time(
|
||||
kubevirt_vmi_info{
|
||||
phase!="running",
|
||||
exported_namespace=~".+",
|
||||
name=~".+"
|
||||
}[10m]
|
||||
) == 1
|
||||
labels:
|
||||
severity: critical
|
||||
namespace: "{{ $labels.exported_namespace }}"
|
||||
annotations:
|
||||
summary: "VMI '{{ $labels.name }}' in namespace '{{ $labels.exported_namespace }}' is not Running for more than 10 minutes"
|
||||
description: >
|
||||
The VM '{{ $labels.name }}' in namespace '{{ $labels.exported_namespace }}' has been in a non-Running state
|
||||
for more than 10 minutes. Please investigate the issue.
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kubevirt
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
prometheus.kubevirt.io: "true"
|
||||
endpoints:
|
||||
- port: metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
7
packages/system/kubevirt-operator/templates/alerts.yaml
Normal file
7
packages/system/kubevirt-operator/templates/alerts.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
{{- $files := .Files.Glob "alerts/*.yaml" -}}
|
||||
{{- range $path, $file := $files }}
|
||||
---
|
||||
# from: {{ $path }}
|
||||
{{ toString $file }}
|
||||
|
||||
{{- end -}}
|
||||
@@ -13,4 +13,5 @@ spec:
|
||||
- ExpandDisks
|
||||
customizeComponents: {}
|
||||
imagePullPolicy: IfNotPresent
|
||||
monitorNamespace: tenant-root
|
||||
workloadUpdateStrategy: {}
|
||||
|
||||
Reference in New Issue
Block a user