Compare commits

...

1 Commits

Author SHA1 Message Date
flashbo
cb4ef9f85f feat: alert rule support import promethues alert rule (#2080) 2024-08-07 15:36:58 +08:00
5 changed files with 213 additions and 0 deletions

View File

@@ -315,6 +315,8 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGets)
pages.POST("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByFE)
pages.POST("/busi-group/:id/alert-rules/import", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImport)
pages.POST("/busi-group/:id/alert-rules/import-prom-rule", rt.auth(),
rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImportPromRule)
pages.DELETE("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/del"), rt.bgrw(), rt.alertRuleDel)
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)

View File

@@ -126,6 +126,25 @@ func (rt *Router) alertRuleAddByImport(c *gin.Context) {
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) alertRuleAddByImportPromRule(c *gin.Context) {
username := c.MustGet("username").(string)
type PromRule struct {
Groups []models.PromRuleGroup `yaml:"groups"`
}
var pr PromRule
ginx.Dangerous(c.BindYAML(&pr))
if len(pr.Groups) == 0 {
ginx.Bomb(http.StatusBadRequest, "input yaml is empty")
}
lst := models.DealPromGroup(pr.Groups)
bgid := ginx.UrlParamInt64(c, "id")
err := rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language"))
ginx.NewRender(c).Data(err, nil)
}
func (rt *Router) alertRuleAddByService(c *gin.Context) {
var lst []models.AlertRule
ginx.BindJSON(c, &lst)

View File

@@ -26,6 +26,21 @@ const (
TDENGINE = "tdengine"
)
const (
AlertRuleEnabled = 0
AlertRuleDisabled = 1
AlertRuleEnableInGlobalBG = 0
AlertRuleEnableInOneBG = 1
AlertRuleNotNotifyRecovered = 0
AlertRuleNotifyRecovered = 1
AlertRuleNotifyRepeatStep60Min = 60
AlertRuleRecoverDuration0Sec = 0
)
type AlertRule struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"` // busi group id

93
models/prom_alert_rule.go Normal file
View File

@@ -0,0 +1,93 @@
package models
import (
"fmt"
"time"
"github.com/toolkits/pkg/logger"
)
type PromRule struct {
Alert string `yaml:"alert,omitempty" json:"alert,omitempty"` // 报警规则的名称
Record string `yaml:"record,omitempty" json:"record,omitempty"` // 记录规则的名称
Expr string `yaml:"expr,omitempty" json:"expr,omitempty"` // PromQL 表达式
For string `yaml:"for,omitempty" json:"for,omitempty"` // 告警的等待时间
Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"` // 规则的注释信息
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty"` // 规则的标签信息
}
type PromRuleGroup struct {
Name string `yaml:"name"`
Rules []PromRule `yaml:"rules"`
Interval string `yaml:"interval,omitempty"`
}
func convertInterval(interval string) int {
duration, err := time.ParseDuration(interval)
if err != nil {
logger.Errorf("Error parsing interval `%s`,err: %v", interval, err)
return 0
}
return int(duration.Seconds())
}
func ConvertAlert(rule PromRule, interval string) AlertRule {
annotations := rule.Annotations
appendTags := []string{}
severity := 2
if len(rule.Labels) > 0 {
for k, v := range rule.Labels {
if k != "severity" {
appendTags = append(appendTags, fmt.Sprintf("%s=%s", k, v))
} else {
switch v {
case "critical":
severity = 1
case "warning":
severity = 2
case "info":
severity = 3
}
}
}
}
return AlertRule{
Name: rule.Alert,
Severity: severity,
Disabled: AlertRuleEnabled,
PromForDuration: convertInterval(rule.For),
PromQl: rule.Expr,
PromEvalInterval: convertInterval(interval),
EnableStimeJSON: "00:00",
EnableEtimeJSON: "23:59",
EnableDaysOfWeekJSON: []string{
"1", "2", "3", "4", "5", "6", "0",
},
EnableInBG: AlertRuleEnableInGlobalBG,
NotifyRecovered: AlertRuleNotifyRecovered,
NotifyRepeatStep: AlertRuleNotifyRepeatStep60Min,
RecoverDuration: AlertRuleRecoverDuration0Sec,
AnnotationsJSON: annotations,
AppendTagsJSON: appendTags,
}
}
func DealPromGroup(promRule []PromRuleGroup) []AlertRule {
var alertRules []AlertRule
for _, group := range promRule {
interval := group.Interval
if interval == "" {
interval = "15s"
}
for _, rule := range group.Rules {
if rule.Alert != "" {
alertRules = append(alertRules, ConvertAlert(rule, interval))
}
}
}
return alertRules
}

View File

@@ -0,0 +1,84 @@
package models_test
import (
"testing"
"github.com/ccfos/nightingale/v6/models"
"gopkg.in/yaml.v2"
)
func TestConvertAlert(t *testing.T) {
jobMissing := []models.PromRule{}
err := yaml.Unmarshal([]byte(` - alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 1m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`), &jobMissing)
if err != nil {
t.Errorf("Failed to Unmarshal, err: %s", err)
}
t.Logf("jobMissing: %+v", jobMissing[0])
convJobMissing := models.ConvertAlert(jobMissing[0], "30s")
if convJobMissing.PromEvalInterval != 30 {
t.Errorf("PromEvalInterval is expected to be 30, but got %d",
convJobMissing.PromEvalInterval)
}
if convJobMissing.PromForDuration != 60 {
t.Errorf("PromForDuration is expected to be 60, but got %d",
convJobMissing.PromForDuration)
}
if convJobMissing.Severity != 2 {
t.Errorf("Severity is expected to be 2, but got %d", convJobMissing.Severity)
}
ruleEvaluationSlow := []models.PromRule{}
yaml.Unmarshal([]byte(` - alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 180s
labels:
severity: info
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`), &ruleEvaluationSlow)
t.Logf("ruleEvaluationSlow: %+v", ruleEvaluationSlow[0])
convRuleEvaluationSlow := models.ConvertAlert(ruleEvaluationSlow[0], "1m")
if convRuleEvaluationSlow.PromEvalInterval != 60 {
t.Errorf("PromEvalInterval is expected to be 60, but got %d",
convJobMissing.PromEvalInterval)
}
if convRuleEvaluationSlow.PromForDuration != 180 {
t.Errorf("PromForDuration is expected to be 180, but got %d",
convJobMissing.PromForDuration)
}
if convRuleEvaluationSlow.Severity != 3 {
t.Errorf("Severity is expected to be 3, but got %d", convJobMissing.Severity)
}
targetMissing := []models.PromRule{}
yaml.Unmarshal([]byte(` - alert: PrometheusTargetMissing
expr: up == 0
for: 1.5m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`), &targetMissing)
t.Logf("targetMissing: %+v", targetMissing[0])
convTargetMissing := models.ConvertAlert(targetMissing[0], "1h")
if convTargetMissing.PromEvalInterval != 3600 {
t.Errorf("PromEvalInterval is expected to be 3600, but got %d",
convTargetMissing.PromEvalInterval)
}
if convTargetMissing.PromForDuration != 90 {
t.Errorf("PromForDuration is expected to be 90, but got %d",
convTargetMissing.PromForDuration)
}
if convTargetMissing.Severity != 1 {
t.Errorf("Severity is expected to be 1, but got %d", convTargetMissing.Severity)
}
}