Compare commits

...

1 Commits

Author SHA1 Message Date
Xu Bin
deb11752e5 refactor: webhook send event (#2239) 2024-10-28 11:14:09 +08:00
7 changed files with 315 additions and 31 deletions

View File

@@ -3,6 +3,7 @@ package eval
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"reflect"
@@ -117,21 +118,29 @@ func (arw *AlertRuleWorker) Eval() {
arw.processor.Stats.CounterRuleEval.WithLabelValues().Inc()
typ := cachedRule.GetRuleType()
var anomalyPoints []common.AnomalyPoint
var recoverPoints []common.AnomalyPoint
var (
anomalyPoints []common.AnomalyPoint
recoverPoints []common.AnomalyPoint
err error
)
switch typ {
case models.PROMETHEUS:
anomalyPoints = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
case models.HOST:
anomalyPoints = arw.GetHostAnomalyPoint(cachedRule.RuleConfig)
anomalyPoints, err = arw.GetHostAnomalyPoint(cachedRule.RuleConfig)
case models.TDENGINE:
anomalyPoints, recoverPoints = arw.GetTdengineAnomalyPoint(cachedRule, arw.processor.DatasourceId())
anomalyPoints, recoverPoints, err = arw.GetTdengineAnomalyPoint(cachedRule, arw.processor.DatasourceId())
case models.LOKI:
anomalyPoints = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
default:
return
}
if err != nil {
logger.Errorf("rule_eval:%s get anomaly point err:%s", arw.Key(), err.Error())
return
}
if arw.processor == nil {
logger.Warningf("rule_eval:%s processor is nil", arw.Key())
return
@@ -179,7 +188,7 @@ func (arw *AlertRuleWorker) Stop() {
close(arw.quit)
}
func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.AnomalyPoint {
func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) ([]common.AnomalyPoint, error) {
var lst []common.AnomalyPoint
var severity int
@@ -187,13 +196,13 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
logger.Errorf("rule_eval:%s rule_config:%s, error:%v", arw.Key(), ruleConfig, err)
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return lst
return lst, err
}
if rule == nil {
logger.Errorf("rule_eval:%s rule_config:%s, error:rule is nil", arw.Key(), ruleConfig)
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return lst
return lst, errors.New("rule is nil")
}
arw.inhibit = rule.Inhibit
@@ -224,7 +233,7 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
logger.Errorf("rule_eval:%s promql:%s, error:%v", arw.Key(), promql, err)
arw.processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.datasourceId)).Inc()
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), QUERY_DATA).Inc()
continue
return lst, err
}
if len(warnings) > 0 {
@@ -241,10 +250,10 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
}
lst = append(lst, points...)
}
return lst
return lst, nil
}
func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId int64) ([]common.AnomalyPoint, []common.AnomalyPoint) {
func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId int64) ([]common.AnomalyPoint, []common.AnomalyPoint, error) {
// 获取查询和规则判断条件
points := []common.AnomalyPoint{}
recoverPoints := []common.AnomalyPoint{}
@@ -252,7 +261,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
if ruleConfig == "" {
logger.Warningf("rule_eval:%d promql is blank", rule.Id)
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return points, recoverPoints
return points, recoverPoints, errors.New("rule config is nil")
}
var ruleQuery models.RuleQuery
@@ -261,7 +270,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
logger.Warningf("rule_eval:%d promql parse error:%s", rule.Id, err.Error())
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId())).Inc()
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return points, recoverPoints
return points, recoverPoints, err
}
arw.inhibit = ruleQuery.Inhibit
@@ -288,7 +297,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
logger.Warningf("rule_eval rid:%d query data error: %v", rule.Id, err)
arw.processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.datasourceId)).Inc()
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), QUERY_DATA).Inc()
continue
return points, recoverPoints, err
}
// 此条日志很重要,是告警判断的现场值
logger.Debugf("rule_eval rid:%d req:%+v resp:%+v", rule.Id, query, series)
@@ -305,10 +314,10 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
points, recoverPoints = GetAnomalyPoint(rule.Id, ruleQuery, seriesTagIndexes, seriesStore)
}
return points, recoverPoints
return points, recoverPoints, nil
}
func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.AnomalyPoint {
func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) ([]common.AnomalyPoint, error) {
var lst []common.AnomalyPoint
var severity int
@@ -316,13 +325,13 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
logger.Errorf("rule_eval:%s rule_config:%s, error:%v", arw.Key(), ruleConfig, err)
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return lst
return lst, err
}
if rule == nil {
logger.Errorf("rule_eval:%s rule_config:%s, error:rule is nil", arw.Key(), ruleConfig)
arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
return lst
return lst, errors.New("rule is nil")
}
arw.inhibit = rule.Inhibit
@@ -449,7 +458,7 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
}
}
}
return lst
return lst, nil
}
func GetAnomalyPoint(ruleId int64, ruleQuery models.RuleQuery, seriesTagIndexes map[string]map[uint64][]uint64, seriesStore map[uint64]models.DataResp) ([]common.AnomalyPoint, []common.AnomalyPoint) {

View File

@@ -195,8 +195,8 @@ func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.
if queue == nil {
queue = &WebhookQueue{
list: NewSafeListLimited(QueueMaxSize),
closeCh: make(chan struct{}),
eventQueue: NewSafeEventQueue(QueueMaxSize),
closeCh: make(chan struct{}),
}
CallbackEventQueueLock.Lock()
@@ -206,8 +206,8 @@ func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.
StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
}
succ := queue.list.PushFront(event)
succ := queue.eventQueue.Push(event)
if !succ {
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.eventQueue.Len(), event)
}
}

View File

@@ -121,8 +121,8 @@ var EventQueueLock sync.RWMutex
const QueueMaxSize = 100000
type WebhookQueue struct {
list *SafeListLimited
closeCh chan struct{}
eventQueue *SafeEventQueue
closeCh chan struct{}
}
func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
@@ -132,8 +132,8 @@ func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCur
if queue == nil {
queue = &WebhookQueue{
list: NewSafeListLimited(QueueMaxSize),
closeCh: make(chan struct{}),
eventQueue: NewSafeEventQueue(QueueMaxSize),
closeCh: make(chan struct{}),
}
EventQueueLock.Lock()
@@ -143,10 +143,10 @@ func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCur
StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
}
succ := queue.list.PushFront(event)
succ := queue.eventQueue.Push(event)
if !succ {
stats.AlertNotifyErrorTotal.WithLabelValues("push_event_queue").Inc()
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.eventQueue.Len(), event)
}
}
@@ -157,7 +157,7 @@ func StartConsumer(ctx *ctx.Context, queue *WebhookQueue, popSize int, webhook *
logger.Infof("event queue:%v closed", queue)
return
default:
events := queue.list.PopBack(popSize)
events := queue.eventQueue.PopN(popSize)
if len(events) == 0 {
time.Sleep(time.Millisecond * 400)
continue

View File

@@ -0,0 +1,113 @@
package sender
import (
"container/list"
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type SafeEventQueue struct {
lock sync.RWMutex
maxSize int
queueHigh *list.List
queueMiddle *list.List
queueLow *list.List
}
const (
High = 1
Middle = 2
Low = 3
)
func NewSafeEventQueue(maxSize int) *SafeEventQueue {
return &SafeEventQueue{
maxSize: maxSize,
lock: sync.RWMutex{},
queueHigh: list.New(),
queueMiddle: list.New(),
queueLow: list.New(),
}
}
func (spq *SafeEventQueue) Len() int {
spq.lock.RLock()
defer spq.lock.RUnlock()
return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
}
// len 无锁读取长度,不要在本文件外调用
func (spq *SafeEventQueue) len() int {
return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
}
func (spq *SafeEventQueue) Push(event *models.AlertCurEvent) bool {
spq.lock.Lock()
defer spq.lock.Unlock()
switch event.Severity {
case High:
spq.queueHigh.PushBack(event)
case Middle:
spq.queueMiddle.PushBack(event)
case Low:
spq.queueLow.PushBack(event)
default:
return false
}
for spq.len() > spq.maxSize {
if spq.queueLow.Len() > 0 {
spq.queueLow.Remove(spq.queueLow.Front())
} else if spq.queueMiddle.Len() > 0 {
spq.queueMiddle.Remove(spq.queueMiddle.Front())
} else {
spq.queueHigh.Remove(spq.queueHigh.Front())
}
}
return true
}
// pop 无锁弹出事件,不要在本文件外调用
func (spq *SafeEventQueue) pop() *models.AlertCurEvent {
if spq.len() == 0 {
return nil
}
var elem interface{}
if spq.queueHigh.Len() > 0 {
elem = spq.queueHigh.Remove(spq.queueHigh.Front())
} else if spq.queueMiddle.Len() > 0 {
elem = spq.queueMiddle.Remove(spq.queueMiddle.Front())
} else {
elem = spq.queueLow.Remove(spq.queueLow.Front())
}
event, ok := elem.(*models.AlertCurEvent)
if !ok {
return nil
}
return event
}
func (spq *SafeEventQueue) Pop() *models.AlertCurEvent {
spq.lock.Lock()
defer spq.lock.Unlock()
return spq.pop()
}
func (spq *SafeEventQueue) PopN(n int) []*models.AlertCurEvent {
spq.lock.Lock()
defer spq.lock.Unlock()
events := make([]*models.AlertCurEvent, 0, n)
count := 0
for count < n && spq.len() > 0 {
event := spq.pop()
if event != nil {
events = append(events, event)
}
count++
}
return events
}

View File

@@ -0,0 +1,157 @@
package sender
import (
"sync"
"testing"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/stretchr/testify/assert"
)
func TestSafePriorityQueue_ConcurrentPushPop(t *testing.T) {
spq := NewSafeEventQueue(100000)
var wg sync.WaitGroup
numGoroutines := 100
numEvents := 1000
// 并发 Push
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func(goroutineID int) {
defer wg.Done()
for j := 0; j < numEvents; j++ {
event := &models.AlertCurEvent{
Severity: goroutineID%3 + 1,
TriggerTime: time.Now().UnixNano(),
}
spq.Push(event)
}
}(i)
}
wg.Wait()
// 检查队列长度是否正确
expectedLen := numGoroutines * numEvents
assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
// 并发 Pop
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func() {
defer wg.Done()
for {
event := spq.Pop()
if event == nil {
return
}
}
}()
}
wg.Wait()
// 最终队列应该为空
assert.Equal(t, 0, spq.Len(), "Queue should be empty after concurrent pops")
}
func TestSafePriorityQueue_ConcurrentPopMax(t *testing.T) {
spq := NewSafeEventQueue(100000)
// 添加初始数据
for i := 0; i < 1000; i++ {
spq.Push(&models.AlertCurEvent{
Severity: i%3 + 1,
TriggerTime: time.Now().UnixNano(),
})
}
var wg sync.WaitGroup
numGoroutines := 10
popMax := 100
// 并发 PopN
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func() {
defer wg.Done()
events := spq.PopN(popMax)
assert.LessOrEqual(t, len(events), popMax, "PopN exceeded maximum")
}()
}
wg.Wait()
// 检查队列长度是否正确
expectedRemaining := 1000 - (numGoroutines * popMax)
if expectedRemaining < 0 {
expectedRemaining = 0
}
assert.Equal(t, expectedRemaining, spq.Len(), "Queue length mismatch after concurrent PopN")
}
func TestSafePriorityQueue_ConcurrentPushPopWithDifferentSeverities(t *testing.T) {
spq := NewSafeEventQueue(100000)
var wg sync.WaitGroup
numGoroutines := 50
numEvents := 500
// 并发 Push 不同优先级的事件
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func(goroutineID int) {
defer wg.Done()
for j := 0; j < numEvents; j++ {
event := &models.AlertCurEvent{
Severity: goroutineID%3 + 1, // 模拟不同的 Severity
TriggerTime: time.Now().UnixNano(),
}
spq.Push(event)
}
}(i)
}
wg.Wait()
// 检查队列长度是否正确
expectedLen := numGoroutines * numEvents
assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
// 检查事件的顺序是否按照优先级排列
var lastEvent *models.AlertCurEvent
for spq.Len() > 0 {
event := spq.Pop()
if lastEvent != nil {
assert.LessOrEqual(t, lastEvent.Severity, event.Severity, "Events are not in correct priority order")
}
lastEvent = event
}
}
func TestSafePriorityQueue_ExceedMaxSize(t *testing.T) {
spq := NewSafeEventQueue(5)
// 插入超过最大容量的事件
for i := 0; i < 10; i++ {
spq.Push(&models.AlertCurEvent{
Severity: i % 3,
TriggerTime: int64(i),
})
}
// 验证队列的长度是否不超过 maxSize
assert.LessOrEqual(t, spq.Len(), spq.maxSize)
// 验证队列中剩余事件的内容
expectedEvents := 5
if spq.Len() < 5 {
expectedEvents = spq.Len()
}
// 检查最后存入的事件是否是按优先级排序
for i := 0; i < expectedEvents; i++ {
event := spq.Pop()
if event != nil {
assert.LessOrEqual(t, event.Severity, 2)
}
}
}

3
go.mod
View File

@@ -32,6 +32,7 @@ require (
github.com/rakyll/statik v0.1.7
github.com/redis/go-redis/v9 v9.0.2
github.com/spaolacci/murmur3 v1.1.0
github.com/stretchr/testify v1.8.4
github.com/tidwall/gjson v1.14.0
github.com/toolkits/pkg v1.3.8
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
@@ -44,6 +45,8 @@ require (
gorm.io/gorm v1.25.7-0.20240204074919-46816ad31dde
)
require github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
require (
github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e // indirect
github.com/beorn7/perks v1.0.1 // indirect

2
go.sum
View File

@@ -234,6 +234,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pquerna/cachecontrol v0.1.0 h1:yJMy84ti9h/+OEWa752kBTKv4XC30OtVVHYv/8cTqKc=
github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
@@ -288,6 +289,7 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tidwall/gjson v1.14.0 h1:6aeJ0bzojgWLa82gDQHcx3S0Lr/O51I9bJ5nv6JFx5w=
github.com/tidwall/gjson v1.14.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=