refactor: webhook send event (#2239 )

2026-03-02 22:19:10 +00:00 · 2024-10-28 11:14:09 +08:00
7 changed files with 315 additions and 31 deletions
--- a/alert/eval/eval.go
+++ b/alert/eval/eval.go
@@ -3,6 +3,7 @@ package eval
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"reflect"
@@ -117,21 +118,29 @@ func (arw *AlertRuleWorker) Eval() {
 	arw.processor.Stats.CounterRuleEval.WithLabelValues().Inc()

 	typ := cachedRule.GetRuleType()
-	var anomalyPoints []common.AnomalyPoint
-	var recoverPoints []common.AnomalyPoint
+	var (
+		anomalyPoints []common.AnomalyPoint
+		recoverPoints []common.AnomalyPoint
+		err           error
+	)
 	switch typ {
 	case models.PROMETHEUS:
-		anomalyPoints = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
+		anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
 	case models.HOST:
-		anomalyPoints = arw.GetHostAnomalyPoint(cachedRule.RuleConfig)
+		anomalyPoints, err = arw.GetHostAnomalyPoint(cachedRule.RuleConfig)
 	case models.TDENGINE:
-		anomalyPoints, recoverPoints = arw.GetTdengineAnomalyPoint(cachedRule, arw.processor.DatasourceId())
+		anomalyPoints, recoverPoints, err = arw.GetTdengineAnomalyPoint(cachedRule, arw.processor.DatasourceId())
 	case models.LOKI:
-		anomalyPoints = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
+		anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
 	default:
 		return
 	}

+	if err != nil {
+		logger.Errorf("rule_eval:%s get anomaly point err:%s", arw.Key(), err.Error())
+		return
+	}
+
 	if arw.processor == nil {
 		logger.Warningf("rule_eval:%s processor is nil", arw.Key())
 		return
@@ -179,7 +188,7 @@ func (arw *AlertRuleWorker) Stop() {
 	close(arw.quit)
 }

-func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.AnomalyPoint {
+func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) ([]common.AnomalyPoint, error) {
 	var lst []common.AnomalyPoint
 	var severity int

@@ -187,13 +196,13 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
 	if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
 		logger.Errorf("rule_eval:%s rule_config:%s, error:%v", arw.Key(), ruleConfig, err)
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return lst
+		return lst, err
 	}

 	if rule == nil {
 		logger.Errorf("rule_eval:%s rule_config:%s, error:rule is nil", arw.Key(), ruleConfig)
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return lst
+		return lst, errors.New("rule is nil")
 	}

 	arw.inhibit = rule.Inhibit
@@ -224,7 +233,7 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
 			logger.Errorf("rule_eval:%s promql:%s, error:%v", arw.Key(), promql, err)
 			arw.processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.datasourceId)).Inc()
 			arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), QUERY_DATA).Inc()
-			continue
+			return lst, err
 		}

 		if len(warnings) > 0 {
@@ -241,10 +250,10 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
 		}
 		lst = append(lst, points...)
 	}
-	return lst
+	return lst, nil
 }

-func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId int64) ([]common.AnomalyPoint, []common.AnomalyPoint) {
+func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId int64) ([]common.AnomalyPoint, []common.AnomalyPoint, error) {
 	// 获取查询和规则判断条件
 	points := []common.AnomalyPoint{}
 	recoverPoints := []common.AnomalyPoint{}
@@ -252,7 +261,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
 	if ruleConfig == "" {
 		logger.Warningf("rule_eval:%d promql is blank", rule.Id)
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return points, recoverPoints
+		return points, recoverPoints, errors.New("rule config is nil")
 	}

 	var ruleQuery models.RuleQuery
@@ -261,7 +270,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
 		logger.Warningf("rule_eval:%d promql parse error:%s", rule.Id, err.Error())
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId())).Inc()
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return points, recoverPoints
+		return points, recoverPoints, err
 	}

 	arw.inhibit = ruleQuery.Inhibit
@@ -288,7 +297,7 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
 				logger.Warningf("rule_eval rid:%d query data error: %v", rule.Id, err)
 				arw.processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.datasourceId)).Inc()
 				arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), QUERY_DATA).Inc()
-				continue
+				return points, recoverPoints, err
 			}
 			//  此条日志很重要，是告警判断的现场值
 			logger.Debugf("rule_eval rid:%d req:%+v resp:%+v", rule.Id, query, series)
@@ -305,10 +314,10 @@ func (arw *AlertRuleWorker) GetTdengineAnomalyPoint(rule *models.AlertRule, dsId
 		points, recoverPoints = GetAnomalyPoint(rule.Id, ruleQuery, seriesTagIndexes, seriesStore)
 	}

-	return points, recoverPoints
+	return points, recoverPoints, nil
 }

-func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.AnomalyPoint {
+func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) ([]common.AnomalyPoint, error) {
 	var lst []common.AnomalyPoint
 	var severity int

@@ -316,13 +325,13 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
 	if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
 		logger.Errorf("rule_eval:%s rule_config:%s, error:%v", arw.Key(), ruleConfig, err)
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return lst
+		return lst, err
 	}

 	if rule == nil {
 		logger.Errorf("rule_eval:%s rule_config:%s, error:rule is nil", arw.Key(), ruleConfig)
 		arw.processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.processor.DatasourceId()), GET_RULE_CONFIG).Inc()
-		return lst
+		return lst, errors.New("rule is nil")
 	}

 	arw.inhibit = rule.Inhibit
@@ -449,7 +458,7 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
 			}
 		}
 	}
-	return lst
+	return lst, nil
 }

 func GetAnomalyPoint(ruleId int64, ruleQuery models.RuleQuery, seriesTagIndexes map[string]map[uint64][]uint64, seriesStore map[uint64]models.DataResp) ([]common.AnomalyPoint, []common.AnomalyPoint) {
--- a/alert/sender/callback.go
+++ b/alert/sender/callback.go
@@ -195,8 +195,8 @@ func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.

 	if queue == nil {
 		queue = &WebhookQueue{
-			list:    NewSafeListLimited(QueueMaxSize),
-			closeCh: make(chan struct{}),
+			eventQueue: NewSafeEventQueue(QueueMaxSize),
+			closeCh:    make(chan struct{}),
 		}

 		CallbackEventQueueLock.Lock()
@@ -206,8 +206,8 @@ func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.
 		StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
 	}

-	succ := queue.list.PushFront(event)
+	succ := queue.eventQueue.Push(event)
 	if !succ {
-		logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
+		logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.eventQueue.Len(), event)
 	}
 }
--- a/alert/sender/webhook.go
+++ b/alert/sender/webhook.go
@@ -121,8 +121,8 @@ var EventQueueLock sync.RWMutex
 const QueueMaxSize = 100000

 type WebhookQueue struct {
-	list    *SafeListLimited
-	closeCh chan struct{}
+	eventQueue *SafeEventQueue
+	closeCh    chan struct{}
 }

 func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
@@ -132,8 +132,8 @@ func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCur

 	if queue == nil {
 		queue = &WebhookQueue{
-			list:    NewSafeListLimited(QueueMaxSize),
-			closeCh: make(chan struct{}),
+			eventQueue: NewSafeEventQueue(QueueMaxSize),
+			closeCh:    make(chan struct{}),
 		}

 		EventQueueLock.Lock()
@@ -143,10 +143,10 @@ func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCur
 		StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
 	}

-	succ := queue.list.PushFront(event)
+	succ := queue.eventQueue.Push(event)
 	if !succ {
 		stats.AlertNotifyErrorTotal.WithLabelValues("push_event_queue").Inc()
-		logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
+		logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.eventQueue.Len(), event)
 	}
 }

@@ -157,7 +157,7 @@ func StartConsumer(ctx *ctx.Context, queue *WebhookQueue, popSize int, webhook *
 			logger.Infof("event queue:%v closed", queue)
 			return
 		default:
-			events := queue.list.PopBack(popSize)
+			events := queue.eventQueue.PopN(popSize)
 			if len(events) == 0 {
 				time.Sleep(time.Millisecond * 400)
 				continue
--- a/alert/sender/webhook_event_queue.go
+++ b/alert/sender/webhook_event_queue.go
@@ -0,0 +1,113 @@
+package sender
+
+import (
+	"container/list"
+	"sync"
+
+	"github.com/ccfos/nightingale/v6/models"
+)
+
+type SafeEventQueue struct {
+	lock        sync.RWMutex
+	maxSize     int
+	queueHigh   *list.List
+	queueMiddle *list.List
+	queueLow    *list.List
+}
+
+const (
+	High   = 1
+	Middle = 2
+	Low    = 3
+)
+
+func NewSafeEventQueue(maxSize int) *SafeEventQueue {
+	return &SafeEventQueue{
+		maxSize:     maxSize,
+		lock:        sync.RWMutex{},
+		queueHigh:   list.New(),
+		queueMiddle: list.New(),
+		queueLow:    list.New(),
+	}
+}
+
+func (spq *SafeEventQueue) Len() int {
+	spq.lock.RLock()
+	defer spq.lock.RUnlock()
+	return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
+}
+
+// len 无锁读取长度，不要在本文件外调用
+func (spq *SafeEventQueue) len() int {
+	return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
+}
+
+func (spq *SafeEventQueue) Push(event *models.AlertCurEvent) bool {
+	spq.lock.Lock()
+	defer spq.lock.Unlock()
+	switch event.Severity {
+	case High:
+		spq.queueHigh.PushBack(event)
+	case Middle:
+		spq.queueMiddle.PushBack(event)
+	case Low:
+		spq.queueLow.PushBack(event)
+	default:
+		return false
+	}
+
+	for spq.len() > spq.maxSize {
+		if spq.queueLow.Len() > 0 {
+			spq.queueLow.Remove(spq.queueLow.Front())
+		} else if spq.queueMiddle.Len() > 0 {
+			spq.queueMiddle.Remove(spq.queueMiddle.Front())
+		} else {
+			spq.queueHigh.Remove(spq.queueHigh.Front())
+		}
+	}
+	return true
+}
+
+// pop 无锁弹出事件，不要在本文件外调用
+func (spq *SafeEventQueue) pop() *models.AlertCurEvent {
+	if spq.len() == 0 {
+		return nil
+	}
+
+	var elem interface{}
+
+	if spq.queueHigh.Len() > 0 {
+		elem = spq.queueHigh.Remove(spq.queueHigh.Front())
+	} else if spq.queueMiddle.Len() > 0 {
+		elem = spq.queueMiddle.Remove(spq.queueMiddle.Front())
+	} else {
+		elem = spq.queueLow.Remove(spq.queueLow.Front())
+	}
+	event, ok := elem.(*models.AlertCurEvent)
+	if !ok {
+		return nil
+	}
+	return event
+}
+
+func (spq *SafeEventQueue) Pop() *models.AlertCurEvent {
+	spq.lock.Lock()
+	defer spq.lock.Unlock()
+	return spq.pop()
+}
+
+func (spq *SafeEventQueue) PopN(n int) []*models.AlertCurEvent {
+	spq.lock.Lock()
+	defer spq.lock.Unlock()
+
+	events := make([]*models.AlertCurEvent, 0, n)
+	count := 0
+	for count < n && spq.len() > 0 {
+		event := spq.pop()
+		if event != nil {
+			events = append(events, event)
+		}
+		count++
+	}
+	return events
+}
--- a/alert/sender/webhook_event_queue_test.go
+++ b/alert/sender/webhook_event_queue_test.go
@@ -0,0 +1,157 @@
+package sender
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/ccfos/nightingale/v6/models"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestSafePriorityQueue_ConcurrentPushPop(t *testing.T) {
+	spq := NewSafeEventQueue(100000)
+
+	var wg sync.WaitGroup
+	numGoroutines := 100
+	numEvents := 1000
+
+	// 并发 Push
+	wg.Add(numGoroutines)
+	for i := 0; i < numGoroutines; i++ {
+		go func(goroutineID int) {
+			defer wg.Done()
+			for j := 0; j < numEvents; j++ {
+				event := &models.AlertCurEvent{
+					Severity:    goroutineID%3 + 1,
+					TriggerTime: time.Now().UnixNano(),
+				}
+				spq.Push(event)
+			}
+		}(i)
+	}
+	wg.Wait()
+
+	// 检查队列长度是否正确
+	expectedLen := numGoroutines * numEvents
+	assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
+
+	// 并发 Pop
+	wg.Add(numGoroutines)
+	for i := 0; i < numGoroutines; i++ {
+		go func() {
+			defer wg.Done()
+			for {
+				event := spq.Pop()
+				if event == nil {
+					return
+				}
+			}
+		}()
+	}
+	wg.Wait()
+
+	// 最终队列应该为空
+	assert.Equal(t, 0, spq.Len(), "Queue should be empty after concurrent pops")
+}
+
+func TestSafePriorityQueue_ConcurrentPopMax(t *testing.T) {
+	spq := NewSafeEventQueue(100000)
+
+	// 添加初始数据
+	for i := 0; i < 1000; i++ {
+		spq.Push(&models.AlertCurEvent{
+			Severity:    i%3 + 1,
+			TriggerTime: time.Now().UnixNano(),
+		})
+	}
+
+	var wg sync.WaitGroup
+	numGoroutines := 10
+	popMax := 100
+
+	// 并发 PopN
+	wg.Add(numGoroutines)
+	for i := 0; i < numGoroutines; i++ {
+		go func() {
+			defer wg.Done()
+			events := spq.PopN(popMax)
+			assert.LessOrEqual(t, len(events), popMax, "PopN exceeded maximum")
+		}()
+	}
+	wg.Wait()
+
+	// 检查队列长度是否正确
+	expectedRemaining := 1000 - (numGoroutines * popMax)
+	if expectedRemaining < 0 {
+		expectedRemaining = 0
+	}
+	assert.Equal(t, expectedRemaining, spq.Len(), "Queue length mismatch after concurrent PopN")
+}
+
+func TestSafePriorityQueue_ConcurrentPushPopWithDifferentSeverities(t *testing.T) {
+	spq := NewSafeEventQueue(100000)
+
+	var wg sync.WaitGroup
+	numGoroutines := 50
+	numEvents := 500
+
+	// 并发 Push 不同优先级的事件
+	wg.Add(numGoroutines)
+	for i := 0; i < numGoroutines; i++ {
+		go func(goroutineID int) {
+			defer wg.Done()
+			for j := 0; j < numEvents; j++ {
+				event := &models.AlertCurEvent{
+					Severity:    goroutineID%3 + 1, // 模拟不同的 Severity
+					TriggerTime: time.Now().UnixNano(),
+				}
+				spq.Push(event)
+			}
+		}(i)
+	}
+	wg.Wait()
+
+	// 检查队列长度是否正确
+	expectedLen := numGoroutines * numEvents
+	assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
+
+	// 检查事件的顺序是否按照优先级排列
+	var lastEvent *models.AlertCurEvent
+	for spq.Len() > 0 {
+		event := spq.Pop()
+		if lastEvent != nil {
+			assert.LessOrEqual(t, lastEvent.Severity, event.Severity, "Events are not in correct priority order")
+		}
+		lastEvent = event
+	}
+}
+
+func TestSafePriorityQueue_ExceedMaxSize(t *testing.T) {
+	spq := NewSafeEventQueue(5)
+
+	// 插入超过最大容量的事件
+	for i := 0; i < 10; i++ {
+		spq.Push(&models.AlertCurEvent{
+			Severity:    i % 3,
+			TriggerTime: int64(i),
+		})
+	}
+
+	// 验证队列的长度是否不超过 maxSize
+	assert.LessOrEqual(t, spq.Len(), spq.maxSize)
+
+	// 验证队列中剩余事件的内容
+	expectedEvents := 5
+	if spq.Len() < 5 {
+		expectedEvents = spq.Len()
+	}
+
+	// 检查最后存入的事件是否是按优先级排序
+	for i := 0; i < expectedEvents; i++ {
+		event := spq.Pop()
+		if event != nil {
+			assert.LessOrEqual(t, event.Severity, 2)
+		}
+	}
+}
--- a/go.mod
+++ b/go.mod
@@ -32,6 +32,7 @@ require (
 	github.com/rakyll/statik v0.1.7
 	github.com/redis/go-redis/v9 v9.0.2
 	github.com/spaolacci/murmur3 v1.1.0
+	github.com/stretchr/testify v1.8.4
 	github.com/tidwall/gjson v1.14.0
 	github.com/toolkits/pkg v1.3.8
 	golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
@@ -44,6 +45,8 @@ require (
 	gorm.io/gorm v1.25.7-0.20240204074919-46816ad31dde
 )

+require github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+
 require (
 	github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
--- a/go.sum
+++ b/go.sum
@@ -234,6 +234,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pquerna/cachecontrol v0.1.0 h1:yJMy84ti9h/+OEWa752kBTKv4XC30OtVVHYv/8cTqKc=
 github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
 github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
@@ -288,6 +289,7 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/tidwall/gjson v1.14.0 h1:6aeJ0bzojgWLa82gDQHcx3S0Lr/O51I9bJ5nv6JFx5w=
 github.com/tidwall/gjson v1.14.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=