Compare commits

...

69 Commits

Author SHA1 Message Date
ning
b057940134 add stats 2024-08-01 15:18:49 +08:00
ning
4500c4aba8 refactor callback 2024-08-01 15:12:03 +08:00
710leo
726e994d58 merge main 2024-07-30 15:58:31 +08:00
710leo
71c4c24f00 code refactor 2024-07-30 15:57:15 +08:00
710leo
d14a834149 webhook send batch 2024-07-30 15:47:25 +08:00
Yening Qin
5b2513b7a1 feat: support lark and larkcard notify channel (#2061)
* feat: support lark notify channel (#2056)

Co-authored-by: flashbo <36443248+lwb0214@users.noreply.github.com>
Co-authored-by: wenbo <1027758873@qq.com>
2024-07-27 21:21:43 +08:00
ning
7cec16eaf0 update center router init 2024-07-26 14:59:40 +08:00
ning
5f895552a9 done 2024-07-26 12:54:28 +08:00
ning
1e3f62b92f refactor 2024-07-25 17:46:19 +08:00
ning
17dbb3ec77 code refactor 2024-07-25 12:06:10 +08:00
ning
00822c8404 refactor: add ibex enable check 2024-07-25 11:39:43 +08:00
ning
55de30d6c7 refactor: update mute rule api 2024-07-24 11:37:38 +08:00
Yening Qin
8b7dbed27e refactor: modify heartbeat api (#2051) 2024-07-24 11:23:56 +08:00
Dan218
71b8fa27d0 feat: Provide optional style for buildTargetWhere (#2038) 2024-07-24 11:12:17 +08:00
ning
31174d719e refactor: event relabel 2024-07-22 11:45:17 +08:00
ning
5b5bb22ffd fix: event relable process tagsmap 2024-07-22 10:46:29 +08:00
ning
e98fe9ea2e refactor: HandleTSFunc 2024-07-21 15:28:06 +08:00
ning
32e9ded393 refactor: server-clusters api perm 2024-07-21 11:04:35 +08:00
ning
8293ca20be refactor: assets file support md 2024-07-18 15:07:47 +08:00
Yening Qin
6c4ddfc349 refactor: update languageDetector (#2043) 2024-07-18 14:13:48 +08:00
ning
cd0c478515 refactor: event relabel add default value 2024-07-17 22:48:50 +08:00
Yening Qin
2cd25ac0e5 fix: optimize event recovery inhibit (#2042) 2024-07-17 22:30:31 +08:00
ning
bb99ba3d1c update sql 2024-07-17 11:57:20 +08:00
Yening Qin
64405dca5d feat: alert event support relabel (#2041) 2024-07-17 10:30:29 +08:00
ulricqin
69ea9ca8f8 Update README.md 2024-07-17 09:39:00 +08:00
ulricqin
41d0f2fcda Update README.md 2024-07-17 09:36:30 +08:00
710leo
93df1c0fbc docs: add perm point 2024-07-16 23:44:30 +08:00
flashbo
86e952788d refactor: targets get api support backend sorting (#2034)
Co-authored-by: wenbo <bupt.lwb@gmail.com>
2024-07-16 23:38:04 +08:00
ning
e890f2616f refactor: change webhook sleep time 2024-07-13 14:38:32 +08:00
yanli
6c2ee584e5 refactor: MetricDesc defaults to Chinese (#2032) 2024-07-12 21:50:51 +08:00
Dan218
5f07fc3010 Feat: Add skip Verify Insecure ssl/tls in sendWebhook (#2030) 2024-07-12 10:38:33 +08:00
ning
20fa310ba9 refactor: sync team to duty 2024-07-08 17:54:59 +08:00
ning
0e3b08be9a feat: ldap support defaultTeams 2024-07-08 17:35:39 +08:00
ning
b7d971d7c8 refactor: add alert rule pure api 2024-07-08 17:10:31 +08:00
ning
4373ae7f0b code refactor 2024-07-05 10:40:27 +08:00
dependabot[bot]
053325a691 build(deps): bump golang.org/x/image from 0.13.0 to 0.18.0 (#2017) 2024-07-04 17:56:36 +08:00
ning
c54267aa3a refactor: webhook support retry 2024-07-04 17:49:16 +08:00
ning
74dc430886 add migrate sql 2024-07-04 15:45:22 +08:00
Yening Qin
dc79ee4687 feat: recording rule support cron pattern (#2025) 2024-07-04 11:23:48 +08:00
shardingHe
e154c946e6 docs: add dashboard for aliyun-mysql (#2020)
* add dashboard for aliyun-mysql

* Update mysql.json

---------

Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
Co-authored-by: ulricqin <ulricqin@qq.com>
2024-07-03 11:41:50 +08:00
ning
08bfc0b388 refactor: add callbak log 2024-07-01 15:15:19 +08:00
ning
5338270aef feat: encrypt pass 2024-06-27 17:07:19 +08:00
Ulric Qin
00550ba2c7 add redis dashboard 2024-06-27 10:55:23 +08:00
Ulric Qin
c58bec23bf login fail count 2024-06-26 17:04:22 +08:00
ning
a5b77be0ab fix: recovered event id 2024-06-26 11:57:27 +08:00
Yening Qin
f529681c35 fix: embedded-dashboards api perm (#2012) 2024-06-25 18:13:13 +08:00
Ulric Qin
e3042dd6d5 Merge branch 'main' of github.com:ccfos/nightingale 2024-06-25 17:39:48 +08:00
Ulric Qin
1ebab4fcb0 add mysql dashboard 2024-06-25 17:39:15 +08:00
ning
ccf38b6da7 docs: update clickhouse integrations 2024-06-25 16:48:12 +08:00
Ulric Qin
9a0a687727 Merge branch 'main' of github.com:ccfos/nightingale 2024-06-25 16:42:59 +08:00
Ulric Qin
d00510978d add mysql dashboard 2024-06-25 16:42:40 +08:00
ning
9b478d98fd refactor: checkout heartbeat global label 2024-06-25 15:09:29 +08:00
ning
4845ca5bdb refactor: update compose sql 2024-06-22 00:33:45 +08:00
Yening Qin
a844d2b091 fix: use postgresql (#2008) 2024-06-21 18:13:58 +08:00
Ulric Qin
69ca7f3b93 validate heartbeat hostname 2024-06-21 17:51:44 +08:00
Ulric Qin
b9c6c33ceb refactor extractIdentFromTimeSeries 2024-06-21 17:43:43 +08:00
Ulric Qin
5099d3c040 add ignore_host querystring 2024-06-21 12:36:45 +08:00
Ulric Qin
e34f8ac701 Merge branch 'main' of github.com:ccfos/nightingale 2024-06-21 12:25:08 +08:00
Ulric Qin
ab82a6f910 modify ignore_ident logic 2024-06-21 12:24:54 +08:00
小炒肉
57f8bd3612 feat: callbackURL Parse Template (#2001)
Co-authored-by: zhihuanzhu <zhihuanzhu@deeproute.ai>
2024-06-19 14:46:06 +08:00
ning
8ab96e2cea refactor: add more mute log 2024-06-17 19:17:51 +08:00
ning
0a2e23c285 refactor: update users api 2024-06-17 17:09:33 +08:00
小炒肉
5c1d4077e2 fix: feishucard ats (#1997)
Co-authored-by: zhihuanzhu <zhihuanzhu@deeproute.ai>
2024-06-17 15:14:27 +08:00
Zoe
2a46d9f98e docs: add clickhouse alerts,dashboards,metrics (#1994) 2024-06-17 11:28:17 +08:00
Ulric Qin
ce5c213593 Merge branch 'main' of github.com:ccfos/nightingale 2024-06-14 19:03:01 +08:00
ning
771a8d121b refactor: change recovery event last_eval_time 2024-06-14 16:13:44 +08:00
Ulric Qin
af88b0e283 Merge branch 'main' of github.com:ccfos/nightingale 2024-06-14 15:27:24 +08:00
Ulric Qin
8e5d7f2a5b update dingtalk tpl 2024-06-14 15:27:06 +08:00
Yening Qin
1a22211a5d feat: oidc support default team (#1995) 2024-06-14 12:00:21 +08:00
70 changed files with 19383 additions and 685 deletions

View File

@@ -78,22 +78,12 @@
![边缘部署模式](https://download.flashcat.cloud/ulric/20240222102119.png)
## 近期计划
- [ ] 仪表盘:支持内嵌 Grafana
- [ ] 告警规则:通知时支持配置过滤标签,避免告警事件中一堆不重要的标签
- [x] 告警规则:支持配置恢复时的 Promql告警恢复通知也可以带上恢复时的值了
- [ ] 机器管理自定义标签拆分管理agent 自动上报的标签和用户在页面自定义的标签分开管理,对于 agent 自动上报的标签,以 agent 为准,直接覆盖服务端 DB 中的数据
- [ ] 机器管理:机器支持角色字段,即无头标签,用于描述混部场景
- [ ] 机器管理:把业务组的 busigroup 标签迁移到机器的属性里,让机器支持挂到多个业务组
- [ ] 告警规则:增加 Host Metrics 类别,支持按照业务组、角色、标签等筛选机器,规则 promql 支持变量,支持在机器颗粒度配置变量值
- [ ] 告警通知:重构整个通知逻辑,引入事件处理的 pipeline支持对告警事件做自定义处理和灵活分派
## 交流渠道
- 报告Bug优先推荐提交[夜莺GitHub Issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&projects=&template=bug_report.yml)
- 推荐完整浏览[夜莺文档站点](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale-v7/introduction/),了解更多信息
- 推荐搜索关注夜莺公众号,第一时间获取社区动态:`夜莺监控Nightingale`
- 日常答疑、技术分享、用户之间的交流,统一使用知识星球,大伙可以免费加入交流,[入口在这里](https://download.flashcat.cloud/ulric/20240319095409.png)
- 日常问题交流推荐加入[知识星球](https://download.flashcat.cloud/ulric/20240319095409.png),也可以加我微信 `picobyte`,备注:`夜莺加群-<公司>-<姓名>` 拉入微信群,不过研发人员主要是关注 github issue 和星球,微信群关注较少
## 广受关注
[![Stargazers over time](https://api.star-history.com/svg?repos=ccfos/nightingale&type=Date)](https://star-history.com/#ccfos/nightingale&Date)

View File

@@ -32,6 +32,7 @@ type Alerting struct {
Timeout int64
TemplatesDir string
NotifyConcurrency int
WebhookBatchSend bool
}
type CallPlugin struct {

View File

@@ -100,6 +100,8 @@ func (e *Dispatch) relaodTpls() error {
models.Mm: sender.NewSender(models.Mm, tmpTpls),
models.Telegram: sender.NewSender(models.Telegram, tmpTpls),
models.FeishuCard: sender.NewSender(models.FeishuCard, tmpTpls),
models.Lark: sender.NewSender(models.Lark, tmpTpls),
models.LarkCard: sender.NewSender(models.LarkCard, tmpTpls),
}
// domain -> Callback()
@@ -110,7 +112,9 @@ func (e *Dispatch) relaodTpls() error {
models.TelegramDomain: sender.NewCallBacker(models.TelegramDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.FeishuCardDomain: sender.NewCallBacker(models.FeishuCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.IbexDomain: sender.NewCallBacker(models.IbexDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.LarkDomain: sender.NewCallBacker(models.LarkDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.DefaultDomain: sender.NewCallBacker(models.DefaultDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.LarkCardDomain: sender.NewCallBacker(models.LarkCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
}
e.RwLock.RLock()
@@ -261,7 +265,11 @@ func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, not
e.SendCallbacks(rule, notifyTarget, event)
// handle global webhooks
sender.SendWebhooks(notifyTarget.ToWebhookList(), event, e.Astats)
if e.alerting.WebhookBatchSend {
sender.BatchSendWebhooks(notifyTarget.ToWebhookList(), event, e.Astats)
} else {
sender.SingleSendWebhooks(notifyTarget.ToWebhookList(), event, e.Astats)
}
// handle plugin call
go sender.MayPluginNotify(e.genNoticeBytes(event), e.notifyConfigCache.GetNotifyScript(), e.Astats)
@@ -276,7 +284,7 @@ func (e *Dispatch) SendCallbacks(rule *models.AlertRule, notifyTarget *NotifyTar
continue
}
cbCtx := sender.BuildCallBackContext(e.ctx, urlStr, rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.Astats)
cbCtx := sender.BuildCallBackContext(e.ctx, urlStr, rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.alerting.WebhookBatchSend, e.Astats)
if strings.HasPrefix(urlStr, "${ibex}") {
e.CallBacks[models.IbexDomain].CallBack(cbCtx)
@@ -299,6 +307,12 @@ func (e *Dispatch) SendCallbacks(rule *models.AlertRule, notifyTarget *NotifyTar
continue
}
// process lark card
if parsedURL.Host == models.LarkDomain && parsedURL.Query().Get("card") == "1" {
e.CallBacks[models.LarkCardDomain].CallBack(cbCtx)
continue
}
callBacker, ok := e.CallBacks[parsedURL.Host]
if ok {
callBacker.CallBack(cbCtx)

View File

@@ -79,6 +79,22 @@ func (s *NotifyTarget) ToCallbackList() []string {
func (s *NotifyTarget) ToWebhookList() []*models.Webhook {
webhooks := make([]*models.Webhook, 0, len(s.webhooks))
for _, wh := range s.webhooks {
if wh.Batch == 0 {
wh.Batch = 1000
}
if wh.Timeout == 0 {
wh.Timeout = 10
}
if wh.RetryCount == 0 {
wh.RetryCount = 10
}
if wh.RetryInterval == 0 {
wh.RetryInterval = 10
}
webhooks = append(webhooks, wh)
}
return webhooks

View File

@@ -143,19 +143,22 @@ func (arw *AlertRuleWorker) Eval() {
if p.Severity > point.Severity {
hash := process.Hash(cachedRule.Id, arw.processor.DatasourceId(), p)
arw.processor.DeleteProcessEvent(hash)
models.AlertCurEventDelByHash(arw.ctx, hash)
pointsMap[tagHash] = point
}
}
now := time.Now().Unix()
for _, point := range pointsMap {
str := fmt.Sprintf("%v", point.Value)
arw.processor.RecoverSingle(process.Hash(cachedRule.Id, arw.processor.DatasourceId(), point), point.Timestamp, &str)
arw.processor.RecoverSingle(process.Hash(cachedRule.Id, arw.processor.DatasourceId(), point), now, &str)
}
} else {
now := time.Now().Unix()
for _, point := range recoverPoints {
str := fmt.Sprintf("%v", point.Value)
arw.processor.RecoverSingle(process.Hash(cachedRule.Id, arw.processor.DatasourceId(), point), point.Timestamp, &str)
arw.processor.RecoverSingle(process.Hash(cachedRule.Id, arw.processor.DatasourceId(), point), now, &str)
}
}

View File

@@ -12,28 +12,28 @@ import (
"github.com/toolkits/pkg/logger"
)
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) bool {
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) (bool, string) {
if rule.Disabled == 1 {
return true
return true, "rule disabled"
}
if TimeSpanMuteStrategy(rule, event) {
return true
return true, "rule is not effective for period of time"
}
if IdentNotExistsMuteStrategy(rule, event, targetCache) {
return true
return true, "ident not exists mute"
}
if BgNotMatchMuteStrategy(rule, event, targetCache) {
return true
return true, "bg not match mute"
}
if EventMuteStrategy(event, alertMuteCache) {
return true
return true, "match mute rule"
}
return false
return false, ""
}
// TimeSpanMuteStrategy 根据规则配置的告警生效时间段过滤,如果产生的告警不在规则配置的告警生效时间段内,则不告警,即被mute

View File

@@ -18,6 +18,9 @@ import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
)
@@ -145,9 +148,10 @@ func (p *Processor) Handle(anomalyPoints []common.AnomalyPoint, from string, inh
// 如果 event 被 mute 了,本质也是 fire 的状态,这里无论如何都添加到 alertingKeys 中,防止 fire 的事件自动恢复了
hash := event.Hash
alertingKeys[hash] = struct{}{}
if mute.IsMuted(cachedRule, event, p.TargetCache, p.alertMuteCache) {
isMuted, detail := mute.IsMuted(cachedRule, event, p.TargetCache, p.alertMuteCache)
if isMuted {
p.Stats.CounterMuteTotal.WithLabelValues(event.GroupName).Inc()
logger.Debugf("rule_eval:%s event:%v is muted", p.Key(), event)
logger.Debugf("rule_eval:%s event:%v is muted, detail:%s", p.Key(), event, detail)
continue
}
@@ -217,9 +221,57 @@ func (p *Processor) BuildEvent(anomalyPoint common.AnomalyPoint, from string, no
} else {
event.LastEvalTime = event.TriggerTime
}
// 生成事件之后,立马进程 relabel 处理
Relabel(p.rule, event)
return event
}
func Relabel(rule *models.AlertRule, event *models.AlertCurEvent) {
if rule == nil {
return
}
// need to keep the original label
event.OriginalTags = event.Tags
event.OriginalTagsJSON = make([]string, len(event.TagsJSON))
labels := make([]prompb.Label, len(event.TagsJSON))
for i, tag := range event.TagsJSON {
label := strings.Split(tag, "=")
if len(label) != 2 {
logger.Errorf("event%+v relabel: the label length is not 2:%v", event, label)
continue
}
event.OriginalTagsJSON[i] = tag
labels[i] = prompb.Label{Name: label[0], Value: label[1]}
}
for i := 0; i < len(rule.EventRelabelConfig); i++ {
if rule.EventRelabelConfig[i].Replacement == "" {
rule.EventRelabelConfig[i].Replacement = "$1"
}
if rule.EventRelabelConfig[i].Separator == "" {
rule.EventRelabelConfig[i].Separator = ";"
}
if rule.EventRelabelConfig[i].Regex == "" {
rule.EventRelabelConfig[i].Regex = "(.*)"
}
}
// relabel process
relabels := writer.Process(labels, rule.EventRelabelConfig...)
event.TagsJSON = make([]string, len(relabels))
event.TagsMap = make(map[string]string, len(relabels))
for i, label := range relabels {
event.TagsJSON[i] = fmt.Sprintf("%s=%s", label.Name, label.Value)
event.TagsMap[label.Name] = label.Value
}
event.Tags = strings.Join(event.TagsJSON, ",,")
}
func (p *Processor) HandleRecover(alertingKeys map[string]struct{}, now int64, inhibit bool) {
for _, hash := range p.pendings.Keys() {
if _, has := alertingKeys[hash]; has {
@@ -270,6 +322,7 @@ func (p *Processor) HandleRecoverEvent(hashArr []string, now int64, inhibit bool
// hash 对应的恢复事件的被抑制了,把之前的事件删除
p.fires.Delete(e.Hash)
p.pendings.Delete(e.Hash)
models.AlertCurEventDelByHash(p.ctx, e.Hash)
eventMap[event.Tags] = *event
}
}

View File

@@ -10,6 +10,7 @@ import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
@@ -19,19 +20,35 @@ type RecordRuleContext struct {
datasourceId int64
quit chan struct{}
scheduler *cron.Cron
rule *models.RecordingRule
promClients *prom.PromClientMap
stats *astats.Stats
}
func NewRecordRuleContext(rule *models.RecordingRule, datasourceId int64, promClients *prom.PromClientMap, writers *writer.WritersType, stats *astats.Stats) *RecordRuleContext {
return &RecordRuleContext{
rrc := &RecordRuleContext{
datasourceId: datasourceId,
quit: make(chan struct{}),
rule: rule,
promClients: promClients,
stats: stats,
}
if rule.CronPattern == "" && rule.PromEvalInterval != 0 {
rule.CronPattern = fmt.Sprintf("@every %ds", rule.PromEvalInterval)
}
rrc.scheduler = cron.New(cron.WithSeconds())
_, err := rrc.scheduler.AddFunc(rule.CronPattern, func() {
rrc.Eval()
})
if err != nil {
logger.Errorf("add cron pattern error: %v", err)
}
return rrc
}
func (rrc *RecordRuleContext) Key() string {
@@ -39,9 +56,9 @@ func (rrc *RecordRuleContext) Key() string {
}
func (rrc *RecordRuleContext) Hash() string {
return str.MD5(fmt.Sprintf("%d_%d_%s_%d",
return str.MD5(fmt.Sprintf("%d_%s_%s_%d",
rrc.rule.Id,
rrc.rule.PromEvalInterval,
rrc.rule.CronPattern,
rrc.rule.PromQl,
rrc.datasourceId,
))
@@ -51,23 +68,7 @@ func (rrc *RecordRuleContext) Prepare() {}
func (rrc *RecordRuleContext) Start() {
logger.Infof("eval:%s started", rrc.Key())
interval := rrc.rule.PromEvalInterval
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-rrc.quit:
return
case <-ticker.C:
rrc.Eval()
}
}
}()
rrc.scheduler.Start()
}
func (rrc *RecordRuleContext) Eval() {
@@ -109,5 +110,8 @@ func (rrc *RecordRuleContext) Eval() {
func (rrc *RecordRuleContext) Stop() {
logger.Infof("%s stopped", rrc.Key())
c := rrc.scheduler.Stop()
<-c.Done()
close(rrc.quit)
}

View File

@@ -29,21 +29,24 @@ type (
Rule *models.AlertRule
Events []*models.AlertCurEvent
Stats *astats.Stats
BatchSend bool
}
DefaultCallBacker struct{}
)
func BuildCallBackContext(ctx *ctx.Context, callBackURL string, rule *models.AlertRule, events []*models.AlertCurEvent,
uids []int64, userCache *memsto.UserCacheType, stats *astats.Stats) CallBackContext {
uids []int64, userCache *memsto.UserCacheType, batchSend bool, stats *astats.Stats) CallBackContext {
users := userCache.GetByUserIds(uids)
newCallBackUrl, _ := events[0].ParseURL(callBackURL)
return CallBackContext{
Ctx: ctx,
CallBackURL: callBackURL,
CallBackURL: newCallBackUrl,
Rule: rule,
Events: events,
Users: users,
BatchSend: batchSend,
Stats: stats,
}
}
@@ -95,6 +98,10 @@ func NewCallBacker(
// return &MmSender{tpl: tpls[models.Mm]}
case models.TelegramDomain:
return &TelegramSender{tpl: tpls[models.Telegram]}
case models.LarkDomain:
return &LarkSender{tpl: tpls[models.Lark]}
case models.LarkCardDomain:
return &LarkCardSender{tpl: tpls[models.LarkCard]}
}
return nil
@@ -107,15 +114,30 @@ func (c *DefaultCallBacker) CallBack(ctx CallBackContext) {
event := ctx.Events[0]
if ctx.BatchSend {
webhookConf := &models.Webhook{
Type: models.RuleCallback,
Enable: true,
Url: ctx.CallBackURL,
Timeout: 5,
RetryCount: 3,
RetryInterval: 10,
Batch: 1000,
}
PushCallbackEvent(webhookConf, event, ctx.Stats)
return
}
ctx.Stats.AlertNotifyTotal.WithLabelValues("rule_callback").Inc()
resp, code, err := poster.PostJSON(ctx.CallBackURL, 5*time.Second, event, 3)
if err != nil {
logger.Errorf("event_callback_fail(rule_id=%d url=%s), resp: %s, err: %v, code: %d",
event.RuleId, ctx.CallBackURL, string(resp), err, code)
logger.Errorf("event_callback_fail(rule_id=%d url=%s), event:%+v, resp: %s, err: %v, code: %d",
event.RuleId, ctx.CallBackURL, event, string(resp), err, code)
ctx.Stats.AlertNotifyErrorTotal.WithLabelValues("rule_callback").Inc()
} else {
logger.Infof("event_callback_succ(rule_id=%d url=%s), resp: %s, code: %d",
event.RuleId, ctx.CallBackURL, string(resp), code)
logger.Infof("event_callback_succ(rule_id=%d url=%s), event:%+v, resp: %s, code: %d",
event.RuleId, ctx.CallBackURL, event, string(resp), code)
}
}
@@ -135,3 +157,27 @@ type TaskCreateReply struct {
Err string `json:"err"`
Dat int64 `json:"dat"` // task.id
}
func PushCallbackEvent(webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
CallbackEventQueueLock.RLock()
queue := CallbackEventQueue[webhook.Url]
CallbackEventQueueLock.RUnlock()
if queue == nil {
queue = &WebhookQueue{
list: NewSafeListLimited(QueueMaxSize),
closeCh: make(chan struct{}),
}
CallbackEventQueueLock.Lock()
CallbackEventQueue[webhook.Url] = queue
CallbackEventQueueLock.Unlock()
StartConsumer(queue, webhook.Batch, webhook, stats)
}
succ := queue.list.PushFront(event)
if !succ {
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
}
}

View File

@@ -97,7 +97,21 @@ func (fs *FeishuCardSender) CallBack(ctx CallBackContext) {
return
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.FeishuCard, fs.tpl, ctx.Events)
if len(ats) > 0 {
atTags := ""
for _, at := range ats {
if strings.Contains(at, "@") {
atTags += fmt.Sprintf("<at email=\"%s\" ></at>", at)
} else {
atTags += fmt.Sprintf("<at id=\"%s\" ></at>", at)
}
}
message = atTags + message
}
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {

64
alert/sender/lark.go Normal file
View File

@@ -0,0 +1,64 @@
package sender
import (
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
var (
_ CallBacker = (*LarkSender)(nil)
)
type LarkSender struct {
tpl *template.Template
}
func (lk *LarkSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: BuildTplMessage(models.Lark, lk.tpl, ctx.Events),
},
}
doSend(ctx.CallBackURL, body, models.Lark, ctx.Stats)
ctx.Stats.AlertNotifyTotal.WithLabelValues("rule_callback").Inc()
}
func (lk *LarkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls := lk.extract(ctx.Users)
message := BuildTplMessage(models.Lark, lk.tpl, ctx.Events)
for _, url := range urls {
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: message,
},
}
doSend(url, body, models.Lark, ctx.Stats)
}
}
func (lk *LarkSender) extract(users []*models.User) []string {
urls := make([]string, 0, len(users))
for _, user := range users {
if token, has := user.ExtractToken(models.Lark); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + token
}
urls = append(urls, url)
}
}
return urls
}

98
alert/sender/larkcard.go Normal file
View File

@@ -0,0 +1,98 @@
package sender
import (
"fmt"
"html/template"
"net/url"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type LarkCardSender struct {
tpl *template.Template
}
func (fs *LarkCardSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events)
if len(ats) > 0 {
atTags := ""
for _, at := range ats {
if strings.Contains(at, "@") {
atTags += fmt.Sprintf("<at email=\"%s\" ></at>", at)
} else {
atTags += fmt.Sprintf("<at id=\"%s\" ></at>", at)
}
}
message = atTags + message
}
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
// This is to be compatible with the Larkcard interface, if with query string parameters, the request will fail
// Remove query parameters from the URL,
parsedURL, err := url.Parse(ctx.CallBackURL)
if err != nil {
return
}
parsedURL.RawQuery = ""
doSend(parsedURL.String(), body, models.LarkCard, ctx.Stats)
}
func (fs *LarkCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, _ := fs.extract(ctx.Users)
message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
for _, url := range urls {
doSend(url, body, models.LarkCard, ctx.Stats)
}
}
func (fs *LarkCardSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
ats := make([]string, 0)
for i := range users {
if token, has := users[i].ExtractToken(models.Lark); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + strings.TrimSpace(token)
}
urls = append(urls, url)
}
}
return urls, ats
}

View File

@@ -41,6 +41,10 @@ func NewSender(key string, tpls map[string]*template.Template, smtp ...aconf.SMT
return &MmSender{tpl: tpls[models.Mm]}
case models.Telegram:
return &TelegramSender{tpl: tpls[models.Telegram]}
case models.Lark:
return &LarkSender{tpl: tpls[models.Lark]}
case models.LarkCard:
return &LarkCardSender{tpl: tpls[models.LarkCard]}
}
return nil
}

View File

@@ -2,9 +2,11 @@ package sender
import (
"bytes"
"crypto/tls"
"encoding/json"
"io"
"net/http"
"sync"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
@@ -13,59 +15,159 @@ import (
"github.com/toolkits/pkg/logger"
)
func SendWebhooks(webhooks []*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
for _, conf := range webhooks {
if conf.Url == "" || !conf.Enable {
continue
}
bs, err := json.Marshal(event)
if err != nil {
continue
}
func sendWebhook(webhook *models.Webhook, event interface{}, stats *astats.Stats) bool {
channel := "webhook"
if webhook.Type == models.RuleCallback {
channel = "callback"
}
bf := bytes.NewBuffer(bs)
conf := webhook
if conf.Url == "" || !conf.Enable {
return false
}
bs, err := json.Marshal(event)
if err != nil {
logger.Errorf("%s alertingWebhook failed to marshal event:%+v err:%v", channel, event, err)
return false
}
req, err := http.NewRequest("POST", conf.Url, bf)
if err != nil {
logger.Warning("alertingWebhook failed to new request", err)
continue
}
bf := bytes.NewBuffer(bs)
req.Header.Set("Content-Type", "application/json")
if conf.BasicAuthUser != "" && conf.BasicAuthPass != "" {
req.SetBasicAuth(conf.BasicAuthUser, conf.BasicAuthPass)
}
req, err := http.NewRequest("POST", conf.Url, bf)
if err != nil {
logger.Warningf("%s alertingWebhook failed to new reques event:%s err:%v", channel, string(bs), err)
return true
}
if len(conf.Headers) > 0 && len(conf.Headers)%2 == 0 {
for i := 0; i < len(conf.Headers); i += 2 {
if conf.Headers[i] == "host" || conf.Headers[i] == "Host" {
req.Host = conf.Headers[i+1]
continue
}
req.Header.Set(conf.Headers[i], conf.Headers[i+1])
req.Header.Set("Content-Type", "application/json")
if conf.BasicAuthUser != "" && conf.BasicAuthPass != "" {
req.SetBasicAuth(conf.BasicAuthUser, conf.BasicAuthPass)
}
if len(conf.Headers) > 0 && len(conf.Headers)%2 == 0 {
for i := 0; i < len(conf.Headers); i += 2 {
if conf.Headers[i] == "host" || conf.Headers[i] == "Host" {
req.Host = conf.Headers[i+1]
continue
}
req.Header.Set(conf.Headers[i], conf.Headers[i+1])
}
}
insecureSkipVerify := false
if webhook != nil {
insecureSkipVerify = webhook.SkipVerify
}
client := http.Client{
Timeout: time.Duration(conf.Timeout) * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: insecureSkipVerify},
},
}
// todo add skip verify
client := http.Client{
Timeout: time.Duration(conf.Timeout) * time.Second,
stats.AlertNotifyTotal.WithLabelValues(channel).Inc()
var resp *http.Response
resp, err = client.Do(req)
if err != nil {
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
logger.Errorf("event_%s_fail, event:%s, url: [%s], error: [%s]", channel, string(bs), conf.Url, err)
return true
}
var body []byte
if resp.Body != nil {
defer resp.Body.Close()
body, _ = io.ReadAll(resp.Body)
}
if resp.StatusCode == 429 {
logger.Errorf("event_%s_fail, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs))
return true
}
logger.Debugf("event_%s_succ, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs))
return false
}
func SingleSendWebhooks(webhooks []*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
for _, conf := range webhooks {
retryCount := 0
for retryCount < 3 {
needRetry := sendWebhook(conf, event, stats)
if !needRetry {
break
}
retryCount++
time.Sleep(time.Minute * 1 * time.Duration(retryCount))
}
}
}
func BatchSendWebhooks(webhooks []*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
for _, conf := range webhooks {
logger.Infof("push event:%+v to queue:%v", event, conf)
PushEvent(conf, event, stats)
}
}
var EventQueue = make(map[string]*WebhookQueue)
var CallbackEventQueue = make(map[string]*WebhookQueue)
var CallbackEventQueueLock sync.RWMutex
var EventQueueLock sync.RWMutex
const QueueMaxSize = 100000
type WebhookQueue struct {
list *SafeListLimited
closeCh chan struct{}
}
func PushEvent(webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
EventQueueLock.RLock()
queue := EventQueue[webhook.Url]
EventQueueLock.RUnlock()
if queue == nil {
queue = &WebhookQueue{
list: NewSafeListLimited(QueueMaxSize),
closeCh: make(chan struct{}),
}
EventQueueLock.Lock()
EventQueue[webhook.Url] = queue
EventQueueLock.Unlock()
StartConsumer(queue, webhook.Batch, webhook, stats)
}
succ := queue.list.PushFront(event)
if !succ {
stats.AlertNotifyErrorTotal.WithLabelValues("push_event_queue").Inc()
logger.Warningf("Write channel(%s) full, current channel size: %d event:%v", webhook.Url, queue.list.Len(), event)
}
}
func StartConsumer(queue *WebhookQueue, popSize int, webhook *models.Webhook, stats *astats.Stats) {
for {
select {
case <-queue.closeCh:
logger.Infof("event queue:%v closed", queue)
return
default:
events := queue.list.PopBack(popSize)
if len(events) == 0 {
time.Sleep(time.Millisecond * 400)
continue
}
retryCount := 0
for retryCount < webhook.RetryCount {
needRetry := sendWebhook(webhook, events, stats)
if !needRetry {
break
}
retryCount++
time.Sleep(time.Second * time.Duration(webhook.RetryInterval) * time.Duration(retryCount))
}
}
stats.AlertNotifyTotal.WithLabelValues("webhook").Inc()
var resp *http.Response
resp, err = client.Do(req)
if err != nil {
stats.AlertNotifyErrorTotal.WithLabelValues("webhook").Inc()
logger.Errorf("event_webhook_fail, ruleId: [%d], eventId: [%d], url: [%s], error: [%s]", event.RuleId, event.Id, conf.Url, err)
continue
}
var body []byte
if resp.Body != nil {
defer resp.Body.Close()
body, _ = io.ReadAll(resp.Body)
}
logger.Debugf("event_webhook_succ, url: %s, response code: %d, body: %s event:%+v", conf.Url, resp.StatusCode, string(body), event)
}
}

View File

@@ -0,0 +1,111 @@
package sender
import (
"container/list"
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type SafeList struct {
sync.RWMutex
L *list.List
}
func NewSafeList() *SafeList {
return &SafeList{L: list.New()}
}
func (sl *SafeList) PushFront(v interface{}) *list.Element {
sl.Lock()
e := sl.L.PushFront(v)
sl.Unlock()
return e
}
func (sl *SafeList) PushFrontBatch(vs []interface{}) {
sl.Lock()
for _, item := range vs {
sl.L.PushFront(item)
}
sl.Unlock()
}
func (sl *SafeList) PopBack(max int) []*models.AlertCurEvent {
sl.Lock()
count := sl.L.Len()
if count == 0 {
sl.Unlock()
return []*models.AlertCurEvent{}
}
if count > max {
count = max
}
items := make([]*models.AlertCurEvent, 0, count)
for i := 0; i < count; i++ {
item := sl.L.Remove(sl.L.Back())
sample, ok := item.(*models.AlertCurEvent)
if ok {
items = append(items, sample)
}
}
sl.Unlock()
return items
}
func (sl *SafeList) RemoveAll() {
sl.Lock()
sl.L.Init()
sl.Unlock()
}
func (sl *SafeList) Len() int {
sl.RLock()
size := sl.L.Len()
sl.RUnlock()
return size
}
// SafeList with Limited Size
type SafeListLimited struct {
maxSize int
SL *SafeList
}
func NewSafeListLimited(maxSize int) *SafeListLimited {
return &SafeListLimited{SL: NewSafeList(), maxSize: maxSize}
}
func (sll *SafeListLimited) PopBack(max int) []*models.AlertCurEvent {
return sll.SL.PopBack(max)
}
func (sll *SafeListLimited) PushFront(v interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFront(v)
return true
}
func (sll *SafeListLimited) PushFrontBatch(vs []interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFrontBatch(vs)
return true
}
func (sll *SafeListLimited) RemoveAll() {
sll.SL.RemoveAll()
}
func (sll *SafeListLimited) Len() int {
return sll.SL.Len()
}

View File

@@ -18,20 +18,28 @@ var MetricDesc MetricDescType
// GetMetricDesc , if metric is not registered, empty string will be returned
func GetMetricDesc(lang, metric string) string {
var m map[string]string
if lang == "zh" {
m = MetricDesc.Zh
} else {
switch lang {
case "en":
m = MetricDesc.En
default:
m = MetricDesc.Zh
}
if m != nil {
if desc, has := m[metric]; has {
if desc, ok := m[metric]; ok {
return desc
}
}
return MetricDesc.CommonDesc[metric]
}
if MetricDesc.CommonDesc != nil {
if desc, ok := MetricDesc.CommonDesc[metric]; ok {
return desc
}
}
return ""
}
func LoadMetricsYaml(configDir, metricsYamlFile string) error {
fp := metricsYamlFile
if fp == "" {

View File

@@ -78,6 +78,7 @@ ops:
- "/dashboards/del"
- "/embedded-dashboards/put"
- "/embedded-dashboards"
- "/public-dashboards"
- name: alert
cname: 告警规则

View File

@@ -107,7 +107,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
go version.GetGithubVersion()
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
centerRouter := centerrt.New(config.HTTP, config.Center, config.Alert, cconf.Operations, dsCache, notifyConfigCache, promClients, tdengineClients,
centerRouter := centerrt.New(config.HTTP, config.Center, config.Alert, config.Ibex, cconf.Operations, dsCache, notifyConfigCache, promClients, tdengineClients,
redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx)

View File

@@ -13,6 +13,7 @@ import (
"github.com/ccfos/nightingale/v6/center/cstats"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
_ "github.com/ccfos/nightingale/v6/front/statik"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/aop"
@@ -34,6 +35,7 @@ import (
type Router struct {
HTTP httpx.Config
Center cconf.Center
Ibex conf.Ibex
Alert aconf.Alert
Operations cconf.Operation
DatasourceCache *memsto.DatasourceCacheType
@@ -48,13 +50,15 @@ type Router struct {
UserCache *memsto.UserCacheType
UserGroupCache *memsto.UserGroupCacheType
Ctx *ctx.Context
HeartbeatHook HeartbeatHookFunc
}
func New(httpConfig httpx.Config, center cconf.Center, alert aconf.Alert, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType, pc *prom.PromClientMap, tdendgineClients *tdengine.TdengineClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) *Router {
func New(httpConfig httpx.Config, center cconf.Center, alert aconf.Alert, ibex conf.Ibex, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType, pc *prom.PromClientMap, tdendgineClients *tdengine.TdengineClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
Alert: alert,
Ibex: ibex,
Operations: operations,
DatasourceCache: ds,
NotifyConfigCache: ncc,
@@ -68,6 +72,7 @@ func New(httpConfig httpx.Config, center cconf.Center, alert aconf.Alert, operat
UserCache: uc,
UserGroupCache: ugc,
Ctx: ctx,
HeartbeatHook: func(ident string) map[string]interface{} { return nil },
}
}
@@ -91,7 +96,9 @@ func languageDetector(i18NHeaderKey string) gin.HandlerFunc {
if headerKey != "" {
lang := c.GetHeader(headerKey)
if lang != "" {
if strings.HasPrefix(lang, "zh") {
if strings.HasPrefix(lang, "zh_HK") {
c.Request.Header.Set("X-Language", "zh_HK")
} else if strings.HasPrefix(lang, "zh") {
c.Request.Header.Set("X-Language", "zh_CN")
} else if strings.HasPrefix(lang, "en") {
c.Request.Header.Set("X-Language", "en")
@@ -112,7 +119,7 @@ func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) {
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf":
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf", "md":
if !rt.Center.UseFileAssets {
c.FileFromFS(c.Request.URL.Path, *fs)
} else {
@@ -312,7 +319,9 @@ func (rt *Router) Config(r *gin.Engine) {
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)
pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet)
pages.GET("/alert-rule/:arid/pure", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRulePureGet)
pages.PUT("/busi-group/alert-rule/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation)
pages.POST("/relabel-test", rt.auth(), rt.user(), rt.relabelTest)
pages.GET("/busi-groups/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGetsByGids)
pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets)
@@ -372,8 +381,8 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks"), rt.bgro(), rt.taskGets)
pages.POST("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks/add"), rt.bgrw(), rt.taskAdd)
pages.GET("/servers", rt.auth(), rt.user(), rt.perm("/help/servers"), rt.serversGet)
pages.GET("/server-clusters", rt.auth(), rt.user(), rt.perm("/help/servers"), rt.serverClustersGet)
pages.GET("/servers", rt.auth(), rt.user(), rt.serversGet)
pages.GET("/server-clusters", rt.auth(), rt.user(), rt.serverClustersGet)
pages.POST("/datasource/list", rt.auth(), rt.user(), rt.datasourceList)
pages.POST("/datasource/plugin/list", rt.auth(), rt.pluginList)
@@ -423,6 +432,9 @@ func (rt *Router) Config(r *gin.Engine) {
pages.PUT("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternPut)
pages.DELETE("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternDel)
pages.GET("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards"), rt.embeddedDashboardsGet)
pages.PUT("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards/put"), rt.embeddedDashboardsPut)
pages.GET("/user-variable-configs", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigGets)
pages.POST("/user-variable-config", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigAdd)
pages.PUT("/user-variable-config/:id", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigPut)

View File

@@ -1,14 +1,18 @@
package router
import (
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/gin-gonic/gin"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/str"
@@ -316,6 +320,20 @@ func (rt *Router) alertRuleGet(c *gin.Context) {
ginx.NewRender(c).Data(ar, err)
}
func (rt *Router) alertRulePureGet(c *gin.Context) {
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
ginx.NewRender(c).Data(ar, err)
}
// pre validation before save rule
func (rt *Router) alertRuleValidation(c *gin.Context) {
var f models.AlertRule //new
@@ -388,3 +406,50 @@ func (rt *Router) alertRuleCallbacks(c *gin.Context) {
ginx.NewRender(c).Data(callbacks, nil)
}
type alertRuleTestForm struct {
Configs []*pconf.RelabelConfig `json:"configs"`
Tags []string `json:"tags"`
}
func (rt *Router) relabelTest(c *gin.Context) {
var f alertRuleTestForm
ginx.BindJSON(c, &f)
if len(f.Tags) == 0 || len(f.Configs) == 0 {
ginx.Bomb(http.StatusBadRequest, "relabel config is empty")
}
labels := make([]prompb.Label, len(f.Tags))
for i, tag := range f.Tags {
label := strings.Split(tag, "=")
if len(label) != 2 {
ginx.Bomb(http.StatusBadRequest, "tag:%s format error", tag)
}
labels[i] = prompb.Label{Name: label[0], Value: label[1]}
}
for i := 0; i < len(f.Configs); i++ {
if f.Configs[i].Replacement == "" {
f.Configs[i].Replacement = "$1"
}
if f.Configs[i].Separator == "" {
f.Configs[i].Separator = ";"
}
if f.Configs[i].Regex == "" {
f.Configs[i].Regex = "(.*)"
}
}
relabels := writer.Process(labels, f.Configs...)
var tags []string
for _, label := range relabels {
tags = append(tags, fmt.Sprintf("%s=%s", label.Name, label.Value))
}
ginx.NewRender(c).Data(tags, nil)
}

View File

@@ -1,13 +1,16 @@
package router
import (
"github.com/ccfos/nightingale/v6/models"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
)
const EMBEDDEDDASHBOARD = "embedded-dashboards"
func (rt *Router) configsGet(c *gin.Context) {
prefix := ginx.QueryStr(c, "prefix", "")
limit := ginx.QueryInt(c, "limit", 10)
@@ -33,6 +36,18 @@ func (rt *Router) configPutByKey(c *gin.Context) {
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, f.Ckey, f.Cval, username))
}
func (rt *Router) embeddedDashboardsGet(c *gin.Context) {
config, err := models.ConfigsGet(rt.Ctx, EMBEDDEDDASHBOARD)
ginx.NewRender(c).Data(config, err)
}
func (rt *Router) embeddedDashboardsPut(c *gin.Context) {
var f models.Configs
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, EMBEDDEDDASHBOARD, f.Cval, username))
}
func (rt *Router) configsDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)

View File

@@ -3,19 +3,33 @@ package router
import (
"compress/gzip"
"encoding/json"
"fmt"
"io/ioutil"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
type HeartbeatHookFunc func(ident string) map[string]interface{}
func (rt *Router) heartbeat(c *gin.Context) {
req, err := HandleHeartbeat(c, rt.Ctx, rt.Alert.Heartbeat.EngineName, rt.MetaSet, rt.IdentSet, rt.TargetCache)
ginx.Dangerous(err)
m := rt.HeartbeatHook(req.Hostname)
ginx.NewRender(c).Data(m, err)
}
func HandleHeartbeat(c *gin.Context, ctx *ctx.Context, engineName string, metaSet *metas.Set, identSet *idents.Set, targetCache *memsto.TargetCacheType) (models.HostMeta, error) {
var bs []byte
var err error
var r *gzip.Reader
@@ -24,7 +38,7 @@ func (rt *Router) heartbeat(c *gin.Context) {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
c.String(400, err.Error())
return
return req, err
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
@@ -32,11 +46,19 @@ func (rt *Router) heartbeat(c *gin.Context) {
} else {
defer c.Request.Body.Close()
bs, err = ioutil.ReadAll(c.Request.Body)
ginx.Dangerous(err)
if err != nil {
return req, err
}
}
err = json.Unmarshal(bs, &req)
ginx.Dangerous(err)
if err != nil {
return req, err
}
if req.Hostname == "" {
return req, fmt.Errorf("hostname is required", 400)
}
// maybe from pushgw
if req.Offset == 0 {
@@ -48,51 +70,65 @@ func (rt *Router) heartbeat(c *gin.Context) {
}
if req.EngineName == "" {
req.EngineName = rt.Alert.Heartbeat.EngineName
req.EngineName = engineName
}
rt.MetaSet.Set(req.Hostname, req)
metaSet.Set(req.Hostname, req)
var items = make(map[string]struct{})
items[req.Hostname] = struct{}{}
rt.IdentSet.MSet(items)
identSet.MSet(items)
if target, has := rt.TargetCache.Get(req.Hostname); has && target != nil {
if target, has := targetCache.Get(req.Hostname); has && target != nil {
gid := ginx.QueryInt64(c, "gid", 0)
hostIp := strings.TrimSpace(req.HostIp)
filed := make(map[string]interface{})
field := make(map[string]interface{})
if gid != 0 && gid != target.GroupId {
filed["group_id"] = gid
field["group_id"] = gid
}
if hostIp != "" && hostIp != target.HostIp {
filed["host_ip"] = hostIp
field["host_ip"] = hostIp
}
if len(req.GlobalLabels) > 0 {
tagsMap := target.GetTagsMap()
tagNeedUpdate := false
for k, v := range req.GlobalLabels {
if v == "" {
continue
}
if tagv, ok := tagsMap[k]; !ok || tagv != v {
tagNeedUpdate = true
tagsMap[k] = v
}
}
if tagNeedUpdate {
lst := []string{}
for k, v := range req.GlobalLabels {
for k, v := range tagsMap {
lst = append(lst, k+"="+v)
}
sort.Strings(lst)
labels := strings.Join(lst, " ")
if target.Tags != labels {
filed["tags"] = labels
}
labels := strings.Join(lst, " ") + " "
field["tags"] = labels
}
if req.EngineName != "" && req.EngineName != target.EngineName {
filed["engine_name"] = req.EngineName
field["engine_name"] = req.EngineName
}
if len(filed) > 0 {
err := target.UpdateFieldsMap(rt.Ctx, filed)
if req.AgentVersion != "" && req.AgentVersion != target.AgentVersion {
field["agent_version"] = req.AgentVersion
}
if len(field) > 0 {
err := target.UpdateFieldsMap(ctx, field)
if err != nil {
logger.Errorf("update target fields failed, err: %v", err)
}
}
logger.Debugf("heartbeat field:%+v target: %v", filed, *target)
logger.Debugf("heartbeat field:%+v target: %v", field, *target)
}
ginx.NewRender(c).Message(err)
return req, nil
}

View File

@@ -55,12 +55,12 @@ func (rt *Router) loginPost(c *gin.Context) {
var err error
lc := rt.Sso.LDAP.Copy()
if lc.Enable {
user, err = ldapx.LdapLogin(rt.Ctx, f.Username, authPassWord, lc.DefaultRoles, lc)
user, err = ldapx.LdapLogin(rt.Ctx, f.Username, authPassWord, lc.DefaultRoles, lc.DefaultTeams, lc)
if err != nil {
logger.Debugf("ldap login failed: %v username: %s", err, f.Username)
var errLoginInN9e error
// to use n9e as the minimum guarantee for login
if user, errLoginInN9e = models.PassLogin(rt.Ctx, f.Username, authPassWord); errLoginInN9e != nil {
if user, errLoginInN9e = models.PassLogin(rt.Ctx, rt.Redis, f.Username, authPassWord); errLoginInN9e != nil {
ginx.NewRender(c).Message("ldap login failed: %v; n9e login failed: %v", err, errLoginInN9e)
return
}
@@ -68,7 +68,7 @@ func (rt *Router) loginPost(c *gin.Context) {
user.RolesLst = strings.Fields(user.Roles)
}
} else {
user, err = models.PassLogin(rt.Ctx, f.Username, authPassWord)
user, err = models.PassLogin(rt.Ctx, rt.Redis, f.Username, authPassWord)
ginx.Dangerous(err)
}
@@ -262,6 +262,15 @@ func (rt *Router) loginCallback(c *gin.Context) {
user.FullSsoFields("oidc", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.OIDC.DefaultRoles)
// create user from oidc
ginx.Dangerous(user.Add(rt.Ctx))
if len(rt.Sso.OIDC.DefaultTeams) > 0 {
for _, gid := range rt.Sso.OIDC.DefaultTeams {
err = models.UserGroupMemberAdd(rt.Ctx, gid, user.Id)
if err != nil {
logger.Errorf("user:%v UserGroupMemberAdd: %s", user, err)
}
}
}
}
// set user login state

View File

@@ -95,7 +95,8 @@ func (rt *Router) alertMuteAddByService(c *gin.Context) {
var f models.AlertMute
ginx.BindJSON(c, &f)
ginx.NewRender(c).Message(f.Add(rt.Ctx))
err := f.Add(rt.Ctx)
ginx.NewRender(c).Data(f.Id, err)
}
func (rt *Router) alertMuteDel(c *gin.Context) {

View File

@@ -90,7 +90,8 @@ func (rt *Router) notifyChannelPuts(c *gin.Context) {
var notifyChannels []models.NotifyChannel
ginx.BindJSON(c, &notifyChannels)
channels := []string{models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram, models.Email}
channels := []string{models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram,
models.Email, models.Lark, models.LarkCard}
m := make(map[string]struct{})
for _, v := range notifyChannels {
@@ -126,7 +127,8 @@ func (rt *Router) notifyContactPuts(c *gin.Context) {
var notifyContacts []models.NotifyContact
ginx.BindJSON(c, &notifyContacts)
keys := []string{models.DingtalkKey, models.WecomKey, models.FeishuKey, models.MmKey, models.TelegramKey}
keys := []string{models.DingtalkKey, models.WecomKey, models.FeishuKey, models.MmKey,
models.TelegramKey, models.LarkKey}
m := make(map[string]struct{})
for _, v := range notifyContacts {

View File

@@ -4,9 +4,11 @@ import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) selfProfileGet(c *gin.Context) {
@@ -58,5 +60,25 @@ func (rt *Router) selfPasswordPut(c *gin.Context) {
var f selfPasswordForm
ginx.BindJSON(c, &f)
user := c.MustGet("user").(*models.User)
ginx.NewRender(c).Message(user.ChangePassword(rt.Ctx, f.OldPass, f.NewPass))
newPassWord := f.NewPass
oldPassWord := f.OldPass
if rt.HTTP.RSA.OpenRSA {
var err error
newPassWord, err = secu.Decrypt(f.NewPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username)
ginx.NewRender(c).Message(err)
return
}
oldPassWord, err = secu.Decrypt(f.OldPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username)
ginx.NewRender(c).Message(err)
return
}
}
ginx.NewRender(c).Message(user.ChangePassword(rt.Ctx, oldPassWord, newPassWord))
}

View File

@@ -49,6 +49,9 @@ func (rt *Router) targetGets(c *gin.Context) {
downtime := ginx.QueryInt64(c, "downtime", 0)
dsIds := queryDatasourceIds(c)
order := ginx.QueryStr(c, "order", "ident")
desc := ginx.QueryBool(c, "desc", false)
var err error
if len(bgids) == 0 {
user := c.MustGet("user").(*models.User)
@@ -62,11 +65,17 @@ func (rt *Router) targetGets(c *gin.Context) {
bgids = append(bgids, 0)
}
}
total, err := models.TargetTotal(rt.Ctx, bgids, dsIds, query, downtime)
options := []models.BuildTargetWhereOption{
models.BuildTargetWhereWithBgids(bgids),
models.BuildTargetWhereWithDsIds(dsIds),
models.BuildTargetWhereWithQuery(query),
models.BuildTargetWhereWithDowntime(downtime),
}
total, err := models.TargetTotal(rt.Ctx, options...)
ginx.Dangerous(err)
list, err := models.TargetGets(rt.Ctx, bgids, dsIds, query, downtime, limit, ginx.Offset(c, limit))
list, err := models.TargetGets(rt.Ctx, limit,
ginx.Offset(c, limit), order, desc, options...)
ginx.Dangerous(err)
if err == nil {

View File

@@ -8,6 +8,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/str"
)
@@ -104,6 +105,11 @@ func (rt *Router) taskRecordAdd(c *gin.Context) {
}
func (rt *Router) taskAdd(c *gin.Context) {
if !rt.Ibex.Enable {
ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it."))
return
}
var f models.TaskForm
ginx.BindJSON(c, &f)

View File

@@ -10,6 +10,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/str"
)
@@ -118,6 +119,11 @@ type taskTplForm struct {
}
func (rt *Router) taskTplAdd(c *gin.Context) {
if !rt.Ibex.Enable {
ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it."))
return
}
var f taskTplForm
ginx.BindJSON(c, &f)

View File

@@ -7,9 +7,11 @@ import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) userBusiGroupsGets(c *gin.Context) {
@@ -46,6 +48,7 @@ func (rt *Router) userGets(c *gin.Context) {
order := ginx.QueryStr(c, "order", "username")
desc := ginx.QueryBool(c, "desc", false)
rt.UserCache.UpdateUsersLastActiveTime()
total, err := models.UserTotal(rt.Ctx, query, stime, etime)
ginx.Dangerous(err)
@@ -76,7 +79,18 @@ func (rt *Router) userAddPost(c *gin.Context) {
var f userAddForm
ginx.BindJSON(c, &f)
password, err := models.CryptoPass(rt.Ctx, f.Password)
authPassWord := f.Password
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
password, err := models.CryptoPass(rt.Ctx, authPassWord)
ginx.Dangerous(err)
if len(f.Roles) == 0 {
@@ -177,7 +191,18 @@ func (rt *Router) userPasswordPut(c *gin.Context) {
target := User(rt.Ctx, ginx.UrlParamInt64(c, "id"))
cryptoPass, err := models.CryptoPass(rt.Ctx, f.Password)
authPassWord := f.Password
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, target.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
cryptoPass, err := models.CryptoPass(rt.Ctx, authPassWord)
ginx.Dangerous(err)
ginx.NewRender(c).Message(target.UpdatePassword(rt.Ctx, cryptoPass, c.MustGet("username").(string)))

View File

@@ -78,6 +78,6 @@ enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["ibex:20090"]
servers = ["nightingale:20090"]
## temp script dir
meta_dir = "./meta"

View File

@@ -8,7 +8,7 @@ CREATE TABLE users (
portrait varchar(255) not null default '',
roles varchar(255) not null,
contacts varchar(1024),
maintainer boolean not null default false,
maintainer int not null default 0,
belong varchar(16) not null default '',
last_active_time bigint not null default 0,
create_at bigint not null default 0,
@@ -60,8 +60,8 @@ CREATE TABLE configs (
ckey varchar(191) not null,
cval text not null default '',
note varchar(1024) not null default '',
external boolean not null default false,
encrypted boolean not null default false,
external int not null default 0,
encrypted int not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
@@ -378,7 +378,7 @@ COMMENT ON COLUMN alert_mute.disabled IS '0:enabled 1:disabled';
CREATE TABLE alert_subscribe (
id bigserial,
name varchar(255) not null default '',
disabled boolean not null default false,
disabled int not null default 0,
group_id bigint not null default 0,
prod varchar(255) not null default '',
cate varchar(128) not null,
@@ -397,7 +397,7 @@ CREATE TABLE alert_subscribe (
rule_ids VARCHAR(1024) DEFAULT '',
webhooks text not null,
extra_config text not null,
redefine_webhooks boolean default false,
redefine_webhooks int default 0,
for_duration bigint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
@@ -744,7 +744,7 @@ CREATE TABLE datasource
status varchar(255) not null default '',
http varchar(4096) not null default '',
auth varchar(8192) not null default '',
is_default smallint not null default 0,
is_default boolean not null default false,
created_at bigint not null default 0,
created_by varchar(64) not null default '',
updated_at bigint not null default 0,
@@ -845,7 +845,7 @@ CREATE TABLE metric_filter (
update_by VARCHAR(191) NOT NULL DEFAULT ''
);
CREATE INDEX idx_name ON metric_filter (name);
CREATE INDEX idx_metric_filter_name ON metric_filter (name);
CREATE TABLE board_busigroup (
busi_group_id BIGINT NOT NULL DEFAULT 0,
@@ -870,6 +870,7 @@ CREATE INDEX idx_ident ON builtin_components (ident);
CREATE TABLE builtin_payloads (
id BIGSERIAL PRIMARY KEY,
type VARCHAR(191) NOT NULL,
uuid BIGINT NOT NULL DEFAULT 0,
component VARCHAR(191) NOT NULL,
cate VARCHAR(191) NOT NULL,
name VARCHAR(191) NOT NULL,
@@ -882,6 +883,6 @@ CREATE TABLE builtin_payloads (
);
CREATE INDEX idx_component ON builtin_payloads (component);
CREATE INDEX idx_name ON builtin_payloads (name);
CREATE INDEX idx_builtin_payloads_name ON builtin_payloads (name);
CREATE INDEX idx_cate ON builtin_payloads (cate);
CREATE INDEX idx_type ON builtin_payloads (type);

View File

@@ -1,11 +1,25 @@
#### {{if .IsRecovered}}<font color="#008800">S{{.Severity}} - Recovered - {{.RuleName}}</font>{{else}}<font color="#FF0000">S{{.Severity}} - Triggered - {{.RuleName}}</font>{{end}}
#### {{if .IsRecovered}}<font color="#008800">💚{{.RuleName}}</font>{{else}}<font color="#FF0000">💔{{.RuleName}}</font>{{end}}
---
- **规则标题**: {{.RuleName}}{{if .RuleNote}}
- **规则备注**: {{.RuleNote}}{{end}}
- **监控指标**: {{.TagsJSON}}
- {{if .IsRecovered}}**恢复时间**{{timeformat .LastEvalTime}}{{else}}**触发时间**: {{timeformat .TriggerTime}}
- **触发时值**: {{.TriggerValue}}{{end}}
- **发送时间**: {{timestamp}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}
- **告警级别**: {{.Severity}}
{{- if .RuleNote}}
- **规则备注**: {{.RuleNote}}
{{- end}}
{{- if not .IsRecovered}}
- **当次触发时值**: {{.TriggerValue}}
- **当次触发时间**: {{timeformat .TriggerTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- else}}
{{- if .AnnotationsJSON.recovery_value}}
- **恢复时值**: {{formatDecimal .AnnotationsJSON.recovery_value 4}}
{{- end}}
- **恢复时间**: {{timeformat .LastEvalTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- end}}
- **告警事件标签**:
{{- range $key, $val := .TagsMap}}
{{- if ne $key "rulename" }}
- `{{$key}}`: `{{$val}}`
{{- end}}
{{- end}}

View File

@@ -397,6 +397,7 @@ CREATE TABLE `recording_rule` (
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`cron_pattern` varchar(255) default '' comment 'cron pattern',
`append_tags` varchar(255) default '' comment 'split by space: service=n9e mod=api',
`query_configs` text not null comment 'query configs',
`create_at` bigint default '0',
@@ -440,7 +441,7 @@ CREATE TABLE `alert_cur_event` (
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
@@ -455,6 +456,7 @@ CREATE TABLE `alert_cur_event` (
`annotations` text not null comment 'annotations',
`rule_config` text not null comment 'annotations',
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
`original_tags` text comment 'labels key=val,,k2=v2',
PRIMARY KEY (`id`),
KEY (`hash`),
KEY (`rule_id`),
@@ -480,7 +482,7 @@ CREATE TABLE `alert_his_event` (
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
@@ -494,6 +496,7 @@ CREATE TABLE `alert_his_event` (
`recover_time` bigint not null default 0,
`last_eval_time` bigint not null default 0 comment 'for time filter',
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
`original_tags` text comment 'labels key=val,,k2=v2',
`annotations` text not null comment 'annotations',
`rule_config` text not null comment 'annotations',
PRIMARY KEY (`id`),
@@ -524,6 +527,7 @@ CREATE TABLE `builtin_components` (
CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
@@ -538,6 +542,7 @@ CREATE TABLE `builtin_payloads` (
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_uuid` (`uuid`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

View File

@@ -56,6 +56,7 @@ CREATE TABLE `builtin_components` (
CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
@@ -70,8 +71,16 @@ CREATE TABLE `builtin_payloads` (
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_uuid` (`uuid`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v7.0.0-beta.7 */
ALTER TABLE users ADD COLUMN last_active_time BIGINT NOT NULL DEFAULT 0;
ALTER TABLE users ADD COLUMN last_active_time BIGINT NOT NULL DEFAULT 0;
/* v7.0.0-beta.13 */
ALTER TABLE recording_rule ADD COLUMN cron_pattern VARCHAR(255) DEFAULT '' COMMENT 'cron pattern';
/* v7.0.0-beta.14 */
ALTER TABLE alert_cur_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2';
ALTER TABLE alert_his_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2';

7
go.mod
View File

@@ -8,7 +8,7 @@ require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/expr-lang/expr v1.16.1
github.com/flashcatcloud/ibex v1.3.4
github.com/flashcatcloud/ibex v1.3.5
github.com/gin-contrib/pprof v1.4.0
github.com/gin-gonic/gin v1.9.1
github.com/go-ldap/ldap/v3 v3.4.4
@@ -85,6 +85,7 @@ require (
github.com/pquerna/cachecontrol v0.1.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/procfs v0.11.0 // indirect
github.com/robfig/cron/v3 v3.0.1
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
@@ -93,10 +94,10 @@ require (
go.uber.org/automaxprocs v1.5.2 // indirect
golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/image v0.13.0 // indirect
golang.org/x/image v0.18.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect

15
go.sum
View File

@@ -47,8 +47,8 @@ github.com/fatih/camelcase v1.0.0 h1:hxNvNX/xYBp0ovncs8WyWZrOrpBNub/JfaMvbURyft8
github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc=
github.com/fatih/structs v1.1.0 h1:Q7juDM0QtcnhCpeyLGQKyg4TOIghuNXrkL32pHAUMxo=
github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M=
github.com/flashcatcloud/ibex v1.3.4 h1:s5MgQmDIYR18liBKPNl96kC/h1jOTZjIOlUWeSx0710=
github.com/flashcatcloud/ibex v1.3.4/go.mod h1:T8hbMUySK2q6cXUaYp0AUVeKkU9Od2LjzwmB5lmTRBM=
github.com/flashcatcloud/ibex v1.3.5 h1:8GOOf5+aJT0TP/MC6izz7CO5JKJSdKVFBwL0vQp93Nc=
github.com/flashcatcloud/ibex v1.3.5/go.mod h1:T8hbMUySK2q6cXUaYp0AUVeKkU9Od2LjzwmB5lmTRBM=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/garyburd/redigo v1.6.2/go.mod h1:NR3MbYisc3/PwhQ00EMzDiPmrwpPxAn5GI05/YaO1SY=
@@ -250,6 +250,8 @@ github.com/rakyll/statik v0.1.7 h1:OF3QCZUuyPxuGEP7B4ypUa7sB/iHtqOTDYZXGM8KOdQ=
github.com/rakyll/statik v0.1.7/go.mod h1:AlZONWzMtEnMs7W4e/1LURLiI49pIMmp6V9Unghqrcc=
github.com/redis/go-redis/v9 v9.0.2 h1:BA426Zqe/7r56kCcvxYLWe1mkaz71LKF77GwgFzSxfE=
github.com/redis/go-redis/v9 v9.0.2/go.mod h1:/xDTe9EF1LM61hek62Poq2nzQSGj0xSrEtEHbBQevps=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/robfig/go-cache v0.0.0-20130306151617-9fc39e0dbf62/go.mod h1:65XQgovT59RWatovFwnwocoUxiI/eENTnOY5GK3STuY=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
@@ -340,8 +342,9 @@ golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA=
golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/image v0.13.0 h1:3cge/F/QTkNLauhf2QoE9zp+7sr+ZcL4HnoZmdwg9sg=
golang.org/x/image v0.13.0/go.mod h1:6mmbMOeV28HuMTgA6OSRkdXKYw/t5W9Uwn2Yv1r3Yxk=
golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ=
golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
@@ -372,7 +375,7 @@ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -414,8 +417,8 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190425163242-31fd60d6bfdc/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=

View File

@@ -0,0 +1,971 @@
{
"name": "阿里云MySQL",
"tags": "阿里云 mysql",
"ident": "",
"configs": {
"panels": [
{
"type": "row",
"id": "1cb8caf3-ef35-4572-9ecc-71b9f063a685",
"name": "关键指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "1cb8caf3-ef35-4572-9ecc-71b9f063a685",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "5aad17df-354e-40de-a643-61da6668939b",
"layout": {
"h": 5,
"w": 24,
"x": 0,
"y": 1,
"i": "fcf9515d-3a56-4596-8b3a-d7d8631aa218",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_SlowQueries{instanceName=\"$instance\"}",
"legend": "{{instanceName}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "每秒慢查询数量(countS)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "row",
"id": "2b3a816e-94e2-4c9d-9bb8-770c458033db",
"name": "基础指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 6,
"i": "2b3a816e-94e2-4c9d-9bb8-770c458033db",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "12d4a674-6d09-4b02-aa4f-d767531bd368",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 7,
"i": "baba4778-b950-4224-9dac-9ecda041f93b",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_CpuUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "55b17951-a4ae-46a7-a2d7-57db1414f6ff",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 7,
"i": "c4c248bd-21fb-4485-8235-f50640116e65",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MemoryUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内存使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "02c6af68-0e59-4f62-b0e8-80a9a9d0df82",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 7,
"i": "51cf9211-5e76-4176-b1ec-42929ccc6803",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_DiskUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "磁盘使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 11,
"i": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_ActiveSessions{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "活跃连接数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b518c9c4-f0e8-4712-ab67-be4521eeff0c",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 11,
"i": "ff589719-6072-488d-819d-6e080a6f3c60",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_ConnectionUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "连接数使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "86c1f728-ac1e-402b-bea6-2e3979f472c3",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 11,
"i": "5d673c5d-1fbb-4df4-9ece-c991d053ca34",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_IOPSUsage{instanceName=\"$instance\"} ",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "IOPS使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off",
"standardOptions": {
"util": "percent"
}
}
}
]
},
{
"type": "timeseries",
"id": "dc874418-8d11-409c-96e8-e48fac2f6e20",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 15,
"i": "86915dd4-990c-41ba-b048-3da301d97327",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_NetworkInNew{instanceName=\"$instance\"}/ 8",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流入带宽",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b979878a-81a6-4c0d-960d-22a736d00655",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 15,
"i": "86f9e07f-85dc-44e0-8245-ca0a9b0dfa81",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_NetworkOutNew{instanceName=\"$instance\"}/ 8",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流出带宽",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "row",
"id": "6d896a20-bf04-4dc7-94da-1394ef109848",
"name": "性能指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 19,
"i": "6d896a20-bf04-4dc7-94da-1394ef109848",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "2e545b2b-130b-4829-a2d2-ee5305c302aa",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 20,
"i": "13dceb72-9e9d-483d-86d2-b192debdcece",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_QPS{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "QPS",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "reqps"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "0299da4b-d779-4ed7-9cd5-096f43181b2e",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 20,
"i": "2b23c24e-b6f9-44f5-8151-2d5a7585c31a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_TPS{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "TPS",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "reqps"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "56a0e345-1d4d-4051-a3cf-738bea220f96",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 20,
"i": "d1752ed4-f4a1-4c4b-854f-1c2ef01b34a4",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_IbufUseRatio{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "BP利用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"name": "datasource",
"label": "datasource",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "instance",
"label": "",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(AliyunRds_MySQL_SlowQueries, instanceName)"
}
],
"version": "3.0.0"
},
"uuid": 1717556327098444000
}

View File

@@ -0,0 +1,361 @@
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf ZooKeeper故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "avg(clickhouse_metrics_zoo_keeper_session ) != 1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153856411000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 内存使用",
"note": "内存使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_memory_tracking / clickhouse_asynchronous_metrics_os_memory_total * 100 \u003e 90",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_memory_tracking/ clickhouse_asynchronous_metrics_os_memory_total * 100 \u003e 80",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153858877000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 磁盘使用",
"note": "磁盘使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_asynchronous_metrics_disk_available_default / (clickhouse_asynchronous_metrics_disk_available_default + clickhouse_asynchronous_metrics_disk_used_default) * 100 \u003c 10",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_asynchronous_metrics_disk_available_default / (clickhouse_asynchronous_metrics_disk_available_default + clickhouse_asynchronous_metrics_disk_used_default) * 100 \u003c 20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153860224000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 网络故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_network_send \u003e 250 or clickhouse_metrics_network_receive \u003e 250",
"severity": 2
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_network_send \u003e 250 or clickhouse_metrics_network_receive \u003e 250",
"severity": 3
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "increase(clickhouse_metrics_interserver_connection[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153861525000
}
]

View File

@@ -0,0 +1,521 @@
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 认证错误",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) \u003e 0",
"severity": 2
},
{
"prom_ql": "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153863782000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter ZooKeeper故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "avg(ClickHouseMetrics_ZooKeeperSession) != 1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153865298000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 内存使用",
"note": "内存使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseMetrics_MemoryTracking / ClickHouseAsyncMetrics_OSMemoryTotal * 100 \u003e 90",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseMetrics_MemoryTracking / ClickHouseAsyncMetrics_OSMemoryTotal * 100 \u003e 80",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153866296000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 副本错误",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1",
"severity": 1
},
{
"prom_ql": " ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1",
"severity": 1
},
{
"prom_ql": " ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153867268000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 磁盘使用",
"note": "磁盘使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 \u003c 10",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 \u003c 20",
"severity": 2
},
{
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 \u003c 20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153868363000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 网络故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ClickHouseMetrics_NetworkSend \u003e 250 or ClickHouseMetrics_NetworkReceive \u003e 250",
"severity": 2
},
{
"prom_ql": "ClickHouseMetrics_NetworkSend \u003e 250 or ClickHouseMetrics_NetworkReceive \u003e 250",
"severity": 3
},
{
"prom_ql": "increase(ClickHouseMetrics_InterserverConnection[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153869486000
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,422 @@
[
{
"id": 0,
"uuid": 1719305153888541000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse HTTP 连接数",
"unit": "sishort",
"note": "通过HTTP协议连接到ClickHouse服务器的客户端数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_http_connection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153889950000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse INSERT查询平均时间",
"unit": "sishort",
"note": "插入查询执行的平均时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_insert_query_time_microseconds_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153890963000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse SELECT 查询数",
"unit": "none",
"note": "执行的选择SELECT查询的数量",
"lang": "zh_CN",
"expression": "clickhouse_events_select_query",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153892134000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse SELECT查询平均时间",
"unit": "sishort",
"note": "选择查询执行的平均时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_select_query_time_microseconds_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153893317000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse TCP 连接数",
"unit": "sishort",
"note": "通过TCP协议连接到ClickHouse服务器的客户端数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_tcp_connection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153894646000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 临时数据量",
"unit": "sishort",
"note": "临时数据部分的数量,这些部分当前正在生成。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_temporary",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153896151000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 分布式表连接数",
"unit": "sishort",
"note": "发送到分布式表的远程服务器的数据连接数。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_distributed_send",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153897491000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 宽数据量",
"unit": "sishort",
"note": "宽数据部分的数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_wide",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153899026000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 待插入分布式表文件数",
"unit": "sishort",
"note": "等待异步插入到分布式表的文件数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_distributed_files_to_insert",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153900278000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 提交前数据量",
"unit": "sishort",
"note": "提交前的数据部分数量这些部分在data_parts列表中但不用于SELECT查询。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_pre_committed",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153901527000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 提交后数据量",
"unit": "sishort",
"note": "提交后的数据部分数量这些部分在data_parts列表中并且用于SELECT查询。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_committed",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153902727000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 插入未压缩",
"unit": "sishort",
"note": " 插入操作写入的未压缩字节数。",
"lang": "zh_CN",
"expression": "clickhouse_events_inserted_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153904402000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 插入行数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "clickhouse_events_inserted_rows",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153905722000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询优先级",
"unit": "sishort",
"note": "由于优先级设置,被停止并等待的查询数量。\n",
"lang": "zh_CN",
"expression": "clickhouse_metrics_query_preempted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153906824000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询总数",
"unit": "none",
"note": "ClickHouse执行的查询总数。",
"lang": "zh_CN",
"expression": "clickhouse_events_query",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153907953000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询总时间",
"unit": "milliseconds",
"note": "查询执行的总时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_query_time_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153909480000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 正被删除数据量",
"unit": "sishort",
"note": "正在被删除的数据部分数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_deleting",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153911177000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 移动池活动任务数",
"unit": "sishort",
"note": "后台移动池中的活动任务数,用于处理数据移动。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_background_move_pool_task",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153912274000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 紧凑数据量",
"unit": "sishort",
"note": "紧凑数据部分的数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_compact",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153913312000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 缓冲区活动任务数",
"unit": "sishort",
"note": "后台缓冲区冲洗调度池中的活动任务数,用于定期缓冲区冲洗。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_background_buffer_flush_schedule_pool_task",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153914788000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 跨磁盘量",
"unit": "sishort",
"note": "移动到另一个磁盘并应在析构函数中删除的数据部分数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_delete_on_destroy",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153916159000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 过时数据量",
"unit": "sishort",
"note": " 过时的数据部分数量这些部分不是活动数据部分但当前SELECT查询可能使用它们。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_outdated",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153917507000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中内存使用情况",
"unit": "sishort",
"note": "ClickHouse服务器使用的总内存量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_memory_tracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153918455000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中数据库数量",
"unit": "none",
"note": "ClickHouse数据库数量",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_number_of_databases",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153919709000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中表的数量",
"unit": "none",
"note": "ClickHouse表数量",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_number_of_tables",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153920898000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse修订",
"unit": "none",
"note": "ClickHouse服务器的修订号通常是一个用于标识特定构建的数字。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_revision",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153921934000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse服务器运行时间",
"unit": "sishort",
"note": "ClickHouse服务器自启动以来的运行时间。",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_uptime",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153923130000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse版本号",
"unit": "none",
"note": "ClickHouse服务器的版本号以整数形式表示。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_version_integer",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
}
]

View File

@@ -0,0 +1,797 @@
[
{
"id": 0,
"uuid": 1719305153924793000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "ClickHouse Tcp 连接数",
"unit": "none",
"note": "tcp连接数",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_TCPConnection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153926074000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "ClickHouse 内存",
"unit": "bitsIEC",
"note": "分配的内存总量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_MemoryTracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153927130000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "INSERT查询平均延迟",
"unit": "microseconds",
"note": "INSERT查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_InsertQueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_InsertQuery[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153928310000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "INSERT查询数",
"unit": "queries",
"note": "与查询数相同但仅限于INSERT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153929755000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "lseek函数调用次数",
"unit": "times",
"note": "'lseek'函数被调用的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Seek[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153931299000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表写入的压缩字节数",
"unit": "bytes",
"note": "",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterCompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153932255000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的数据块数",
"unit": "blocks",
"note": "插入到MergeTree表的数据块数。每个块形成一个数据部分",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterBlocks[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153933664000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的未压缩字节数",
"unit": "bytes",
"note": "插入到MergeTree表的未压缩字节数列以它们在内存中存储的形式\n\n在ClickHouse数据库中当数据被插入到MergeTree系列表包括ReplicatedMergeTree等在数据实际被写入磁盘并经过压缩处理之前在内存中以原始格式暂存时所占用的字节数量。这里的“未压缩”意味着数据还未经过ClickHouse为了节省存储空间而在存储阶段执行的列式存储压缩算法处理",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterUncompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153934869000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的行数",
"unit": "rows",
"note": "插入到MergeTree表的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153935835000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询平均延迟",
"unit": "microseconds",
"note": "SELECT查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_SelectQueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_SelectQuery[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153937045000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询数",
"unit": "queries",
"note": "SELECT查询的数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153938270000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询的字节数",
"unit": "bytes",
"note": "从所有表SELECT的字节数未压缩列以它们在内存中存储的形式",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153939642000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询的行数",
"unit": "rows",
"note": "从所有表SELECT的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153940852000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "TCP连接数",
"unit": "connections",
"note": "与 TCP 服务器(带本地接口的客户端)的连接数,也包括服务器-服务器连接",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_TCPConnection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153941862000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "临时部分数",
"unit": "parts",
"note": "目前正在生成的部分,不在数据部分列表中",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsTemporary",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153942864000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取失败次数",
"unit": "times",
"note": "从文件描述符读取read/pread失败的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadFailed[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153943822000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取次数",
"unit": "reads",
"note": "从文件描述符进行读取read/pread的次数不包括套接字",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorRead[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153944918000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取的字节数",
"unit": "bytes",
"note": "从文件描述符读取的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"lang": "zh_CN",
"expression": "irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadBytes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153946307000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "保留空间",
"unit": "bytes",
"note": "为当前运行的后台合并保留的磁盘空间。它略大于当前合并部分的总大小",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_DiskSpaceReservedForMerge",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153947296000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "内存占用",
"unit": "bytes",
"note": "分配的内存总量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_MemoryTracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153948199000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "写入文件描述符次数",
"unit": "writes",
"note": "写入文件描述符write/pwrite的次数不包括套接字",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWrite[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153949391000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "写入文件描述符的字节数",
"unit": "bytes",
"note": "写入文件描述符的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWriteBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153950577000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并平均持续时间",
"unit": "milliseconds",
"note": "合并的平均持续时间",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_MergesTimeMilliseconds[1m]) / increase(ClickHouseProfileEvents_Merge[1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153953146000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并读取的未压缩字节数",
"unit": "bytes",
"note": "后台合并读取的未压缩字节数(列以它们在内存中存储的形式)。这是合并前的字节数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergedUncompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153954853000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并读取的行数",
"unit": "rows",
"note": "后台合并读取的行数。这是合并前的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153956608000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "后台合并次数",
"unit": "times",
"note": "启动的后台合并次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Merge[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153957668000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "复制部分合并次数",
"unit": "times",
"note": "ReplicatedMergeTree表的数据部分成功合并的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartMerges[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153958797000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "复制部分数据丢失次数",
"unit": "times",
"note": "数据在任何副本上都不存在的次数(即使是现在离线的副本)。这些数据部分肯定丢失了。由于异步复制(如果未启用配额插入),当写入数据部分的副本失败并且在故障后重新联机时不包含该数据部分,这是正常现象",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedDataLoss[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153959861000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的INSERT查询数",
"unit": "times",
"note": "与失败的查询相同但仅限于INSERT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedInsertQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153961190000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的SELECT查询数",
"unit": "queries",
"note": "与失败的查询相同但仅限于SELECT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedSelectQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153962249000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的查询数",
"unit": "queries",
"note": "失败的查询数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153963287000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "延迟插入次数",
"unit": "times",
"note": "由于分区的活动数据部分数量过多INSERT到MergeTree表的块被限制的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_DelayedInserts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153964822000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "延迟插入阻塞的平均等待时间",
"unit": "milliseconds",
"note": "由于分区的活动数据部分数量过多INSERT到MergeTree表的块被限制时的总等待时间毫秒",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_DelayedInsertsMilliseconds[1m]) / (increase(ClickHouseProfileEvents_DelayedInserts[1m]) + 0.01)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153966130000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "慢查询次数",
"unit": "times",
"note": "从文件中进行慢查询读取的次数,这表明系统过载",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SlowRead[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153967132000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "打开的文件数",
"unit": "files",
"note": "打开的文件数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FileOpen[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153968376000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "拒绝插入次数",
"unit": "times",
"note": "由于分区的活动数据部分数量过多INSERT到MergeTree表的块被拒绝的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_RejectedInserts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153969972000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "提交部分数",
"unit": "parts",
"note": "已经提交的数据部分的数量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsCommitted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153971113000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "插入字节数",
"unit": "bytes",
"note": "所有表INSERT的字节数未压缩列以它们在内存中存储的形式",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153972182000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "插入行数",
"unit": "rows",
"note": "所有表INSERT的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153973527000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "未压缩缓存命中次数",
"unit": "times",
"note": "未压缩缓存命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheHits[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153974747000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "未压缩缓存未命中次数",
"unit": "times",
"note": "未压缩缓存未命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheMisses[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153976184000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询内存限制超标次数",
"unit": "times",
"note": "查询内存限制超标的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_QueryMemoryLimitExceeded[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153977623000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询处理线程降低次数",
"unit": "times",
"note": "由于慢查询读取,降低查询处理线程数的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBackoff[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153978786000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询平均延迟",
"unit": "microseconds",
"note": "查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_QueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_Query[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153980379000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询总数",
"unit": "queries",
"note": "需要解释和可能执行的查询数量,不包括失败的查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Query[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153981570000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "标记缓存命中次数",
"unit": "times",
"note": "标记缓存命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MarkCacheHits[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153983200000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "标记缓存未命中次数",
"unit": "times",
"note": "标记缓存未命中的次数",
"lang": "zh_CN",
"expression": "rate(ClickHouseProfileEvents_MarkCacheMisses[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153984657000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的压缩块数",
"unit": "blocks",
"note": "从压缩源(文件,网络)读取的压缩块数(独立压缩的数据块)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBlocks[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153985923000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的数据部分数",
"unit": "parts",
"note": "从MergeTree表读取的数据部分数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedParts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153987437000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的未压缩字节数",
"unit": "bytes",
"note": "从压缩源(文件,网络)读取的未压缩字节数(解压后的字节数)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153988925000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的标记数",
"unit": "marks",
"note": "从MergeTree表读取的标记数索引粒度",
"lang": "zh_CN",
"expression": "irate(ClickHouseProfileEvents_SelectedMarks[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153990107000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的范围数",
"unit": "ranges",
"note": "从MergeTree表读取的所有数据部分中非相邻的范围数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedRanges[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
},
{
"id": 0,
"uuid": 1719305153991749000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "预提交部分数",
"unit": "parts",
"note": "在数据部分中但不用于SELECT查询的部分",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsPreCommitted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": ""
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -215,13 +215,13 @@ func (uc *UserCacheType) loopUpdateLastActiveTime() {
duration := 5 * time.Minute
for {
time.Sleep(duration)
if err := uc.updateUsersLastActiveTime(); err != nil {
if err := uc.UpdateUsersLastActiveTime(); err != nil {
logger.Warningf("failed to update users' last active time: %v", err)
}
}
}
func (uc *UserCacheType) updateUsersLastActiveTime() error {
func (uc *UserCacheType) UpdateUsersLastActiveTime() error {
// read the full list of users from the database
users, err := models.UserGetAll(uc.ctx)
if err != nil {

View File

@@ -52,6 +52,8 @@ type AlertCurEvent struct {
Tags string `json:"-"` // for db
TagsJSON []string `json:"tags" gorm:"-"` // for fe
TagsMap map[string]string `json:"tags_map" gorm:"-"` // for internal usage
OriginalTags string `json:"-"` // for db
OriginalTagsJSON []string `json:"original_tags" gorm:"-"` // for fe
Annotations string `json:"-"` //
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
IsRecovered bool `json:"is_recovered" gorm:"-"` // for notify.py
@@ -160,6 +162,35 @@ func (e *AlertCurEvent) ParseRule(field string) error {
return nil
}
func (e *AlertCurEvent) ParseURL(url string) (string, error) {
f := strings.TrimSpace(url)
if f == "" {
return url, nil
}
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
"{{$annotations := .AnnotationsJSON}}",
}
text := strings.Join(append(defs, f), "")
t, err := template.New("callbackUrl" + fmt.Sprint(e.RuleId)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
return url, nil
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
return url, nil
}
return body.String(), nil
}
func (e *AlertCurEvent) GenCardTitle(rules []*AggrRule) string {
arr := make([]string, len(rules))
for i := 0; i < len(rules); i++ {
@@ -260,6 +291,7 @@ func (e *AlertCurEvent) ToHis(ctx *ctx.Context) *AlertHisEvent {
TriggerTime: e.TriggerTime,
TriggerValue: e.TriggerValue,
Tags: e.Tags,
OriginalTags: e.OriginalTags,
RecoverTime: recoverTime,
LastEvalTime: e.LastEvalTime,
NotifyCurNumber: e.NotifyCurNumber,
@@ -272,6 +304,7 @@ func (e *AlertCurEvent) DB2FE() error {
e.NotifyGroupsJSON = strings.Fields(e.NotifyGroups)
e.CallbacksJSON = strings.Fields(e.Callbacks)
e.TagsJSON = strings.Split(e.Tags, ",,")
e.OriginalTagsJSON = strings.Split(e.OriginalTags, ",,")
json.Unmarshal([]byte(e.Annotations), &e.AnnotationsJSON)
json.Unmarshal([]byte(e.RuleConfig), &e.RuleConfigJson)
return nil
@@ -282,6 +315,7 @@ func (e *AlertCurEvent) FE2DB() {
e.NotifyGroups = strings.Join(e.NotifyGroupsJSON, " ")
e.Callbacks = strings.Join(e.CallbacksJSON, " ")
e.Tags = strings.Join(e.TagsJSON, ",,")
e.OriginalTags = strings.Join(e.OriginalTagsJSON, ",,")
b, _ := json.Marshal(e.AnnotationsJSON)
e.Annotations = string(b)

View File

@@ -48,6 +48,8 @@ type AlertHisEvent struct {
LastEvalTime int64 `json:"last_eval_time"`
Tags string `json:"-"`
TagsJSON []string `json:"tags" gorm:"-"`
OriginalTags string `json:"-"` // for db
OriginalTagsJSON []string `json:"original_tags" gorm:"-"` // for fe
Annotations string `json:"-"`
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
@@ -68,6 +70,7 @@ func (e *AlertHisEvent) DB2FE() {
e.NotifyGroupsJSON = strings.Fields(e.NotifyGroups)
e.CallbacksJSON = strings.Fields(e.Callbacks)
e.TagsJSON = strings.Split(e.Tags, ",,")
e.OriginalTagsJSON = strings.Split(e.OriginalTags, ",,")
if len(e.Annotations) > 0 {
err := json.Unmarshal([]byte(e.Annotations), &e.AnnotationsJSON)
@@ -301,16 +304,19 @@ func EventPersist(ctx *ctx.Context, event *AlertCurEvent) error {
}
}
// use his id as cur id
event.Id = his.Id
return nil
}
// use his id as cur id
event.Id = his.Id
if event.IsRecovered {
// alert_cur_event表里没有数据表示之前没告警结果现在报了恢复神奇....理论上不应该出现的
return nil
}
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {
if err := event.Add(ctx); err != nil {
return fmt.Errorf("add cur event error:%v", err)

View File

@@ -9,6 +9,7 @@ import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
@@ -26,60 +27,61 @@ const (
)
type AlertRule struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"` // busi group id
Cate string `json:"cate"` // alert rule cate (prometheus|elasticsearch)
DatasourceIds string `json:"-" gorm:"datasource_ids"` // datasource ids
DatasourceIdsJson []int64 `json:"datasource_ids" gorm:"-"` // for fe
Cluster string `json:"cluster"` // take effect by clusters, seperated by space
Name string `json:"name"` // rule name
Note string `json:"note"` // will sent in notify
Prod string `json:"prod"` // product empty means n9e
Algorithm string `json:"algorithm"` // algorithm (''|holtwinters), empty means threshold
AlgoParams string `json:"-" gorm:"algo_params"` // params algorithm need
AlgoParamsJson interface{} `json:"algo_params" gorm:"-"` // for fe
Delay int `json:"delay"` // Time (in seconds) to delay evaluation
Severity int `json:"severity"` // 1: Emergency 2: Warning 3: Notice
Severities []int `json:"severities" gorm:"-"` // 1: Emergency 2: Warning 3: Notice
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
PromForDuration int `json:"prom_for_duration"` // prometheus for, unit:s
PromQl string `json:"prom_ql"` // just one ql
RuleConfig string `json:"-" gorm:"rule_config"` // rule config
RuleConfigJson interface{} `json:"rule_config" gorm:"-"` // rule config for fe
PromEvalInterval int `json:"prom_eval_interval"` // unit:s
EnableStime string `json:"-"` // split by space: "00:00 10:00 12:00"
EnableStimeJSON string `json:"enable_stime" gorm:"-"` // for fe
EnableStimesJSON []string `json:"enable_stimes" gorm:"-"` // for fe
EnableEtime string `json:"-"` // split by space: "00:00 10:00 12:00"
EnableEtimeJSON string `json:"enable_etime" gorm:"-"` // for fe
EnableEtimesJSON []string `json:"enable_etimes" gorm:"-"` // for fe
EnableDaysOfWeek string `json:"-"` // eg: "0 1 2 3 4 5 6 ; 0 1 2"
EnableDaysOfWeekJSON []string `json:"enable_days_of_week" gorm:"-"` // for fe
EnableDaysOfWeeksJSON [][]string `json:"enable_days_of_weeks" gorm:"-"` // for fe
EnableInBG int `json:"enable_in_bg"` // 0: global 1: enable one busi-group
NotifyRecovered int `json:"notify_recovered"` // whether notify when recovery
NotifyChannels string `json:"-"` // split by space: sms voice email dingtalk wecom
NotifyChannelsJSON []string `json:"notify_channels" gorm:"-"` // for fe
NotifyGroups string `json:"-"` // split by space: 233 43
NotifyGroupsObj []UserGroup `json:"notify_groups_obj" gorm:"-"` // for fe
NotifyGroupsJSON []string `json:"notify_groups" gorm:"-"` // for fe
NotifyRepeatStep int `json:"notify_repeat_step"` // notify repeat interval, unit: min
NotifyMaxNumber int `json:"notify_max_number"` // notify: max number
RecoverDuration int64 `json:"recover_duration"` // unit: s
Callbacks string `json:"-"` // split by space: http://a.com/api/x http://a.com/api/y'
CallbacksJSON []string `json:"callbacks" gorm:"-"` // for fe
RunbookUrl string `json:"runbook_url"` // sop url
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Annotations string `json:"-"` //
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
ExtraConfig string `json:"-" gorm:"extra_config"` // extra config
ExtraConfigJSON interface{} `json:"extra_config" gorm:"-"` // for fe
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UUID int64 `json:"uuid" gorm:"-"` // tpl identifier
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"` // busi group id
Cate string `json:"cate"` // alert rule cate (prometheus|elasticsearch)
DatasourceIds string `json:"-" gorm:"datasource_ids"` // datasource ids
DatasourceIdsJson []int64 `json:"datasource_ids" gorm:"-"` // for fe
Cluster string `json:"cluster"` // take effect by clusters, seperated by space
Name string `json:"name"` // rule name
Note string `json:"note"` // will sent in notify
Prod string `json:"prod"` // product empty means n9e
Algorithm string `json:"algorithm"` // algorithm (''|holtwinters), empty means threshold
AlgoParams string `json:"-" gorm:"algo_params"` // params algorithm need
AlgoParamsJson interface{} `json:"algo_params" gorm:"-"` // for fe
Delay int `json:"delay"` // Time (in seconds) to delay evaluation
Severity int `json:"severity"` // 1: Emergency 2: Warning 3: Notice
Severities []int `json:"severities" gorm:"-"` // 1: Emergency 2: Warning 3: Notice
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
PromForDuration int `json:"prom_for_duration"` // prometheus for, unit:s
PromQl string `json:"prom_ql"` // just one ql
RuleConfig string `json:"-" gorm:"rule_config"` // rule config
RuleConfigJson interface{} `json:"rule_config" gorm:"-"` // rule config for fe
EventRelabelConfig []*pconf.RelabelConfig `json:"event_relabel_config" gorm:"-"` // event relabel config
PromEvalInterval int `json:"prom_eval_interval"` // unit:s
EnableStime string `json:"-"` // split by space: "00:00 10:00 12:00"
EnableStimeJSON string `json:"enable_stime" gorm:"-"` // for fe
EnableStimesJSON []string `json:"enable_stimes" gorm:"-"` // for fe
EnableEtime string `json:"-"` // split by space: "00:00 10:00 12:00"
EnableEtimeJSON string `json:"enable_etime" gorm:"-"` // for fe
EnableEtimesJSON []string `json:"enable_etimes" gorm:"-"` // for fe
EnableDaysOfWeek string `json:"-"` // eg: "0 1 2 3 4 5 6 ; 0 1 2"
EnableDaysOfWeekJSON []string `json:"enable_days_of_week" gorm:"-"` // for fe
EnableDaysOfWeeksJSON [][]string `json:"enable_days_of_weeks" gorm:"-"` // for fe
EnableInBG int `json:"enable_in_bg"` // 0: global 1: enable one busi-group
NotifyRecovered int `json:"notify_recovered"` // whether notify when recovery
NotifyChannels string `json:"-"` // split by space: sms voice email dingtalk wecom
NotifyChannelsJSON []string `json:"notify_channels" gorm:"-"` // for fe
NotifyGroups string `json:"-"` // split by space: 233 43
NotifyGroupsObj []UserGroup `json:"notify_groups_obj" gorm:"-"` // for fe
NotifyGroupsJSON []string `json:"notify_groups" gorm:"-"` // for fe
NotifyRepeatStep int `json:"notify_repeat_step"` // notify repeat interval, unit: min
NotifyMaxNumber int `json:"notify_max_number"` // notify: max number
RecoverDuration int64 `json:"recover_duration"` // unit: s
Callbacks string `json:"-"` // split by space: http://a.com/api/x http://a.com/api/y'
CallbacksJSON []string `json:"callbacks" gorm:"-"` // for fe
RunbookUrl string `json:"runbook_url"` // sop url
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Annotations string `json:"-"` //
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
ExtraConfig string `json:"-" gorm:"extra_config"` // extra config
ExtraConfigJSON interface{} `json:"extra_config" gorm:"-"` // for fe
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UUID int64 `json:"uuid" gorm:"-"` // tpl identifier
}
type PromRuleConfig struct {
@@ -622,6 +624,13 @@ func (ar *AlertRule) DB2FE() error {
json.Unmarshal([]byte(ar.Annotations), &ar.AnnotationsJSON)
json.Unmarshal([]byte(ar.ExtraConfig), &ar.ExtraConfigJSON)
// 解析 RuleConfig 字段
var ruleConfig struct {
EventRelabelConfig []*pconf.RelabelConfig `json:"event_relabel_config"`
}
json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
ar.EventRelabelConfig = ruleConfig.EventRelabelConfig
err := ar.FillDatasourceIds()
return err
}

View File

@@ -9,7 +9,6 @@ import (
imodels "github.com/flashcatcloud/ibex/src/models"
"github.com/toolkits/pkg/logger"
"gorm.io/driver/mysql"
"gorm.io/driver/postgres"
"gorm.io/gorm"
)
@@ -23,9 +22,8 @@ func MigrateIbexTables(db *gorm.DB) {
switch db.Dialector.(type) {
case *mysql.Dialector:
tableOptions = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
case *postgres.Dialector:
tableOptions = "ENCODING='UTF8'"
}
if tableOptions != "" {
db = db.Set("gorm:table_options", tableOptions)
}
@@ -52,8 +50,6 @@ func MigrateTables(db *gorm.DB) error {
switch db.Dialector.(type) {
case *mysql.Dialector:
tableOptions = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
case *postgres.Dialector:
tableOptions = "ENCODING='UTF8'"
}
if tableOptions != "" {
db = db.Set("gorm:table_options", tableOptions)
@@ -62,10 +58,31 @@ func MigrateTables(db *gorm.DB) error {
dts := []interface{}{&RecordingRule{}, &AlertRule{}, &AlertSubscribe{}, &AlertMute{},
&TaskRecord{}, &ChartShare{}, &Target{}, &Configs{}, &Datasource{}, &NotifyTpl{},
&Board{}, &BoardBusigroup{}, &Users{}, &SsoConfig{}, &models.BuiltinMetric{},
&models.MetricFilter{}, &models.BuiltinComponent{}, &models.BuiltinPayload{}}
&models.MetricFilter{}, &models.BuiltinComponent{}}
if !columnHasIndex(db, &AlertHisEvent{}, "last_eval_time") {
dts = append(dts, &AlertHisEvent{})
if !columnHasIndex(db, &AlertHisEvent{}, "original_tags") ||
!columnHasIndex(db, &AlertCurEvent{}, "original_tags") {
asyncDts := []interface{}{&AlertHisEvent{}, &AlertCurEvent{}}
go func() {
defer func() {
if r := recover(); r != nil {
logger.Errorf("panic to migrate table: %v", r)
}
}()
for _, dt := range asyncDts {
if err := db.AutoMigrate(dt); err != nil {
logger.Errorf("failed to migrate table: %v", err)
}
}
}()
}
if !db.Migrator().HasTable(&models.BuiltinPayload{}) {
dts = append(dts, &models.BuiltinPayload{})
} else {
dts = append(dts, &BuiltinPayloads{})
}
for _, dt := range dts {
@@ -187,6 +204,7 @@ type AlertMute struct {
type RecordingRule struct {
QueryConfigs string `gorm:"type:text;not null;column:query_configs"` // query_configs
DatasourceIds string `gorm:"column:datasource_ids;type:varchar(255);default:'';comment:datasource ids"`
CronPattern string `gorm:"column:cron_pattern;type:varchar(255);default:'';comment:cron pattern"`
}
type AlertingEngines struct {
@@ -200,8 +218,14 @@ type TaskRecord struct {
EventId int64 `gorm:"column:event_id;bigint(20);not null;default:0;comment:event id;index:idx_event_id"`
}
type AlertHisEvent struct {
LastEvalTime int64 `gorm:"column:last_eval_time;bigint(20);not null;default:0;comment:for time filter;index:idx_last_eval_time"`
LastEvalTime int64 `gorm:"column:last_eval_time;bigint(20);not null;default:0;comment:for time filter;index:idx_last_eval_time"`
OriginalTags string `gorm:"column:original_tags;type:text;comment:labels key=val,,k2=v2"`
}
type AlertCurEvent struct {
OriginalTags string `gorm:"column:original_tags;type:text;comment:labels key=val,,k2=v2"`
}
type Target struct {
HostIp string `gorm:"column:host_ip;varchar(15);default:'';comment:IPv4 string;index:idx_host_ip"`
AgentVersion string `gorm:"column:agent_version;varchar(255);default:'';comment:agent version;index:idx_agent_version"`
@@ -209,7 +233,7 @@ type Target struct {
}
type Datasource struct {
IsDefault bool `gorm:"column:is_default;int;not null;default:0;comment:is default datasource"`
IsDefault bool `gorm:"column:is_default;type:boolean;not null;comment:is default datasource"`
}
type Configs struct {
@@ -248,3 +272,7 @@ type Users struct {
type SsoConfig struct {
UpdateAt int64 `gorm:"column:update_at;type:int;default:0;comment:update_at"`
}
type BuiltinPayloads struct {
UUID int64 `json:"uuid" gorm:"type:bigint;not null;index:idx_uuid;comment:'uuid of payload'"`
}

View File

@@ -7,7 +7,11 @@ const NOTIFYCONTACT = "notify_contact"
const SMTP = "smtp_config"
const IBEX = "ibex_server"
var GlobalCallback = 0
var RuleCallback = 1
type Webhook struct {
Type int `json:"type"`
Enable bool `json:"enable"`
Url string `json:"url"`
BasicAuthUser string `json:"basic_auth_user"`
@@ -17,6 +21,9 @@ type Webhook struct {
Headers []string `json:"headers_str"`
SkipVerify bool `json:"skip_verify"`
Note string `json:"note"`
RetryCount int `json:"retry_count"`
RetryInterval int `json:"retry_interval"`
Batch int `json:"batch"`
}
type NotifyScript struct {

View File

@@ -171,8 +171,7 @@ func InitNotifyConfig(c *ctx.Context, tplDir string) {
if cval == "" {
var notifyContacts []NotifyContact
contacts := []string{DingtalkKey, WecomKey, FeishuKey, MmKey, TelegramKey}
for _, contact := range contacts {
for _, contact := range DefaultContacts {
notifyContacts = append(notifyContacts, NotifyContact{Ident: contact, Name: contact, BuiltIn: true})
}
@@ -182,6 +181,35 @@ func InitNotifyConfig(c *ctx.Context, tplDir string) {
logger.Errorf("failed to set notify contact config: %v", err)
return
}
} else {
var contacts []NotifyContact
if err = json.Unmarshal([]byte(cval), &contacts); err != nil {
logger.Errorf("failed to unmarshal notify channel config: %v", err)
return
}
contactMap := make(map[string]struct{})
for _, contact := range contacts {
contactMap[contact.Ident] = struct{}{}
}
var newContacts []NotifyContact
for _, contact := range DefaultContacts {
if _, ok := contactMap[contact]; !ok {
newContacts = append(newContacts, NotifyContact{Ident: contact, Name: contact, BuiltIn: true})
}
}
if len(newContacts) > 0 {
contacts = append(contacts, newContacts...)
data, err := json.Marshal(contacts)
if err != nil {
logger.Errorf("failed to marshal contacts: %v", err)
return
}
if err = ConfigsSet(c, NOTIFYCONTACT, string(data)); err != nil {
logger.Errorf("failed to set notify contact config: %v", err)
return
}
}
}
// init notify tpl
@@ -229,18 +257,33 @@ func getNotifyTpl(tplDir string) map[string]string {
}
var TplMap = map[string]string{
Dingtalk: `#### {{if .IsRecovered}}<font color="#008800">S{{.Severity}} - Recovered - {{.RuleName}}</font>{{else}}<font color="#FF0000">S{{.Severity}} - Triggered - {{.RuleName}}</font>{{end}}
Dingtalk: `#### {{if .IsRecovered}}<font color="#008800">💚{{.RuleName}}</font>{{else}}<font color="#FF0000">💔{{.RuleName}}</font>{{end}}
---
- **规则标题**: {{.RuleName}}{{if .RuleNote}}
- **规则备注**: {{.RuleNote}}{{end}}
{{if not .IsRecovered}}- **触发时值**: {{.TriggerValue}}{{end}}
{{if .TargetIdent}}- **监控对象**: {{.TargetIdent}}{{end}}
- **监控指标**: {{.TagsJSON}}
- {{if .IsRecovered}}**恢复时间**: {{timeformat .LastEvalTime}}{{else}}**触发时**: {{timeformat .TriggerTime}}{{end}}
- **发时间**: {{timestamp}}
`,
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}
- **告警级别**: {{.Severity}}
{{- if .RuleNote}}
- **规则备注**: {{.RuleNote}}
{{- end}}
{{- if not .IsRecovered}}
- **当次触发时**: {{.TriggerValue}}
- **当次触发时间**: {{timeformat .TriggerTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- else}}
{{- if .AnnotationsJSON.recovery_value}}
- **恢复时值**: {{formatDecimal .AnnotationsJSON.recovery_value 4}}
{{- end}}
- **恢复时间**: {{timeformat .LastEvalTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- end}}
- **告警事件标签**:
{{- range $key, $val := .TagsMap}}
{{- if ne $key "rulename" }}
- {{$key}}: {{$val}}
{{- end}}
{{- end}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}})|[查看曲线]({{$domain}}/metric/explorer?data_source_id={{.DatasourceId}}&data_source_name=prometheus&mode=graph&prom_ql={{.PromQl}})`,
Email: `<!DOCTYPE html>
<html lang="en">
<head>
@@ -464,7 +507,10 @@ var TplMap = map[string]string{
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}`,
发送时间: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
事件详情: {{$domain}}/alert-his-events/{{.Id}}
屏蔽1小时: {{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}}`,
FeishuCard: `{{ if .IsRecovered }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
@@ -481,7 +527,9 @@ var TplMap = map[string]string{
**发送时间:** {{timestamp}}
**触发时值:** {{.TriggerValue}}
{{if .RuleNote }}**告警描述:** **{{.RuleNote}}**{{end}}
{{- end -}}`,
{{- end -}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}})|[查看曲线]({{$domain}}/metric/explorer?data_source_id={{.DatasourceId}}&data_source_name=prometheus&mode=graph&prom_ql={{.PromQl}})`,
EmailSubject: `{{if .IsRecovered}}Recovered{{else}}Triggered{{end}}: {{.RuleName}} {{.TagsJSON}}`,
Mm: `级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
@@ -507,5 +555,38 @@ var TplMap = map[string]string{
**触发时值**: {{.TriggerValue}}{{end}}
{{if .IsRecovered}}**恢复时间**: {{timeformat .LastEvalTime}}{{else}}**首次触发时间**: {{timeformat .FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**距离首次告警**: {{humanizeDurationInterface $time_duration}}
**发送时间**: {{timestamp}}`,
**发送时间**: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}})|[查看曲线]({{$domain}}/metric/explorer?data_source_id={{.DatasourceId}}&data_source_name=prometheus&mode=graph&prom_ql={{.PromQl}})`,
Lark: `级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
事件详情: {{$domain}}/alert-his-events/{{.Id}}
屏蔽1小时: {{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}}`,
LarkCard: `{{ if .IsRecovered }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Recovered
**告警名称:** {{.RuleName}}
**恢复时间:** {{timeformat .LastEvalTime}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
**告警描述:** **服务已恢复**
{{- else }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Triggered
**告警名称:** {{.RuleName}}
**触发时间:** {{timeformat .TriggerTime}}
**发送时间:** {{timestamp}}
**触发时值:** {{.TriggerValue}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
{{if .RuleNote }}**告警描述:** **{{.RuleNote}}**{{end}}
{{- end -}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?busiGroup={{.GroupId}}&cate={{.Cate}}&datasource_ids={{.DatasourceId}}&prod={{.RuleProd}}{{range $key, $value := .TagsMap}}&tags={{$key}}%3D{{$value}}{{end}})|[查看曲线]({{$domain}}/metric/explorer?data_source_id={{.DatasourceId}}&data_source_name=prometheus&mode=graph&prom_ql={{.PromQl}})`,
}

View File

@@ -27,9 +27,10 @@ type RecordingRule struct {
QueryConfigs string `json:"-" gorm:"query_configs"` // query_configs
QueryConfigsJson []QueryConfig `json:"query_configs" gorm:"-"` // query_configs for fe
PromEvalInterval int `json:"prom_eval_interval"` // unit:s
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Note string `json:"note"` // note
CronPattern string `json:"cron_pattern"`
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Note string `json:"note"` // note
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
@@ -68,8 +69,12 @@ func (re *RecordingRule) DB2FE() error {
json.Unmarshal([]byte(re.DatasourceIds), &re.DatasourceIdsJson)
json.Unmarshal([]byte(re.QueryConfigs), &re.QueryConfigsJson)
return nil
if re.CronPattern == "" && re.PromEvalInterval != 0 {
re.CronPattern = fmt.Sprintf("@every %ds", re.PromEvalInterval)
}
return nil
}
func (re *RecordingRule) Verify() error {
@@ -99,6 +104,10 @@ func (re *RecordingRule) Verify() error {
re.PromEvalInterval = 60
}
if re.CronPattern == "" {
re.CronPattern = "@every 60s"
}
re.AppendTags = strings.TrimSpace(re.AppendTags)
rer := strings.Fields(re.AppendTags)
for i := 0; i < len(rer); i++ {

View File

@@ -85,43 +85,68 @@ func TargetDel(ctx *ctx.Context, idents []string) error {
return DB(ctx).Where("ident in ?", idents).Delete(new(Target)).Error
}
func buildTargetWhere(ctx *ctx.Context, bgids []int64, dsIds []int64, query string, downtime int64) *gorm.DB {
session := DB(ctx).Model(&Target{})
type BuildTargetWhereOption func(session *gorm.DB) *gorm.DB
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
if len(dsIds) > 0 {
session = session.Where("datasource_id in (?)", dsIds)
}
if downtime > 0 {
session = session.Where("update_at < ?", time.Now().Unix()-downtime)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
q := "%" + arr[i] + "%"
session = session.Where("ident like ? or note like ? or tags like ?", q, q, q)
func BuildTargetWhereWithBgids(bgids []int64) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
return session
}
}
func BuildTargetWhereWithDsIds(dsIds []int64) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(dsIds) > 0 {
session = session.Where("datasource_id in (?)", dsIds)
}
return session
}
}
func BuildTargetWhereWithQuery(query string) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
q := "%" + arr[i] + "%"
session = session.Where("ident like ? or note like ? or tags like ?", q, q, q)
}
}
return session
}
}
func BuildTargetWhereWithDowntime(downtime int64) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if downtime > 0 {
session = session.Where("update_at < ?", time.Now().Unix()-downtime)
}
return session
}
}
func buildTargetWhere(ctx *ctx.Context, options ...BuildTargetWhereOption) *gorm.DB {
session := DB(ctx).Model(&Target{})
for _, opt := range options {
session = opt(session)
}
return session
}
func TargetTotalCount(ctx *ctx.Context) (int64, error) {
return Count(DB(ctx).Model(new(Target)))
func TargetTotal(ctx *ctx.Context, options ...BuildTargetWhereOption) (int64, error) {
return Count(buildTargetWhere(ctx, options...))
}
func TargetTotal(ctx *ctx.Context, bgids []int64, dsIds []int64, query string, downtime int64) (int64, error) {
return Count(buildTargetWhere(ctx, bgids, dsIds, query, downtime))
}
func TargetGets(ctx *ctx.Context, bgids []int64, dsIds []int64, query string, downtime int64, limit, offset int) ([]*Target, error) {
func TargetGets(ctx *ctx.Context, limit, offset int, order string, desc bool, options ...BuildTargetWhereOption) ([]*Target, error) {
var lst []*Target
err := buildTargetWhere(ctx, bgids, dsIds, query, downtime).Order("ident").Limit(limit).Offset(offset).Find(&lst).Error
if desc {
order += " desc"
} else {
order += " asc"
}
err := buildTargetWhere(ctx, options...).Order(order).Limit(limit).Offset(offset).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].TagsJSON = strings.Fields(lst[i].Tags)

View File

@@ -187,6 +187,11 @@ func (t *TaskTpl) CleanFields() error {
return nil
}
type TaskTplHost struct {
Id int64 `json:"id"`
Host string `json:"host"`
}
func (t *TaskTpl) Save(ctx *ctx.Context, hosts []string) error {
if err := t.CleanFields(); err != nil {
return err
@@ -212,10 +217,12 @@ func (t *TaskTpl) Save(ctx *ctx.Context, hosts []string) error {
continue
}
err := tx.Table("task_tpl_host").Create(map[string]interface{}{
"id": t.Id,
"host": host,
}).Error
taskTplHost := TaskTplHost{
Id: t.Id,
Host: host,
}
err := tx.Table("task_tpl_host").Create(&taskTplHost).Error
if err != nil {
return err

View File

@@ -1,6 +1,10 @@
package models
import (
"bytes"
"fmt"
"strconv"
"github.com/prometheus/common/model"
)
@@ -12,6 +16,26 @@ type DataResp struct {
Query string `json:"query"`
}
func (d *DataResp) String() string {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Ref: %s ", d.Ref))
buf.WriteString(fmt.Sprintf("Metric: %+v ", d.Metric))
buf.WriteString(fmt.Sprintf("Labels: %s ", d.Labels))
buf.WriteString("Values: ")
for _, v := range d.Values {
buf.WriteString(" [")
for i, ts := range v {
if i > 0 {
buf.WriteString(", ")
}
buf.WriteString(strconv.FormatInt(int64(ts), 10))
}
buf.WriteString("] ")
}
buf.WriteString(fmt.Sprintf("Query: %s ", d.Query))
return buf.String()
}
func (d *DataResp) Last() (float64, float64, bool) {
if len(d.Values) == 0 {
return 0, 0, false

View File

@@ -3,12 +3,15 @@ package models
import (
"fmt"
"os"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/storage"
"github.com/redis/go-redis/v9"
"github.com/pkg/errors"
"github.com/tidwall/gjson"
@@ -27,26 +30,32 @@ const (
Telegram = "telegram"
Email = "email"
EmailSubject = "mailsubject"
Lark = "lark"
LarkCard = "larkcard"
DingtalkKey = "dingtalk_robot_token"
WecomKey = "wecom_robot_token"
FeishuKey = "feishu_robot_token"
MmKey = "mm_webhook_url"
TelegramKey = "telegram_robot_token"
LarkKey = "lark_robot_token"
DingtalkDomain = "oapi.dingtalk.com"
WecomDomain = "qyapi.weixin.qq.com"
FeishuDomain = "open.feishu.cn"
LarkDomain = "open.larksuite.com"
// FeishuCardDomain The domain name of the feishu card is the same as the feishu,distinguished by the parameter
FeishuCardDomain = "open.feishu.cn?card=1"
LarkCardDomain = "open.larksuite.com?card=1"
TelegramDomain = "api.telegram.org"
IbexDomain = "ibex"
DefaultDomain = "default"
)
var (
DefaultChannels = []string{Dingtalk, Wecom, Feishu, Mm, Telegram, Email, FeishuCard}
DefaultChannels = []string{Dingtalk, Wecom, Feishu, Mm, Telegram, Email, FeishuCard, Lark, LarkCard}
DefaultContacts = []string{DingtalkKey, WecomKey, FeishuKey, MmKey, TelegramKey, LarkKey}
)
type User struct {
@@ -205,8 +214,10 @@ func (u *User) Add(ctx *ctx.Context) error {
}
func (u *User) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := u.Verify(); err != nil {
return err
if u.Belong == "" {
if err := u.Verify(); err != nil {
return err
}
}
return DB(ctx).Model(u).Select(selectField, selectFields...).Updates(u).Error
@@ -324,13 +335,106 @@ func InitRoot(ctx *ctx.Context) {
fmt.Println("root password init done")
}
func PassLogin(ctx *ctx.Context, username, pass string) (*User, error) {
func reachLoginFailCount(ctx *ctx.Context, redisObj storage.Redis, username string, count int64) (bool, error) {
key := "/userlogin/errorcount/" + username
val, err := redisObj.Get(ctx.GetContext(), key).Result()
if err == redis.Nil {
return false, nil
}
if err != nil {
return false, err
}
c, err := strconv.ParseInt(val, 10, 64)
if err != nil {
return false, err
}
return c >= count, nil
}
func incrLoginFailCount(ctx *ctx.Context, redisObj storage.Redis, username string, seconds int64) {
key := "/userlogin/errorcount/" + username
duration := time.Duration(seconds) * time.Second
val, err := redisObj.Get(ctx.GetContext(), key).Result()
if err == redis.Nil {
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
if err != nil {
logger.Warningf("login_fail_count: failed to get redis value. key:%s, error:%s", key, err)
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
count, err := strconv.ParseInt(val, 10, 64)
if err != nil {
logger.Warningf("login_fail_count: failed to parse int64. key:%s, error:%s", key, err)
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
count++
redisObj.Set(ctx.GetContext(), key, fmt.Sprintf("%d", count), duration)
}
func PassLogin(ctx *ctx.Context, redis storage.Redis, username, pass string) (*User, error) {
// 300 5 meaning: 300 seconds, 5 times
val, err := ConfigsGet(ctx, "login_fail_count")
if err != nil {
return nil, err
}
var (
needCheck = val != "" // DB 里有配置,说明启用了这个 feature
seconds int64
count int64
)
if needCheck {
pair := strings.Fields(val)
if len(pair) != 2 {
logger.Warningf("login_fail_count config invalid: %s", val)
needCheck = false
} else {
seconds, err = strconv.ParseInt(pair[0], 10, 64)
if err != nil {
logger.Warningf("login_fail_count seconds invalid: %s", pair[0])
needCheck = false
}
count, err = strconv.ParseInt(pair[1], 10, 64)
if err != nil {
logger.Warningf("login_fail_count count invalid: %s", pair[1])
needCheck = false
}
}
}
if needCheck {
reach, err := reachLoginFailCount(ctx, redis, username, count)
if err != nil {
return nil, err
}
if reach {
return nil, fmt.Errorf("reach login fail count")
}
}
user, err := UserGetByUsername(ctx, username)
if err != nil {
return nil, err
}
if user == nil {
if needCheck {
incrLoginFailCount(ctx, redis, username, seconds)
}
return nil, fmt.Errorf("Username or password invalid")
}
@@ -340,6 +444,9 @@ func PassLogin(ctx *ctx.Context, username, pass string) (*User, error) {
}
if loginPass != user.Password {
if needCheck {
incrLoginFailCount(ctx, redis, username, seconds)
}
return nil, fmt.Errorf("Username or password invalid")
}
@@ -724,6 +831,9 @@ func (u *User) ExtractToken(key string) (string, bool) {
return ret.String(), ret.Exists()
case Email:
return u.Email, u.Email != ""
case Lark, LarkCard:
ret := gjson.GetBytes(bs, LarkKey)
return ret.String(), ret.Exists()
default:
return "", false
}

View File

@@ -70,7 +70,11 @@ func (ugs *UserGroupSyncer) syncTeamMember() error {
if err != nil {
return err
}
err = ugs.addMemberToFDTeam(users)
toDutyErr := ugs.addMemberToFDTeam(users)
if toDutyErr != nil {
logger.Warningf("failed to sync user group %s %v to flashduty's team: %v", ugs.ug.Name, users, toDutyErr)
}
return err
}

View File

@@ -49,7 +49,8 @@ var I18N = `
"url path invalid":"url非法",
"no such server":"无此实例",
"admin role can not be modified":"管理员角色不允许修改",
"builtin payload already exists":"内置模板已存在"
"builtin payload already exists":"内置模板已存在",
"This functionality has not been enabled. Please contact the system administrator to activate it.":"此功能尚未启用。请联系系统管理员启用"
},
"zh_CN": {
"Username or password invalid": "用户名或密码错误",
@@ -101,7 +102,8 @@ var I18N = `
"admin role can not be modified":"管理员角色不允许修改",
"builtin payload already exists":"内置模板已存在",
"builtin metric already exists":"内置指标已存在",
"AlertRule already exists":"告警规则已存在"
"AlertRule already exists":"告警规则已存在",
"This functionality has not been enabled. Please contact the system administrator to activate it.":"此功能尚未启用。请联系系统管理员启用"
}
}
`

View File

@@ -34,6 +34,7 @@ type Config struct {
TLS bool
StartTLS bool
DefaultRoles []string
DefaultTeams []int64
RoleTeamMapping []RoleTeamMapping
}
@@ -55,6 +56,7 @@ type SsoClient struct {
TLS bool
StartTLS bool
DefaultRoles []string
DefaultTeams []int64
RoleTeamMapping map[string]RoleTeamMapping
Ticker *time.Ticker
@@ -109,6 +111,7 @@ func (s *SsoClient) Reload(cf Config) {
s.TLS = cf.TLS
s.StartTLS = cf.StartTLS
s.DefaultRoles = cf.DefaultRoles
s.DefaultTeams = cf.DefaultTeams
s.SyncAdd = cf.SyncAddUsers
s.SyncDel = cf.SyncDelUsers
s.SyncInterval = cf.SyncInterval
@@ -135,8 +138,11 @@ func (s *SsoClient) Copy() *SsoClient {
newRoles := make([]string, len(s.DefaultRoles))
copy(newRoles, s.DefaultRoles)
newTeams := make([]int64, len(s.DefaultTeams))
copy(newTeams, s.DefaultTeams)
lc := *s
lc.DefaultRoles = newRoles
lc.DefaultTeams = newTeams
s.RUnlock()
@@ -291,7 +297,7 @@ func (s *SsoClient) genLdapAttributeSearchList() []string {
return ldapAttributes
}
func LdapLogin(ctx *ctx.Context, username, pass string, defaultRoles []string, ldap *SsoClient) (*models.User, error) {
func LdapLogin(ctx *ctx.Context, username, pass string, defaultRoles []string, defaultTeams []int64, ldap *SsoClient) (*models.User, error) {
sr, err := ldap.LoginCheck(username, pass)
if err != nil {
return nil, err
@@ -331,6 +337,10 @@ func LdapLogin(ctx *ctx.Context, username, pass string, defaultRoles []string, l
}
}
if len(roleTeamMapping.Teams) == 0 {
roleTeamMapping.Teams = defaultTeams
}
// Synchronize group information
if err = models.UserGroupMemberSync(ctx, roleTeamMapping.Teams, user.Id, coverTeams); err != nil {
logger.Errorf("ldap.error: failed to update user(%s) group member err: %+v", user, err)
@@ -347,6 +357,15 @@ func LdapLogin(ctx *ctx.Context, username, pass string, defaultRoles []string, l
return nil, errors.WithMessage(err, "failed to add user")
}
if len(roleTeamMapping.Teams) == 0 {
for _, gid := range defaultTeams {
err = models.UserGroupMemberAdd(ctx, gid, user.Id)
if err != nil {
logger.Errorf("user:%v gid:%d UserGroupMemberAdd: %s", user, gid, err)
}
}
}
if err = models.UserGroupMemberSync(ctx, roleTeamMapping.Teams, user.Id, false); err != nil {
logger.Errorf("ldap.error: failed to update user(%s) group member err: %+v", user, err)
}

View File

@@ -32,6 +32,7 @@ type SsoClient struct {
Email string
}
DefaultRoles []string
DefaultTeams []int64
Ctx context.Context
Provider *oidc.Provider
@@ -55,6 +56,7 @@ type Config struct {
Email string
}
DefaultRoles []string
DefaultTeams []int64
Scopes []string
}
@@ -90,6 +92,7 @@ func (s *SsoClient) Reload(cf Config) error {
s.Attributes.Email = cf.Attributes.Email
s.DisplayName = cf.DisplayName
s.DefaultRoles = cf.DefaultRoles
s.DefaultTeams = cf.DefaultTeams
s.Ctx = context.Background()
if cf.SkipTlsVerify {

View File

@@ -52,14 +52,16 @@ type WriterOptions struct {
}
type RelabelConfig struct {
SourceLabels model.LabelNames
Separator string
Regex string
SourceLabels model.LabelNames `json:"source_labels"`
Separator string `json:"separator"`
Regex string `json:"regex"`
RegexCompiled *regexp.Regexp
Modulus uint64
TargetLabel string
Replacement string
Action string
If string `json:"if"`
IfRegex *regexp.Regexp
Modulus uint64 `json:"modulus"`
TargetLabel string `json:"target_label"`
Replacement string `json:"replacement"`
Action string `json:"action"`
}
func (p *Pushgw) PreCheck() {

View File

@@ -136,7 +136,7 @@ func matchSample(filterMap, sampleMap map[string]string) bool {
}
func (rt *Router) ForwardByIdent(clientIP string, ident string, v *prompb.TimeSeries) {
rt.BeforePush(clientIP, v)
v = rt.BeforePush(clientIP, v)
if v == nil {
return
}
@@ -157,7 +157,7 @@ func (rt *Router) ForwardByIdent(clientIP string, ident string, v *prompb.TimeSe
}
func (rt *Router) ForwardByMetric(clientIP string, metric string, v *prompb.TimeSeries) {
rt.BeforePush(clientIP, v)
v = rt.BeforePush(clientIP, v)
if v == nil {
return
}
@@ -177,7 +177,7 @@ func (rt *Router) ForwardByMetric(clientIP string, metric string, v *prompb.Time
rt.Writers.PushSample(hashkey, *v)
}
func (rt *Router) BeforePush(clientIP string, v *prompb.TimeSeries) {
rt.HandleTS(v)
func (rt *Router) BeforePush(clientIP string, v *prompb.TimeSeries) *prompb.TimeSeries {
rt.debugSample(clientIP, v)
return rt.HandleTS(v)
}

View File

@@ -14,7 +14,7 @@ import (
"github.com/ccfos/nightingale/v6/pushgw/writer"
)
type HandleTSFunc func(pt *prompb.TimeSeries)
type HandleTSFunc func(pt *prompb.TimeSeries) *prompb.TimeSeries
type Router struct {
HTTP httpx.Config
@@ -27,6 +27,7 @@ type Router struct {
Writers *writer.WritersType
Ctx *ctx.Context
HandleTS HandleTSFunc
HeartbeartApi string
}
func New(httpConfig httpx.Config, pushgw pconf.Pushgw, aconf aconf.Alert, tc *memsto.TargetCacheType, bg *memsto.BusiGroupCacheType,
@@ -42,7 +43,7 @@ func New(httpConfig httpx.Config, pushgw pconf.Pushgw, aconf aconf.Alert, tc *me
BusiGroupCache: bg,
IdentSet: idents,
MetaSet: metas,
HandleTS: func(pt *prompb.TimeSeries) {},
HandleTS: func(pt *prompb.TimeSeries) *prompb.TimeSeries { return pt },
}
}

View File

@@ -6,14 +6,32 @@ import (
"io"
"time"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
// heartbeat Forward heartbeat request to the center.
func (rt *Router) heartbeat(c *gin.Context) {
gid := ginx.QueryStr(c, "gid", "")
req, err := HandleHeartbeat(c, rt.Aconf.Heartbeat.EngineName, rt.MetaSet)
if err != nil {
logger.Warningf("req:%v heartbeat failed to handle heartbeat err:%v", req, err)
ginx.Dangerous(err)
}
api := "/v1/n9e/heartbeat"
if rt.HeartbeartApi != "" {
api = rt.HeartbeartApi
}
ret, err := poster.PostByUrlsWithResp[map[string]interface{}](rt.Ctx, api+"?gid="+gid, req)
ginx.NewRender(c).Data(ret, err)
}
func HandleHeartbeat(c *gin.Context, engineName string, metaSet *metas.Set) (models.HostMeta, error) {
var bs []byte
var err error
var r *gzip.Reader
@@ -21,28 +39,35 @@ func (rt *Router) heartbeat(c *gin.Context) {
if c.GetHeader("Content-Encoding") == "gzip" {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
c.String(400, err.Error())
return
return req, err
}
defer r.Close()
bs, err = io.ReadAll(r)
ginx.Dangerous(err)
if err != nil {
return req, err
}
} else {
defer c.Request.Body.Close()
bs, err = io.ReadAll(c.Request.Body)
ginx.Dangerous(err)
if err != nil {
return req, err
}
}
err = json.Unmarshal(bs, &req)
ginx.Dangerous(err)
if err != nil {
return req, err
}
if req.Hostname == "" {
ginx.Dangerous("hostname is required", 400)
}
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
req.RemoteAddr = c.ClientIP()
gid := ginx.QueryStr(c, "gid", "")
req.EngineName = engineName
metaSet.Set(req.Hostname, req)
req.EngineName = rt.Aconf.Heartbeat.EngineName
rt.MetaSet.Set(req.Hostname, req)
ginx.NewRender(c).Message(poster.PostByUrls(rt.Ctx, "/v1/n9e/heartbeat?gid="+gid, req))
return req, nil
}

View File

@@ -21,9 +21,10 @@ func extractMetricFromTimeSeries(s *prompb.TimeSeries) string {
return ""
}
func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent bool, identMetrics []string) (string, string) {
// 返回的第二个参数bool表示是否需要把 ident 写入 target 表
func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent, ignoreHost bool, identMetrics []string) (string, bool) {
if s == nil {
return "", ""
return "", false
}
labelMap := make(map[string]int)
@@ -32,14 +33,24 @@ func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent bool, identMet
}
var ident string
var heartbeatIdent string
// agent_hostname for grafana-agent and categraf
if idx, ok := labelMap["agent_hostname"]; ok {
s.Labels[idx].Name = "ident"
// 如果标签中有ident则直接使用
if idx, ok := labelMap["ident"]; ok {
ident = s.Labels[idx].Value
}
if !ignoreIdent && ident == "" {
if ident == "" {
// 没有 ident 标签,尝试使用 agent_hostname 作为 ident
// agent_hostname for grafana-agent and categraf
if idx, ok := labelMap["agent_hostname"]; ok {
s.Labels[idx].Name = "ident"
ident = s.Labels[idx].Value
}
}
if !ignoreHost && ident == "" {
// agent_hostname 没有,那就使用 host 作为 ident用于 telegraf 的场景
// 但是,有的时候 nginx 采集的指标中带有 host 标签表示域名,这个时候就不能用 host 作为 ident此时需要在 url 中设置 ignore_host=true
// telegraf, output plugin: http, format: prometheusremotewrite
if idx, ok := labelMap["host"]; ok {
s.Labels[idx].Name = "ident"
@@ -47,11 +58,11 @@ func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent bool, identMet
}
}
if idx, ok := labelMap["ident"]; ok {
ident = s.Labels[idx].Value
if ident == "" {
// 上报的监控数据中并没有 ident 信息
return "", false
}
heartbeatIdent = ident
if len(identMetrics) > 0 {
metricFound := false
for _, identMetric := range identMetrics {
@@ -62,11 +73,11 @@ func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent bool, identMet
}
if !metricFound {
heartbeatIdent = ""
return ident, false
}
}
return ident, heartbeatIdent
return ident, !ignoreIdent
}
func duplicateLabelKey(series *prompb.TimeSeries) bool {
@@ -101,14 +112,18 @@ func (rt *Router) remoteWrite(c *gin.Context) {
return
}
var ids = make(map[string]struct{})
var (
ignoreIdent = ginx.QueryBool(c, "ignore_ident", false)
ignoreHost = ginx.QueryBool(c, "ignore_host", false)
ids = make(map[string]struct{})
)
for i := 0; i < count; i++ {
if duplicateLabelKey(&req.Timeseries[i]) {
continue
}
ident, heartbeatIdent := extractIdentFromTimeSeries(&req.Timeseries[i], ginx.QueryBool(c, "ignore_ident", false), rt.Pushgw.IdentMetrics)
ident, insertTarget := extractIdentFromTimeSeries(&req.Timeseries[i], ignoreIdent, ignoreHost, rt.Pushgw.IdentMetrics)
if len(ident) > 0 {
// enrich host labels
target, has := rt.TargetCache.Get(ident)
@@ -117,7 +132,7 @@ func (rt *Router) remoteWrite(c *gin.Context) {
}
}
if len(heartbeatIdent) > 0 {
if insertTarget {
// has ident tag or agent_hostname tag
// register host in table target
ids[ident] = struct{}{}

View File

@@ -3,24 +3,28 @@ package writer
import (
"crypto/md5"
"fmt"
"regexp"
"sort"
"strings"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/toolkits/pkg/logger"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
const (
Replace string = "replace"
Keep string = "keep"
Drop string = "drop"
HashMod string = "hashmod"
LabelMap string = "labelmap"
LabelDrop string = "labeldrop"
LabelKeep string = "labelkeep"
Lowercase string = "lowercase"
Uppercase string = "uppercase"
Replace string = "replace"
Keep string = "keep"
Drop string = "drop"
HashMod string = "hashmod"
LabelMap string = "labelmap"
LabelDrop string = "labeldrop"
LabelKeep string = "labelkeep"
Lowercase string = "lowercase"
Uppercase string = "uppercase"
DropIfEqual string = "drop_if_equal"
)
func Process(labels []prompb.Label, cfgs ...*pconf.RelabelConfig) []prompb.Label {
@@ -55,10 +59,6 @@ func newBuilder(ls []prompb.Label) *LabelBuilder {
}
func (l *LabelBuilder) set(k, v string) *LabelBuilder {
if v == "" {
return l.del(k)
}
l.LabelSet[k] = v
return l
}
@@ -96,9 +96,17 @@ func relabel(lset []prompb.Label, cfg *pconf.RelabelConfig) []prompb.Label {
}
regx := cfg.RegexCompiled
if regx == nil {
regx = compileRegex(cfg.Regex)
}
if regx == nil {
return lset
}
val := strings.Join(values, cfg.Separator)
lb := newBuilder(lset)
switch cfg.Action {
case Drop:
if regx.MatchString(val) {
@@ -109,21 +117,7 @@ func relabel(lset []prompb.Label, cfg *pconf.RelabelConfig) []prompb.Label {
return nil
}
case Replace:
indexes := regx.FindStringSubmatchIndex(val)
if indexes == nil {
break
}
target := model.LabelName(regx.ExpandString([]byte{}, cfg.TargetLabel, val, indexes))
if !target.IsValid() {
lb.del(cfg.TargetLabel)
break
}
res := regx.ExpandString([]byte{}, cfg.Replacement, val, indexes)
if len(res) == 0 {
lb.del(cfg.TargetLabel)
break
}
lb.set(string(target), string(res))
return handleReplace(lb, regx, cfg, val, lset)
case Lowercase:
lb.set(cfg.TargetLabel, strings.ToLower(val))
case Uppercase:
@@ -150,13 +144,84 @@ func relabel(lset []prompb.Label, cfg *pconf.RelabelConfig) []prompb.Label {
lb.del(l.Name)
}
}
case DropIfEqual:
return handleDropIfEqual(lb, cfg, lset)
default:
panic(fmt.Errorf("relabel: unknown relabel action type %q", cfg.Action))
logger.Errorf("relabel: unknown relabel action type %q", cfg.Action)
}
return lb.labels()
}
func handleReplace(lb *LabelBuilder, regx *regexp.Regexp, cfg *pconf.RelabelConfig, val string, lset []prompb.Label) []prompb.Label {
// 如果没有 source_labels直接设置标签新增标签
if len(cfg.SourceLabels) == 0 {
lb.set(cfg.TargetLabel, cfg.Replacement)
return lb.labels()
}
// 如果 Replacement 为空, separator 不为空, 则用已有标签构建新标签
if cfg.Replacement == "" && len(cfg.SourceLabels) > 1 {
lb.set(cfg.TargetLabel, val)
return lb.labels()
}
// 处理正则表达式替换的情况(修改标签值,正则)
if regx != nil {
indexes := regx.FindStringSubmatchIndex(val)
if indexes == nil {
return lb.labels()
}
target := model.LabelName(cfg.TargetLabel)
if !target.IsValid() {
lb.del(cfg.TargetLabel)
return lb.labels()
}
res := regx.ExpandString([]byte{}, cfg.Replacement, val, indexes)
if len(res) == 0 {
lb.del(cfg.TargetLabel)
} else {
lb.set(string(target), string(res))
}
return lb.labels()
}
// 默认情况,直接设置目标标签值
lb.set(cfg.TargetLabel, cfg.Replacement)
return lb.labels()
}
func handleDropIfEqual(lb *LabelBuilder, cfg *pconf.RelabelConfig, lset []prompb.Label) []prompb.Label {
if len(cfg.SourceLabels) < 2 {
return lb.labels()
}
firstVal := getValue(lset, cfg.SourceLabels[0])
equal := true
for _, label := range cfg.SourceLabels[1:] {
if getValue(lset, label) != firstVal {
equal = false
break
}
}
if equal {
return nil
}
return lb.labels()
}
func compileRegex(expr string) *regexp.Regexp {
regex, err := regexp.Compile(expr)
if err != nil {
logger.Error("failed to compile regexp:", expr, "error:", err)
return nil
}
return regex
}
func sum64(hash [md5.Size]byte) uint64 {
var s uint64

View File

@@ -0,0 +1,406 @@
// @Author: Ciusyan 6/19/24
package writer
import (
"reflect"
"sort"
"testing"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
func TestProcess(t *testing.T) {
tests := []struct {
name string
labels []prompb.Label
cfgs []*pconf.RelabelConfig
expected []prompb.Label
}{
// 1. 添加新标签 (Adding new label)
{
name: "Adding new label",
labels: []prompb.Label{{Name: "job", Value: "aa"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
},
expected: []prompb.Label{{Name: "job", Value: "aa"}, {Name: "foo", Value: "bar"}},
},
// 2. 更新现有标签 (Updating existing label)
{
name: "Updating existing label",
labels: []prompb.Label{{Name: "foo", Value: "aaaa"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
},
expected: []prompb.Label{{Name: "foo", Value: "bar"}},
},
// 3. 重写现有标签 (Rewriting existing label)
{
name: "Rewriting existing label",
labels: []prompb.Label{{Name: "instance", Value: "bar:123"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: "([^:]+):.+",
TargetLabel: "instance",
Replacement: "$1",
},
},
expected: []prompb.Label{{Name: "instance", Value: "bar"}},
},
{
name: "Rewriting existing label",
labels: []prompb.Label{{Name: "instance", Value: "bar:123"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: ":([0-9]+)$",
TargetLabel: "port",
Replacement: "$1",
},
},
expected: []prompb.Label{{Name: "port", Value: "123"}, {Name: "instance", Value: "bar:123"}},
},
// 4. 更新度量标准名称 (Updating metric name)
{
name: "Updating metric name",
labels: []prompb.Label{{Name: "__name__", Value: "foo_suffix"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"__name__"},
Regex: "(.+)_suffix",
TargetLabel: "__name__",
Replacement: "prefix_$1",
},
},
expected: []prompb.Label{{Name: "__name__", Value: "prefix_foo"}},
},
// 5. 删除不需要/保持需要 的标签 (Removing unneeded labels)
{
name: "Removing unneeded labels",
labels: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labeldrop",
Regex: "foo.+",
},
},
expected: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
},
},
{
name: "keep needed labels",
labels: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labelkeep",
Regex: "foo.+",
},
},
expected: []prompb.Label{
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
},
// 6. 删除特定标签值 (Removing the specific label value)
{
name: "Removing the specific label value",
labels: []prompb.Label{
{Name: "foo", Value: "bar"},
{Name: "baz", Value: "x"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"foo"},
Regex: "bar",
TargetLabel: "foo",
Replacement: "",
},
},
expected: []prompb.Label{
{Name: "baz", Value: "x"},
},
},
// 7. 删除不需要的度量标准 (Removing unneeded metrics)
{
name: "Removing unneeded metrics",
labels: []prompb.Label{
{Name: "instance", Value: "foobar1"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: nil,
},
{
name: "Removing unneeded metrics 2",
labels: []prompb.Label{
{Name: "instance", Value: "foobar2"},
{Name: "job", Value: "xxx"},
{Name: "aaa", Value: "bb"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: nil,
},
{
name: "Removing unneeded metrics 3",
labels: []prompb.Label{
{Name: "instance", Value: "xxx"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "xxx"},
},
},
{
name: "Removing unneeded metrics 4",
labels: []prompb.Label{
{Name: "instance", Value: "abc"},
{Name: "job", Value: "xyz"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "abc"},
{Name: "job", Value: "xyz"},
},
},
{
name: "Removing unneeded metrics with multiple labels",
labels: []prompb.Label{
{Name: "job", Value: "foo"},
{Name: "instance", Value: "bar"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"job", "instance"},
Regex: "foo;bar",
Separator: ";",
},
},
expected: nil,
},
// 8. 按条件删除度量标准 (Dropping metrics on certain condition)
{
name: "Dropping metrics on certain condition",
labels: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "123"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop_if_equal",
SourceLabels: model.LabelNames{"real_port", "needed_port"},
},
},
expected: nil,
},
{
name: "Dropping metrics on certain condition 2",
labels: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "456"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop_if_equal",
SourceLabels: model.LabelNames{"real_port", "needed_port"},
},
},
expected: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "456"},
},
},
// 9. 修改标签名称 (Modifying label names)
{
name: "Modifying label names",
labels: []prompb.Label{
{Name: "foo_xx", Value: "bb"},
{Name: "job", Value: "qq"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labelmap",
Regex: "foo_(.+)",
Replacement: "bar_$1",
},
},
expected: []prompb.Label{
{Name: "foo_xx", Value: "bb"},
{Name: "bar_xx", Value: "bb"},
{Name: "job", Value: "qq"},
},
},
// 10. 从多个现有标签构建新标签 (Constructing a label from multiple existing labels)
{
name: "Constructing a label from multiple existing labels",
labels: []prompb.Label{
{Name: "host", Value: "hostname"},
{Name: "port", Value: "9090"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"host", "port"},
Separator: ":",
TargetLabel: "address",
},
},
expected: []prompb.Label{
{Name: "host", Value: "hostname"},
{Name: "port", Value: "9090"},
{Name: "address", Value: "hostname:9090"},
},
},
// 11. 链式重标记规则 (Chaining relabeling rules)
{
name: "Chaining relabeling rules",
labels: []prompb.Label{
{Name: "instance", Value: "hostname:9090"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: "([^:]+):.*",
TargetLabel: "instance",
Replacement: "$1",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "hostname"},
{Name: "foo", Value: "bar"},
},
},
// 12. 条件重标记 (Conditional relabeling)
{
name: "Conditional relabeling matches",
labels: []prompb.Label{
{Name: "label", Value: "x"},
{Name: "foo", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "x"},
{Name: "foo", Value: "bar"},
},
},
{
name: "Conditional relabeling matches alternative",
labels: []prompb.Label{
{Name: "label", Value: "y"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "y"},
{Name: "foo", Value: "bar"},
},
},
{
name: "Conditional relabeling does not match",
labels: []prompb.Label{
{Name: "label", Value: "z"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "z"},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := Process(tt.labels, tt.cfgs...)
// Sort the slices before comparison
sort.Slice(got, func(i, j int) bool {
return got[i].Name < got[j].Name
})
sort.Slice(tt.expected, func(i, j int) bool {
return tt.expected[i].Name < tt.expected[j].Name
})
if !reflect.DeepEqual(got, tt.expected) {
t.Errorf("Process() = %v, want %v", got, tt.expected)
}
})
}
}