Compare commits

...

45 Commits

Author SHA1 Message Date
kongfei
b873bd161e install requests lib for python3 2023-07-25 11:18:28 +08:00
yimiaoxiehou
60b76b9ccc add static ttf file route (#1641)
Co-authored-by: chenzebin <chenzebin@ut.cn>
2023-07-20 19:40:03 +08:00
ning
ef39ee2f66 Merge branch 'main' of github.com:ccfos/nightingale 2023-07-20 18:01:21 +08:00
ning
6c83c2ef9b fix: panic when query data get cli is nil 2023-07-20 18:01:09 +08:00
李明
9495ec67ab feat: support index pattern datasource_id param (#1640)
* feat:support index pattern datasource id param
2023-07-20 14:09:21 +08:00
青牛踏雪
bb5680f6c4 fix windows dashboards label_values (#1639) 2023-07-20 11:49:20 +08:00
李明
acbe49f518 index pattern basic op (#1635)
* index pattern basic op
2023-07-20 11:21:51 +08:00
青牛踏雪
9dd55938c2 add Gitlab dashboard and alert rules based on categraf acquisition (#1636) 2023-07-20 11:17:21 +08:00
shardingHe
5433e6e27e AlertAggrView update verify (#1637)
Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
2023-07-19 20:33:56 +08:00
ning
2dd6eb5f0f fix: get targets 2023-07-19 15:56:25 +08:00
Yening Qin
1731713dbb fix: get all target by guest user (#1634)
* fix: targets api get all

* code refactor
2023-07-19 15:08:10 +08:00
Ulric Qin
327ddb7bad code refactor 2023-07-17 19:47:58 +08:00
Ulric Qin
9e4adc1fa2 code refactor 2023-07-17 17:47:14 +08:00
Ulric Qin
bce7fdb470 code refactor 2023-07-17 17:13:56 +08:00
Ulric Qin
b79422962c code refactor 2023-07-17 17:13:39 +08:00
Ulric Qin
e5989ae5c2 rename integration Mongo to MongoDB 2023-07-17 17:11:27 +08:00
Ulric Qin
64feafa3a6 code refactor 2023-07-17 12:18:15 +08:00
Ulric Qin
52e4fa4d0d rename obs to dumper 2023-07-17 07:04:01 +08:00
Ulric Qin
6462c02861 rename obs to dumper 2023-07-17 07:01:19 +08:00
Ulric Qin
c657182659 refactor forward series 2023-07-16 11:35:56 +08:00
Ulric Qin
04d93eff34 refactor observe functions 2023-07-16 10:29:55 +08:00
Ulric Qin
40d60aeb4a add observe 2023-07-16 10:06:41 +08:00
Ulric Qin
ac875fa1b9 fix logger format output 2023-07-16 06:38:57 +08:00
shardingHe
b7c3e8a4f5 add interface of validation rule (#1606)
* add interface of validation rule

---------

Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
Co-authored-by: Yening Qin <710leo@gmail.com>
2023-07-14 14:16:35 +08:00
ning
2524e15947 Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:45:07 +08:00
ning
995c579403 docs: update built-in alert rule 2023-07-14 11:44:55 +08:00
Ulric Qin
848b7ac1ae Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:37:50 +08:00
Ulric Qin
9476b5ba7c code refactor 2023-07-14 11:37:39 +08:00
ning
7b58696bdc Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:16:19 +08:00
ning
6159178d99 set alert_rule.promql empty 2023-07-14 11:16:06 +08:00
青牛踏雪
99e5e0c117 add MinIO dashboard and alert rules based on categraf acquisition (#1625)
* add MinIO  dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition
2023-07-14 10:37:35 +08:00
Yening Qin
be1a3c1d8b sub and mute rule by severity (#1621)
* sub severity

* mute by severity
2023-07-13 11:16:32 +08:00
Yening Qin
f6378b055c docs: optimize the name of the integrations directory 2023-07-12 18:09:05 +08:00
Yening Qin
2574bb19cd rename integration ceph 2023-07-12 17:59:59 +08:00
Yening Qin
aa9d43cc69 rename integration 2023-07-12 17:58:20 +08:00
李明
d7f18ebec1 add mute hook (#1617) 2023-07-12 16:39:43 +08:00
Ulric Qin
b40f6976bb code refactor 2023-07-12 15:31:09 +08:00
Ulric Qin
cd1db57b7c code refactor 2023-07-12 15:05:58 +08:00
青牛踏雪
5a6ca42c75 add ceph dashboard and alert rules based on categraf acquisition (#1619) 2023-07-11 15:08:00 +08:00
Ulric Qin
80874a743c refactor logic: do not extract ident when ignore_ident exists 2023-07-11 14:56:53 +08:00
ulricqin
6cc612564f fix alert mute compute (#1618) 2023-07-11 10:12:09 +08:00
Yening Qin
909bbb5e66 refactor alert eval (#1616) 2023-07-10 18:49:15 +08:00
青牛踏雪
ff3ea7de58 update postgresql dashboard and alert rules based on categraf acquisition (#1613) 2023-07-07 15:09:27 +08:00
kongfei605
dd316e6ce1 alerts rule and dashboards for pg (#1612) 2023-07-07 14:04:55 +08:00
kongfei605
ba893e77cd update title of tidb alerts (#1611) 2023-07-07 14:04:15 +08:00
233 changed files with 21538 additions and 3894 deletions

3
.gitignore vendored
View File

@@ -31,6 +31,9 @@ _test
/etc/*.local.yml
/etc/*.local.conf
/etc/plugins/*.local.yml
/etc/script/rules.yaml
/etc/script/alert-rules.json
/etc/script/record-rules.json
/data*
/tarball
/run

View File

@@ -15,6 +15,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
@@ -47,16 +48,19 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
rt.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
@@ -67,9 +71,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
}
func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType,
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap) {
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) {
alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats)
recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats)

View File

@@ -15,6 +15,7 @@ type AnomalyPoint struct {
Value float64 `json:"value"`
Severity int `json:"severity"`
Triggered bool `json:"triggered"`
Query string `json:"query"`
}
func NewAnomalyPoint(key string, labels map[string]string, ts int64, value float64, severity int) AnomalyPoint {

View File

@@ -83,6 +83,10 @@ func (e *Consumer) consumeOne(event *models.AlertCurEvent) {
}
func (e *Consumer) persist(event *models.AlertCurEvent) {
if event.Status != 0 {
return
}
if !e.ctx.IsCenter {
event.DB2FE()
err := poster.PostByUrls(e.ctx, "/v1/n9e/event-persist", event)

View File

@@ -28,9 +28,10 @@ type Dispatch struct {
alerting aconf.Alerting
senders map[string]sender.Sender
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
Senders map[string]sender.Sender
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
BeforeSenderHook func(*models.AlertCurEvent) bool
ctx *ctx.Context
@@ -51,9 +52,10 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
alerting: alerting,
senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
Senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
BeforeSenderHook: func(*models.AlertCurEvent) bool { return true },
ctx: ctx,
}
@@ -63,7 +65,7 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
func (e *Dispatch) ReloadTpls() error {
err := e.relaodTpls()
if err != nil {
logger.Error("failed to reload tpls: %v", err)
logger.Errorf("failed to reload tpls: %v", err)
}
duration := time.Duration(9000) * time.Millisecond
@@ -100,7 +102,7 @@ func (e *Dispatch) relaodTpls() error {
e.RwLock.Lock()
e.tpls = tmpTpls
e.senders = senders
e.Senders = senders
e.RwLock.Unlock()
return nil
}
@@ -141,7 +143,7 @@ func (e *Dispatch) HandleEventNotify(event *models.AlertCurEvent, isSubscribe bo
}
// 处理事件发送,这里用一个goroutine处理一个event的所有发送事件
go e.Send(rule, event, notifyTarget, isSubscribe)
go e.Send(rule, event, notifyTarget)
// 如果是不是订阅规则出现的event, 则需要处理订阅规则的event
if !isSubscribe {
@@ -177,26 +179,41 @@ func (e *Dispatch) handleSub(sub *models.AlertSubscribe, event models.AlertCurEv
if sub.ForDuration > (event.TriggerTime - event.FirstTriggerTime) {
return
}
if len(sub.SeveritiesJson) != 0 {
match := false
for _, s := range sub.SeveritiesJson {
if s == event.Severity || s == 0 {
match = true
break
}
}
if !match {
return
}
}
sub.ModifyEvent(&event)
LogEvent(&event, "subscribe")
event.SubRuleId = sub.Id
e.HandleEventNotify(&event, true)
}
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget, isSubscribe bool) {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, event, uids, e.userCache)
e.RwLock.RLock()
s := e.senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget) {
needSend := e.BeforeSenderHook(event)
if needSend {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, []*models.AlertCurEvent{event}, uids, e.userCache)
e.RwLock.RLock()
s := e.Senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
}
s.Send(ctx)
}
logger.Debugf("send event: %s, channel: %s", event.Hash, channel)
for i := 0; i < len(ctx.Users); i++ {
logger.Debug("send event to user: ", ctx.Users[i])
}
s.Send(ctx)
}
// handle event callbacks

View File

@@ -69,14 +69,16 @@ func (arw *AlertRuleWorker) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-arw.quit:
return
default:
case <-ticker.C:
arw.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()
@@ -163,6 +165,7 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
points := common.ConvertAnomalyPoints(value)
for i := 0; i < len(points); i++ {
points[i].Severity = query.Severity
points[i].Query = promql
}
lst = append(lst, points...)
}

View File

@@ -13,7 +13,11 @@ import (
)
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) bool {
if TimeNonEffectiveMuteStrategy(rule, event) {
if rule.Disabled == 1 {
return true
}
if TimeSpanMuteStrategy(rule, event) {
return true
}
@@ -32,12 +36,9 @@ func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *m
return false
}
// TimeNonEffectiveMuteStrategy 根据规则配置的告警时间过滤,如果产生的告警不在规则配置的告警时间内,则不告警
func TimeNonEffectiveMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
if rule.Disabled == 1 {
return true
}
// TimeSpanMuteStrategy 根据规则配置的告警生效时间过滤,如果产生的告警不在规则配置的告警生效时间内,则不告警,即被mute
// 时间范围左闭右开默认范围00:00-24:00
func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
tm := time.Unix(event.TriggerTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
@@ -52,18 +53,33 @@ func TimeNonEffectiveMuteStrategy(rule *models.AlertRule, event *models.AlertCur
if !strings.Contains(enableDaysOfWeek[i], triggerWeek) {
continue
}
if enableStime[i] <= enableEtime[i] {
if triggerTime < enableStime[i] || triggerTime > enableEtime[i] {
continue
if enableStime[i] < enableEtime[i] {
if enableEtime[i] == "23:59" {
// 02:00-23:59这种情况做个特殊处理相当于左闭右闭区间了
if triggerTime < enableStime[i] {
// mute, 即没生效
continue
}
} else {
// 02:00-04:00 或者 02:00-24:00
if triggerTime < enableStime[i] || triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
} else {
if triggerTime < enableStime[i] && triggerTime > enableEtime[i] {
} else if enableStime[i] > enableEtime[i] {
// 21:00-09:00
if triggerTime < enableStime[i] && triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
// 到这里说明当前时刻在告警规则的某组生效时间范围内,直接返回 false
// 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute直接返回 false
return false
}
return true
}
@@ -165,7 +181,7 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
break
}
} else {
if triggerTime < mute.PeriodicMutesJson[i].EnableStime || triggerTime >= mute.PeriodicMutesJson[i].EnableEtime {
if triggerTime >= mute.PeriodicMutesJson[i].EnableStime || triggerTime < mute.PeriodicMutesJson[i].EnableEtime {
matchTime = true
break
}
@@ -177,5 +193,21 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
return false
}
var matchSeverity bool
if len(mute.SeveritiesJson) > 0 {
for _, s := range mute.SeveritiesJson {
if event.Severity == s || s == 0 {
matchSeverity = true
break
}
}
} else {
matchSeverity = true
}
if !matchSeverity {
return false
}
return common.MatchTags(event.TagsMap, mute.ITags)
}

View File

@@ -167,3 +167,13 @@ func (n *Naming) ActiveServers(datasourceId int64) ([]string, error) {
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
}
func (n *Naming) ActiveServersByEngineName() ([]string, error) {
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?engine_name="+n.heartbeatConfig.EngineName)
return lst, err
}
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "engine_cluster = ? and clock > ?", n.heartbeatConfig.EngineName, time.Now().Unix()-30)
}

View File

@@ -23,6 +23,8 @@ import (
"github.com/toolkits/pkg/str"
)
type EventMuteHookFunc func(event *models.AlertCurEvent) bool
type ExternalProcessorsType struct {
ExternalLock sync.RWMutex
Processors map[string]*Processor
@@ -43,6 +45,8 @@ func (e *ExternalProcessorsType) GetExternalAlertRule(datasourceId, id int64) (*
return processor, has
}
type HandleEventFunc func(event *models.AlertCurEvent)
type Processor struct {
datasourceId int64
@@ -66,6 +70,10 @@ type Processor struct {
promClients *prom.PromClientMap
ctx *ctx.Context
stats *astats.Stats
HandleFireEventHook HandleEventFunc
HandleRecoverEventHook HandleEventFunc
EventMuteHook EventMuteHookFunc
}
func (p *Processor) Key() string {
@@ -102,6 +110,10 @@ func NewProcessor(rule *models.AlertRule, datasourceId int64, atertRuleCache *me
promClients: promClients,
ctx: ctx,
stats: stats,
HandleFireEventHook: func(event *models.AlertCurEvent) {},
HandleRecoverEventHook: func(event *models.AlertCurEvent) {},
EventMuteHook: func(event *models.AlertCurEvent) bool { return false },
}
p.mayHandleGroup()
@@ -133,6 +145,11 @@ func (p *Processor) Handle(anomalyPoints []common.AnomalyPoint, from string, inh
logger.Debugf("rule_eval:%s event:%v is muted", p.Key(), event)
continue
}
if p.EventMuteHook(event) {
continue
}
tagHash := TagHash(anomalyPoint)
eventsMap[tagHash] = append(eventsMap[tagHash], event)
}
@@ -174,6 +191,8 @@ func (p *Processor) BuildEvent(anomalyPoint common.AnomalyPoint, from string, no
event.RuleConfig = p.rule.RuleConfig
event.RuleConfigJson = p.rule.RuleConfigJson
event.Severity = anomalyPoint.Severity
event.ExtraConfig = p.rule.ExtraConfigJSON
event.PromQl = anomalyPoint.Query
if from == "inner" {
event.LastEvalTime = now
@@ -227,6 +246,8 @@ func (p *Processor) RecoverSingle(hash string, now int64, value *string) {
cachedRule.UpdateEvent(event)
event.IsRecovered = true
event.LastEvalTime = now
p.HandleRecoverEventHook(event)
p.pushEventToQueue(event)
}
@@ -284,9 +305,12 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
if cachedRule == nil {
return
}
logger.Debugf("rule_eval:%s event:%+v fire", p.Key(), event)
if fired, has := p.fires.Get(event.Hash); has {
p.fires.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
event.FirstTriggerTime = fired.FirstTriggerTime
p.HandleFireEventHook(event)
if cachedRule.NotifyRepeatStep == 0 {
logger.Debugf("rule_eval:%s event:%+v repeat is zero nothing to do", p.Key(), event)
@@ -296,11 +320,10 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
}
// 之前发送过告警了,这次是否要继续发送,要看是否过了通道静默时间
if event.LastEvalTime > fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if event.LastEvalTime >= fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if cachedRule.NotifyMaxNumber == 0 {
// 最大可以发送次数如果是0表示不想限制最大发送次数一直发即可
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
} else {
// 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数
@@ -309,7 +332,6 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
return
} else {
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
}
}
@@ -317,6 +339,7 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
} else {
event.NotifyCurNumber = 1
event.FirstTriggerTime = event.TriggerTime
p.HandleFireEventHook(event)
p.pushEventToQueue(event)
}
}
@@ -432,7 +455,7 @@ func labelMapToArr(m map[string]string) []string {
}
func Hash(ruleId, datasourceId int64, vector common.AnomalyPoint) string {
return str.MD5(fmt.Sprintf("%d_%s_%d_%d", ruleId, vector.Labels.String(), datasourceId, vector.Severity))
return str.MD5(fmt.Sprintf("%d_%s_%d_%d_%s", ruleId, vector.Labels.String(), datasourceId, vector.Severity, vector.Query))
}
func TagHash(vector common.AnomalyPoint) string {

View File

@@ -54,14 +54,16 @@ func (rrc *RecordRuleContext) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-rrc.quit:
return
default:
case <-ticker.C:
rrc.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()

View File

@@ -32,7 +32,7 @@ type DingtalkSender struct {
}
func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
@@ -40,7 +40,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ds.tpl, ctx.Event)
message := BuildTplMessage(ds.tpl, ctx.Events)
for _, url := range urls {
var body dingtalk
@@ -49,7 +49,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Event.RuleName,
Title: ctx.Events[0].RuleName,
Text: message,
},
}
@@ -57,7 +57,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Event.RuleName,
Title: ctx.Events[0].RuleName,
Text: message + "\n" + strings.Join(ats, " "),
},
At: dingtalkAt{

View File

@@ -22,18 +22,18 @@ type EmailSender struct {
}
func (es *EmailSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tos := extract(ctx.Users)
var subject string
if es.subjectTpl != nil {
subject = BuildTplMessage(es.subjectTpl, ctx.Event)
subject = BuildTplMessage(es.subjectTpl, []*models.AlertCurEvent{ctx.Events[0]})
} else {
subject = ctx.Event.RuleName
subject = ctx.Events[0].RuleName
}
content := BuildTplMessage(es.contentTpl, ctx.Event)
content := BuildTplMessage(es.contentTpl, ctx.Events)
es.WriteEmail(subject, content, tos)
}

View File

@@ -31,11 +31,11 @@ type FeishuSender struct {
}
func (fs *FeishuSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, ats := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Event)
message := BuildTplMessage(fs.tpl, ctx.Events)
for _, url := range urls {
body := feishu{
Msgtype: "text",

View File

@@ -96,11 +96,11 @@ var (
)
func (fs *FeishuCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, _ := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Event)
message := BuildTplMessage(fs.tpl, ctx.Events)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
@@ -109,7 +109,7 @@ func (fs *FeishuCardSender) Send(ctx MessageContext) {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Event.RuleName)
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message

View File

@@ -28,7 +28,7 @@ type MmSender struct {
}
func (ms *MmSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
@@ -36,7 +36,7 @@ func (ms *MmSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ms.tpl, ctx.Event)
message := BuildTplMessage(ms.tpl, ctx.Events)
SendMM(MatterMostMessage{
Text: message,

View File

@@ -17,9 +17,9 @@ type (
// MessageContext 一个event所生成的告警通知的上下文
MessageContext struct {
Users []*models.User
Rule *models.AlertRule
Event *models.AlertCurEvent
Users []*models.User
Rule *models.AlertRule
Events []*models.AlertCurEvent
}
)
@@ -43,23 +43,32 @@ func NewSender(key string, tpls map[string]*template.Template, smtp aconf.SMTPCo
return nil
}
func BuildMessageContext(rule *models.AlertRule, event *models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
func BuildMessageContext(rule *models.AlertRule, events []*models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
users := userCache.GetByUserIds(uids)
return MessageContext{
Rule: rule,
Event: event,
Users: users,
Rule: rule,
Events: events,
Users: users,
}
}
func BuildTplMessage(tpl *template.Template, event *models.AlertCurEvent) string {
type BuildTplMessageFunc func(tpl *template.Template, events []*models.AlertCurEvent) string
var BuildTplMessage BuildTplMessageFunc = buildTplMessage
func buildTplMessage(tpl *template.Template, events []*models.AlertCurEvent) string {
if tpl == nil {
return "tpl for current sender not found, please check configuration"
}
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
var content string
for _, event := range events {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
}
content += body.String() + "\n\n"
}
return body.String()
return content
}

View File

@@ -26,11 +26,11 @@ type TelegramSender struct {
}
func (ts *TelegramSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tokens := ts.extract(ctx.Users)
message := BuildTplMessage(ts.tpl, ctx.Event)
message := BuildTplMessage(ts.tpl, ctx.Events)
SendTelegram(TelegramMessage{
Text: message,

View File

@@ -25,11 +25,11 @@ type WecomSender struct {
}
func (ws *WecomSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls := ws.extract(ctx.Users)
message := BuildTplMessage(ws.tpl, ctx.Event)
message := BuildTplMessage(ws.tpl, ctx.Events)
for _, url := range urls {
body := wecom{
Msgtype: "markdown",

View File

@@ -11,6 +11,7 @@ import (
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/models/migrate"
@@ -71,18 +72,20 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
writers := writer.NewWriters(config.Pushgw)
httpx.InitRSAConfig(&config.HTTP.RSA)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
@@ -90,6 +93,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
centerRouter.Config(r)
alertrtRouter.Config(r)
pushgwRouter.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)

View File

@@ -39,11 +39,14 @@ type Router struct {
IdentSet *idents.Set
TargetCache *memsto.TargetCacheType
Sso *sso.SsoClient
UserCache *memsto.UserCacheType
UserGroupCache *memsto.UserGroupCacheType
Ctx *ctx.Context
}
func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType) *Router {
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType,
uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
@@ -56,6 +59,8 @@ func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operatio
IdentSet: idents,
TargetCache: tc,
Sso: sso,
UserCache: uc,
UserGroupCache: ugc,
Ctx: ctx,
}
}
@@ -101,7 +106,7 @@ func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) {
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map":
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf":
if !rt.Center.UseFileAssets {
c.FileFromFS(c.Request.URL.Path, *fs)
} else {
@@ -263,6 +268,7 @@ func (rt *Router) Config(r *gin.Engine) {
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)
pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet)
pages.PUT("/busi-group/:id/alert-rule/:arid/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation)
pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets)
pages.POST("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/add"), rt.bgrw(), rt.recordingRuleAddByFE)
@@ -359,6 +365,12 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigGet)
pages.PUT("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigPut)
pages.GET("/es-index-pattern", rt.auth(), rt.esIndexPatternGet)
pages.GET("/es-index-pattern-list", rt.auth(), rt.esIndexPatternGetList)
pages.POST("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternAdd)
pages.PUT("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternPut)
pages.DELETE("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternDel)
}
if rt.HTTP.APIForService.Enable {

View File

@@ -69,6 +69,11 @@ func (rt *Router) alertAggrViewPut(c *gin.Context) {
return
}
}
ginx.NewRender(c).Message(view.Update(rt.Ctx, f.Name, f.Rule, f.Cate, me.Id))
view.Name = f.Name
view.Rule = f.Rule
view.Cate = f.Cate
if view.CreateBy == 0 {
view.CreateBy = me.Id
}
ginx.NewRender(c).Message(view.Update(rt.Ctx))
}

View File

@@ -2,6 +2,7 @@ package router
import (
"net/http"
"strconv"
"strings"
"time"
@@ -271,3 +272,56 @@ func (rt *Router) alertRuleGet(c *gin.Context) {
ginx.NewRender(c).Data(ar, err)
}
//pre validation before save rule
func (rt *Router) alertRuleValidation(c *gin.Context) {
var f models.AlertRule //new
ginx.BindJSON(c, &f)
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
rt.bgrwCheck(c, ar.GroupId)
if len(f.NotifyChannelsJSON) > 0 && len(f.NotifyGroupsJSON) > 0 { //Validation NotifyChannels
ngids := make([]int64, 0, len(f.NotifyChannelsJSON))
for i := range f.NotifyGroupsJSON {
id, _ := strconv.ParseInt(f.NotifyGroupsJSON[i], 10, 64)
ngids = append(ngids, id)
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
//If any users have a certain notify channel's token, it will be okay. Otherwise, this notify channel is absent of tokens.
ancs := make([]string, 0, len(f.NotifyChannelsJSON)) //absent Notify Channels
for i := range f.NotifyChannelsJSON {
flag := true
for ui := range users {
if _, b := users[ui].ExtractToken(f.NotifyChannelsJSON[i]); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, f.NotifyChannelsJSON[i])
}
}
if len(ancs) > 0 {
ginx.NewRender(c).Message(i18n.Sprintf(c.GetHeader("X-Language"), "All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %s", ancs))
return
}
}
ginx.NewRender(c).Message("")
}

View File

@@ -99,6 +99,8 @@ func (rt *Router) alertSubscribePut(c *gin.Context) {
"webhooks",
"for_duration",
"redefine_webhooks",
"severities",
"extra_config",
))
}

View File

@@ -91,6 +91,9 @@ func (rt *Router) builtinBoardCateGets(c *gin.Context) {
boardCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/dashboards")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
var boards []Payload
for _, f := range files {
@@ -243,6 +246,9 @@ func (rt *Router) builtinAlertRules(c *gin.Context) {
alertCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/alerts")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
alertRules := make(map[string][]models.AlertRule)
for _, f := range files {

View File

@@ -105,6 +105,10 @@ func DatasourceCheck(ds models.Datasource) error {
return fmt.Errorf("url is empty")
}
if !strings.HasPrefix(ds.HTTPJson.Url, "http") {
return fmt.Errorf("url must start with http or https")
}
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{

View File

@@ -0,0 +1,80 @@
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
)
// 创建 ES Index Pattern
func (rt *Router) esIndexPatternAdd(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
now := time.Now().Unix()
f.CreateAt = now
f.CreateBy = username
f.UpdateAt = now
f.UpdateBy = username
err := f.Add(rt.Ctx)
ginx.NewRender(c).Message(err)
}
// 更新 ES Index Pattern
func (rt *Router) esIndexPatternPut(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
id := ginx.QueryInt64(c, "id")
esIndexPattern, err := models.EsIndexPatternGetById(rt.Ctx, id)
ginx.Dangerous(err)
if esIndexPattern == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such EsIndexPattern")
return
}
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(esIndexPattern.Update(rt.Ctx, f))
}
// 删除 ES Index Pattern
func (rt *Router) esIndexPatternDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids empty")
}
ginx.NewRender(c).Message(models.EsIndexPatternDel(rt.Ctx, f.Ids))
}
// ES Index Pattern列表
func (rt *Router) esIndexPatternGetList(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "datasource_id", 0)
var lst []*models.EsIndexPattern
var err error
if datasourceId != 0 {
lst, err = models.EsIndexPatternGets(rt.Ctx, "datasource_id = ?", datasourceId)
} else {
lst, err = models.EsIndexPatternGets(rt.Ctx, "")
}
ginx.NewRender(c).Data(lst, err)
}
// ES Index Pattern 单个数据
func (rt *Router) esIndexPatternGet(c *gin.Context) {
id := ginx.QueryInt64(c, "id")
item, err := models.EsIndexPatternGet(rt.Ctx, "id=?", id)
ginx.NewRender(c).Data(item, err)
}

View File

@@ -15,6 +15,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
type queryFormItem struct {
@@ -32,10 +33,14 @@ type batchQueryForm struct {
func (rt *Router) promBatchQueryRange(c *gin.Context) {
var f batchQueryForm
ginx.Dangerous(c.BindJSON(&f))
var lst []model.Value
cli := rt.PromClients.GetCli(f.DatasourceId)
var lst []model.Value
if cli == nil {
logger.Warningf("no such datasource id: %d", f.DatasourceId)
ginx.NewRender(c).Data(lst, nil)
return
}
for _, item := range f.Queries {
r := pkgprom.Range{
@@ -67,10 +72,15 @@ func (rt *Router) promBatchQueryInstant(c *gin.Context) {
var f batchInstantForm
ginx.Dangerous(c.BindJSON(&f))
cli := rt.PromClients.GetCli(f.DatasourceId)
var lst []model.Value
cli := rt.PromClients.GetCli(f.DatasourceId)
if cli == nil {
logger.Warningf("no such datasource id: %d", f.DatasourceId)
ginx.NewRender(c).Data(lst, nil)
return
}
for _, item := range f.Queries {
resp, _, err := cli.Query(context.Background(), item.Query, time.Unix(item.Time, 0))
ginx.Dangerous(err)

View File

@@ -28,6 +28,12 @@ func (rt *Router) serverHeartbeat(c *gin.Context) {
func (rt *Router) serversActive(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "dsid")
engineName := ginx.QueryStr(c, "engine_name", "")
if engineName != "" {
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
return
}
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)

View File

@@ -47,10 +47,27 @@ func (rt *Router) targetGets(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", 30)
dsIds := queryDatasourceIds(c)
total, err := models.TargetTotal(rt.Ctx, bgid, dsIds, query)
var bgids []int64
var err error
if bgid == -1 {
// 全部对象的情况,找到用户有权限的业务组
user := c.MustGet("user").(*models.User)
userGroupIds, err := models.MyGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
bgids, err = models.BusiGroupIds(rt.Ctx, userGroupIds)
ginx.Dangerous(err)
// 将未分配业务组的对象也加入到列表中
bgids = append(bgids, 0)
} else {
bgids = append(bgids, bgid)
}
total, err := models.TargetTotal(rt.Ctx, bgids, dsIds, query)
ginx.Dangerous(err)
list, err := models.TargetGets(rt.Ctx, bgid, dsIds, query, limit, ginx.Offset(c, limit))
list, err := models.TargetGets(rt.Ctx, bgids, dsIds, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
if err == nil {

View File

@@ -8,6 +8,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
@@ -49,17 +50,20 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
alertrtRouter.Config(r)
}
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
return func() {

View File

@@ -6,7 +6,7 @@ ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
RUN chmod +x /wait
RUN chmod +x /wait && pip install requests
EXPOSE 17000

View File

@@ -281,6 +281,7 @@ CREATE TABLE `alert_rule` (
`runbook_url` varchar(255),
`append_tags` varchar(255) not null default '' comment 'split by space: service=n9e mod=api',
`annotations` text not null comment 'annotations',
`extra_config` text not null comment 'extra_config',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
@@ -305,6 +306,7 @@ CREATE TABLE `alert_mute` (
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`mute_time_type` tinyint(1) not null default 0,
`periodic_mutes` varchar(4096) not null default '',
`severities` varchar(32) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
@@ -324,6 +326,7 @@ CREATE TABLE `alert_subscribe` (
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`severities` varchar(32) not null default '',
`tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value',
`redefine_severity` tinyint(1) default 0 comment 'is redefine severity?',
`new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
@@ -331,6 +334,7 @@ CREATE TABLE `alert_subscribe` (
`new_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`user_group_ids` varchar(250) not null comment 'split by space 1 34 5, notify cc to user_group_ids',
`webhooks` text not null,
`extra_config` text not null comment 'extra_config',
`redefine_webhooks` tinyint(1) default 0,
`for_duration` bigint not null default 0,
`create_at` bigint not null default 0,
@@ -614,4 +618,19 @@ CREATE TABLE `sso_config` (
`content` text not null,
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `es_index_pattern` (
`id` bigint unsigned not null auto_increment,
`datasource_id` bigint not null default 0 comment 'datasource id',
`name` varchar(191) not null,
`time_field` varchar(128) not null default '@timestamp',
`allow_hide_system_indices` tinyint(1) not null default 0,
`fields_format` varchar(4096) not null default '',
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`datasource_id`, `name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;

8
dumper/dumper.go Normal file
View File

@@ -0,0 +1,8 @@
package dumper
import "github.com/gin-gonic/gin"
// package level functions
func ConfigRouter(r *gin.Engine) {
syncDumper.ConfigRouter(r)
}

110
dumper/sync.go Normal file
View File

@@ -0,0 +1,110 @@
package dumper
import (
"fmt"
"strings"
"sync"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/time"
)
type SyncRecord struct {
Timestamp int64
Mills int64
Count int
Message string
}
func (sr *SyncRecord) String() string {
var sb strings.Builder
sb.WriteString("timestamp: ")
sb.WriteString(time.Format(sr.Timestamp))
sb.WriteString(", mills: ")
sb.WriteString(fmt.Sprint(sr.Mills, "ms"))
sb.WriteString(", count: ")
sb.WriteString(fmt.Sprint(sr.Count))
sb.WriteString(", message: ")
sb.WriteString(sr.Message)
return sb.String()
}
type SyncRecords struct {
Current *SyncRecord
Last *SyncRecord
}
type SyncDumper struct {
sync.RWMutex
records map[string]*SyncRecords
}
func NewSyncDumper() *SyncDumper {
return &SyncDumper{
records: make(map[string]*SyncRecords),
}
}
var syncDumper = NewSyncDumper()
func (sd *SyncDumper) Put(key string, timestamp, mills int64, count int, message string) {
sr := &SyncRecord{
Timestamp: timestamp,
Mills: mills,
Count: count,
Message: message,
}
sd.Lock()
defer sd.Unlock()
if _, ok := sd.records[key]; !ok {
sd.records[key] = &SyncRecords{Current: sr}
return
}
sd.records[key].Last = sd.records[key].Current
sd.records[key].Current = sr
}
// busi_groups:
// last: timestamp, mills, count
// curr: timestamp, mills, count
func (sd *SyncDumper) Sprint() string {
sd.RLock()
defer sd.RUnlock()
var sb strings.Builder
sb.WriteString("\n")
for k, v := range sd.records {
sb.WriteString(k)
sb.WriteString(":\n")
if v.Last != nil {
sb.WriteString("last: ")
sb.WriteString(v.Last.String())
sb.WriteString("\n")
}
sb.WriteString("curr: ")
sb.WriteString(v.Current.String())
sb.WriteString("\n\n")
}
return sb.String()
}
func (sd *SyncDumper) ConfigRouter(r *gin.Engine) {
r.GET("/dumper/sync", func(c *gin.Context) {
clientIP := c.ClientIP()
if clientIP != "127.0.0.1" && clientIP != "::1" {
c.String(403, "forbidden")
return
}
c.String(200, sd.Sprint())
})
}
func PutSyncRecord(key string, timestamp, mills int64, count int, message string) {
syncDumper.Put(key, timestamp, mills, count, message)
}

View File

Before

Width:  |  Height:  |  Size: 3.1 KiB

After

Width:  |  Height:  |  Size: 3.1 KiB

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -0,0 +1,31 @@
### Ceph Dashboard & Alerts
开启ceph 默认Prometheus支持
```bash
ceph mgr module enable prometheus
```
### 采集配置
在categraf中的prometheus插件中加入采集配置
```yaml
cat /opt/categraf/conf/input.prometheus/prometheus.toml
[[instances]]
urls = [
"http://192.168.11.181:9283/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="ceph",cluster="ceph"}
```
Dashboard:
[dashboard](../dashboards/ceph_by_categraf.json)
![ceph](./ceph.png)
Alerts:
[alerts](../alerts/ceph_by_categraf.json)
![alert](./alerts.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 187 KiB

View File

@@ -0,0 +1,951 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -0,0 +1,951 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null
}
]

View File

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 2.0 KiB

View File

Before

Width:  |  Height:  |  Size: 377 KiB

After

Width:  |  Height:  |  Size: 377 KiB

View File

@@ -0,0 +1,702 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "HighgRPCResourceExhaustedRate",
"note": "High gRPC ResourceExhausted error rate",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 > 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 > 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "HighPumaUtilization",
"note": "Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf \"%.1f\" }}%) over the last 60 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "instance:puma_utilization:ratio * 100 > 90",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "instance:puma_utilization:ratio * 100 > 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDatabaseDeadlockCancels",
"note": "Postgres database has queries canceled due to deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDatabaseDeadlocks",
"note": "Postgres database has deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDown",
"note": "The Postgres service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(pg_up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(pg_up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PumaQueueing",
"note": "Puma instance {{ $labels.instance }} is queueing requests with an average of {{ $value | printf \"%.1f\" }} over the last 30 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(puma_queued_connections[30m]) > 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(puma_queued_connections[30m]) > 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "RedisDown",
"note": "The Redis service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(redis_up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(redis_up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "ServiceDown",
"note": "The service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "SidekiqJobsQueuing",
"note": "Sidekiq has jobs queued",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum by (name) (sidekiq_queue_size) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (name) (sidekiq_queue_size) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "WorkhorseHighErrorRate",
"note": "Workhorse has high error rates",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) < 10\n) * 100 > 50\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) < 10\n) * 100 > 50\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
}
]

View File

@@ -0,0 +1,712 @@
{
"name": "GitLab - Machine Performance",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "query_result(node_boot_time_seconds)",
"reg": "/instance=\"(?<text>[^\"]*)/",
"multi": false
}
],
"panels": [
{
"type": "stat",
"id": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"layout": {
"h": 3,
"w": 5,
"x": 0,
"y": 0,
"i": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Uptime",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "none",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "stat",
"id": "fe9d5645-e775-446e-876b-35852b751961",
"layout": {
"h": 3,
"w": 2,
"x": 5,
"y": 0,
"i": "fe9d5645-e775-446e-876b-35852b751961",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Running",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "service",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "barGauge",
"id": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"layout": {
"h": 3,
"w": 5,
"x": 7,
"y": 0,
"i": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_timex_sync_status{instance=\"$instance\"}",
"legend": "NTP Enabled"
},
{
"refId": "B",
"expr": "node_timex_offset_seconds{instance=\"$instance\"}",
"legend": "NTP Offset"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "NTP",
"links": [],
"description": "",
"maxPerRow": 4,
"custom": {
"calc": "avg",
"baseColor": "#9470FF",
"serieWidth": 20,
"sortOrder": "desc"
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "milliseconds"
}
}
},
{
"type": "hexbin",
"id": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"layout": {
"h": 3,
"w": 12,
"x": 12,
"y": 0,
"i": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "count(count(node_cpu_seconds_total{instance=~\"$instance\"}) by (cpu))",
"legend": "Core"
},
{
"refId": "B",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Mem"
},
{
"refId": "C",
"expr": "node_memory_SwapTotal_bytes{instance=~\"$instance\"}",
"legend": "Swap"
},
{
"refId": "D",
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\"}) ",
"legend": "Disk"
},
{
"refId": "E",
"expr": "node_filefd_allocated{instance=~\"$instance\"}",
"legend": "Openfiles"
},
{
"refId": "F",
"expr": "rate(node_context_switches_total{instance=~\"$instance\"}[1m])",
"legend": "ContextSwitch"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "System Info",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"type": "timeseries",
"id": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"layout": {
"h": 7,
"w": 9,
"x": 0,
"y": 3,
"i": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "max(node_load1{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "1m"
},
{
"refId": "B",
"expr": "max(node_load5{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "5m"
},
{
"refId": "C",
"expr": "max(node_load15{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "15m"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Load Average",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"layout": {
"h": 7,
"w": 15,
"x": 9,
"y": 3,
"i": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "1 - (avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"idle\"}[2m])))",
"legend": "CPU"
},
{
"refId": "B",
"expr": "(node_memory_MemTotal_bytes{instance=~\"$instance\"} - (node_memory_MemFree_bytes{instance=~\"$instance\"} + node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"})) / node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "MEM"
},
{
"refId": "C",
"expr": "avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"iowait\"}[2m]))",
"legend": "IOWAIT"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Resource Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"layout": {
"h": 5,
"w": 24,
"x": 0,
"y": 10,
"i": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Available"
},
{
"refId": "B",
"expr": "node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"}",
"legend": "Buffers/Cached"
},
{
"refId": "C",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"} - node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Used"
},
{
"refId": "D",
"expr": "node_memory_MemFree_bytes{instance=~\"$instance\"}",
"legend": "Free"
},
{
"refId": "E",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Total"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"layout": {
"h": 9,
"w": 15,
"x": 0,
"y": 15,
"i": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Reads"
},
{
"refId": "B",
"expr": "sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Writes"
},
{
"refId": "C",
"expr": "sum by (instance) (node_disk_io_now{instance=~\"$instance\"})",
"legend": "{{instance}}-CurrentIO"
},
{
"refId": "D",
"expr": "sum by (instance) (irate(node_disk_read_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Read-consuming"
},
{
"refId": "E",
"expr": "sum by (instance) (irate(node_disk_write_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Write-consuming"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Disk IO",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"layout": {
"h": 9,
"w": 9,
"x": 15,
"y": 15,
"i": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_filesystem_avail_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"}",
"legend": "{{device}}",
"instant": false
},
{
"refId": "B",
"expr": "1-(node_filesystem_free_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"} / node_filesystem_size_bytes{instance=\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}"
},
{
"refId": "C",
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Disk Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"layout": {
"h": 10,
"w": 24,
"x": 0,
"y": 24,
"i": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (irate(node_network_receive_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-receive"
},
{
"refId": "B",
"expr": "sum by (instance) (irate(node_network_transmit_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-transmit"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network Traffic",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

View File

@@ -0,0 +1,501 @@
{
"name": "GitLab - NGINX VTS",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"nginx\"}, instance)",
"reg": "",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "timeseries",
"id": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac",
"layout": {
"h": 4,
"w": 7,
"x": 0,
"y": 0,
"i": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "topk(1, count by (hostname) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "hostname: {{hostname}}"
},
{
"refId": "C",
"expr": "topk(1, count by (version) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "nginx: {{version}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Nginx Info",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68",
"layout": {
"h": 12,
"w": 17,
"x": 7,
"y": 0,
"i": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (\n rate(nginx_vts_upstream_request_seconds_total{instance=~\"$instance\"}[5m])\n) /\nsum by (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{instance}}",
"step": 15,
"instant": false,
"time": {
"start": "now-6h",
"end": "now"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Latency",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "seconds",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "cb24fbce-8bb5-425e-89f3-c32e22532b5d",
"layout": {
"h": 8,
"w": 7,
"x": 0,
"y": 4,
"i": "cb24fbce-8bb5-425e-89f3-c32e22532b5d"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (shared) (nginx_vts_main_shm_usage_bytes{instance=~\"$instance\", job=~\"nginx\"})",
"legend": "{{shared}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Shared Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "54c5264a-a596-486f-a4a8-4ab451d91870",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 13,
"i": "54c5264a-a596-486f-a4a8-4ab451d91870"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_server_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{host}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Server - Current total of incoming / outgoing bytes",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 13,
"i": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{backend}}-{{upstream}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Upstream - Current total of incoming / outgoing bytes",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "712b82cd-633a-4088-8695-fa2d75ab37ca",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 22,
"i": "712b82cd-633a-4088-8695-fa2d75ab37ca"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_server_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{host}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Server - Requests by HTTP code",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "09ca96b6-4e23-4db2-bc14-79c5f965d16f",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 22,
"i": "09ca96b6-4e23-4db2-bc14-79c5f965d16f"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{backend}} {{upstream }}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Upstream - Requests by HTTP code",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

View File

@@ -0,0 +1,517 @@
{
"name": "GitLab - Overview",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [
{
"title": "GitLab Docs",
"url": "https://docs.gitlab.com/"
}
],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
}
],
"panels": [
{
"type": "text",
"id": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"layout": {
"h": 3,
"w": 24,
"x": 0,
"y": 0,
"i": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"isResizable": true
},
"version": "3.0.0",
"name": "",
"links": [],
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"bgColor": "#FFFFFF",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": "<br><h1><i><font color=#5991A7><b>GitLab Service Status</b></font></i></h1>\n"
}
},
{
"type": "text",
"id": "9991440f-1e01-4807-8911-2619329af244",
"layout": {
"h": 3,
"w": 2,
"x": 0,
"y": 3,
"i": "9991440f-1e01-4807-8911-2619329af244",
"isResizable": true
},
"version": "3.0.0",
"name": "Gitlab",
"links": [],
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"bgColor": "#FFFFFF",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": "<p style=\"text-align:center;\"><img src=\"https://www.cloudfoundry.org/wp-content/uploads/2017/10/icon_gitlab_cf@2x.png\" width=80px/></p>"
}
},
{
"type": "stat",
"id": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"layout": {
"h": 3,
"w": 5,
"x": 2,
"y": 3,
"i": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "topk(1, count by (version) (gitlab_build_info{job=~\"gitlab-workhorse\"}))",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "GitLab Version",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "first",
"valueField": "version",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
}
},
{
"type": "hexbin",
"id": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"layout": {
"h": 3,
"w": 17,
"x": 7,
"y": 3,
"i": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) / count(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) * 100",
"legend": "{{job}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Service Status",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#f0ee6e",
"#6ba261",
"#306d52"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"version": "3.0.0",
"id": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"type": "text",
"links": [],
"layout": {
"h": 2,
"w": 24,
"x": 0,
"y": 6,
"i": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"isResizable": true
},
"targets": [],
"options": {
"thresholds": {
"style": "line",
"steps": []
},
"standardOptions": {
"util": "none"
},
"legend": {
"displayMode": "list"
},
"tooltip": {
"mode": "multi"
}
},
"custom": {
"version": "3.0.0",
"content": "<br><h1><i><font color=#5991A7><b>Sidekiq Statistics</b></font></i></h1>\n"
},
"maxPerRow": 4
},
{
"type": "hexbin",
"id": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 8,
"i": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "max(sidekiq_jobs_processed_total)",
"legend": "Processed"
},
{
"refId": "B",
"expr": "max(sidekiq_jobs_failed_total)",
"legend": "Failed"
},
{
"refId": "C",
"expr": "max(sidekiq_jobs_enqueued_size)",
"legend": "Enqueued"
},
{
"refId": "D",
"expr": "max(sidekiq_jobs_scheduled_size)",
"legend": "Scheduled"
},
{
"refId": "E",
"expr": "max(sidekiq_jobs_retry_size)",
"legend": "Retry"
},
{
"refId": "F",
"expr": "max(sidekiq_jobs_dead_size)",
"legend": "Dead"
},
{
"refId": "G",
"expr": "max(sidekiq_processes_size)",
"legend": "Processes"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Background Jobs",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"type": "timeseries",
"id": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 8,
"i": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "topk(10, avg_over_time(sidekiq_queue_size{}[5m]))",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Queue Size Top 10",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"version": "3.0.0",
"id": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"type": "text",
"links": [],
"layout": {
"h": 2,
"w": 24,
"x": 0,
"y": 12,
"i": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"isResizable": true
},
"targets": [],
"options": {
"thresholds": {
"style": "line",
"steps": []
},
"standardOptions": {
"util": "none"
},
"legend": {
"displayMode": "list"
},
"tooltip": {
"mode": "multi"
}
},
"custom": {
"version": "3.0.0",
"content": "<br><h1><i><font color=#5991A7><b>Workhorse Statistics</b></font></i></h1>"
},
"maxPerRow": 4
},
{
"type": "timeseries",
"id": "dd030993-481d-43e4-887a-3c0dee279ea0",
"layout": {
"h": 9,
"w": 12,
"x": 0,
"y": 14,
"i": "dd030993-481d-43e4-887a-3c0dee279ea0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "sum by (code) (\n rate(gitlab_workhorse_http_requests_total{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{code}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Http Request Total",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "77343eca-60d3-4be9-b80d-6db471399339",
"layout": {
"h": 9,
"w": 12,
"x": 12,
"y": 14,
"i": "77343eca-60d3-4be9-b80d-6db471399339",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "sum by (le) (\n rate(gitlab_workhorse_http_request_duration_seconds_bucket{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{le}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Workhorse Latency",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,739 @@
{
"name": "GitLab - Redis",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"redis\"}, instance)",
"reg": "",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "stat",
"id": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 0,
"i": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "avg(time() - redis_start_time_seconds{instance=~\"$instance\"})",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Uptime",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "seconds",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "stat",
"id": "38a6c015-4dae-4765-8f5f-9961ca302e18",
"layout": {
"h": 3,
"w": 4,
"x": 4,
"y": 0,
"i": "38a6c015-4dae-4765-8f5f-9961ca302e18"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n avg_over_time(redis_connected_clients{instance=~\"$instance\"}[5m])\n)",
"legend": "Clients"
},
{
"refId": "B",
"expr": "sum(\n avg_over_time(redis_connected_slaves{instance=~\"$instance\"}[5m])\n)",
"legend": "Slaves"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Clients",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "avg",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "none",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
}
},
{
"type": "timeseries",
"id": "0739f047-3fcd-4bf0-ac63-09c0a4170452",
"layout": {
"h": 6,
"w": 8,
"x": 8,
"y": 0,
"i": "0739f047-3fcd-4bf0-ac63-09c0a4170452"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_commands_processed_total{instance=~\"$instance\"}[5m])\n)",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Commands Executed",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9",
"layout": {
"h": 6,
"w": 8,
"x": 16,
"y": 0,
"i": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_keyspace_hits_total{instance=~\"$instance\"}[5m])\n)",
"legend": "hits"
},
{
"refId": "B",
"expr": "sum(\n rate(redis_keyspace_misses_total{instance=~\"$instance\"}[5m])\n)",
"legend": "misses"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Hits, Misses per Second",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "8a29cc91-697c-48d7-a613-14e81aeb6772",
"layout": {
"h": 10,
"w": 8,
"x": 0,
"y": 3,
"i": "8a29cc91-697c-48d7-a613-14e81aeb6772"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "redis_memory_used_bytes{instance=~\"$instance\"}",
"legend": "used - {{instance}}"
},
{
"refId": "B",
"expr": "redis_config_maxmemory{instance=~\"$instance\"} > 0",
"legend": "max - {{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 6,
"i": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(rate(redis_expired_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "expired"
},
{
"refId": "B",
"expr": "sum(rate(redis_evicted_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "evicted"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expired / Evicted",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "634f53c0-31d8-4a9f-9cfc-a7c241797359",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 6,
"i": "634f53c0-31d8-4a9f-9cfc-a7c241797359"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_net_input_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "In"
},
{
"refId": "B",
"expr": "sum(\n rate(redis_net_output_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "Out"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network I/O",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "9267bb6a-edc7-4088-913e-5902e81cf736",
"layout": {
"h": 7,
"w": 16,
"x": 0,
"y": 13,
"i": "9267bb6a-edc7-4088-913e-5902e81cf736"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(redis_commands_total{instance=~\"$instance\"}[5m])\n) > 0",
"legend": "{{ cmd }}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Command Calls / sec",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.8,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "51d28f91-c2f3-4f22-859e-bdc88f25ead2",
"layout": {
"h": 14,
"w": 8,
"x": 16,
"y": 13,
"i": "51d28f91-c2f3-4f22-859e-bdc88f25ead2"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(redis_db_keys{instance=~\"$instance\"} - redis_db_keys_expiring{instance=~\"$instance\"}) ",
"legend": "not expiring"
},
{
"refId": "B",
"expr": "sum(redis_db_keys_expiring{instance=~\"$instance\"})",
"legend": "expiring"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expiring vs Not-Expiring Keys",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.7,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "1c0cd7d9-5899-49c0-8939-a361b76f60ac",
"layout": {
"h": 7,
"w": 16,
"x": 0,
"y": 20,
"i": "1c0cd7d9-5899-49c0-8939-a361b76f60ac"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (db) (\n redis_db_keys{instance=~\"$instance\"}\n)",
"legend": "{{ db }} "
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Items per DB",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.7,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 263 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 203 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 264 KiB

View File

@@ -0,0 +1,127 @@
### Gitlab Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)中[inputs.prometheus](https://github.com/flashcatcloud/categraf/tree/main/inputs/prometheus)插件采集[Gitlab](https://docs.gitlab.com/)服务组件暴露的指标数据:
开启Gitlab默认Prometheus支持:
[Monitoring GitLab with Prometheus](https://docs.gitlab.com/ee/administration/monitoring/prometheus/)
### 采集配置
在categraf中的prometheus插件中加入采集配置
```yaml
cat /opt/categraf/conf/input.prometheus/prometheus.toml
# # collect interval
# interval = 15
[[instances]]
urls = [
"http://192.168.11.77:9236/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitaly"}
[[instances]]
urls = [
"http://192.168.11.77:9168/sidekiq"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab_exporter_sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9168/database"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab_exporter_database"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9229/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-workhorse"}
[[instances]]
urls = [
"http://192.168.11.77:9100/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="node"}
[[instances]]
urls = [
"http://192.168.11.77:9187/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="postgres"}
[[instances]]
urls = [
"http://192.168.11.77:9121/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="redis"}
[[instances]]
urls = [
"http://192.168.11.77:9999/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="nginx"}
```
Dashboards:
[MachinePerformance](../dashboards/MachinePerformance.json)
![MachinePerformance](./MachinePerformance.png)
[NGINXVTS](../dashboards/NGINXVTS.json)
![NGINXVTS](./NGINXVTS.png)
[Overview](../dashboards/Overview.json)
![Overview](./Overview.png)
[PostgreSQL](../dashboards/PostgreSQL.json)
![PostgreSQL](./PostgreSQL.png)
[Redis](../dashboards/Redis.json)
![Redis](./Redis.png)
Alerts:
[alerts](../alerts/gitlab_by_categraf.json)
![alert](./alerts.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 312 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 144 KiB

View File

@@ -0,0 +1,73 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "http detect failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "http_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

Before

Width:  |  Height:  |  Size: 975 B

After

Width:  |  Height:  |  Size: 975 B

View File

Before

Width:  |  Height:  |  Size: 112 KiB

After

Width:  |  Height:  |  Size: 112 KiB

View File

Before

Width:  |  Height:  |  Size: 182 KiB

After

Width:  |  Height:  |  Size: 182 KiB

View File

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

View File

@@ -1,16 +1,34 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "kafka 数据有丢失风险-副本数小于3",
"note": "",
"prod": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -51,20 +69,40 @@
"append_tags": [
"service=kafka",
"type=categraf"
]
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "kafka 服务宕机",
"note": "",
"prod": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "kafka_broker_info{service=~\"kafka\"} < 1",
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kafka_broker_info{service=~\"kafka\"} < 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
@@ -105,20 +143,40 @@
"append_tags": [
"type=categraf",
"service=kafka"
]
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "kafka 消费能力不足-延迟超过5分钟",
"note": "",
"prod": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "max(kafka_consumer_lag_millis) by (topic, consumergroup) / 1000 > 300",
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max(kafka_consumer_lag_millis) by (topic, consumergroup) / 1000 > 300",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -159,6 +217,8 @@
"append_tags": [
"service=kafka",
"type=categraf"
]
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -0,0 +1,148 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Insufficient consumption ability - delay exceeds 5 minutes - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Risk of data loss - number of replicas less than 3 - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3 - exporter",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null
}
]

View File

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

View File

Before

Width:  |  Height:  |  Size: 78 KiB

After

Width:  |  Height:  |  Size: 78 KiB

View File

Before

Width:  |  Height:  |  Size: 145 KiB

After

Width:  |  Height:  |  Size: 145 KiB

View File

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -0,0 +1,860 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "A certain disk is unable to read/write normally",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filesystem_device_error{instance=\"$node\",mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run.*\"}) > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Disk needs to be cleaned - utilization has reached 92% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes) ) > 92 ",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient computational resources - average load per core of the machine is greater than 10 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg (node_load1) by (instance)/count(count(node_cpu_seconds_total) by (cpu,instance)) by (instance) >10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient file handles - usage exceeds 90% - node exporter",
"note": "You can increase the file handle limit or expand the capacity",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filefd_allocated{instance=\"$node\"}/node_filefd_maximum{instance=\"$node\"}*100) > 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient inode resources - usage exceeds 90% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_files_free * 100) / node_filesystem_files))>90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient memory resources - utilization is greater than 75% - node exporter",
"note": "Expansion or upgrading of configuration is required",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 > 75",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient memory resources - utilization is greater than 95% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 > 95",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Packet loss in the inbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_receive_drop_total{device=~\"e.*\"}[1m]) > 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Packet loss in the outbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_transmit_drop_total{device=~\"e.*\"}[1m]) > 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "System conntrack needs to be adjusted - usage exceeds 80% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit*100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "System experiences OOM - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(node_vmstat_oom_kill[1m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Too many running processes - exceeding 3000 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_procs_running > 3000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -0,0 +1,979 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Address is unreachable by PING, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Hard disk - expected to be written full in 4 hours - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) < 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Hard disk - IO is very busy - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 > 99",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "host",
"datasource_ids": null,
"name": "Lost connection with monitoring target - telegraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {},
"notify_aggregation": {}
}
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Machine load - high CPU, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "cpu_usage_idle{cpu=\"cpu-total\"} < 25",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Machine load - high memory, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mem_available_percent < 25",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Network connection - number of TME_WAIT exceeds 20,000 - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "netstat_tcp_time_wait > 20000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss in the inbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) > 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss in the outbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) > 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Port detection failed, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "process handle limit is too small - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft < 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Process monitoring - lookup failure - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "there are processes with 0 count, a certain process may have crashed - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -0,0 +1,73 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "NTP time offset is too large",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ntp_offset_ms > 1000 or ntp_offset_ms < -1000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

Some files were not shown because too many files have changed in this diff Show More