Compare commits

..

1 Commits

Author SHA1 Message Date
kongfei
e884145b9d alerts rule and dashboards for pg 2023-07-07 13:50:51 +08:00
234 changed files with 3924 additions and 20542 deletions

3
.gitignore vendored
View File

@@ -31,9 +31,6 @@ _test
/etc/*.local.yml
/etc/*.local.conf
/etc/plugins/*.local.yml
/etc/script/rules.yaml
/etc/script/alert-rules.json
/etc/script/record-rules.json
/data*
/tarball
/run

View File

@@ -15,7 +15,6 @@ import (
"github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
@@ -48,19 +47,16 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
rt.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
@@ -71,7 +67,9 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
}
func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType,
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) {
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap) {
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats)
recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats)

View File

@@ -15,7 +15,6 @@ type AnomalyPoint struct {
Value float64 `json:"value"`
Severity int `json:"severity"`
Triggered bool `json:"triggered"`
Query string `json:"query"`
}
func NewAnomalyPoint(key string, labels map[string]string, ts int64, value float64, severity int) AnomalyPoint {

View File

@@ -83,10 +83,6 @@ func (e *Consumer) consumeOne(event *models.AlertCurEvent) {
}
func (e *Consumer) persist(event *models.AlertCurEvent) {
if event.Status != 0 {
return
}
if !e.ctx.IsCenter {
event.DB2FE()
err := poster.PostByUrls(e.ctx, "/v1/n9e/event-persist", event)

View File

@@ -28,10 +28,9 @@ type Dispatch struct {
alerting aconf.Alerting
Senders map[string]sender.Sender
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
BeforeSenderHook func(*models.AlertCurEvent) bool
senders map[string]sender.Sender
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
ctx *ctx.Context
@@ -52,10 +51,9 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
alerting: alerting,
Senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
BeforeSenderHook: func(*models.AlertCurEvent) bool { return true },
senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
ctx: ctx,
}
@@ -65,7 +63,7 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
func (e *Dispatch) ReloadTpls() error {
err := e.relaodTpls()
if err != nil {
logger.Errorf("failed to reload tpls: %v", err)
logger.Error("failed to reload tpls: %v", err)
}
duration := time.Duration(9000) * time.Millisecond
@@ -102,7 +100,7 @@ func (e *Dispatch) relaodTpls() error {
e.RwLock.Lock()
e.tpls = tmpTpls
e.Senders = senders
e.senders = senders
e.RwLock.Unlock()
return nil
}
@@ -143,7 +141,7 @@ func (e *Dispatch) HandleEventNotify(event *models.AlertCurEvent, isSubscribe bo
}
// 处理事件发送,这里用一个goroutine处理一个event的所有发送事件
go e.Send(rule, event, notifyTarget)
go e.Send(rule, event, notifyTarget, isSubscribe)
// 如果是不是订阅规则出现的event, 则需要处理订阅规则的event
if !isSubscribe {
@@ -179,41 +177,26 @@ func (e *Dispatch) handleSub(sub *models.AlertSubscribe, event models.AlertCurEv
if sub.ForDuration > (event.TriggerTime - event.FirstTriggerTime) {
return
}
if len(sub.SeveritiesJson) != 0 {
match := false
for _, s := range sub.SeveritiesJson {
if s == event.Severity || s == 0 {
match = true
break
}
}
if !match {
return
}
}
sub.ModifyEvent(&event)
LogEvent(&event, "subscribe")
event.SubRuleId = sub.Id
e.HandleEventNotify(&event, true)
}
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget) {
needSend := e.BeforeSenderHook(event)
if needSend {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, []*models.AlertCurEvent{event}, uids, e.userCache)
e.RwLock.RLock()
s := e.Senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
}
s.Send(ctx)
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget, isSubscribe bool) {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, event, uids, e.userCache)
e.RwLock.RLock()
s := e.senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
}
logger.Debugf("send event: %s, channel: %s", event.Hash, channel)
for i := 0; i < len(ctx.Users); i++ {
logger.Debug("send event to user: ", ctx.Users[i])
}
s.Send(ctx)
}
// handle event callbacks

View File

@@ -69,16 +69,14 @@ func (arw *AlertRuleWorker) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-arw.quit:
return
case <-ticker.C:
default:
arw.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()
@@ -165,7 +163,6 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
points := common.ConvertAnomalyPoints(value)
for i := 0; i < len(points); i++ {
points[i].Severity = query.Severity
points[i].Query = promql
}
lst = append(lst, points...)
}

View File

@@ -13,11 +13,7 @@ import (
)
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) bool {
if rule.Disabled == 1 {
return true
}
if TimeSpanMuteStrategy(rule, event) {
if TimeNonEffectiveMuteStrategy(rule, event) {
return true
}
@@ -36,9 +32,12 @@ func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *m
return false
}
// TimeSpanMuteStrategy 根据规则配置的告警生效时间过滤,如果产生的告警不在规则配置的告警生效时间内,则不告警,即被mute
// 时间范围左闭右开默认范围00:00-24:00
func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
// TimeNonEffectiveMuteStrategy 根据规则配置的告警时间过滤,如果产生的告警不在规则配置的告警时间内,则不告警
func TimeNonEffectiveMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
if rule.Disabled == 1 {
return true
}
tm := time.Unix(event.TriggerTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
@@ -53,33 +52,18 @@ func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) b
if !strings.Contains(enableDaysOfWeek[i], triggerWeek) {
continue
}
if enableStime[i] < enableEtime[i] {
if enableEtime[i] == "23:59" {
// 02:00-23:59这种情况做个特殊处理相当于左闭右闭区间了
if triggerTime < enableStime[i] {
// mute, 即没生效
continue
}
} else {
// 02:00-04:00 或者 02:00-24:00
if triggerTime < enableStime[i] || triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
if enableStime[i] <= enableEtime[i] {
if triggerTime < enableStime[i] || triggerTime > enableEtime[i] {
continue
}
} else if enableStime[i] > enableEtime[i] {
// 21:00-09:00
if triggerTime < enableStime[i] && triggerTime >= enableEtime[i] {
// mute, 即没生效
} else {
if triggerTime < enableStime[i] && triggerTime > enableEtime[i] {
continue
}
}
// 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute直接返回 false
// 到这里说明当前时刻在告警规则的某组生效时间范围内,直接返回 false
return false
}
return true
}
@@ -181,7 +165,7 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
break
}
} else {
if triggerTime >= mute.PeriodicMutesJson[i].EnableStime || triggerTime < mute.PeriodicMutesJson[i].EnableEtime {
if triggerTime < mute.PeriodicMutesJson[i].EnableStime || triggerTime >= mute.PeriodicMutesJson[i].EnableEtime {
matchTime = true
break
}
@@ -193,21 +177,5 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
return false
}
var matchSeverity bool
if len(mute.SeveritiesJson) > 0 {
for _, s := range mute.SeveritiesJson {
if event.Severity == s || s == 0 {
matchSeverity = true
break
}
}
} else {
matchSeverity = true
}
if !matchSeverity {
return false
}
return common.MatchTags(event.TagsMap, mute.ITags)
}

View File

@@ -167,13 +167,3 @@ func (n *Naming) ActiveServers(datasourceId int64) ([]string, error) {
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
}
func (n *Naming) ActiveServersByEngineName() ([]string, error) {
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?engine_name="+n.heartbeatConfig.EngineName)
return lst, err
}
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "engine_cluster = ? and clock > ?", n.heartbeatConfig.EngineName, time.Now().Unix()-30)
}

View File

@@ -23,8 +23,6 @@ import (
"github.com/toolkits/pkg/str"
)
type EventMuteHookFunc func(event *models.AlertCurEvent) bool
type ExternalProcessorsType struct {
ExternalLock sync.RWMutex
Processors map[string]*Processor
@@ -45,8 +43,6 @@ func (e *ExternalProcessorsType) GetExternalAlertRule(datasourceId, id int64) (*
return processor, has
}
type HandleEventFunc func(event *models.AlertCurEvent)
type Processor struct {
datasourceId int64
@@ -70,10 +66,6 @@ type Processor struct {
promClients *prom.PromClientMap
ctx *ctx.Context
stats *astats.Stats
HandleFireEventHook HandleEventFunc
HandleRecoverEventHook HandleEventFunc
EventMuteHook EventMuteHookFunc
}
func (p *Processor) Key() string {
@@ -110,10 +102,6 @@ func NewProcessor(rule *models.AlertRule, datasourceId int64, atertRuleCache *me
promClients: promClients,
ctx: ctx,
stats: stats,
HandleFireEventHook: func(event *models.AlertCurEvent) {},
HandleRecoverEventHook: func(event *models.AlertCurEvent) {},
EventMuteHook: func(event *models.AlertCurEvent) bool { return false },
}
p.mayHandleGroup()
@@ -145,11 +133,6 @@ func (p *Processor) Handle(anomalyPoints []common.AnomalyPoint, from string, inh
logger.Debugf("rule_eval:%s event:%v is muted", p.Key(), event)
continue
}
if p.EventMuteHook(event) {
continue
}
tagHash := TagHash(anomalyPoint)
eventsMap[tagHash] = append(eventsMap[tagHash], event)
}
@@ -191,8 +174,6 @@ func (p *Processor) BuildEvent(anomalyPoint common.AnomalyPoint, from string, no
event.RuleConfig = p.rule.RuleConfig
event.RuleConfigJson = p.rule.RuleConfigJson
event.Severity = anomalyPoint.Severity
event.ExtraConfig = p.rule.ExtraConfigJSON
event.PromQl = anomalyPoint.Query
if from == "inner" {
event.LastEvalTime = now
@@ -246,8 +227,6 @@ func (p *Processor) RecoverSingle(hash string, now int64, value *string) {
cachedRule.UpdateEvent(event)
event.IsRecovered = true
event.LastEvalTime = now
p.HandleRecoverEventHook(event)
p.pushEventToQueue(event)
}
@@ -305,12 +284,9 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
if cachedRule == nil {
return
}
logger.Debugf("rule_eval:%s event:%+v fire", p.Key(), event)
if fired, has := p.fires.Get(event.Hash); has {
p.fires.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
event.FirstTriggerTime = fired.FirstTriggerTime
p.HandleFireEventHook(event)
if cachedRule.NotifyRepeatStep == 0 {
logger.Debugf("rule_eval:%s event:%+v repeat is zero nothing to do", p.Key(), event)
@@ -320,10 +296,11 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
}
// 之前发送过告警了,这次是否要继续发送,要看是否过了通道静默时间
if event.LastEvalTime >= fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if event.LastEvalTime > fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if cachedRule.NotifyMaxNumber == 0 {
// 最大可以发送次数如果是0表示不想限制最大发送次数一直发即可
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
} else {
// 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数
@@ -332,6 +309,7 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
return
} else {
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
}
}
@@ -339,7 +317,6 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
} else {
event.NotifyCurNumber = 1
event.FirstTriggerTime = event.TriggerTime
p.HandleFireEventHook(event)
p.pushEventToQueue(event)
}
}
@@ -455,7 +432,7 @@ func labelMapToArr(m map[string]string) []string {
}
func Hash(ruleId, datasourceId int64, vector common.AnomalyPoint) string {
return str.MD5(fmt.Sprintf("%d_%s_%d_%d_%s", ruleId, vector.Labels.String(), datasourceId, vector.Severity, vector.Query))
return str.MD5(fmt.Sprintf("%d_%s_%d_%d", ruleId, vector.Labels.String(), datasourceId, vector.Severity))
}
func TagHash(vector common.AnomalyPoint) string {

View File

@@ -54,16 +54,14 @@ func (rrc *RecordRuleContext) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-rrc.quit:
return
case <-ticker.C:
default:
rrc.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()

View File

@@ -32,7 +32,7 @@ type DingtalkSender struct {
}
func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
@@ -40,7 +40,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ds.tpl, ctx.Events)
message := BuildTplMessage(ds.tpl, ctx.Event)
for _, url := range urls {
var body dingtalk
@@ -49,7 +49,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Events[0].RuleName,
Title: ctx.Event.RuleName,
Text: message,
},
}
@@ -57,7 +57,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Events[0].RuleName,
Title: ctx.Event.RuleName,
Text: message + "\n" + strings.Join(ats, " "),
},
At: dingtalkAt{

View File

@@ -22,18 +22,18 @@ type EmailSender struct {
}
func (es *EmailSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
tos := extract(ctx.Users)
var subject string
if es.subjectTpl != nil {
subject = BuildTplMessage(es.subjectTpl, []*models.AlertCurEvent{ctx.Events[0]})
subject = BuildTplMessage(es.subjectTpl, ctx.Event)
} else {
subject = ctx.Events[0].RuleName
subject = ctx.Event.RuleName
}
content := BuildTplMessage(es.contentTpl, ctx.Events)
content := BuildTplMessage(es.contentTpl, ctx.Event)
es.WriteEmail(subject, content, tos)
}

View File

@@ -31,11 +31,11 @@ type FeishuSender struct {
}
func (fs *FeishuSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
urls, ats := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Events)
message := BuildTplMessage(fs.tpl, ctx.Event)
for _, url := range urls {
body := feishu{
Msgtype: "text",

View File

@@ -96,11 +96,11 @@ var (
)
func (fs *FeishuCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
urls, _ := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Events)
message := BuildTplMessage(fs.tpl, ctx.Event)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
@@ -109,7 +109,7 @@ func (fs *FeishuCardSender) Send(ctx MessageContext) {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
SendTitle := fmt.Sprintf("🔔 %s", ctx.Event.RuleName)
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message

View File

@@ -28,7 +28,7 @@ type MmSender struct {
}
func (ms *MmSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
@@ -36,7 +36,7 @@ func (ms *MmSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ms.tpl, ctx.Events)
message := BuildTplMessage(ms.tpl, ctx.Event)
SendMM(MatterMostMessage{
Text: message,

View File

@@ -17,9 +17,9 @@ type (
// MessageContext 一个event所生成的告警通知的上下文
MessageContext struct {
Users []*models.User
Rule *models.AlertRule
Events []*models.AlertCurEvent
Users []*models.User
Rule *models.AlertRule
Event *models.AlertCurEvent
}
)
@@ -43,32 +43,23 @@ func NewSender(key string, tpls map[string]*template.Template, smtp aconf.SMTPCo
return nil
}
func BuildMessageContext(rule *models.AlertRule, events []*models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
func BuildMessageContext(rule *models.AlertRule, event *models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
users := userCache.GetByUserIds(uids)
return MessageContext{
Rule: rule,
Events: events,
Users: users,
Rule: rule,
Event: event,
Users: users,
}
}
type BuildTplMessageFunc func(tpl *template.Template, events []*models.AlertCurEvent) string
var BuildTplMessage BuildTplMessageFunc = buildTplMessage
func buildTplMessage(tpl *template.Template, events []*models.AlertCurEvent) string {
func BuildTplMessage(tpl *template.Template, event *models.AlertCurEvent) string {
if tpl == nil {
return "tpl for current sender not found, please check configuration"
}
var content string
for _, event := range events {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
}
content += body.String() + "\n\n"
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
}
return content
return body.String()
}

View File

@@ -26,11 +26,11 @@ type TelegramSender struct {
}
func (ts *TelegramSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
tokens := ts.extract(ctx.Users)
message := BuildTplMessage(ts.tpl, ctx.Events)
message := BuildTplMessage(ts.tpl, ctx.Event)
SendTelegram(TelegramMessage{
Text: message,

View File

@@ -25,11 +25,11 @@ type WecomSender struct {
}
func (ws *WecomSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
return
}
urls := ws.extract(ctx.Users)
message := BuildTplMessage(ws.tpl, ctx.Events)
message := BuildTplMessage(ws.tpl, ctx.Event)
for _, url := range urls {
body := wecom{
Msgtype: "markdown",

View File

@@ -11,7 +11,6 @@ import (
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/models/migrate"
@@ -72,20 +71,18 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
writers := writer.NewWriters(config.Pushgw)
httpx.InitRSAConfig(&config.HTTP.RSA)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
@@ -93,7 +90,6 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
centerRouter.Config(r)
alertrtRouter.Config(r)
pushgwRouter.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)

View File

@@ -39,14 +39,11 @@ type Router struct {
IdentSet *idents.Set
TargetCache *memsto.TargetCacheType
Sso *sso.SsoClient
UserCache *memsto.UserCacheType
UserGroupCache *memsto.UserGroupCacheType
Ctx *ctx.Context
}
func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType,
uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) *Router {
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
@@ -59,8 +56,6 @@ func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operatio
IdentSet: idents,
TargetCache: tc,
Sso: sso,
UserCache: uc,
UserGroupCache: ugc,
Ctx: ctx,
}
}
@@ -106,7 +101,7 @@ func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) {
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf":
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map":
if !rt.Center.UseFileAssets {
c.FileFromFS(c.Request.URL.Path, *fs)
} else {
@@ -268,7 +263,6 @@ func (rt *Router) Config(r *gin.Engine) {
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)
pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet)
pages.PUT("/busi-group/:id/alert-rule/:arid/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation)
pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets)
pages.POST("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/add"), rt.bgrw(), rt.recordingRuleAddByFE)
@@ -365,12 +359,6 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigGet)
pages.PUT("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigPut)
pages.GET("/es-index-pattern", rt.auth(), rt.esIndexPatternGet)
pages.GET("/es-index-pattern-list", rt.auth(), rt.esIndexPatternGetList)
pages.POST("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternAdd)
pages.PUT("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternPut)
pages.DELETE("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternDel)
}
if rt.HTTP.APIForService.Enable {

View File

@@ -69,11 +69,6 @@ func (rt *Router) alertAggrViewPut(c *gin.Context) {
return
}
}
view.Name = f.Name
view.Rule = f.Rule
view.Cate = f.Cate
if view.CreateBy == 0 {
view.CreateBy = me.Id
}
ginx.NewRender(c).Message(view.Update(rt.Ctx))
ginx.NewRender(c).Message(view.Update(rt.Ctx, f.Name, f.Rule, f.Cate, me.Id))
}

View File

@@ -2,7 +2,6 @@ package router
import (
"net/http"
"strconv"
"strings"
"time"
@@ -272,56 +271,3 @@ func (rt *Router) alertRuleGet(c *gin.Context) {
ginx.NewRender(c).Data(ar, err)
}
//pre validation before save rule
func (rt *Router) alertRuleValidation(c *gin.Context) {
var f models.AlertRule //new
ginx.BindJSON(c, &f)
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
rt.bgrwCheck(c, ar.GroupId)
if len(f.NotifyChannelsJSON) > 0 && len(f.NotifyGroupsJSON) > 0 { //Validation NotifyChannels
ngids := make([]int64, 0, len(f.NotifyChannelsJSON))
for i := range f.NotifyGroupsJSON {
id, _ := strconv.ParseInt(f.NotifyGroupsJSON[i], 10, 64)
ngids = append(ngids, id)
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
//If any users have a certain notify channel's token, it will be okay. Otherwise, this notify channel is absent of tokens.
ancs := make([]string, 0, len(f.NotifyChannelsJSON)) //absent Notify Channels
for i := range f.NotifyChannelsJSON {
flag := true
for ui := range users {
if _, b := users[ui].ExtractToken(f.NotifyChannelsJSON[i]); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, f.NotifyChannelsJSON[i])
}
}
if len(ancs) > 0 {
ginx.NewRender(c).Message(i18n.Sprintf(c.GetHeader("X-Language"), "All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %s", ancs))
return
}
}
ginx.NewRender(c).Message("")
}

View File

@@ -99,8 +99,6 @@ func (rt *Router) alertSubscribePut(c *gin.Context) {
"webhooks",
"for_duration",
"redefine_webhooks",
"severities",
"extra_config",
))
}

View File

@@ -91,9 +91,6 @@ func (rt *Router) builtinBoardCateGets(c *gin.Context) {
boardCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/dashboards")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
var boards []Payload
for _, f := range files {
@@ -246,9 +243,6 @@ func (rt *Router) builtinAlertRules(c *gin.Context) {
alertCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/alerts")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
alertRules := make(map[string][]models.AlertRule)
for _, f := range files {

View File

@@ -105,10 +105,6 @@ func DatasourceCheck(ds models.Datasource) error {
return fmt.Errorf("url is empty")
}
if !strings.HasPrefix(ds.HTTPJson.Url, "http") {
return fmt.Errorf("url must start with http or https")
}
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{

View File

@@ -1,80 +0,0 @@
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
)
// 创建 ES Index Pattern
func (rt *Router) esIndexPatternAdd(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
now := time.Now().Unix()
f.CreateAt = now
f.CreateBy = username
f.UpdateAt = now
f.UpdateBy = username
err := f.Add(rt.Ctx)
ginx.NewRender(c).Message(err)
}
// 更新 ES Index Pattern
func (rt *Router) esIndexPatternPut(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
id := ginx.QueryInt64(c, "id")
esIndexPattern, err := models.EsIndexPatternGetById(rt.Ctx, id)
ginx.Dangerous(err)
if esIndexPattern == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such EsIndexPattern")
return
}
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(esIndexPattern.Update(rt.Ctx, f))
}
// 删除 ES Index Pattern
func (rt *Router) esIndexPatternDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids empty")
}
ginx.NewRender(c).Message(models.EsIndexPatternDel(rt.Ctx, f.Ids))
}
// ES Index Pattern列表
func (rt *Router) esIndexPatternGetList(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "datasource_id", 0)
var lst []*models.EsIndexPattern
var err error
if datasourceId != 0 {
lst, err = models.EsIndexPatternGets(rt.Ctx, "datasource_id = ?", datasourceId)
} else {
lst, err = models.EsIndexPatternGets(rt.Ctx, "")
}
ginx.NewRender(c).Data(lst, err)
}
// ES Index Pattern 单个数据
func (rt *Router) esIndexPatternGet(c *gin.Context) {
id := ginx.QueryInt64(c, "id")
item, err := models.EsIndexPatternGet(rt.Ctx, "id=?", id)
ginx.NewRender(c).Data(item, err)
}

View File

@@ -15,7 +15,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
type queryFormItem struct {
@@ -33,14 +32,10 @@ type batchQueryForm struct {
func (rt *Router) promBatchQueryRange(c *gin.Context) {
var f batchQueryForm
ginx.Dangerous(c.BindJSON(&f))
var lst []model.Value
cli := rt.PromClients.GetCli(f.DatasourceId)
if cli == nil {
logger.Warningf("no such datasource id: %d", f.DatasourceId)
ginx.NewRender(c).Data(lst, nil)
return
}
var lst []model.Value
for _, item := range f.Queries {
r := pkgprom.Range{
@@ -72,14 +67,9 @@ func (rt *Router) promBatchQueryInstant(c *gin.Context) {
var f batchInstantForm
ginx.Dangerous(c.BindJSON(&f))
var lst []model.Value
cli := rt.PromClients.GetCli(f.DatasourceId)
if cli == nil {
logger.Warningf("no such datasource id: %d", f.DatasourceId)
ginx.NewRender(c).Data(lst, nil)
return
}
var lst []model.Value
for _, item := range f.Queries {
resp, _, err := cli.Query(context.Background(), item.Query, time.Unix(item.Time, 0))

View File

@@ -28,12 +28,6 @@ func (rt *Router) serverHeartbeat(c *gin.Context) {
func (rt *Router) serversActive(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "dsid")
engineName := ginx.QueryStr(c, "engine_name", "")
if engineName != "" {
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
return
}
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)

View File

@@ -47,27 +47,10 @@ func (rt *Router) targetGets(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", 30)
dsIds := queryDatasourceIds(c)
var bgids []int64
var err error
if bgid == -1 {
// 全部对象的情况,找到用户有权限的业务组
user := c.MustGet("user").(*models.User)
userGroupIds, err := models.MyGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
bgids, err = models.BusiGroupIds(rt.Ctx, userGroupIds)
ginx.Dangerous(err)
// 将未分配业务组的对象也加入到列表中
bgids = append(bgids, 0)
} else {
bgids = append(bgids, bgid)
}
total, err := models.TargetTotal(rt.Ctx, bgids, dsIds, query)
total, err := models.TargetTotal(rt.Ctx, bgid, dsIds, query)
ginx.Dangerous(err)
list, err := models.TargetGets(rt.Ctx, bgids, dsIds, query, limit, ginx.Offset(c, limit))
list, err := models.TargetGets(rt.Ctx, bgid, dsIds, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
if err == nil {

View File

@@ -8,7 +8,6 @@ import (
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
@@ -50,20 +49,17 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
alertrtRouter.Config(r)
}
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
return func() {

View File

@@ -6,7 +6,7 @@ ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
RUN chmod +x /wait && pip install requests
RUN chmod +x /wait
EXPOSE 17000

View File

@@ -281,7 +281,6 @@ CREATE TABLE `alert_rule` (
`runbook_url` varchar(255),
`append_tags` varchar(255) not null default '' comment 'split by space: service=n9e mod=api',
`annotations` text not null comment 'annotations',
`extra_config` text not null comment 'extra_config',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
@@ -306,7 +305,6 @@ CREATE TABLE `alert_mute` (
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`mute_time_type` tinyint(1) not null default 0,
`periodic_mutes` varchar(4096) not null default '',
`severities` varchar(32) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
@@ -326,7 +324,6 @@ CREATE TABLE `alert_subscribe` (
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`severities` varchar(32) not null default '',
`tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value',
`redefine_severity` tinyint(1) default 0 comment 'is redefine severity?',
`new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
@@ -334,7 +331,6 @@ CREATE TABLE `alert_subscribe` (
`new_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`user_group_ids` varchar(250) not null comment 'split by space 1 34 5, notify cc to user_group_ids',
`webhooks` text not null,
`extra_config` text not null comment 'extra_config',
`redefine_webhooks` tinyint(1) default 0,
`for_duration` bigint not null default 0,
`create_at` bigint not null default 0,
@@ -618,19 +614,4 @@ CREATE TABLE `sso_config` (
`content` text not null,
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `es_index_pattern` (
`id` bigint unsigned not null auto_increment,
`datasource_id` bigint not null default 0 comment 'datasource id',
`name` varchar(191) not null,
`time_field` varchar(128) not null default '@timestamp',
`allow_hide_system_indices` tinyint(1) not null default 0,
`fields_format` varchar(4096) not null default '',
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`datasource_id`, `name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;

View File

@@ -1,8 +0,0 @@
package dumper
import "github.com/gin-gonic/gin"
// package level functions
func ConfigRouter(r *gin.Engine) {
syncDumper.ConfigRouter(r)
}

View File

@@ -1,110 +0,0 @@
package dumper
import (
"fmt"
"strings"
"sync"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/time"
)
type SyncRecord struct {
Timestamp int64
Mills int64
Count int
Message string
}
func (sr *SyncRecord) String() string {
var sb strings.Builder
sb.WriteString("timestamp: ")
sb.WriteString(time.Format(sr.Timestamp))
sb.WriteString(", mills: ")
sb.WriteString(fmt.Sprint(sr.Mills, "ms"))
sb.WriteString(", count: ")
sb.WriteString(fmt.Sprint(sr.Count))
sb.WriteString(", message: ")
sb.WriteString(sr.Message)
return sb.String()
}
type SyncRecords struct {
Current *SyncRecord
Last *SyncRecord
}
type SyncDumper struct {
sync.RWMutex
records map[string]*SyncRecords
}
func NewSyncDumper() *SyncDumper {
return &SyncDumper{
records: make(map[string]*SyncRecords),
}
}
var syncDumper = NewSyncDumper()
func (sd *SyncDumper) Put(key string, timestamp, mills int64, count int, message string) {
sr := &SyncRecord{
Timestamp: timestamp,
Mills: mills,
Count: count,
Message: message,
}
sd.Lock()
defer sd.Unlock()
if _, ok := sd.records[key]; !ok {
sd.records[key] = &SyncRecords{Current: sr}
return
}
sd.records[key].Last = sd.records[key].Current
sd.records[key].Current = sr
}
// busi_groups:
// last: timestamp, mills, count
// curr: timestamp, mills, count
func (sd *SyncDumper) Sprint() string {
sd.RLock()
defer sd.RUnlock()
var sb strings.Builder
sb.WriteString("\n")
for k, v := range sd.records {
sb.WriteString(k)
sb.WriteString(":\n")
if v.Last != nil {
sb.WriteString("last: ")
sb.WriteString(v.Last.String())
sb.WriteString("\n")
}
sb.WriteString("curr: ")
sb.WriteString(v.Current.String())
sb.WriteString("\n\n")
}
return sb.String()
}
func (sd *SyncDumper) ConfigRouter(r *gin.Engine) {
r.GET("/dumper/sync", func(c *gin.Context) {
clientIP := c.ClientIP()
if clientIP != "127.0.0.1" && clientIP != "::1" {
c.String(403, "forbidden")
return
}
c.String(200, sd.Sprint())
})
}
func PutSyncRecord(key string, timestamp, mills int64, count int, message string) {
syncDumper.Put(key, timestamp, mills, count, message)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -1,31 +0,0 @@
### Ceph Dashboard & Alerts
开启ceph 默认Prometheus支持
```bash
ceph mgr module enable prometheus
```
### 采集配置
在categraf中的prometheus插件中加入采集配置
```yaml
cat /opt/categraf/conf/input.prometheus/prometheus.toml
[[instances]]
urls = [
"http://192.168.11.181:9283/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="ceph",cluster="ceph"}
```
Dashboard:
[dashboard](../dashboards/ceph_by_categraf.json)
![ceph](./ceph.png)
Alerts:
[alerts](../alerts/ceph_by_categraf.json)
![alert](./alerts.png)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 187 KiB

View File

@@ -1,951 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,951 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,702 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "HighgRPCResourceExhaustedRate",
"note": "High gRPC ResourceExhausted error rate",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 > 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 > 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "HighPumaUtilization",
"note": "Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf \"%.1f\" }}%) over the last 60 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "instance:puma_utilization:ratio * 100 > 90",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "instance:puma_utilization:ratio * 100 > 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDatabaseDeadlockCancels",
"note": "Postgres database has queries canceled due to deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDatabaseDeadlocks",
"note": "Postgres database has deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PostgresDown",
"note": "The Postgres service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(pg_up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(pg_up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PumaQueueing",
"note": "Puma instance {{ $labels.instance }} is queueing requests with an average of {{ $value | printf \"%.1f\" }} over the last 30 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(puma_queued_connections[30m]) > 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(puma_queued_connections[30m]) > 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "RedisDown",
"note": "The Redis service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(redis_up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(redis_up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "ServiceDown",
"note": "The service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(up[5m]) * 100 < 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(up[5m]) * 100 < 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "SidekiqJobsQueuing",
"note": "Sidekiq has jobs queued",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum by (name) (sidekiq_queue_size) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (name) (sidekiq_queue_size) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "WorkhorseHighErrorRate",
"note": "Workhorse has high error rates",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) < 10\n) * 100 > 50\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) < 10\n) * 100 > 50\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
}
]

View File

@@ -1,712 +0,0 @@
{
"name": "GitLab - Machine Performance",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "query_result(node_boot_time_seconds)",
"reg": "/instance=\"(?<text>[^\"]*)/",
"multi": false
}
],
"panels": [
{
"type": "stat",
"id": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"layout": {
"h": 3,
"w": 5,
"x": 0,
"y": 0,
"i": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Uptime",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "none",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "stat",
"id": "fe9d5645-e775-446e-876b-35852b751961",
"layout": {
"h": 3,
"w": 2,
"x": 5,
"y": 0,
"i": "fe9d5645-e775-446e-876b-35852b751961",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Running",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "service",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "barGauge",
"id": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"layout": {
"h": 3,
"w": 5,
"x": 7,
"y": 0,
"i": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_timex_sync_status{instance=\"$instance\"}",
"legend": "NTP Enabled"
},
{
"refId": "B",
"expr": "node_timex_offset_seconds{instance=\"$instance\"}",
"legend": "NTP Offset"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "NTP",
"links": [],
"description": "",
"maxPerRow": 4,
"custom": {
"calc": "avg",
"baseColor": "#9470FF",
"serieWidth": 20,
"sortOrder": "desc"
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "milliseconds"
}
}
},
{
"type": "hexbin",
"id": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"layout": {
"h": 3,
"w": 12,
"x": 12,
"y": 0,
"i": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "count(count(node_cpu_seconds_total{instance=~\"$instance\"}) by (cpu))",
"legend": "Core"
},
{
"refId": "B",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Mem"
},
{
"refId": "C",
"expr": "node_memory_SwapTotal_bytes{instance=~\"$instance\"}",
"legend": "Swap"
},
{
"refId": "D",
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\"}) ",
"legend": "Disk"
},
{
"refId": "E",
"expr": "node_filefd_allocated{instance=~\"$instance\"}",
"legend": "Openfiles"
},
{
"refId": "F",
"expr": "rate(node_context_switches_total{instance=~\"$instance\"}[1m])",
"legend": "ContextSwitch"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "System Info",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"type": "timeseries",
"id": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"layout": {
"h": 7,
"w": 9,
"x": 0,
"y": 3,
"i": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "max(node_load1{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "1m"
},
{
"refId": "B",
"expr": "max(node_load5{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "5m"
},
{
"refId": "C",
"expr": "max(node_load15{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "15m"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Load Average",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"layout": {
"h": 7,
"w": 15,
"x": 9,
"y": 3,
"i": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "1 - (avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"idle\"}[2m])))",
"legend": "CPU"
},
{
"refId": "B",
"expr": "(node_memory_MemTotal_bytes{instance=~\"$instance\"} - (node_memory_MemFree_bytes{instance=~\"$instance\"} + node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"})) / node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "MEM"
},
{
"refId": "C",
"expr": "avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"iowait\"}[2m]))",
"legend": "IOWAIT"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Resource Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"layout": {
"h": 5,
"w": 24,
"x": 0,
"y": 10,
"i": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Available"
},
{
"refId": "B",
"expr": "node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"}",
"legend": "Buffers/Cached"
},
{
"refId": "C",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"} - node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Used"
},
{
"refId": "D",
"expr": "node_memory_MemFree_bytes{instance=~\"$instance\"}",
"legend": "Free"
},
{
"refId": "E",
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Total"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"layout": {
"h": 9,
"w": 15,
"x": 0,
"y": 15,
"i": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Reads"
},
{
"refId": "B",
"expr": "sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Writes"
},
{
"refId": "C",
"expr": "sum by (instance) (node_disk_io_now{instance=~\"$instance\"})",
"legend": "{{instance}}-CurrentIO"
},
{
"refId": "D",
"expr": "sum by (instance) (irate(node_disk_read_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Read-consuming"
},
{
"refId": "E",
"expr": "sum by (instance) (irate(node_disk_write_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Write-consuming"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Disk IO",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"layout": {
"h": 9,
"w": 9,
"x": 15,
"y": 15,
"i": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "node_filesystem_avail_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"}",
"legend": "{{device}}",
"instant": false
},
{
"refId": "B",
"expr": "1-(node_filesystem_free_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"} / node_filesystem_size_bytes{instance=\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}"
},
{
"refId": "C",
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Disk Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"layout": {
"h": 10,
"w": 24,
"x": 0,
"y": 24,
"i": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (irate(node_network_receive_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-receive"
},
{
"refId": "B",
"expr": "sum by (instance) (irate(node_network_transmit_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-transmit"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network Traffic",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

View File

@@ -1,501 +0,0 @@
{
"name": "GitLab - NGINX VTS",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"nginx\"}, instance)",
"reg": "",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "timeseries",
"id": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac",
"layout": {
"h": 4,
"w": 7,
"x": 0,
"y": 0,
"i": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "topk(1, count by (hostname) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "hostname: {{hostname}}"
},
{
"refId": "C",
"expr": "topk(1, count by (version) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "nginx: {{version}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Nginx Info",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68",
"layout": {
"h": 12,
"w": 17,
"x": 7,
"y": 0,
"i": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (instance) (\n rate(nginx_vts_upstream_request_seconds_total{instance=~\"$instance\"}[5m])\n) /\nsum by (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{instance}}",
"step": 15,
"instant": false,
"time": {
"start": "now-6h",
"end": "now"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Latency",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "seconds",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "cb24fbce-8bb5-425e-89f3-c32e22532b5d",
"layout": {
"h": 8,
"w": 7,
"x": 0,
"y": 4,
"i": "cb24fbce-8bb5-425e-89f3-c32e22532b5d"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (shared) (nginx_vts_main_shm_usage_bytes{instance=~\"$instance\", job=~\"nginx\"})",
"legend": "{{shared}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Shared Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "54c5264a-a596-486f-a4a8-4ab451d91870",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 13,
"i": "54c5264a-a596-486f-a4a8-4ab451d91870"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_server_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{host}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Server - Current total of incoming / outgoing bytes",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 13,
"i": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{backend}}-{{upstream}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Upstream - Current total of incoming / outgoing bytes",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "712b82cd-633a-4088-8695-fa2d75ab37ca",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 22,
"i": "712b82cd-633a-4088-8695-fa2d75ab37ca"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_server_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{host}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Server - Requests by HTTP code",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "09ca96b6-4e23-4db2-bc14-79c5f965d16f",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 22,
"i": "09ca96b6-4e23-4db2-bc14-79c5f965d16f"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{backend}} {{upstream }}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Upstream - Requests by HTTP code",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.2,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

View File

@@ -1,517 +0,0 @@
{
"name": "GitLab - Overview",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [
{
"title": "GitLab Docs",
"url": "https://docs.gitlab.com/"
}
],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
}
],
"panels": [
{
"type": "text",
"id": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"layout": {
"h": 3,
"w": 24,
"x": 0,
"y": 0,
"i": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"isResizable": true
},
"version": "3.0.0",
"name": "",
"links": [],
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"bgColor": "#FFFFFF",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": "<br><h1><i><font color=#5991A7><b>GitLab Service Status</b></font></i></h1>\n"
}
},
{
"type": "text",
"id": "9991440f-1e01-4807-8911-2619329af244",
"layout": {
"h": 3,
"w": 2,
"x": 0,
"y": 3,
"i": "9991440f-1e01-4807-8911-2619329af244",
"isResizable": true
},
"version": "3.0.0",
"name": "Gitlab",
"links": [],
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"bgColor": "#FFFFFF",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": "<p style=\"text-align:center;\"><img src=\"https://www.cloudfoundry.org/wp-content/uploads/2017/10/icon_gitlab_cf@2x.png\" width=80px/></p>"
}
},
{
"type": "stat",
"id": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"layout": {
"h": 3,
"w": 5,
"x": 2,
"y": 3,
"i": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "topk(1, count by (version) (gitlab_build_info{job=~\"gitlab-workhorse\"}))",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "GitLab Version",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "first",
"valueField": "version",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
}
},
{
"type": "hexbin",
"id": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"layout": {
"h": 3,
"w": 17,
"x": 7,
"y": 3,
"i": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) / count(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) * 100",
"legend": "{{job}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Service Status",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#f0ee6e",
"#6ba261",
"#306d52"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"version": "3.0.0",
"id": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"type": "text",
"links": [],
"layout": {
"h": 2,
"w": 24,
"x": 0,
"y": 6,
"i": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"isResizable": true
},
"targets": [],
"options": {
"thresholds": {
"style": "line",
"steps": []
},
"standardOptions": {
"util": "none"
},
"legend": {
"displayMode": "list"
},
"tooltip": {
"mode": "multi"
}
},
"custom": {
"version": "3.0.0",
"content": "<br><h1><i><font color=#5991A7><b>Sidekiq Statistics</b></font></i></h1>\n"
},
"maxPerRow": 4
},
{
"type": "hexbin",
"id": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 8,
"i": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "max(sidekiq_jobs_processed_total)",
"legend": "Processed"
},
{
"refId": "B",
"expr": "max(sidekiq_jobs_failed_total)",
"legend": "Failed"
},
{
"refId": "C",
"expr": "max(sidekiq_jobs_enqueued_size)",
"legend": "Enqueued"
},
{
"refId": "D",
"expr": "max(sidekiq_jobs_scheduled_size)",
"legend": "Scheduled"
},
{
"refId": "E",
"expr": "max(sidekiq_jobs_retry_size)",
"legend": "Retry"
},
{
"refId": "F",
"expr": "max(sidekiq_jobs_dead_size)",
"legend": "Dead"
},
{
"refId": "G",
"expr": "max(sidekiq_processes_size)",
"legend": "Processes"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Background Jobs",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"colorDomainAuto": true
},
"options": {
"standardOptions": {}
}
},
{
"type": "timeseries",
"id": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 8,
"i": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "topk(10, avg_over_time(sidekiq_queue_size{}[5m]))",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Queue Size Top 10",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"version": "3.0.0",
"id": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"type": "text",
"links": [],
"layout": {
"h": 2,
"w": 24,
"x": 0,
"y": 12,
"i": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"isResizable": true
},
"targets": [],
"options": {
"thresholds": {
"style": "line",
"steps": []
},
"standardOptions": {
"util": "none"
},
"legend": {
"displayMode": "list"
},
"tooltip": {
"mode": "multi"
}
},
"custom": {
"version": "3.0.0",
"content": "<br><h1><i><font color=#5991A7><b>Workhorse Statistics</b></font></i></h1>"
},
"maxPerRow": 4
},
{
"type": "timeseries",
"id": "dd030993-481d-43e4-887a-3c0dee279ea0",
"layout": {
"h": 9,
"w": 12,
"x": 0,
"y": 14,
"i": "dd030993-481d-43e4-887a-3c0dee279ea0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "sum by (code) (\n rate(gitlab_workhorse_http_requests_total{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{code}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Http Request Total",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "77343eca-60d3-4be9-b80d-6db471399339",
"layout": {
"h": 9,
"w": 12,
"x": 12,
"y": 14,
"i": "77343eca-60d3-4be9-b80d-6db471399339",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "B",
"expr": "sum by (le) (\n rate(gitlab_workhorse_http_request_duration_seconds_bucket{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{le}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Workhorse Latency",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,739 +0,0 @@
{
"name": "GitLab - Redis",
"tags": "",
"ident": "",
"configs": {
"version": "3.0.0",
"links": [],
"var": [
{
"name": "DS_PROMETHEUS",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 5
},
{
"name": "instance",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"redis\"}, instance)",
"reg": "",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "stat",
"id": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 0,
"i": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "avg(time() - redis_start_time_seconds{instance=~\"$instance\"})",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Uptime",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "seconds",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "stat",
"id": "38a6c015-4dae-4765-8f5f-9961ca302e18",
"layout": {
"h": 3,
"w": 4,
"x": 4,
"y": 0,
"i": "38a6c015-4dae-4765-8f5f-9961ca302e18"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n avg_over_time(redis_connected_clients{instance=~\"$instance\"}[5m])\n)",
"legend": "Clients"
},
{
"refId": "B",
"expr": "sum(\n avg_over_time(redis_connected_slaves{instance=~\"$instance\"}[5m])\n)",
"legend": "Slaves"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Clients",
"links": [],
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "avg",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "none",
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
}
},
{
"type": "timeseries",
"id": "0739f047-3fcd-4bf0-ac63-09c0a4170452",
"layout": {
"h": 6,
"w": 8,
"x": 8,
"y": 0,
"i": "0739f047-3fcd-4bf0-ac63-09c0a4170452"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_commands_processed_total{instance=~\"$instance\"}[5m])\n)",
"legend": ""
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Commands Executed",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9",
"layout": {
"h": 6,
"w": 8,
"x": 16,
"y": 0,
"i": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_keyspace_hits_total{instance=~\"$instance\"}[5m])\n)",
"legend": "hits"
},
{
"refId": "B",
"expr": "sum(\n rate(redis_keyspace_misses_total{instance=~\"$instance\"}[5m])\n)",
"legend": "misses"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Hits, Misses per Second",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "8a29cc91-697c-48d7-a613-14e81aeb6772",
"layout": {
"h": 10,
"w": 8,
"x": 0,
"y": 3,
"i": "8a29cc91-697c-48d7-a613-14e81aeb6772"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "redis_memory_used_bytes{instance=~\"$instance\"}",
"legend": "used - {{instance}}"
},
{
"refId": "B",
"expr": "redis_config_maxmemory{instance=~\"$instance\"} > 0",
"legend": "max - {{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory Usage",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 6,
"i": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(rate(redis_expired_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "expired"
},
{
"refId": "B",
"expr": "sum(rate(redis_evicted_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "evicted"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expired / Evicted",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "634f53c0-31d8-4a9f-9cfc-a7c241797359",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 6,
"i": "634f53c0-31d8-4a9f-9cfc-a7c241797359"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(\n rate(redis_net_input_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "In"
},
{
"refId": "B",
"expr": "sum(\n rate(redis_net_output_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "Out"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network I/O",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.1,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "9267bb6a-edc7-4088-913e-5902e81cf736",
"layout": {
"h": 7,
"w": 16,
"x": 0,
"y": 13,
"i": "9267bb6a-edc7-4088-913e-5902e81cf736"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum without (instance) (\n rate(redis_commands_total{instance=~\"$instance\"}[5m])\n) > 0",
"legend": "{{ cmd }}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Command Calls / sec",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.8,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "51d28f91-c2f3-4f22-859e-bdc88f25ead2",
"layout": {
"h": 14,
"w": 8,
"x": 16,
"y": 13,
"i": "51d28f91-c2f3-4f22-859e-bdc88f25ead2"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum(redis_db_keys{instance=~\"$instance\"} - redis_db_keys_expiring{instance=~\"$instance\"}) ",
"legend": "not expiring"
},
{
"refId": "B",
"expr": "sum(redis_db_keys_expiring{instance=~\"$instance\"})",
"legend": "expiring"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expiring vs Not-Expiring Keys",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.7,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "1c0cd7d9-5899-49c0-8939-a361b76f60ac",
"layout": {
"h": 7,
"w": 16,
"x": 0,
"y": 20,
"i": "1c0cd7d9-5899-49c0-8939-a361b76f60ac"
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"refId": "A",
"expr": "sum by (db) (\n redis_db_keys{instance=~\"$instance\"}\n)",
"legend": "{{ db }} "
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Items per DB",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "none",
"min": 0
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.7,
"gradientMode": "none",
"stack": "normal",
"scaleDistribution": {
"type": "linear"
}
}
}
]
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 263 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 203 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 264 KiB

View File

@@ -1,127 +0,0 @@
### Gitlab Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)中[inputs.prometheus](https://github.com/flashcatcloud/categraf/tree/main/inputs/prometheus)插件采集[Gitlab](https://docs.gitlab.com/)服务组件暴露的指标数据:
开启Gitlab默认Prometheus支持:
[Monitoring GitLab with Prometheus](https://docs.gitlab.com/ee/administration/monitoring/prometheus/)
### 采集配置
在categraf中的prometheus插件中加入采集配置
```yaml
cat /opt/categraf/conf/input.prometheus/prometheus.toml
# # collect interval
# interval = 15
[[instances]]
urls = [
"http://192.168.11.77:9236/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitaly"}
[[instances]]
urls = [
"http://192.168.11.77:9168/sidekiq"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab_exporter_sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9168/database"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab_exporter_database"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9229/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="gitlab-workhorse"}
[[instances]]
urls = [
"http://192.168.11.77:9100/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="node"}
[[instances]]
urls = [
"http://192.168.11.77:9187/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="postgres"}
[[instances]]
urls = [
"http://192.168.11.77:9121/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="redis"}
[[instances]]
urls = [
"http://192.168.11.77:9999/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="gitlab",job="nginx"}
```
Dashboards:
[MachinePerformance](../dashboards/MachinePerformance.json)
![MachinePerformance](./MachinePerformance.png)
[NGINXVTS](../dashboards/NGINXVTS.json)
![NGINXVTS](./NGINXVTS.png)
[Overview](../dashboards/Overview.json)
![Overview](./Overview.png)
[PostgreSQL](../dashboards/PostgreSQL.json)
![PostgreSQL](./PostgreSQL.png)
[Redis](../dashboards/Redis.json)
![Redis](./Redis.png)
Alerts:
[alerts](../alerts/gitlab_by_categraf.json)
![alert](./alerts.png)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 312 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

View File

@@ -1,73 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "http detect failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "http_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,148 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Insufficient consumption ability - delay exceeds 5 minutes - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
52
],
"name": "Risk of data loss - number of replicas less than 3 - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3 - exporter",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,860 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "A certain disk is unable to read/write normally",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filesystem_device_error{instance=\"$node\",mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run.*\"}) > 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Disk needs to be cleaned - utilization has reached 92% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes) ) > 92 ",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient computational resources - average load per core of the machine is greater than 10 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg (node_load1) by (instance)/count(count(node_cpu_seconds_total) by (cpu,instance)) by (instance) >10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient file handles - usage exceeds 90% - node exporter",
"note": "You can increase the file handle limit or expand the capacity",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filefd_allocated{instance=\"$node\"}/node_filefd_maximum{instance=\"$node\"}*100) > 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient inode resources - usage exceeds 90% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_files_free * 100) / node_filesystem_files))>90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient memory resources - utilization is greater than 75% - node exporter",
"note": "Expansion or upgrading of configuration is required",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 > 75",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Insufficient memory resources - utilization is greater than 95% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 > 95",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Packet loss in the inbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_receive_drop_total{device=~\"e.*\"}[1m]) > 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Packet loss in the outbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_transmit_drop_total{device=~\"e.*\"}[1m]) > 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "System conntrack needs to be adjusted - usage exceeds 80% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit*100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "System experiences OOM - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(node_vmstat_oom_kill[1m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Too many running processes - exceeding 3000 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_procs_running > 3000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,979 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Address is unreachable by PING, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Hard disk - expected to be written full in 4 hours - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) < 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Hard disk - IO is very busy - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 > 99",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "host",
"datasource_ids": null,
"name": "Lost connection with monitoring target - telegraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {},
"notify_aggregation": {}
}
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Machine load - high CPU, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "cpu_usage_idle{cpu=\"cpu-total\"} < 25",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Machine load - high memory, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mem_available_percent < 25",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Network connection - number of TME_WAIT exceeds 20,000 - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "netstat_tcp_time_wait > 20000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss in the inbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) > 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss in the outbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) > 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Port detection failed, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "process handle limit is too small - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft < 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Process monitoring - lookup failure - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "there are processes with 0 count, a certain process may have crashed - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,73 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "NTP time offset is too large",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ntp_offset_ms > 1000 or ntp_offset_ms < -1000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,142 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "DisksOffline",
"note": "Disks down in MinIO deployment",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "avg_over_time(minio_cluster_disk_offline_total{}[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(minio_cluster_disk_offline_total{}[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "NodesOffline",
"note": "Node down in MinIO deployment",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "avg_over_time(minio_cluster_nodes_offline_total{}[5m]) > 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(minio_cluster_nodes_offline_total{}[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null
}
]

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -1,31 +0,0 @@
### MinIO Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)中[inputs.prometheus](https://github.com/flashcatcloud/categraf/tree/main/inputs/prometheus)插件采集[MinIO](https://min.io/docs/minio/linux/operations/monitoring/collect-minio-metrics-using-prometheus.html?ref=docs-redirect#minio-metrics-collect-using-prometheus)服务指标数据:
开启 MinIO Prometheus访问
```bash
# 启动MinIO服务的时候加入下面的变量
MINIO_PROMETHEUS_AUTH_TYPE=public
```
### 采集配置
在categraf中的prometheus插件中加入采集配置
```yaml
cat /opt/categraf/conf/input.prometheus/prometheus.toml
[[instances]]
urls = [
"http://192.168.1.188:9000/minio/v2/metrics/cluster"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {job="minio-cluster"}
```
[Dashboard](../dashboards/minio_by_categraf.json)
![MinIO](./minio.png)
[Alerts](../alerts/minio_by_categraf.json)
![alert](./alerts.png)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 158 KiB

View File

@@ -1,586 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Average MongoDB operation time exceeds 250 seconds - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_mongod_op_latencies_latency_total[5m]) / rate(mongodb_mongod_op_latencies_ops_total[5m]) > 250000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoOperationHighLatency"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo connection number has exceeded 80% - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mongodb_connections{state=\"current\"}) / avg by (instance) (mongodb_connections{state=\"available\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoTooManyConnections(>80%)"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo has encountered an Assert error - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_asserts_total{type=~\"regular|message\"}[5m]) > 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoAssertsDetected"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo has encountered cursor timeout - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_mongod_metrics_cursor_timed_out_total[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRecurrentCursorTimeout"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo has encountered page fault interrupt - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_extra_info_page_faults_total[5m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRecurrentMemoryPageFaults"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo has just restarted, please pay attention - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mongodb_instance_uptime_seconds < 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRestarted"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo instance has crashed - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "MongoServerDown",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoServerDown"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mongo replica set master-slave delay exceeds 30 seconds - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mongodb_mongod_replset_member_replication_lag > 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoSlaveReplicationLag(>30s)"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,732 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "A slow query has occurred in Mysql within the last minute - categraf",
"note": "MySQL server mysql has some new slow query",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(mysql_global_status_slow_queries[1m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlowQueries"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "More than 60% of the connections in Mysql are in a running state - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_running) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighThreadsRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mysql has just restarted. Please be advised - categraf",
"note": "MySQL has just been restarted, less than one minute ago",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_global_status_uptime < 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlRestarted"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mysql has opened a large number of file handles. Please be aware - categraf",
"note": "More than 80% of MySQL files open",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_open_files) / avg by (instance)(mysql_global_variables_open_files_limit) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighOpenFiles"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mysql instance has crashed - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_up == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlDown"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlInnodbLogWaits - categraf",
"note": "MySQL innodb log writes stalling",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mysql_global_status_innodb_log_waits[15m]) > 10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlInnodbLogWaits"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveIoThreadNotRunning - categraf",
"note": "MySQL Slave IO thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveIoThreadNotRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveReplicationLag - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveReplicationLag"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveSqlThreadNotRunning - categraf",
"note": "MySQL Slave SQL thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveSqlThreadNotRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "The number of connections in Mysql has exceeded 80% - categraf",
"note": "More than 80% of MySQL connections are in use",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_connected) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlTooManyConnections"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,732 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "More than 60% of the connections in MySQL are in a running state",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_running) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighThreadsRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "More than 80% of MySQL files open",
"note": "More than 80% of MySQL files open",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_innodb_num_open_files) / avg by (instance)(mysql_global_variables_open_files_limit) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighOpenFiles"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MySQL connection count has exceeded 80%",
"note": "More than 80% of MySQL connections are in use",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_connected) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlTooManyConnections"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Mysql has just restarted. Please be advised - exporter",
"note": "MySQL has just been restarted, less than one minute ago",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_global_status_uptime < 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlRestarted"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MySQL server mysql has some new slow query",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(mysql_global_status_slow_queries[1m]) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlowQueries"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlInnodbLogWaits - exporter",
"note": "MySQL innodb log writes stalling",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mysql_global_status_innodb_log_waits[15m]) > 10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlInnodbLogWaits"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveIoThreadNotRunning - exporter",
"note": "MySQL Slave IO thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveIoThreadNotRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveReplicationLag - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveReplicationLag"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "MysqlSlaveSqlThreadNotRunning - exporter",
"note": "MySQL Slave SQL thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveSqlThreadNotRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "The MySQL instance is down",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_up == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlDown"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,54 +1,15 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Network address probe failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
{
"name": "Network address probe failed",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "net_response_result_code != 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
@@ -56,18 +17,14 @@
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

View File

@@ -1,73 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "PING address detection failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -1,68 +0,0 @@
## PostgreSQL Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)中[inputs.postgresql](https://github.com/flashcatcloud/categraf/tree/main/inputs/postgresql)插件采集[PostgreSQL](https://www.postgresql.org/)服务监控指标数据;
### 配置文件示例:
如果是多个PostgreSQL可以写多个[[instance]]配置
```toml
[[instances]]
address = "host=192.168.11.181 port=5432 user=postgres password=123456789 sslmode=disable"
## specify address via a url matching:
## postgres://[pqgotest[:password]]@localhost[/dbname]?sslmode=[disable|verify-ca|verify-full]
## or a simple string:
## host=localhost user=pqgotest password=... sslmode=... dbname=app_production
##
## All connection parameters are optional.
##
## Without the dbname parameter, the driver will default to a database
## with the same name as the user. This dbname is just for instantiating a
## connection with the server and doesn't restrict the databases we are trying
## to grab metrics for.
##
# address = "host=localhost user=postgres sslmode=disable"
## A custom name for the database that will be used as the "server" tag in the
## measurement output. If not specified, a default one generated from
## the connection address is used.
# outputaddress = "db01"
## connection configuration.
## maxlifetime - specify the maximum lifetime of a connection.
## default is forever (0s)
# max_lifetime = "0s"
## A list of databases to explicitly ignore. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'databases' option.
# ignored_databases = ["postgres", "template0", "template1"]
## A list of databases to pull metrics about. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'ignored_databases' option.
# databases = ["app_production", "testing"]
## Whether to use prepared statements when connecting to the database.
## This should be set to false when connecting through a PgBouncer instance
## with pool_mode set to transaction.
#prepared_statements = true
# [[instances.metrics]]
# mesurement = "sessions"
# label_fields = [ "status", "type" ]
# metric_fields = [ "value" ]
# timeout = "3s"
# request = '''
# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
# '''
```
### 告警规则
![alert](./alerts.png)
[alerts](../alerts/postgresql_by_categraf.json)
### 仪表盘:
![dashboard](./postgresql.png)
[dashboard](../dashboards/postgresql_by_categraf.json)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 169 KiB

View File

@@ -1,221 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Process X high number of open files - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (namedprocess_namegroup_worst_fd_ratio{groupname=\"X\"}) * 100 > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ProcessHighOpenFiles"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Process X is down - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (instance) (namedprocess_namegroup_num_procs{groupname=\"X\"}) == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ProcessNotRunning"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Process X is restarted - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "namedprocess_namegroup_oldest_start_time_seconds{groupname=\"X\"} > time() - 60 ",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ProcessRestarted"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,152 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "process handle limit is too low",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft < 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "there is a process count of 0, indicating that a certain process may have crashed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_count == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,152 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "process handle limit is too low",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft < 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "there is a process count of 0, indicating that a certain process may have crashed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_count == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,34 +1,16 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Cluster Operator Unavailable Replicas",
"note": "There are kube_pod_names that are either running but not yet available or kube_pod_names that still have not been created.",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_deployment_status_replicas_unavailable > 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "kube_deployment_status_replicas_unavailable > 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -66,40 +48,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Container Restarts",
"note": "Over the last 10 minutes a rabbitmq container was restarted",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(kube__container_status_restarts_total[10m]) >= 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "increase(kube__container_status_restarts_total[10m]) >= 1\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -137,40 +99,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] File Descriptors Near Limit",
"note": "The file descriptors are near to the limit",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (max_over_time(rabbitmq_process_open_fds[5m]))/sum (rabbitmq_process_max_tcp_sockets)> 0.8\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "sum (max_over_time(rabbitmq_process_open_fds[5m]))/sum (rabbitmq_process_max_tcp_sockets)> 0.8\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -208,40 +150,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] High Connection Churn",
"note": "There are a high connection churn",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "( sum(rate(rabbitmq_connections_closed_total[5m]) ) + sum(rate(rabbitmq_connections_opened_total[5m]) ) )/sum (rabbitmq_connections) > 0.1 unless sum (rabbitmq_connections) < 100\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "( sum(rate(rabbitmq_connections_closed_total[5m]) ) + sum(rate(rabbitmq_connections_opened_total[5m]) ) )/sum (rabbitmq_connections) > 0.1 unless sum (rabbitmq_connections) < 100\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -279,40 +201,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Insufficient Established Erlang Distribution Links",
"note": "Insuffient establised erland distribution links",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "count (erlang_vm_dist_node_state) == 3< count (rabbitmq_build_info ) *(count (rabbitmq_build_info - 1 ))\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "count (erlang_vm_dist_node_state) == 3< count (rabbitmq_build_info ) *(count (rabbitmq_build_info - 1 ))\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -350,40 +252,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Low Disk Watermark Predicted",
"note": "The predicted free disk space in 24 hours from now is low",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "( predict_linear(rabbitmq_disk_space_available_bytes[24h], 60*60*24) < rabbitmq_disk_space_available_limit_bytes )and( count_over_time(rabbitmq_disk_space_available_limit_bytes[2h] offset 22h) > 0)\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "( predict_linear(rabbitmq_disk_space_available_bytes[24h], 60*60*24) < rabbitmq_disk_space_available_limit_bytes )and( count_over_time(rabbitmq_disk_space_available_limit_bytes[2h] offset 22h) > 0)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -421,40 +303,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] No MajorityOfNodesReady",
"note": "There are so many nodes not ready",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_statefulset_status_replicas_ready<=kube_statefulset_replicas/ 2 unless kube_statefulset_replicas== 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "kube_statefulset_status_replicas_ready<=kube_statefulset_replicas/ 2 unless kube_statefulset_replicas== 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -492,40 +354,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Persistent Volume Missing",
"note": "There is at least one pvc not bound",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_persistentvolumeclaim_status_phase{phase=\"Bound\"} == 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_ql": "kube_persistentvolumeclaim_status_phase{phase=\"Bound\"} == 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -563,40 +405,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] TCP Sockets Near Limit",
"note": "The TCP sockets are near to the limit",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (max_over_time(rabbitmq_process_open_tcp_sockets[5m]))/sum (rabbitmq_process_max_tcp_sockets)> 0.8\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "sum (max_over_time(rabbitmq_process_open_tcp_sockets[5m]))/sum (rabbitmq_process_max_tcp_sockets)> 0.8\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -634,40 +456,20 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"cate": "",
"name": "[RabbitMQ] Unroutable Messages",
"note": "There were unroutable message within the last 5 minutes in RabbitMQ cluster",
"prod": "metric",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (increase(rabbitmq_channel_messages_unroutable_dropped_total[5m])) >= 1 or sum (increase(rabbitmq_channel_messages_unroutable_returned_total[5m])) >= 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_ql": "sum (increase(rabbitmq_channel_messages_unroutable_dropped_total[5m])) >= 1 or sum (increase(rabbitmq_channel_messages_unroutable_returned_total[5m])) >= 1\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
@@ -705,8 +507,6 @@
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
"append_tags": []
}
]

View File

@@ -1,440 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis eviction rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum(rate(redis_evicted_keys[5m])) / sum(redis_keyspace_keys)) > 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighKeysEvictionRatio"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis memory usage rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_maxmemory > 0 and (redis_used_memory / redis_maxmemory) > 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighMemoryUsage"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis Ping latency (above 100 milliseconds)",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_ping_use_seconds > 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighPingLatency"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Low Redis hit rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(redis_keyspace_hits[5m])\n/\n(rate(redis_keyspace_misses[5m]) + rate(redis_keyspace_hits[5m]))\n< 0.9",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowHitRatio"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Redis connection refused",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(rate(redis_rejected_connections[5m])) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisRejectedConnHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Redis has just been restarted, please be aware",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_uptime_in_seconds < 600",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowUptime"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,513 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis client connection count - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(redis_connected_clients / redis_config_maxclients) > 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighClientsUsage"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis eviction rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum(rate(redis_evicted_keys_total[5m])) / sum(redis_db_keys)) > 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighKeysEvictionRatio"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis latency - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(rate(redis_commands_duration_seconds_total[5m])) / sum(rate(redis_commands_processed_total[5m])) > 0.25",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighResponseTime"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High Redis memory usage rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_memory_max_bytes > 0 and (redis_memory_used_bytes / redis_memory_max_bytes) > 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighMemoryUsage"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Low Redis hit rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(redis_keyspace_hits_total[5m])\n/\n(rate(redis_keyspace_misses_total[5m]) + rate(redis_keyspace_hits_total[5m]))\n< 0.9",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowHitRatio"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Redis connection refused - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(rate(redis_rejected_connections_total[5m])) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisRejectedConnHigh"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Redis has just been restarted, please be aware - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_uptime_in_seconds < 600",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowUptime"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -4,8 +4,8 @@
"datasource_ids": [
7
],
"name": "TiDB-CPU 使用率大于 80%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-CPU 使用率大于 80%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -72,8 +72,8 @@
"datasource_ids": [
7
],
"name": "TiDB-Node_exporter_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-Node_exporter_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -140,8 +140,8 @@
"datasource_ids": [
7
],
"name": "TiDB-PD 写盘延迟大于 1s leader 写盘慢会导致 leader 切换",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-PD 写盘延迟大于 1s leader 写盘慢会导致 leader 切换",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -208,8 +208,8 @@
"datasource_ids": [
7
],
"name": "TiDB-PD 无 leader ",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-PD 无 leader ",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -278,8 +278,8 @@
"datasource_ids": [
7
],
"name": "TiDB-PD 节点之间网络延迟大于 1s",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-PD 节点之间网络延迟大于 1s",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -346,8 +346,8 @@
"datasource_ids": [
7
],
"name": "TiDB-TCP 连接数大于 50000 ",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-TCP 连接数大于 50000 ",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -416,8 +416,8 @@
"datasource_ids": [
7
],
"name": "TiDB-TiDB_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-TiDB_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -484,8 +484,8 @@
"datasource_ids": [
7
],
"name": "TiDB-blackbox_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-blackbox_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -552,8 +552,8 @@
"datasource_ids": [
7
],
"name": "TiDB-drainer_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-drainer_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -620,8 +620,8 @@
"datasource_ids": [
7
],
"name": "TiDB-grafana_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-grafana_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -688,8 +688,8 @@
"datasource_ids": [
7
],
"name": "TiDB-pd_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-pd_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -756,8 +756,8 @@
"datasource_ids": [
7
],
"name": "TiDB-pump_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-pump_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -824,8 +824,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tidb CPU 使用率超过可用 CPU的 80%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tidb CPU 使用率超过可用 CPU的 80%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -892,8 +892,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tidb 最近 15 分钟加载 schema 出现错误",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tidb 最近 15 分钟加载 schema 出现错误",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -960,8 +960,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tidb 最近五分钟出现跳过 binlog",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tidb 最近五分钟出现跳过 binlog",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1028,8 +1028,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tidb 最近十分钟加载 schema 失败大于 10 次",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tidb 最近十分钟加载 schema 失败大于 10 次",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1096,8 +1096,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tiflash_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tiflash_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1164,8 +1164,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tikv 5分钟内内存增长大于 5G",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tikv 5分钟内内存增长大于 5G",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1234,8 +1234,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tikv 发生重启",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tikv 发生重启",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1302,8 +1302,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tikv 可用空间低于 20%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tikv 可用空间低于 20%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1372,8 +1372,8 @@
"datasource_ids": [
7
],
"name": "TiDB-tikv_server_is_down",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-tikv_server_is_down",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1440,8 +1440,8 @@
"datasource_ids": [
7
],
"name": "TiDB-内存使用大于 80%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-内存使用大于 80%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1508,8 +1508,8 @@
"datasource_ids": [
7
],
"name": "TiDB-挂载点状态变为只读",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-挂载点状态变为只读",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1576,8 +1576,8 @@
"datasource_ids": [
7
],
"name": "TiDB-文件系统 inode 使用率大于 80%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-文件系统 inode 使用率大于 80%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1644,8 +1644,8 @@
"datasource_ids": [
7
],
"name": "TiDB-磁盘使用率大于 80%",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-磁盘使用率大于 80%",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1712,8 +1712,8 @@
"datasource_ids": [
7
],
"name": "TiDB-网卡-入向有丢包",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-网卡-入向有丢包",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1780,8 +1780,8 @@
"datasource_ids": [
7
],
"name": "TiDB-网卡-出向有丢包",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-网卡-出向有丢包",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1848,8 +1848,8 @@
"datasource_ids": [
7
],
"name": "TiDB-集群中出现状态为 down 的 tikv 数量大于 0",
"note": "TiDB(中控)",
"name": "CTIE-TIDB-BJ天玑两地三中心-DEV-集群中出现状态为 down 的 tikv 数量大于 0",
"note": "CTIE-TIDB-BJ天玑两地三中心-DEV,10.100.2.15 tidb(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
@@ -1911,4 +1911,4 @@
"append_tags": [],
"annotations": {}
}
]
]

View File

@@ -1,440 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High CPU utilization - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * sum by (instance) (rate(windows_cpu_time_total{mode != 'idle'}[5m])) / count by (instance) (windows_cpu_core_frequency_mhz) > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighCPUUsage"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "High memory usage rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * (windows_cs_physical_memory_bytes - windows_os_physical_memory_free_bytes) / windows_cs_physical_memory_bytes > 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighPhysicalMemoryUsage"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "machine reboot in the last 15 minutes - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "time() - windows_system_system_up_time < 900",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=UpTimeLessThan15Min"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss issue in the inbound direction - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * rate(windows_net_packets_received_errors[5m]) / (rate(windows_net_packets_received_errors[5m]) + rate(windows_net_packets_received_total[5m])>0) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighInboundErrorRate"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "packet loss issue in the outbound direction - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * rate(windows_net_packets_outbound_errors[5m]) / (rate(windows_net_packets_outbound_errors[5m]) + rate(windows_net_packets_sent_total[5m])>0) > 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighOutboundErrorRate"
],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "The hard disk is almost full - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * (windows_logical_disk_size_bytes - windows_logical_disk_free_bytes) / windows_logical_disk_size_bytes > 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=LogicalDiskFull"
],
"annotations": null,
"extra_config": null
}
]

View File

@@ -1,286 +0,0 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "More than 1 Zookeeper leader - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(zk_server_leader) > 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "There is no Zookeeper leader available - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(zk_server_leader) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Zookeeper has crashed - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "zk_up == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
},
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Zookeeper instance is running abnormally - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "zk_ruok == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null
}
]

View File

Before

Width:  |  Height:  |  Size: 3.1 KiB

After

Width:  |  Height:  |  Size: 3.1 KiB

View File

@@ -0,0 +1,392 @@
[
{
"name": "Elastic Cluster Red status",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
]
},
{
"name": "Elastic Cluster Yellow status",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
]
},
{
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
]
},
{
"name": "Elasticsearch disk space low of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
]
},
{
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
]
},
{
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
]
},
{
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
]
},
{
"name": "Elasticsearch no new documents of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
]
},
{
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
]
},
{
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
]
},
{
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
]
},
{
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
]
},
{
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
]
}
]

View File

@@ -0,0 +1,392 @@
[
{
"name": "Elastic Cluster Red status",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
]
},
{
"name": "Elastic Cluster Yellow status",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
]
},
{
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
]
},
{
"name": "Elasticsearch disk space low of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
]
},
{
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 90",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
]
},
{
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 > 80",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
]
},
{
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_initializing_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
]
},
{
"name": "Elasticsearch no new documents of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
]
},
{
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
]
},
{
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "elasticsearch_cluster_health_relocating_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
]
},
{
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_unassigned_shards > 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
]
},
{
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
]
},
{
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": " elasticsearch_cluster_health_number_of_nodes < number_of_nodes",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
]
}
]

View File

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 2.0 KiB

View File

Before

Width:  |  Height:  |  Size: 377 KiB

After

Width:  |  Height:  |  Size: 377 KiB

View File

@@ -0,0 +1,30 @@
[
{
"name": "http detect failed",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "http_response_result_code != 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

View File

Before

Width:  |  Height:  |  Size: 975 B

After

Width:  |  Height:  |  Size: 975 B

View File

Before

Width:  |  Height:  |  Size: 112 KiB

After

Width:  |  Height:  |  Size: 112 KiB

Some files were not shown because too many files have changed in this diff Show More