Compare commits

...

22 Commits

Author SHA1 Message Date
ning
6743b125a3 add some sql 2024-05-30 15:40:06 +08:00
ning
928388f637 code refactor 2024-05-29 20:49:21 +08:00
ning
664c98e6a6 code refactor 2024-05-29 20:30:26 +08:00
ning
cd1be178e7 code refactor 2024-05-29 19:56:59 +08:00
ning
a7034917b5 code refactor 2024-05-29 17:17:04 +08:00
ning
b872329508 code refactor 2024-05-29 17:06:11 +08:00
ning
b5ede2f2d8 code refactor 2024-05-29 16:58:46 +08:00
ning
209e82787f code refactor 2024-05-29 16:48:28 +08:00
ning
dd94719952 change integration 2024-05-28 18:18:47 +08:00
ning
0991967c61 Merge branch 'main' of github.com:ccfos/nightingale into integration-tpl 2024-05-28 17:24:56 +08:00
ning
ee649d0b4b modity import alert rule 2024-05-28 11:37:56 +08:00
ning
ca1094b263 code refactor 2024-05-24 17:45:14 +08:00
ning
e2b6b1f9ba code refactor 2024-05-24 17:21:13 +08:00
ning
be1b20685e code refactor 2024-05-24 17:16:28 +08:00
ning
335d551bd9 code refactor 2024-05-24 16:48:04 +08:00
ning
5b3e2e9764 code refactor 2024-05-24 16:09:54 +08:00
ning
cc67908ac3 code refactor 2024-05-24 16:02:01 +08:00
ning
8b297d7df1 code refactor 2024-05-24 14:57:22 +08:00
ning
0d1114b988 code refactor 2024-05-24 14:45:27 +08:00
ning
b800fcd47f code refactor 2024-05-24 14:25:43 +08:00
ning
3f865e2d19 code refactor 2024-05-22 10:47:37 +08:00
ning
e3ea65d098 code refactor 2024-05-22 10:34:34 +08:00
101 changed files with 822 additions and 650 deletions

View File

@@ -345,7 +345,7 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
missTargets = append(missTargets, ident)
}
}
logger.Debugf("rule_eval:%s missTargets:%v", arw.Key(), missTargets)
targets := arw.processor.TargetCache.Gets(missTargets)
for _, target := range targets {
m := make(map[string]string)
@@ -383,7 +383,6 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
// means this target is not collect by categraf, do not check offset
continue
}
if target, exists := targetMap[ident]; exists {
if now-target.UpdateAt > 120 {
// means this target is not a active host, do not check offset
@@ -397,6 +396,7 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
}
}
logger.Debugf("rule_eval:%s offsetIdents:%v", arw.Key(), offsetIdents)
for host, offset := range offsetIdents {
m := make(map[string]string)
target, exists := arw.processor.TargetCache.Get(host)
@@ -431,7 +431,7 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
missTargets = append(missTargets, ident)
}
}
logger.Debugf("rule_eval:%s missTargets:%v", arw.Key(), missTargets)
pct := float64(len(missTargets)) / float64(len(idents)) * 100
if pct >= float64(trigger.Percent) {
lst = append(lst, common.NewAnomalyPoint(trigger.Type, nil, now, pct, trigger.Severity))

View File

@@ -76,7 +76,6 @@ ops:
- "/dashboards/add"
- "/dashboards/put"
- "/dashboards/del"
- "/dashboards-built-in"
- name: alert
cname: 告警规则
@@ -85,7 +84,7 @@ ops:
- "/alert-rules/add"
- "/alert-rules/put"
- "/alert-rules/del"
- "/alert-rules-built-in"
- name: alert-mutes
cname: 告警静默管理
ops:
@@ -180,6 +179,14 @@ ops:
- "/builtin-metrics/put"
- "/builtin-metrics/del"
- name: built-in-components
cname: 模版中心
ops:
- "/built-in-components"
- "/built-in-components/add"
- "/built-in-components/put"
- "/built-in-components/del"
- name: system
cname: 系统信息
ops:

View File

@@ -3,6 +3,7 @@ package integration
import (
"encoding/json"
"path"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
@@ -31,145 +32,148 @@ func Init(ctx *ctx.Context, builtinIntegrationsDir string) {
// get logo name
// /api/n9e/integrations/icon/AliYun/aliyun.png
// files, err := file.FilesUnder(componentDir + "/icon")
// if err == nil && len(files) > 0 {
// component.Logo = "/api/n9e/integrations/icon/" + component.Ident + "/" + files[0]
// } else if err != nil {
// logger.Warningf("read builtin component icon dir fail %s %v", component.Ident, err)
// }
files, err := file.FilesUnder(componentDir + "/icon")
if err == nil && len(files) > 0 {
component.Logo = "/api/n9e/integrations/icon/" + component.Ident + "/" + files[0]
} else if err != nil {
logger.Warningf("read builtin component icon dir fail %s %v", component.Ident, err)
}
// // get description
// files, err = file.FilesUnder(componentDir + "/markdown")
// if err == nil && len(files) > 0 {
// var readmeFile string
// for _, file := range files {
// if strings.HasSuffix(strings.ToLower(file), "md") {
// readmeFile = file
// }
// }
// if readmeFile != "" {
// component.Readme, _ = file.ReadString(readmeFile)
// }
// } else if err != nil {
// logger.Warningf("read builtin component markdown dir fail %s %v", component.Ident, err)
// }
// get description
files, err = file.FilesUnder(componentDir + "/markdown")
if err == nil && len(files) > 0 {
var readmeFile string
for _, file := range files {
if strings.HasSuffix(strings.ToLower(file), "md") {
readmeFile = componentDir + "/markdown/" + file
break
}
}
if readmeFile != "" {
component.Readme, _ = file.ReadString(readmeFile)
}
} else if err != nil {
logger.Warningf("read builtin component markdown dir fail %s %v", component.Ident, err)
}
// exists, _ := models.BuiltinComponentExists(ctx, &component)
// if !exists {
// err = component.Add(ctx, "system")
// if err != nil {
// logger.Warning("add builtin component fail ", component, err)
// continue
// }
// }
exists, _ := models.BuiltinComponentExists(ctx, &component)
if !exists {
err = component.Add(ctx, "system")
if err != nil {
logger.Warning("add builtin component fail ", component, err)
continue
}
}
// // alerts
// files, err = file.FilesUnder(componentDir + "/alerts")
// if err == nil && len(files) > 0 {
// for _, f := range files {
// fp := componentDir + "/alerts/" + f
// bs, err := file.ReadBytes(fp)
// if err != nil {
// logger.Warning("read builtin component alerts file fail ", f, err)
// continue
// }
// alerts
files, err = file.FilesUnder(componentDir + "/alerts")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/alerts/" + f
bs, err := file.ReadBytes(fp)
if err != nil {
logger.Warning("read builtin component alerts file fail ", f, err)
continue
}
// alerts := []models.AlertRule{}
// err = json.Unmarshal(bs, &alerts)
// if err != nil {
// logger.Warning("parse builtin component alerts file fail ", f, err)
// continue
// }
alerts := []models.AlertRule{}
err = json.Unmarshal(bs, &alerts)
if err != nil {
logger.Warning("parse builtin component alerts file fail ", f, err)
continue
}
// for _, alert := range alerts {
// content, err := json.Marshal(alert)
// if err != nil {
// logger.Warning("marshal builtin alert fail ", alert, err)
// continue
// }
for _, alert := range alerts {
content, err := json.Marshal(alert)
if err != nil {
logger.Warning("marshal builtin alert fail ", alert, err)
continue
}
// cate := strings.Replace(f, ".json", "", -1)
// builtinAlert := models.BuiltinPayload{
// Component: component.Ident,
// Type: "alert",
// Cate: cate,
// Name: alert.Name,
// Content: string(content),
// }
cate := strings.Replace(f, ".json", "", -1)
builtinAlert := models.BuiltinPayload{
Component: component.Ident,
Type: "alert",
Cate: cate,
Name: alert.Name,
Tags: alert.AppendTags,
Content: string(content),
}
// exists, err := models.BuiltinPayloadExists(ctx, &builtinAlert)
// if err != nil {
// logger.Warning("check builtin alert exists fail ", builtinAlert, err)
// continue
// }
exists, err := models.BuiltinPayloadExists(ctx, &builtinAlert)
if err != nil {
logger.Warning("check builtin alert exists fail ", builtinAlert, err)
continue
}
// if exists {
// continue
// }
if exists {
continue
}
// err = builtinAlert.Add(ctx, "system")
// if err != nil {
// logger.Warningf("add builtin alert:%+v fail %v", builtinAlert, err)
// continue
// }
// }
// }
// }
err = builtinAlert.Add(ctx, "system")
if err != nil {
logger.Warningf("add builtin alert:%+v fail %v", builtinAlert, err)
continue
}
}
}
}
// // dashboards
// files, err = file.FilesUnder(componentDir + "/dashboards")
// if err == nil && len(files) > 0 {
// for _, f := range files {
// fp := componentDir + "/dashboards/" + f
// bs, err := file.ReadBytes(fp)
// if err != nil {
// logger.Warning("read builtin component dashboards file fail ", f, err)
// continue
// }
// dashboards
files, err = file.FilesUnder(componentDir + "/dashboards")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/dashboards/" + f
bs, err := file.ReadBytes(fp)
if err != nil {
logger.Warning("read builtin component dashboards file fail ", f, err)
continue
}
// dashboard := BuiltinBoard{}
// err = json.Unmarshal(bs, &dashboard)
// if err != nil {
// logger.Warning("parse builtin component dashboards file fail ", f, err)
// continue
// }
dashboard := BuiltinBoard{}
err = json.Unmarshal(bs, &dashboard)
if err != nil {
logger.Warning("parse builtin component dashboards file fail ", f, err)
continue
}
// content, err := json.Marshal(dashboard)
// if err != nil {
// logger.Warning("marshal builtin dashboard fail ", dashboard, err)
// continue
// }
content, err := json.Marshal(dashboard)
if err != nil {
logger.Warning("marshal builtin dashboard fail ", dashboard, err)
continue
}
// builtinDashboard := models.BuiltinPayload{
// Component: component.Ident,
// Type: "dashboard",
// Cate: "",
// Name: dashboard.Name,
// Content: string(content),
// }
builtinDashboard := models.BuiltinPayload{
Component: component.Ident,
Type: "dashboard",
Cate: "",
Name: dashboard.Name,
Tags: dashboard.Tags,
Content: string(content),
}
// exists, err := models.BuiltinPayloadExists(ctx, &builtinDashboard)
// if err != nil {
// logger.Warning("check builtin dashboard exists fail ", builtinDashboard, err)
// continue
// }
exists, err := models.BuiltinPayloadExists(ctx, &builtinDashboard)
if err != nil {
logger.Warning("check builtin dashboard exists fail ", builtinDashboard, err)
continue
}
// if exists {
// continue
// }
if exists {
continue
}
// err = builtinDashboard.Add(ctx, "system")
// if err != nil {
// logger.Warning("add builtin dashboard fail ", builtinDashboard, err)
// continue
// }
// }
// } else if err != nil {
// logger.Warningf("read builtin component dash dir fail %s %v", component.Ident, err)
// }
err = builtinDashboard.Add(ctx, "system")
if err != nil {
logger.Warning("add builtin dashboard fail ", builtinDashboard, err)
continue
}
}
} else if err != nil {
logger.Warningf("read builtin component dash dir fail %s %v", component.Ident, err)
}
// metrics
files, err := file.FilesUnder(componentDir + "/metrics")
files, err = file.FilesUnder(componentDir + "/metrics")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/metrics/" + f

View File

@@ -274,13 +274,14 @@ func (rt *Router) Config(r *gin.Engine) {
pages.POST("/builtin-cate-favorite", rt.auth(), rt.user(), rt.builtinCateFavoriteAdd)
pages.DELETE("/builtin-cate-favorite/:name", rt.auth(), rt.user(), rt.builtinCateFavoriteDel)
pages.GET("/builtin-boards", rt.builtinBoardGets)
pages.GET("/builtin-board/:name", rt.builtinBoardGet)
pages.GET("/dashboards/builtin/list", rt.builtinBoardGets)
pages.GET("/builtin-boards-cates", rt.auth(), rt.user(), rt.builtinBoardCateGets)
pages.POST("/builtin-boards-detail", rt.auth(), rt.user(), rt.builtinBoardDetailGets)
pages.GET("/integrations/icon/:cate/:name", rt.builtinIcon)
pages.GET("/integrations/makedown/:cate", rt.builtinMarkdown)
// pages.GET("/builtin-boards", rt.builtinBoardGets)
// pages.GET("/builtin-board/:name", rt.builtinBoardGet)
// pages.GET("/dashboards/builtin/list", rt.builtinBoardGets)
// pages.GET("/builtin-boards-cates", rt.auth(), rt.user(), rt.builtinBoardCateGets)
// pages.POST("/builtin-boards-detail", rt.auth(), rt.user(), rt.builtinBoardDetailGets)
// pages.GET("/integrations/makedown/:cate", rt.builtinMarkdown)
pages.GET("/busi-groups/public-boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.publicBoardGets)
pages.GET("/busi-groups/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.boardGetsByGids)
@@ -299,8 +300,8 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/share-charts", rt.chartShareGets)
pages.POST("/share-charts", rt.auth(), rt.chartShareAdd)
pages.GET("/alert-rules/builtin/alerts-cates", rt.auth(), rt.user(), rt.builtinAlertCateGets)
pages.GET("/alert-rules/builtin/list", rt.auth(), rt.user(), rt.builtinAlertRules)
// pages.GET("/alert-rules/builtin/alerts-cates", rt.auth(), rt.user(), rt.builtinAlertCateGets)
// pages.GET("/alert-rules/builtin/list", rt.auth(), rt.user(), rt.builtinAlertRules)
pages.GET("/alert-rules/callbacks", rt.auth(), rt.user(), rt.alertRuleCallbacks)
pages.GET("/busi-groups/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGetsByGids)
@@ -434,16 +435,17 @@ func (rt *Router) Config(r *gin.Engine) {
// for admin api
pages.GET("/user/busi-groups", rt.auth(), rt.admin(), rt.userBusiGroupsGets)
pages.POST("/builtin-components", rt.auth(), rt.user(), rt.perm("/builtin-components/add"), rt.builtinComponentsAdd)
pages.GET("/builtin-components", rt.auth(), rt.user(), rt.perm("/builtin-components"), rt.builtinComponentsGets)
pages.PUT("/builtin-components", rt.auth(), rt.user(), rt.perm("/builtin-components/put"), rt.builtinComponentsPut)
pages.DELETE("/builtin-components", rt.auth(), rt.user(), rt.perm("/builtin-components/del"), rt.builtinComponentsDel)
pages.GET("/builtin-components", rt.auth(), rt.user(), rt.builtinComponentsGets)
pages.POST("/builtin-components", rt.auth(), rt.user(), rt.perm("/built-in-components/add"), rt.builtinComponentsAdd)
pages.PUT("/builtin-components", rt.auth(), rt.user(), rt.perm("/built-in-components/put"), rt.builtinComponentsPut)
pages.DELETE("/builtin-components", rt.auth(), rt.user(), rt.perm("/built-in-components/del"), rt.builtinComponentsDel)
pages.POST("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/builtin-payloads/add"), rt.builtinPayloadsAdd)
pages.GET("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/builtin-payloads"), rt.builtinPayloadsGets)
pages.GET("/builtin-payload/:id", rt.auth(), rt.user(), rt.perm("/builtin-payloads"), rt.builtinPayloadGet)
pages.PUT("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/builtin-payloads/put"), rt.builtinPayloadsPut)
pages.DELETE("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/builtin-payloads/del"), rt.builtinPayloadsDel)
pages.GET("/builtin-payloads", rt.auth(), rt.user(), rt.builtinPayloadsGets)
pages.GET("/builtin-payloads/cates", rt.auth(), rt.user(), rt.builtinPayloadcatesGet)
pages.POST("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/built-in-components/add"), rt.builtinPayloadsAdd)
pages.GET("/builtin-payload/:id", rt.auth(), rt.user(), rt.perm("/built-in-components"), rt.builtinPayloadGet)
pages.PUT("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/built-in-components/put"), rt.builtinPayloadsPut)
pages.DELETE("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/built-in-components/del"), rt.builtinPayloadsDel)
}
r.GET("/api/n9e/versions", func(c *gin.Context) {

View File

@@ -31,18 +31,11 @@ func (rt *Router) builtinComponentsAdd(c *gin.Context) {
func (rt *Router) builtinComponentsGets(c *gin.Context) {
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
bc, err := models.BuiltinComponentGets(rt.Ctx, query, limit, ginx.Offset(c, limit))
bc, err := models.BuiltinComponentGets(rt.Ctx, query)
ginx.Dangerous(err)
total, err := models.BuiltinComponentCount(rt.Ctx, query)
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"list": bc,
"total": total,
}, nil)
ginx.NewRender(c).Data(bc, nil)
}
func (rt *Router) builtinComponentsPut(c *gin.Context) {

View File

@@ -1,13 +1,21 @@
package router
import (
"encoding/json"
"net/http"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
)
type Board struct {
Name string `json:"name"`
Tags string `json:"tags"`
Configs interface{} `json:"configs"`
}
func (rt *Router) builtinPayloadsAdd(c *gin.Context) {
var lst []models.BuiltinPayload
ginx.BindJSON(c, &lst)
@@ -21,9 +29,118 @@ func (rt *Router) builtinPayloadsAdd(c *gin.Context) {
reterr := make(map[string]string)
for i := 0; i < count; i++ {
if err := lst[i].Add(rt.Ctx, username); err != nil {
reterr[lst[i].Name] = err.Error()
if lst[i].Type == "alert" {
if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") {
// 处理多个告警规则模板的情况
alertRules := []models.AlertRule{}
if err := json.Unmarshal([]byte(lst[i].Content), &alertRules); err != nil {
reterr[lst[i].Name] = err.Error()
}
for _, rule := range alertRules {
contentBytes, err := json.Marshal(rule)
if err != nil {
reterr[rule.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
Component: lst[i].Component,
Cate: lst[i].Cate,
Name: rule.Name,
Tags: rule.AppendTags,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = err.Error()
}
}
continue
}
alertRule := models.AlertRule{}
if err := json.Unmarshal([]byte(lst[i].Content), &alertRule); err != nil {
reterr[lst[i].Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
Component: lst[i].Component,
Cate: lst[i].Cate,
Name: alertRule.Name,
Tags: alertRule.AppendTags,
Content: lst[i].Content,
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = err.Error()
}
} else if lst[i].Type == "dashboard" {
if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") {
// 处理多个告警规则模板的情况
dashboards := []Board{}
if err := json.Unmarshal([]byte(lst[i].Content), &dashboards); err != nil {
reterr[lst[i].Name] = err.Error()
}
for _, dashboard := range dashboards {
contentBytes, err := json.Marshal(dashboard)
if err != nil {
reterr[dashboard.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
Component: lst[i].Component,
Cate: lst[i].Cate,
Name: dashboard.Name,
Tags: dashboard.Tags,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = err.Error()
}
}
continue
}
dashboard := Board{}
if err := json.Unmarshal([]byte(lst[i].Content), &dashboard); err != nil {
reterr[lst[i].Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
Component: lst[i].Component,
Cate: lst[i].Cate,
Name: dashboard.Name,
Tags: dashboard.Tags,
Content: lst[i].Content,
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = err.Error()
}
} else {
if err := lst[i].Add(rt.Ctx, username); err != nil {
reterr[lst[i].Name] = err.Error()
}
}
}
ginx.NewRender(c).Data(reterr, nil)
@@ -33,20 +150,18 @@ func (rt *Router) builtinPayloadsGets(c *gin.Context) {
typ := ginx.QueryStr(c, "type", "")
component := ginx.QueryStr(c, "component", "")
cate := ginx.QueryStr(c, "cate", "")
name := ginx.QueryStr(c, "name", "")
limit := ginx.QueryInt(c, "limit", 20)
query := ginx.QueryStr(c, "query", "")
lst, err := models.BuiltinPayloadGets(rt.Ctx, typ, component, cate, name, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
m := make(map[string][]*models.BuiltinPayload)
for _, v := range lst {
if _, ok := m[v.Cate]; !ok {
m[v.Cate] = make([]*models.BuiltinPayload, 0)
}
m[v.Cate] = append(m[v.Cate], v)
}
lst, err := models.BuiltinPayloadGets(rt.Ctx, typ, component, cate, query)
ginx.NewRender(c).Data(lst, err)
}
ginx.NewRender(c).Data(m, nil)
func (rt *Router) builtinPayloadcatesGet(c *gin.Context) {
typ := ginx.QueryStr(c, "type", "")
component := ginx.QueryStr(c, "component", "")
cates, err := models.BuiltinPayloadCates(rt.Ctx, typ, component)
ginx.NewRender(c).Data(cates, err)
}
func (rt *Router) builtinPayloadGet(c *gin.Context) {
@@ -75,6 +190,24 @@ func (rt *Router) builtinPayloadsPut(c *gin.Context) {
return
}
if req.Type == "alert" {
alertRule := models.AlertRule{}
if err := json.Unmarshal([]byte(req.Content), &alertRule); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
req.Name = alertRule.Name
req.Tags = alertRule.AppendTags
} else if req.Type == "dashboard" {
dashboard := Board{}
if err := json.Unmarshal([]byte(req.Content), &dashboard); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
req.Name = dashboard.Name
req.Tags = dashboard.Tags
}
username := Username(c)
req.UpdatedBy = username

View File

@@ -508,6 +508,38 @@ CREATE TABLE `board_busigroup` (
PRIMARY KEY (`busi_group_id`, `board_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
Create Table: CREATE TABLE `builtin_components` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`ident` varchar(191) NOT NULL COMMENT '''identifier of component''',
`logo` varchar(191) NOT NULL COMMENT '''logo of component''',
`readme` text NOT NULL COMMENT '''readme of component''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_ident` (`ident`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
Create Table: CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
`name` varchar(191) NOT NULL COMMENT '''name of payload''',
`tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''',
`content` longtext NOT NULL COMMENT '''content of payload''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
CREATE TABLE `task_tpl`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,

View File

@@ -38,4 +38,38 @@ CREATE TABLE `board_busigroup` (
`busi_group_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'busi group id',
`board_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'board id',
PRIMARY KEY (`busi_group_id`, `board_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* beta.5 */
Create Table: CREATE TABLE `builtin_components` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`ident` varchar(191) NOT NULL COMMENT '''identifier of component''',
`logo` varchar(191) NOT NULL COMMENT '''logo of component''',
`readme` text NOT NULL COMMENT '''readme of component''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_ident` (`ident`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
Create Table: CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
`name` varchar(191) NOT NULL COMMENT '''name of payload''',
`tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''',
`content` longtext NOT NULL COMMENT '''content of payload''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

View File

@@ -1,30 +0,0 @@
# 调整间隔时间
如有诉求对此插件本身的采集间隔时间调整的话就启用,单位为秒
interval = 15
# 获取被监控端设备的网卡名称
可用以下命令获取网卡名称列表
```
ip addr | grep '^[0-9]' |awk -F':' '{print $2}'
lo
eth0
br-153e7f4f0c83
br-2f302c2a8faa
br-5ae0cdb82efc
br-68cba8773a8c
br-c50ca3122079
docker0
br-fd769e4347bd
veth944ac75@if52
```
# 在数组instances中启用eth_device
将以上获取的网卡列表根据自己的诉求填入如eth0
```
eth_device="eth0"
```
# 测试是否能获取到值
```
./categraf --test --inputs arp_packet
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

View File

@@ -1,126 +0,0 @@
# kernel_vmstat
该监控插件采集的是 `/proc/vmstat` 的指标数据,需要较高版本的 kernel`/proc/vmstat`内容较多,配置文件中给了一个白名单的配置,大家按需启用,只有启用了才会采集。
```ini
[white_list]
oom_kill = 1
nr_free_pages = 0
nr_alloc_batch = 0
nr_inactive_anon = 0
nr_active_anon = 0
nr_inactive_file = 0
nr_active_file = 0
nr_unevictable = 0
nr_mlock = 0
nr_anon_pages = 0
nr_mapped = 0
nr_file_pages = 0
nr_dirty = 0
nr_writeback = 0
nr_slab_reclaimable = 0
nr_slab_unreclaimable = 0
nr_page_table_pages = 0
nr_kernel_stack = 0
nr_unstable = 0
nr_bounce = 0
nr_vmscan_write = 0
nr_vmscan_immediate_reclaim = 0
nr_writeback_temp = 0
nr_isolated_anon = 0
nr_isolated_file = 0
nr_shmem = 0
nr_dirtied = 0
nr_written = 0
numa_hit = 0
numa_miss = 0
numa_foreign = 0
numa_interleave = 0
numa_local = 0
numa_other = 0
workingset_refault = 0
workingset_activate = 0
workingset_nodereclaim = 0
nr_anon_transparent_hugepages = 0
nr_free_cma = 0
nr_dirty_threshold = 0
nr_dirty_background_threshold = 0
pgpgin = 0
pgpgout = 0
pswpin = 0
pswpout = 0
pgalloc_dma = 0
pgalloc_dma32 = 0
pgalloc_normal = 0
pgalloc_movable = 0
pgfree = 0
pgactivate = 0
pgdeactivate = 0
pgfault = 0
pgmajfault = 0
pglazyfreed = 0
pgrefill_dma = 0
pgrefill_dma32 = 0
pgrefill_normal = 0
pgrefill_movable = 0
pgsteal_kswapd_dma = 0
pgsteal_kswapd_dma32 = 0
pgsteal_kswapd_normal = 0
pgsteal_kswapd_movable = 0
pgsteal_direct_dma = 0
pgsteal_direct_dma32 = 0
pgsteal_direct_normal = 0
pgsteal_direct_movable = 0
pgscan_kswapd_dma = 0
pgscan_kswapd_dma32 = 0
pgscan_kswapd_normal = 0
pgscan_kswapd_movable = 0
pgscan_direct_dma = 0
pgscan_direct_dma32 = 0
pgscan_direct_normal = 0
pgscan_direct_movable = 0
pgscan_direct_throttle = 0
zone_reclaim_failed = 0
pginodesteal = 0
slabs_scanned = 0
kswapd_inodesteal = 0
kswapd_low_wmark_hit_quickly = 0
kswapd_high_wmark_hit_quickly = 0
pageoutrun = 0
allocstall = 0
pgrotated = 0
drop_pagecache = 0
drop_slab = 0
numa_pte_updates = 0
numa_huge_pte_updates = 0
numa_hint_faults = 0
numa_hint_faults_local = 0
numa_pages_migrated = 0
pgmigrate_success = 0
pgmigrate_fail = 0
compact_migrate_scanned = 0
compact_free_scanned = 0
compact_isolated = 0
compact_stall = 0
compact_fail = 0
compact_success = 0
htlb_buddy_alloc_success = 0
htlb_buddy_alloc_fail = 0
unevictable_pgs_culled = 0
unevictable_pgs_scanned = 0
unevictable_pgs_rescued = 0
unevictable_pgs_mlocked = 0
unevictable_pgs_munlocked = 0
unevictable_pgs_cleared = 0
unevictable_pgs_stranded = 0
thp_fault_alloc = 0
thp_fault_fallback = 0
thp_collapse_alloc = 0
thp_collapse_alloc_failed = 0
thp_split = 0
thp_zero_page_alloc = 0
thp_zero_page_alloc_failed = 0
balloon_inflate = 0
balloon_deflate = 0
balloon_migrate = 0
```

View File

@@ -9,3 +9,193 @@ categraf 部署之后,就会自动采集 CPU、内存、磁盘、IO、网路
## 内置告警规则
夜莺内置了告警规则,文件名是 `_categraf` 的表示是使用 categraf 作为采集器。文件名是 `_exporter` 的表示是使用 node-exporter 作为采集器。
下面是一个可自己配置开启的插件
## arp packet
### 调整间隔时间
如有诉求对此插件本身的采集间隔时间调整的话就启用,单位为秒
interval = 15
### 获取被监控端设备的网卡名称
可用以下命令获取网卡名称列表
```
ip addr | grep '^[0-9]' |awk -F':' '{print $2}'
lo
eth0
br-153e7f4f0c83
br-2f302c2a8faa
br-5ae0cdb82efc
br-68cba8773a8c
br-c50ca3122079
docker0
br-fd769e4347bd
veth944ac75@if52
```
### 在数组instances中启用eth_device
将以上获取的网卡列表根据自己的诉求填入如eth0
```
eth_device="eth0"
```
### 测试是否能获取到值
```
./categraf --test --inputs arp_packet
```
## netstat
该插件采集网络连接情况,比如有多少 time_wait 连接,多少 established 连接
## kernel_vmstat
该监控插件采集的是 `/proc/vmstat` 的指标数据,需要较高版本的 kernel`/proc/vmstat`内容较多,配置文件中给了一个白名单的配置,大家按需启用,只有启用了才会采集。
```ini
[white_list]
oom_kill = 1
nr_free_pages = 0
nr_alloc_batch = 0
nr_inactive_anon = 0
nr_active_anon = 0
nr_inactive_file = 0
nr_active_file = 0
nr_unevictable = 0
nr_mlock = 0
nr_anon_pages = 0
nr_mapped = 0
nr_file_pages = 0
nr_dirty = 0
nr_writeback = 0
nr_slab_reclaimable = 0
nr_slab_unreclaimable = 0
nr_page_table_pages = 0
nr_kernel_stack = 0
nr_unstable = 0
nr_bounce = 0
nr_vmscan_write = 0
nr_vmscan_immediate_reclaim = 0
nr_writeback_temp = 0
nr_isolated_anon = 0
nr_isolated_file = 0
nr_shmem = 0
nr_dirtied = 0
nr_written = 0
numa_hit = 0
numa_miss = 0
numa_foreign = 0
numa_interleave = 0
numa_local = 0
numa_other = 0
workingset_refault = 0
workingset_activate = 0
workingset_nodereclaim = 0
nr_anon_transparent_hugepages = 0
nr_free_cma = 0
nr_dirty_threshold = 0
nr_dirty_background_threshold = 0
pgpgin = 0
pgpgout = 0
pswpin = 0
pswpout = 0
pgalloc_dma = 0
pgalloc_dma32 = 0
pgalloc_normal = 0
pgalloc_movable = 0
pgfree = 0
pgactivate = 0
pgdeactivate = 0
pgfault = 0
pgmajfault = 0
pglazyfreed = 0
pgrefill_dma = 0
pgrefill_dma32 = 0
pgrefill_normal = 0
pgrefill_movable = 0
pgsteal_kswapd_dma = 0
pgsteal_kswapd_dma32 = 0
pgsteal_kswapd_normal = 0
pgsteal_kswapd_movable = 0
pgsteal_direct_dma = 0
pgsteal_direct_dma32 = 0
pgsteal_direct_normal = 0
pgsteal_direct_movable = 0
pgscan_kswapd_dma = 0
pgscan_kswapd_dma32 = 0
pgscan_kswapd_normal = 0
pgscan_kswapd_movable = 0
pgscan_direct_dma = 0
pgscan_direct_dma32 = 0
pgscan_direct_normal = 0
pgscan_direct_movable = 0
pgscan_direct_throttle = 0
zone_reclaim_failed = 0
pginodesteal = 0
slabs_scanned = 0
kswapd_inodesteal = 0
kswapd_low_wmark_hit_quickly = 0
kswapd_high_wmark_hit_quickly = 0
pageoutrun = 0
allocstall = 0
pgrotated = 0
drop_pagecache = 0
drop_slab = 0
numa_pte_updates = 0
numa_huge_pte_updates = 0
numa_hint_faults = 0
numa_hint_faults_local = 0
numa_pages_migrated = 0
pgmigrate_success = 0
pgmigrate_fail = 0
compact_migrate_scanned = 0
compact_free_scanned = 0
compact_isolated = 0
compact_stall = 0
compact_fail = 0
compact_success = 0
htlb_buddy_alloc_success = 0
htlb_buddy_alloc_fail = 0
unevictable_pgs_culled = 0
unevictable_pgs_scanned = 0
unevictable_pgs_rescued = 0
unevictable_pgs_mlocked = 0
unevictable_pgs_munlocked = 0
unevictable_pgs_cleared = 0
unevictable_pgs_stranded = 0
thp_fault_alloc = 0
thp_fault_fallback = 0
thp_collapse_alloc = 0
thp_collapse_alloc_failed = 0
thp_split = 0
thp_zero_page_alloc = 0
thp_zero_page_alloc_failed = 0
balloon_inflate = 0
balloon_deflate = 0
balloon_migrate = 0
```
# processes
如果进程总量太多,比如超过了 CPU core 的 3 倍,就需要关注了。
## 配置说明
configuration file: `conf/input.processes/processes.toml`
默认配置如下(一般维持默认不用动):
```toml
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
```
有两种采集方式,使用 ps 命令,或者直接读取 `/proc` 目录,默认是后者。如果想强制使用 ps 命令才采集,开启 force_ps 即可:
```toml
force_ps = true
```

View File

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

View File

@@ -1,7 +0,0 @@
# netstat
该插件采集网络连接情况,比如有多少 time_wait 连接,多少 established 连接
# 监控大盘
该插件没有单独的监控大盘OS 的监控大盘统一放到 system 下面了

View File

@@ -102,6 +102,227 @@ urls = [
labels = {job="nginx-vts"}
```
## 仪表盘
# nginx_upstream_check 插件
### 应用场景
一般用于业务系统做对外或对外路由映射时使用代理服务,是运维最常见且最重要的代理工具。
夜莺内置了相关仪表盘,克隆到自己的业务组即可使用。
### 部署场景
需要在装有nginx服务的虚拟机启用此插件。
### 采集原理
- 该采集插件是读取 [nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 的状态输出。[nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 可以周期性检查 upstream 中的各个 server 是否存活,如果检查失败,就会标记为 `down`,如果检查成功,就标记为 `up`
### 注意事项
- 由于 TSDB 通常无法处理字符串,所以 Categraf 会做转换,将 `down` 转换为 2 `up` 转换为 1其他状态转换为 0使用 `nginx_upstream_check_status_code` 这个指标来表示,所以,我们可能需要这样的告警规则:
### 前置条件
#### 条件1nginx服务需要启用nginx_upstream_check_module模块
```
推荐源码编译方式安装模块,如不清楚要安装哪些模块,可参考:
cd /opt/nginx-1.20.1 && ./configure \
--prefix=/usr/share/nginx \
--sbin-path=/usr/sbin/nginx \
--modules-path=/usr/lib64/nginx/modules \
--conf-path=/etc/nginx/nginx.conf \
--error-log-path=/var/log/nginx/error.log \
--http-log-path=/var/log/nginx/access.log \
--http-client-body-temp-path=/var/lib/nginx/tmp/client_body \
--http-proxy-temp-path=/var/lib/nginx/tmp/proxy \
--http-fastcgi-temp-path=/var/lib/nginx/tmp/fastcgi \
--http-uwsgi-temp-path=/var/lib/nginx/tmp/uwsgi \
--http-scgi-temp-path=/var/lib/nginx/tmp/scgi \
--pid-path=/var/run/nginx.pid \
--lock-path=/run/lock/subsys/nginx \
--user=nginx \
--group=nginx \
--with-compat \
--with-threads \
--with-http_addition_module \
--with-http_auth_request_module \
--with-http_dav_module \
--with-http_flv_module \
--with-http_gunzip_module \
--with-http_gzip_static_module \
--with-http_mp4_module \
--with-http_random_index_module \
--with-http_realip_module \
--with-http_secure_link_module \
--with-http_slice_module \
--with-http_ssl_module \
--with-http_stub_status_module \
--with-http_sub_module \
--with-http_v2_module \
--with-mail \
--with-mail_ssl_module \
--with-stream \
--with-stream_realip_module \
--with-stream_ssl_module \
--with-stream_ssl_preread_module \
--with-select_module \
--with-poll_module \
--with-file-aio \
--with-http_xslt_module=dynamic \
--with-http_image_filter_module=dynamic \
--with-http_perl_module=dynamic \
--with-stream=dynamic \
--with-mail=dynamic \
--with-http_xslt_module=dynamic \
--add-module=/etc/nginx/third-modules/nginx_upstream_check_module \
--add-module=/etc/nginx/third-modules/ngx_devel_kit-0.3.0 \
--add-module=/etc/nginx/third-modules/lua-nginx-module-0.10.13 \
--add-module=/etc/nginx/third-modules/nginx-module-vts \
--add-module=/etc/nginx/third-modules/ngx-fancyindex-0.5.2
# 根据cpu核数
make -j2
make install
注意第三方模块nginx_upstream_check_module lua-nginx-module nginx-module-vts 都是相关插件所必备的依赖。
```
#### 条件2nginx启用check_status配置
```
[root@aliyun categraf]# cat /etc/nginx/conf.d/nginx-upstream.domains.com.conf
server {
listen 80;
listen 443 ssl;
server_name nginx-upstream.domains.com;
include /etc/nginx/ssl_conf/domains.com.conf;
location / {
check_status;
include /etc/nginx/ip_whitelist.conf;
}
access_log /var/log/nginx/nginx-upstream.domains.com.access.log main;
error_log /var/log/nginx/nginx-upstream.domains.com.error.log warn;
}
```
浏览器访问https://nginx-upstream.domains.com?format=json出现
![image](https://user-images.githubusercontent.com/12181410/220912157-57f485de-6b4e-4ca4-869d-871244aabde1.png)
浏览器访问https://nginx-upstream.domains.com出现
![image](https://user-images.githubusercontent.com/12181410/220909354-fc8ba53d-2384-41d3-8def-4447a104fb3c.png)
#### 条件3在需要启用upstream监控的域名配置下进行配置
例如:
```
[root@aliyun upstream_conf]# cat upstream_n9e.conf
upstream n9e {
server 127.0.0.1:18000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=18000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_n9e_server_api.conf
upstream n9e-server-api {
server 127.0.0.1:19000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=19000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_vm.conf
upstream vm {
server 127.0.0.1:8428 weight=10 max_fails=2 fail_timeout=5s;
keepalive 20;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=8428;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
```
### 配置场景
```
本配置启用或数据定义如下功能:
增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。
响应超时时间为5秒。
urls字段填写条件2所定义好的域名。
```
### 修改nginx.toml文件配置
```
[root@aliyun conf]# cat input.nginx_upstream_check/nginx_upstream_check.toml
# # collect interval
# interval = 15
[[instances]]
# 这个配置最关键,是要给出获取 status 信息的接口地址
targets = [
"https://nginx-upstream.domains.com/?format=json"
]
# 标签这个配置请注意
# 如果 Categraf 和 Nginx 是在一台机器上target 可能配置的是 127.0.0.1
# 如果 Nginx 有多台机器,每台机器都有 Categraf 来采集本机的 Nginx 的 Status 信息
# 可能会导致时序数据标签相同不易区分当然Categraf 会自带 ident 标签,该标签标识本机机器名
# 如果大家觉得 ident 标签不够用,可以用下面 labels 配置,附加 instance、region 之类的标签
# # append some labels for series
labels = { cloud="my-cloud", region="my-region",azone="az1", product="my-product" }
# # interval = global.interval * interval_times
# interval_times = 1
### Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
### Interface to use when dialing an address
# interface = "eth0"
### HTTP Request Method
# method = "GET"
### Set timeout (default 5 seconds)
# timeout = "5s"
### Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
### Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
### Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
### Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
### Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
### 测试配置
```
./categraf --test --inputs nginx_upstream_check
```
### 重启服务
```
重启categraf服务生效
systemctl daemon-reload && systemctl restart categraf && systemctl status categraf
查看启动日志是否有错误
journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!"
```
### 检查数据呈现
等待1-2分钟后数据就会在图表中展示出来如图
![image](https://user-images.githubusercontent.com/12181410/220914337-f97f6fd5-4763-4174-b64c-131aecf6664f.png)
### 监控告警规则配置
```
一般查看后端是否异常为关键检查对象nginx_upstream_check_status_code返回1代表正常返回2代表异常实际测试可从上图看出
nginx_upstream_check_status_code!=1则视为异常需立即告警级别为一级告警执行频率为60秒持续时长为60秒留观时长2分钟重复发送频率5分钟最大发送次数0次使用企业微信应用及电话语音通道将告警内容发送给系统运维组此规则运用到周一到周日全天。
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

View File

@@ -1,231 +0,0 @@
# 应用场景
一般用于业务系统做对外或对外路由映射时使用代理服务,是运维最常见且最重要的代理工具。
# 部署场景
需要在装有nginx服务的虚拟机启用此插件。
# 采集原理
- 该采集插件是读取 [nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 的状态输出。[nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 可以周期性检查 upstream 中的各个 server 是否存活,如果检查失败,就会标记为 `down`,如果检查成功,就标记为 `up`
# 注意事项
- 由于 TSDB 通常无法处理字符串,所以 Categraf 会做转换,将 `down` 转换为 2 `up` 转换为 1其他状态转换为 0使用 `nginx_upstream_check_status_code` 这个指标来表示,所以,我们可能需要这样的告警规则:
# 前置条件
## 条件1nginx服务需要启用nginx_upstream_check_module模块
```
推荐源码编译方式安装模块,如不清楚要安装哪些模块,可参考:
cd /opt/nginx-1.20.1 && ./configure \
--prefix=/usr/share/nginx \
--sbin-path=/usr/sbin/nginx \
--modules-path=/usr/lib64/nginx/modules \
--conf-path=/etc/nginx/nginx.conf \
--error-log-path=/var/log/nginx/error.log \
--http-log-path=/var/log/nginx/access.log \
--http-client-body-temp-path=/var/lib/nginx/tmp/client_body \
--http-proxy-temp-path=/var/lib/nginx/tmp/proxy \
--http-fastcgi-temp-path=/var/lib/nginx/tmp/fastcgi \
--http-uwsgi-temp-path=/var/lib/nginx/tmp/uwsgi \
--http-scgi-temp-path=/var/lib/nginx/tmp/scgi \
--pid-path=/var/run/nginx.pid \
--lock-path=/run/lock/subsys/nginx \
--user=nginx \
--group=nginx \
--with-compat \
--with-threads \
--with-http_addition_module \
--with-http_auth_request_module \
--with-http_dav_module \
--with-http_flv_module \
--with-http_gunzip_module \
--with-http_gzip_static_module \
--with-http_mp4_module \
--with-http_random_index_module \
--with-http_realip_module \
--with-http_secure_link_module \
--with-http_slice_module \
--with-http_ssl_module \
--with-http_stub_status_module \
--with-http_sub_module \
--with-http_v2_module \
--with-mail \
--with-mail_ssl_module \
--with-stream \
--with-stream_realip_module \
--with-stream_ssl_module \
--with-stream_ssl_preread_module \
--with-select_module \
--with-poll_module \
--with-file-aio \
--with-http_xslt_module=dynamic \
--with-http_image_filter_module=dynamic \
--with-http_perl_module=dynamic \
--with-stream=dynamic \
--with-mail=dynamic \
--with-http_xslt_module=dynamic \
--add-module=/etc/nginx/third-modules/nginx_upstream_check_module \
--add-module=/etc/nginx/third-modules/ngx_devel_kit-0.3.0 \
--add-module=/etc/nginx/third-modules/lua-nginx-module-0.10.13 \
--add-module=/etc/nginx/third-modules/nginx-module-vts \
--add-module=/etc/nginx/third-modules/ngx-fancyindex-0.5.2
# 根据cpu核数
make -j2
make install
注意第三方模块nginx_upstream_check_module lua-nginx-module nginx-module-vts 都是相关插件所必备的依赖。
```
## 条件2nginx启用check_status配置
```
[root@aliyun categraf]# cat /etc/nginx/conf.d/nginx-upstream.domains.com.conf
server {
listen 80;
listen 443 ssl;
server_name nginx-upstream.domains.com;
include /etc/nginx/ssl_conf/domains.com.conf;
location / {
check_status;
include /etc/nginx/ip_whitelist.conf;
}
access_log /var/log/nginx/nginx-upstream.domains.com.access.log main;
error_log /var/log/nginx/nginx-upstream.domains.com.error.log warn;
}
```
浏览器访问https://nginx-upstream.domains.com?format=json出现
![image](https://user-images.githubusercontent.com/12181410/220912157-57f485de-6b4e-4ca4-869d-871244aabde1.png)
浏览器访问https://nginx-upstream.domains.com出现
![image](https://user-images.githubusercontent.com/12181410/220909354-fc8ba53d-2384-41d3-8def-4447a104fb3c.png)
## 条件3在需要启用upstream监控的域名配置下进行配置
例如:
```
[root@aliyun upstream_conf]# cat upstream_n9e.conf
upstream n9e {
server 127.0.0.1:18000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=18000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_n9e_server_api.conf
upstream n9e-server-api {
server 127.0.0.1:19000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=19000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_vm.conf
upstream vm {
server 127.0.0.1:8428 weight=10 max_fails=2 fail_timeout=5s;
keepalive 20;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=8428;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
```
# 配置场景
```
本配置启用或数据定义如下功能:
增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。
响应超时时间为5秒。
urls字段填写条件2所定义好的域名。
```
# 修改nginx.toml文件配置
```
[root@aliyun conf]# cat input.nginx_upstream_check/nginx_upstream_check.toml
# # collect interval
# interval = 15
[[instances]]
# 这个配置最关键,是要给出获取 status 信息的接口地址
targets = [
"https://nginx-upstream.domains.com/?format=json"
]
# 标签这个配置请注意
# 如果 Categraf 和 Nginx 是在一台机器上target 可能配置的是 127.0.0.1
# 如果 Nginx 有多台机器,每台机器都有 Categraf 来采集本机的 Nginx 的 Status 信息
# 可能会导致时序数据标签相同不易区分当然Categraf 会自带 ident 标签,该标签标识本机机器名
# 如果大家觉得 ident 标签不够用,可以用下面 labels 配置,附加 instance、region 之类的标签
# # append some labels for series
labels = { cloud="my-cloud", region="my-region",azone="az1", product="my-product" }
# # interval = global.interval * interval_times
# interval_times = 1
## Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
## Interface to use when dialing an address
# interface = "eth0"
## HTTP Request Method
# method = "GET"
## Set timeout (default 5 seconds)
# timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
# 测试配置
```
./categraf --test --inputs nginx_upstream_check
```
# 重启服务
```
重启categraf服务生效
systemctl daemon-reload && systemctl restart categraf && systemctl status categraf
查看启动日志是否有错误
journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!"
```
# 检查数据呈现
等待1-2分钟后数据就会在图表中展示出来如图
![image](https://user-images.githubusercontent.com/12181410/220914337-f97f6fd5-4763-4174-b64c-131aecf6664f.png)
# 监控告警规则配置
```
一般查看后端是否异常为关键检查对象nginx_upstream_check_status_code返回1代表正常返回2代表异常实际测试可从上图看出
nginx_upstream_check_status_code!=1则视为异常需立即告警级别为一级告警执行频率为60秒持续时长为60秒留观时长2分钟重复发送频率5分钟最大发送次数0次使用企业微信应用及电话语音通道将告警内容发送给系统运维组此规则运用到周一到周日全天。
```
# 监控图表配置
https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_upstream_check/dashboards.json
# 故障自愈配置
```
先略过
```

View File

@@ -1,29 +0,0 @@
[
{
"collector": "Categraf",
"typ": "Nginx_Upstream_Check",
"name": "Nginx upstream_check 最近一分钟检测失败次数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_upstream_check/README.md)",
"lang": "zh_CN",
"expression": "increase(nginx_upstream_check_fall[1m])"
},
{
"collector": "Categraf",
"typ": "Nginx_Upstream_Check",
"name": "Nginx upstream_check 最近一分钟检测次数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_upstream_check/README.md)",
"lang": "zh_CN",
"expression": "increase(nginx_upstream_check_rise[1m])"
},
{
"collector": "Categraf",
"typ": "Nginx_Upstream_Check",
"name": "Nginx upstream_check 检测结果状态码",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_upstream_check/README.md)\n\n- 0unknown\n- 1up\n- 2down\n",
"lang": "zh_CN",
"expression": "nginx_upstream_check_status_code"
}
]

View File

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.2 KiB

View File

@@ -1,35 +0,0 @@
# 进程总量统计
如果进程总量太多,比如超过了 CPU core 的 3 倍,就需要关注了。
## 配置说明
configuration file: `conf/input.processes/processes.toml`
默认配置如下(一般维持默认不用动):
```toml
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
```
有两种采集方式,使用 ps 命令,或者直接读取 `/proc` 目录,默认是后者。如果想强制使用 ps 命令才采集,开启 force_ps 即可:
```toml
force_ps = true
```
## 仪表盘
夜莺内置了进程总量的仪表盘,克隆到自己的业务组下即可使用。
## 告警规则
夜莺内置了进程总量的告警规则,克隆到自己的业务组下即可使用。

View File

@@ -43,3 +43,7 @@ labels = { instance="n9e-10.23.25.3:6379" }
如果一个 redis 集群有 3 个实例,对于业务应用来讲,发起一个请求,可能随机请求到某一个实例上去了,这个是没问题的,但是对于监控 client 而言,显然是希望到所有实例上获取数据的。
当然,如果多个 redis 实例组成了集群,我们希望有个标识来标识这个集群,这个时候,可以通过 labels 来实现,比如给每个实例增加一个 redis_clus 的标签,值为集群名字即可。
# redis_sentinel
forked from [telegraf/redis_sentinel](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/redis_sentinel)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

View File

@@ -1,3 +0,0 @@
# redis_sentinel
forked from [telegraf/redis_sentinel](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/redis_sentinel)

View File

@@ -86,7 +86,7 @@ func BuiltinComponentDels(ctx *ctx.Context, ids []int64) error {
return DB(ctx).Where("id in ?", ids).Delete(new(BuiltinComponent)).Error
}
func BuiltinComponentGets(ctx *ctx.Context, query string, limit, offset int) ([]*BuiltinComponent, error) {
func BuiltinComponentGets(ctx *ctx.Context, query string) ([]*BuiltinComponent, error) {
session := DB(ctx)
if query != "" {
queryPattern := "%" + query + "%"
@@ -95,24 +95,11 @@ func BuiltinComponentGets(ctx *ctx.Context, query string, limit, offset int) ([]
var lst []*BuiltinComponent
err := session.Limit(limit).Offset(offset).Find(&lst).Error
err := session.Order("ident ASC").Find(&lst).Error
return lst, err
}
func BuiltinComponentCount(ctx *ctx.Context, query string) (int64, error) {
session := DB(ctx).Model(&BuiltinComponent{})
if query != "" {
queryPattern := "%" + query + "%"
session = session.Where("ident LIKE ?", queryPattern)
}
var cnt int64
err := session.Count(&cnt).Error
return cnt, err
}
func BuiltinComponentGet(ctx *ctx.Context, where string, args ...interface{}) (*BuiltinComponent, error) {
var lst []*BuiltinComponent
err := DB(ctx).Where(where, args...).Find(&lst).Error

View File

@@ -14,6 +14,7 @@ type BuiltinPayload struct {
Component string `json:"component" gorm:"type:varchar(191);not null;index:idx_component,sort:asc;comment:'component of payload'"` // Host MySQL Redis
Cate string `json:"cate" gorm:"type:varchar(191);not null;comment:'category of payload'"` // categraf_v1 telegraf_v1
Name string `json:"name" gorm:"type:varchar(191);not null;index:idx_name,sort:asc;comment:'name of payload'"` //
Tags string `json:"tags" gorm:"type:varchar(191);not null;default:'';comment:'tags of payload'"` // {"host":"
Content string `json:"content" gorm:"type:longtext;not null;comment:'content of payload'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
@@ -105,7 +106,7 @@ func BuiltinPayloadGet(ctx *ctx.Context, where string, args ...interface{}) (*Bu
return &bp, nil
}
func BuiltinPayloadGets(ctx *ctx.Context, typ, component, cate, name string, limit, offset int) ([]*BuiltinPayload, error) {
func BuiltinPayloadGets(ctx *ctx.Context, typ, component, cate, query string) ([]*BuiltinPayload, error) {
session := DB(ctx)
if typ != "" {
session = session.Where("type = ?", typ)
@@ -118,11 +119,36 @@ func BuiltinPayloadGets(ctx *ctx.Context, typ, component, cate, name string, lim
session = session.Where("cate = ?", cate)
}
if name != "" {
session = session.Where("name like ?", "%"+name+"%")
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("name like ? or tags like ?", qarg, qarg)
}
}
var lst []*BuiltinPayload
err := session.Limit(limit).Offset(offset).Find(&lst).Error
err := session.Find(&lst).Error
return lst, err
}
// get cates of BuiltinPayload by type and component, return []string
func BuiltinPayloadCates(ctx *ctx.Context, typ, component string) ([]string, error) {
var cates []string
err := DB(ctx).Model(new(BuiltinPayload)).Where("type = ? and component = ?", typ, component).Distinct("cate").Pluck("cate", &cates).Error
return cates, err
}
// get components of BuiltinPayload by type and cate, return string
func BuiltinPayloadComponents(ctx *ctx.Context, typ, cate string) (string, error) {
var components []string
err := DB(ctx).Model(new(BuiltinPayload)).Where("type = ? and cate = ?", typ, cate).Distinct("component").Pluck("component", &components).Error
if err != nil {
return "", err
}
if len(components) == 0 {
return "", nil
}
return components[0], nil
}

Some files were not shown because too many files have changed in this diff Show More