Compare commits

...

22 Commits

Author SHA1 Message Date
kongfei
3a075e7681 update dockerfile for github-action 2023-04-27 18:06:49 +08:00
ulricqin
4ec5612d78 add processes dashboards and alerts (#1513) 2023-04-27 16:11:21 +08:00
Yening Qin
817ed0ab1b fix get engine cluster list (#1512)
* fix: get engine cluster list
2023-04-27 15:33:14 +08:00
Yening Qin
63aa615761 compatible with TDSQL-C Mysql (#1511) 2023-04-27 14:28:59 +08:00
ning
2a36902760 fix: alert rule batch update severity 2023-04-27 11:54:14 +08:00
ning
bca9331182 compatible with TDSQL-C Mysql 2023-04-27 10:45:41 +08:00
alick-liming
199a23e385 refactor: get ClientIP (#1502)
* 调整ClientIP获取
2023-04-27 10:26:34 +08:00
ning
c733f16cc7 auto change n9e version in docker-compose.yaml 2023-04-26 17:32:07 +08:00
ning
81585649aa Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-04-26 17:15:18 +08:00
ning
2c4422d657 auto change n9e version in docker-compose.yaml 2023-04-26 17:15:05 +08:00
青牛踏雪
aaf66cb386 docs: add apiserver kubelet node alerts template to k8s (#1508)
* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter
2023-04-26 14:18:13 +08:00
Ulric Qin
cfed4d8318 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-25 15:02:26 +08:00
Ulric Qin
606cd538ec update dingtalk title 2023-04-25 15:02:13 +08:00
kongfei605
bafb3b2546 Merge pull request #1506 from ccfos/docker_release
update dockerfile
2023-04-25 11:56:32 +08:00
kongfei
9a0224697f typo 2023-04-25 11:55:58 +08:00
kongfei
23156552db update dockerfile 2023-04-25 11:54:29 +08:00
青牛踏雪
36bca795fa add kubernetes dashboard based on categraf collection. (#1503)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.
2023-04-24 19:58:12 +08:00
Ulric Qin
b5503ae93e update static files router 2023-04-24 19:42:02 +08:00
青牛踏雪
3c102e47ed add taoskeeper 3.x dashboard (#1501)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine
2023-04-24 19:28:17 +08:00
xtan
60bf8139b1 feat: add eventid to ibex task_record (#1497) 2023-04-24 18:01:48 +08:00
alick-liming
fc0d077c9f feat:1.verify notify template 2.heartbeat add remote_addr 3. gid auto busi group (#1498)
* 1.通知模版校验 2.对象列表remote_addr

* 1.bgid参数调整 2.语句优化

* 代码优化

* 代码调整
2023-04-24 16:02:45 +08:00
kongfei605
3a610f7ea0 fix standard output option for dashboards (#1500) 2023-04-24 13:18:06 +08:00
38 changed files with 9570 additions and 57 deletions

View File

@@ -115,7 +115,7 @@ dockers:
goarch: arm64
ids:
- build
dockerfile: docker/Dockerfile.goreleaser
dockerfile: docker/Dockerfile.goreleaser.arm64
extra_files:
- pub
- etc

View File

@@ -154,6 +154,7 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
// write db
record := models.TaskRecord{
Id: res.Dat,
EventId: event.Id,
GroupId: tpl.GroupId,
IbexAddress: ibexConf.Address,
IbexAuthUser: ibexConf.BasicAuthUser,

View File

@@ -49,7 +49,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Rule.Name,
Title: ctx.Event.RuleName,
Text: message,
},
}
@@ -57,7 +57,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Rule.Name,
Title: ctx.Event.RuleName,
Text: message + "\n" + strings.Join(ats, " "),
},
At: dingtalkAt{

View File

@@ -31,7 +31,7 @@ func (es *EmailSender) Send(ctx MessageContext) {
if es.subjectTpl != nil {
subject = BuildTplMessage(es.subjectTpl, ctx.Event)
} else {
subject = ctx.Rule.Name
subject = ctx.Event.RuleName
}
content := BuildTplMessage(es.contentTpl, ctx.Event)
es.WriteEmail(subject, content, tos)

View File

@@ -78,7 +78,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
writers := writer.NewWriters(config.Pushgw)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, targetCache)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"net/http"
"path"
"runtime"
"strings"
"time"
@@ -31,12 +32,13 @@ type Router struct {
PromClients *prom.PromClientMap
Redis storage.Redis
MetaSet *metas.Set
TargetCache *memsto.TargetCacheType
Sso *sso.SsoClient
Ctx *ctx.Context
}
func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set) *Router {
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, tc *memsto.TargetCacheType) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
@@ -46,6 +48,7 @@ func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operatio
PromClients: pc,
Redis: redis,
MetaSet: metaSet,
TargetCache: tc,
Sso: sso,
Ctx: ctx,
}
@@ -93,12 +96,18 @@ func (rt *Router) configNoRoute(r *gin.Engine) {
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map":
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, strings.Split(c.Request.URL.Path, "/")...)
c.File(path.Join(cwdarr...))
default:
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, "index.html")

View File

@@ -36,6 +36,17 @@ func (rt *Router) heartbeat(c *gin.Context) {
ginx.Dangerous(err)
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
req.RemoteAddr = c.ClientIP()
rt.MetaSet.Set(req.Hostname, req)
ginx.NewRender(c).Message(nil)
gid := ginx.QueryInt64(c, "gid", 0)
if gid != 0 {
target, has := rt.TargetCache.Get(req.Hostname)
if has && target.GroupId != gid {
err = models.TargetUpdateBgid(rt.Ctx, []string{req.Hostname}, gid, false)
}
}
ginx.NewRender(c).Message(err)
}

View File

@@ -3,6 +3,7 @@ package router
import (
"bytes"
"encoding/json"
"fmt"
"html/template"
"github.com/ccfos/nightingale/v6/center/cconf"
@@ -22,6 +23,11 @@ func (rt *Router) notifyTplUpdateContent(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
if err := templateValidate(f); err != nil {
ginx.NewRender(c).Message(err.Error())
return
}
ginx.NewRender(c).Message(f.UpdateContent(rt.Ctx))
}
@@ -29,9 +35,25 @@ func (rt *Router) notifyTplUpdate(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
if err := templateValidate(f); err != nil {
ginx.NewRender(c).Message(err.Error())
return
}
ginx.NewRender(c).Message(f.Update(rt.Ctx))
}
func templateValidate(f models.NotifyTpl) error {
if f.Content == "" {
return nil
}
if _, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(f.Content); err != nil {
return fmt.Errorf("notify template verify illegal:%s", err.Error())
}
return nil
}
func (rt *Router) notifyTplPreview(c *gin.Context) {
var event models.AlertCurEvent
err := json.Unmarshal([]byte(cconf.EVENT_EXAMPLE), &event)

View File

@@ -13,6 +13,7 @@ alter table `board` add built_in tinyint(1) not null default 0 comment '0:false
alter table `board` add hide tinyint(1) not null default 0 comment '0:false 1:true';
alter table `chart_share` add datasource_id bigint unsigned not null default 0;
alter table `chart_share` drop dashboard_id;
alter table `alert_rule` add datasource_ids varchar(255) not null default '';
alter table `alert_rule` add rule_config text not null comment 'rule_config';
@@ -41,6 +42,9 @@ alter table `alert_his_event` add annotations text not null comment 'annotations
alter table `alert_his_event` add rule_config text not null comment 'rule_config';
alter table `alerting_engines` add datasource_id bigint unsigned not null default 0;
alter table `alerting_engines` change cluster engine_cluster varchar(128) not null default '' comment 'n9e engine cluster';
alter table `task_record` add event_id bigint not null comment 'event id' default 0;
CREATE TABLE `datasource`
(

View File

@@ -1,15 +1,12 @@
FROM --platform=$BUILDPLATFORM python:3-slim
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app
ADD etc /app
RUN mkdir -p /app/integrations
ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD http://download.flashcat.cloud/wait /wait
RUN mkdir -p /app/pub && chmod +x /wait
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
ADD pub /app/pub/
RUN chmod +x n9e
EXPOSE 17000

View File

@@ -0,0 +1,13 @@
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_aarch64 /wait
ADD pub /app/pub/
EXPOSE 17000
CMD ["/app/n9e", "-h"]

View File

@@ -651,6 +651,7 @@ COMMENT ON COLUMN task_tpl_host.host IS 'ip or hostname';
CREATE TABLE task_record
(
id bigint not null ,
event_id bigint not null default 0,
group_id bigint not null ,
ibex_address varchar(128) not null,
ibex_auth_user varchar(128) not null default '',
@@ -669,22 +670,23 @@ CREATE TABLE task_record
) ;
CREATE INDEX task_record_cg_idx ON task_record (create_at, group_id);
CREATE INDEX task_record_create_by_idx ON task_record (create_by);
CREATE INDEX task_record_event_id_idx ON task_record (event_id);
COMMENT ON COLUMN task_record.id IS 'ibex task id';
COMMENT ON COLUMN task_record.group_id IS 'busi group id';
COMMENT ON COLUMN task_record.event_id IS 'event id';
CREATE TABLE alerting_engines
(
id serial,
instance varchar(128) not null default '' ,
datasource_id bigint not null default 0 ,
cluster varchar(128) not null default '' ,
engine_cluster varchar(128) not null default '' ,
clock bigint not null,
PRIMARY KEY (id)
) ;
COMMENT ON COLUMN alerting_engines.instance IS 'instance identification, e.g. 10.9.0.9:9090';
COMMENT ON COLUMN alerting_engines.datasource_id IS 'datasource id';
COMMENT ON COLUMN alerting_engines.cluster IS 'target reader cluster';
COMMENT ON COLUMN alerting_engines.engine_cluster IS 'target reader cluster';
CREATE TABLE datasource

View File

@@ -531,6 +531,7 @@ CREATE TABLE `task_tpl_host`
CREATE TABLE `task_record`
(
`id` bigint unsigned not null comment 'ibex task id',
`event_id` bigint not null comment 'event id' default 0,
`group_id` bigint not null comment 'busi group id',
`ibex_address` varchar(128) not null,
`ibex_auth_user` varchar(128) not null default '',
@@ -547,7 +548,8 @@ CREATE TABLE `task_record`
`create_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_at`, `group_id`),
KEY (`create_by`)
KEY (`create_by`),
KEY (`event_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alerting_engines`
@@ -555,12 +557,11 @@ CREATE TABLE `alerting_engines`
`id` int unsigned NOT NULL AUTO_INCREMENT,
`instance` varchar(128) not null default '' comment 'instance identification, e.g. 10.9.0.9:9090',
`datasource_id` bigint not null default 0 comment 'datasource id',
`cluster` varchar(128) not null default '' comment 'n9e-alert cluster',
`engine_cluster` varchar(128) not null default '' comment 'n9e-alert cluster',
`clock` bigint not null,
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `datasource`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,

7
fe.sh
View File

@@ -7,4 +7,9 @@ curl -o n9e-fe-${VERSION}.tar.gz -L https://github.com/n9e/fe/releases/download/
tar zxvf n9e-fe-${VERSION}.tar.gz
cp ./docker/initsql/a-n9e.sql n9e.sql
cp ./docker/initsql/a-n9e.sql n9e.sql
TAG=$(curl -sX GET https://api.github.com/repos/ccfos/nightingale/releases/latest | awk '/tag_name/{print $4;exit}' FS='[""]')
VERSION=$(echo $TAG)
VERSION=${VERSION/v/}
sed -i "s/nightingale:latest/nightingale:${VERSION}/g" ./docker/docker-compose.yaml

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

View File

@@ -0,0 +1,266 @@
[
{
"name": "KubeClientCertificateExpiration-S2",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeClientCertificateExpiration-S1",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "AggregatedAPIErrors",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "AggregatedAPIDown",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeAPIDown",
"note": "KubeAPI has disappeared from Prometheus target discovery.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeAPIErrorBudgetBurn-S1-120秒",
"note": "The API server is burning too much error budget.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1h",
"short=5m"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S1-900秒",
"note": "The API server is burning too much error budget.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=6h",
"short=30m"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S2-3600秒",
"note": "The API server is burning too much error budget.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1d",
"short=2h"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S2-10800秒",
"note": "The API server is burning too much error budget.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 10800,
"prom_ql": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=3d",
"short=6h"
]
}
]

View File

@@ -0,0 +1,366 @@
[
{
"name": "Node状态异常",
"note": "{{ $labels.node }} has been unready for more than 15 minutes.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node不可达",
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node运行太多Pod",
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node状态抖动",
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "PLEG耗时高",
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Pod启动耗时高",
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书过期-S2",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书过期-S1",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务端证书过期-S2",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务端证书过期-S1",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "kubelet故障",
"note": "Kubelet has disappeared from Prometheus target discovery.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kubelet\"} == 1)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,642 @@
{
"name": "Kubernetes / API Server",
"tags": "Categraf",
"ident": "",
"configs": {
"version": "2.0.0",
"links": [],
"var": [],
"panels": [
{
"targets": [
{
"refId": "A",
"expr": "up{job=\"apiserver\"}",
"legend": "{{ instance }}"
}
],
"name": "API Server - Health Status",
"links": [],
"description": "apiserver的实例健康状态0表示down1表示up",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"type": "value"
},
{
"type": "special",
"result": {
"color": "#3fc453",
"text": "UP"
},
"match": {
"special": 1
}
},
{
"type": "special",
"result": {
"color": "#f80202",
"text": "DOWN"
},
"match": {
"special": 0
}
}
],
"standardOptions": {
"util": "none"
}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 0,
"i": "98f46bc1-c078-40f2-915c-f0836957bf2f",
"isResizable": true
},
"id": "98f46bc1-c078-40f2-915c-f0836957bf2f"
},
{
"targets": [
{
"refId": "A",
"expr": "apiserver_requested_deprecated_apis{job=\"apiserver\"}",
"legend": ""
}
],
"name": "Deprecated Kubernetes Resources",
"links": [],
"description": "当前版本apiserver使用未来版本中要移除的资源",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "labelsOfSeriesToRows",
"columns": [
"group",
"version",
"resource",
"removed_release"
],
"sortOrder": "ascend"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 0,
"i": "73beb13a-bd10-4a68-bb9e-5b9ab63da154",
"isResizable": true
},
"id": "73beb13a-bd10-4a68-bb9e-5b9ab63da154"
},
{
"targets": [
{
"refId": "A",
"expr": "sum by (instance,code) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ code }}"
}
],
"name": "API Server - HTTP Requests by code",
"links": [],
"description": "按照返回码分类统计apiserver请求数",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 8,
"i": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11",
"isResizable": true
},
"id": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11"
},
{
"targets": [
{
"refId": "A",
"expr": "sum by (instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ verb }}"
}
],
"name": "API Server - HTTP Requests by verb",
"links": [],
"description": "按照请求动作分类统计apiserver的请求数",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 8,
"i": "94def0cb-0b86-42f7-a4b2-dde714bbb918",
"isResizable": true
},
"id": "94def0cb-0b86-42f7-a4b2-dde714bbb918"
},
{
"targets": [
{
"refId": "A",
"expr": "apiserver_current_inflight_requests{job=\"apiserver\"}",
"legend": "{{ instance }} {{ request_kind }}"
}
],
"name": "API Server - Current Inflight Requests by kind",
"links": [],
"description": "当前并发请求apiserver的数量",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 16,
"i": "ce5a15ad-11c6-44a2-a071-be57009162e1",
"isResizable": true
},
"id": "fb6266a3-3da0-4310-bfe8-c64a53db5db3"
},
{
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) by (instance,verb,le))*1000",
"legend": "{{ instance }} {{ verb }}"
}
],
"name": "API Server - HTTP Requets Latency by verb",
"links": [],
"description": "apiserver的响应延迟按请求动作分类统计",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 16,
"i": "045dca2d-d69b-47a7-b25e-656adb357e11",
"isResizable": true
},
"id": "045dca2d-d69b-47a7-b25e-656adb357e11"
},
{
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket {job=\"apiserver\",verb!=\"WATCH\"}[5m])) by (instance,le))*1000",
"legend": "{{ instance }}"
}
],
"name": "API Server - HTTP Requets Latency by instance",
"links": [],
"description": "apiserver的响应延迟(非watch请求",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 24,
"i": "1e775704-9ee4-45ce-9d24-b49af89fb5c7",
"isResizable": true
},
"id": "1e775704-9ee4-45ce-9d24-b49af89fb5c7"
},
{
"targets": [
{
"refId": "A",
"expr": "sum by(instance,verb) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\"}[5m]))\n / sum by(instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ verb }}"
}
],
"name": "API Server - Errors by verb",
"links": [],
"description": "apiserver的5xx错误率按请求动作分类统计",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 24,
"i": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5",
"isResizable": true
},
"id": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5"
},
{
"targets": [
{
"refId": "A",
"expr": "sum by(instance) (rate(apiserver_request_total{code=~\"5..\", job=\"apiserver\"}[5m]))\n / sum by(instance) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }}"
}
],
"name": "API Server - Errors by Instance",
"links": [],
"description": "apiserver的5xx 错误率(5xx请求数/总请求数)",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 32,
"i": "92a209a1-7d30-4627-9ae1-55ded5095ed7",
"isResizable": true
},
"id": "92a209a1-7d30-4627-9ae1-55ded5095ed7"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(rate(workqueue_depth{job=\"apiserver\"}[5m])) by (instance,name)",
"legend": "{{ instance }} {{ name }}"
}
],
"name": "API Server - Work Queue by instance",
"links": [],
"description": "apiserver工作队列深度越接近0越好",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 32,
"i": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88",
"isResizable": true
},
"id": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (instance)",
"legend": "{{ instance }}"
}
],
"name": "API Server - HTTP Requests by instance",
"links": [],
"description": "5分钟内apiserver的请求数统计",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "normal"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 40,
"i": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f",
"isResizable": true
},
"id": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f"
},
{
"targets": [
{
"refId": "A",
"expr": "rate(process_cpu_seconds_total{job=\"apiserver\"}[5m])",
"legend": "{{ instance }}"
}
],
"name": "API Server - CPU Usage by instance",
"links": [],
"description": "apiserver的cpu使用率",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 40,
"i": "3d5c1ae5-e640-4986-9202-78258169bffb",
"isResizable": true
},
"id": "3d5c1ae5-e640-4986-9202-78258169bffb"
},
{
"targets": [
{
"refId": "A",
"expr": "process_resident_memory_bytes{job=\"apiserver\"}",
"legend": "{{ instance }}"
}
],
"name": "API Server - Memory Usage by instance",
"links": [],
"description": "apiserver的内存使用量",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": []
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"lineWidth": 1,
"fillOpacity": 0.25,
"gradientMode": "none",
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 48,
"i": "1550a2d5-c808-4174-865a-a41b2c16b486",
"isResizable": true
},
"id": "1550a2d5-c808-4174-865a-a41b2c16b486"
}
]
}
}

View File

@@ -0,0 +1,271 @@
{
"name": "Cadvisor",
"tags": "",
"ident": "",
"configs": {
"version": "2.0.0",
"links": [],
"var": [
{
"type": "query",
"name": "host",
"definition": "label_values({__name__=~\"container.*\"},instance)",
"allValue": ".*",
"allOption": true,
"multi": false,
"reg": ""
},
{
"type": "query",
"name": "container",
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
"allValue": ".*",
"allOption": true,
"multi": false,
"reg": ""
}
],
"panels": [
{
"version": "2.0.0",
"id": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
"type": "timeseries",
"name": "CPU Usage",
"links": [],
"layout": {
"h": 7,
"w": 24,
"x": 0,
"y": 0,
"i": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
"isResizable": true
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
"legend": "{{name}}"
}
],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"version": "2.0.0",
"drawStyle": "lines",
"lineInterpolation": "linear",
"fillOpacity": 0.5,
"stack": "off"
},
"maxPerRow": 4
},
{
"version": "2.0.0",
"id": "6690fff4-c159-40e5-b340-65a3ba85e37e",
"type": "timeseries",
"name": "Memory Usage",
"links": [],
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 7,
"i": "6690fff4-c159-40e5-b340-65a3ba85e37e",
"isResizable": true
},
"targets": [
{
"refId": "A",
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"legend": "{{name}}"
}
],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"version": "2.0.0",
"drawStyle": "lines",
"lineInterpolation": "linear",
"fillOpacity": 0.5,
"stack": "off"
},
"maxPerRow": 4
},
{
"version": "2.0.0",
"id": "3c798af5-cfae-4962-9b70-85736df44bb1",
"type": "timeseries",
"name": "Memory Cached",
"links": [],
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 7,
"i": "3c798af5-cfae-4962-9b70-85736df44bb1",
"isResizable": true
},
"targets": [
{
"refId": "A",
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"legend": "{{name}}"
}
],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"version": "2.0.0",
"drawStyle": "lines",
"lineInterpolation": "linear",
"fillOpacity": 0.5,
"stack": "off"
},
"maxPerRow": 4
},
{
"version": "2.0.0",
"id": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
"type": "timeseries",
"name": "Received Network Traffic",
"links": [],
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 15,
"i": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
"isResizable": true
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"legend": "{{name}}"
}
],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"version": "2.0.0",
"drawStyle": "lines",
"lineInterpolation": "linear",
"fillOpacity": 0.5,
"stack": "off"
},
"maxPerRow": 4
},
{
"version": "2.0.0",
"id": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
"type": "timeseries",
"name": "Sent Network Traffic",
"links": [],
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 15,
"i": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
"isResizable": true
},
"targets": [
{
"refId": "A",
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"legend": "{{name}}"
}
],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"version": "2.0.0",
"drawStyle": "lines",
"lineInterpolation": "linear",
"fillOpacity": 0.5,
"stack": "off"
},
"maxPerRow": 4
}
]
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,438 @@
{
"name": "Kubernetes / Kubelet Metrics",
"tags": "Categraf",
"ident": "",
"configs": {
"var": [
{
"name": "cluster",
"definition": "label_values(kubelet_running_pods, cluster)",
"multi": true,
"allOption": true
},
{
"name": "instance",
"definition": "label_values(kubelet_running_pods{cluster=~\"$cluster\"}, instance)",
"multi": true,
"allOption": true
}
],
"panels": [
{
"targets": [
{
"refId": "A",
"expr": "sum(up{source=\"kubelet\", cluster=~\"$cluster\"})"
}
],
"name": "Kubelet UP",
"custom": {
"textMode": "value",
"colorMode": "background",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 0,
"i": "d3caf396-b3a1-449b-acec-f550967889e6",
"isResizable": true
},
"id": "d3caf396-b3a1-449b-acec-f550967889e6"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kubelet_running_pods{cluster=~\"$cluster\", instance=~\"$instance\"})"
}
],
"name": "Running Pods",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 4,
"y": 0,
"i": "38c38b23-a7e3-4177-8c41-3ce955ea0434",
"isResizable": true
},
"id": "38c38b23-a7e3-4177-8c41-3ce955ea0434"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kubelet_running_containers{cluster=~\"$cluster\", instance=~\"$instance\", container_state=\"running\"})"
}
],
"name": "Running Containers",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 8,
"y": 0,
"i": "26bf2320-fcff-48f8-a6fc-aa9076bb9329",
"isResizable": true
},
"id": "525859b9-91d7-4180-b363-bf8ceec977d8"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"desired_state_of_world\"})"
}
],
"name": "Desired Volumes",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 12,
"y": 0,
"i": "54ae4ab3-e932-418c-a637-f2f515cce1b9",
"isResizable": true
},
"id": "84af4617-2ae0-4b30-a82a-6e8586342224"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"actual_state_of_world\"})"
}
],
"name": "Actual Volumes",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 16,
"y": 0,
"i": "d9de76d7-2203-40e7-a792-9888ec869e82",
"isResizable": true
},
"id": "d431f4bd-9115-41d2-a494-1d680bdd1e0f"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))"
}
],
"name": "OP Errors in 5min",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"type": "range",
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
}
},
{
"type": "range",
"match": {
"to": 1
},
"result": {
"color": "#417505"
}
}
],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 4,
"x": 20,
"y": 0,
"i": "bf2bbd15-347d-404c-9b8f-e524875befe2",
"isResizable": true
},
"id": "54de62bc-8af3-4c27-8b8e-1af567b363fc"
},
{
"type": "row",
"id": "730d4a9b-791f-4aaf-a042-668f66e73814",
"name": "Operations",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 3,
"i": "730d4a9b-791f-4aaf-a042-668f66e73814",
"isResizable": false
},
"panels": []
},
{
"targets": [
{
"refId": "A",
"expr": "increase(kubelet_runtime_operations_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])"
}
],
"name": "Operations in 5min",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 4,
"i": "d26e6818-6704-492a-8cbf-58473dd85716",
"isResizable": true
},
"id": "d26e6818-6704-492a-8cbf-58473dd85716"
},
{
"targets": [
{
"refId": "A",
"expr": "increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])"
}
],
"name": "Operation Errors in 5min",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 4,
"i": "4e585d2f-c61c-4350-86ec-dca7ddc34ceb",
"isResizable": true
},
"id": "09a6ad5b-8c0e-4f17-b17f-3ebc514f7d20"
},
{
"targets": [
{
"refId": "A",
"expr": "increase(kubelet_runtime_operations_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_runtime_operations_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
}
],
"name": "Average Operation duration in 1 hour (Unit: Second)",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 24,
"x": 0,
"y": 8,
"i": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7",
"isResizable": true
},
"id": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7"
},
{
"type": "row",
"id": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
"name": "PLEG relist",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 12,
"i": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
"isResizable": false
}
},
{
"targets": [
{
"refId": "A",
"expr": "rate(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
}
],
"name": "relist rate",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 13,
"i": "f3822da8-a9c9-4db1-ba12-465d3ece823e",
"isResizable": true
},
"id": "f3822da8-a9c9-4db1-ba12-465d3ece823e"
},
{
"targets": [
{
"refId": "A",
"expr": "increase(kubelet_pleg_relist_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
}
],
"name": "relist duration (Unit: Second)",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 13,
"i": "2b4ada76-6c30-42cd-9bd3-c939b4c0139c",
"isResizable": true
},
"id": "a6e4c914-bfca-4419-a264-f5b1cbab261a"
}
],
"version": "2.0.0"
}
}

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -1437,7 +1437,7 @@
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC",
"util": "bitsIEC",
"decimals": 0
},
"thresholds": {
@@ -1732,4 +1732,4 @@
],
"version": "3.0.0"
}
}
}

View File

@@ -0,0 +1,68 @@
[
{
"cate": "prometheus",
"datasource_ids": [
0
],
"name": "Too many running processes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "processes_running > (system_n_cpus * 3)",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 1,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 10,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {}
}
]

View File

@@ -0,0 +1,234 @@
{
"name": "Processes by UlricQin",
"tags": "Categraf Linux OS",
"ident": "",
"configs": {
"var": [
{
"name": "Datasource",
"label": "",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 37
},
{
"name": "ident",
"label": "Host",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(processes_running, ident)",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "barGauge",
"id": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 0,
"i": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "processes_running{ident=~\"$ident\"}",
"legend": "{{ident}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Running Processes",
"custom": {
"calc": "lastNotNull",
"baseColor": "#9470FF",
"serieWidth": 20,
"sortOrder": "desc"
},
"options": {
"valueMappings": [
{
"type": "range",
"result": {
"color": "#f10808"
},
"match": {
"from": 50
}
}
],
"standardOptions": {}
}
},
{
"type": "barGauge",
"id": "659f5f75-24ca-493c-97cb-3d99abd52172",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 0,
"i": "df457bf0-17c8-4d05-a527-cfaf0f2b844c",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "processes_total{ident=~\"$ident\"}",
"legend": "{{ident}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Total Processes",
"custom": {
"calc": "lastNotNull",
"baseColor": "#9470FF",
"serieWidth": 20,
"sortOrder": "desc"
},
"options": {
"valueMappings": [
{
"type": "range",
"result": {
"color": "#f10808"
},
"match": {
"from": 600
}
}
],
"standardOptions": {}
}
},
{
"type": "barGauge",
"id": "5e849509-1c41-44c7-85ee-d8c0adf7c623",
"layout": {
"h": 8,
"w": 12,
"x": 0,
"y": 8,
"i": "62291285-be84-470a-9ccc-53be7a8733fd",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "processes_total_threads{ident=~\"$ident\"}",
"legend": "{{ident}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Total Threads",
"custom": {
"calc": "lastNotNull",
"baseColor": "#9470FF",
"serieWidth": 20,
"sortOrder": "desc"
},
"options": {
"valueMappings": [
{
"type": "range",
"result": {
"color": "#ff8286"
},
"match": {
"from": 2000
}
},
{
"type": "range",
"result": {
"color": "#f30909"
},
"match": {
"from": 4000
}
}
],
"standardOptions": {}
}
},
{
"type": "table",
"id": "b2850506-6cdd-48cc-9223-70acff9212b0",
"layout": {
"h": 8,
"w": 12,
"x": 12,
"y": 8,
"i": "b2850506-6cdd-48cc-9223-70acff9212b0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "sum({__name__=~\"processes_sleeping|processes_dead|processes_paging|processes_total_threads|processes_total|processes_idle|processes_running|processes_zombies|processes_stopped|processes_unknown|processes_blocked\", ident=~\"$ident\"}) by (__name__)",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "SUM by Process state",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "labelsOfSeriesToRows",
"sortColumn": "value",
"sortOrder": "descend",
"columns": []
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
]
}
],
"version": "3.0.0"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.7 KiB

View File

@@ -0,0 +1,22 @@
## Categraf as collector
configuration file: `conf/input.processes/processes.toml`
默认配置如下(一般维持默认不用动):
```toml
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
```
有两种采集方式,使用 ps 命令,或者直接读取 `/proc` 目录,默认是后者。如果想强制使用 ps 命令才采集,开启 force_ps 即可:
```
force_ps = true
```

View File

@@ -425,18 +425,6 @@ func AlertNumbers(ctx *ctx.Context, bgids []int64) (map[int64]int64, error) {
return ret, nil
}
func AlertCurEventGetAll(ctx *ctx.Context, cluster string) ([]*AlertCurEvent, error) {
session := DB(ctx).Model(&AlertCurEvent{})
if cluster != "" {
session = session.Where("cluster = ?", cluster)
}
var lst []*AlertCurEvent
err := session.Find(&lst).Error
return lst, err
}
func AlertCurEventGetByIds(ctx *ctx.Context, ids []int64) ([]*AlertCurEvent, error) {
var lst []*AlertCurEvent

View File

@@ -278,6 +278,10 @@ func (ar *AlertRule) Update(ctx *ctx.Context, arf AlertRule) error {
}
func (ar *AlertRule) UpdateColumn(ctx *ctx.Context, column string, value interface{}) error {
if value == nil {
return nil
}
if column == "datasource_ids" {
b, err := json.Marshal(value)
if err != nil {
@@ -286,6 +290,68 @@ func (ar *AlertRule) UpdateColumn(ctx *ctx.Context, column string, value interfa
return DB(ctx).Model(ar).UpdateColumn(column, string(b)).Error
}
if column == "severity" {
severity := int(value.(float64))
if ar.Cate == PROMETHEUS {
var ruleConfig PromRuleConfig
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
if err != nil {
return err
}
if len(ruleConfig.Queries) != 1 {
return nil
}
ruleConfig.Queries[0].Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
} else if ar.Cate == HOST {
var ruleConfig HostRuleConfig
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
if err != nil {
return err
}
if len(ruleConfig.Triggers) != 1 {
return nil
}
ruleConfig.Triggers[0].Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
}
}
if column == "runbook_url" {
url := value.(string)
err := json.Unmarshal([]byte(ar.Annotations), &ar.AnnotationsJSON)
if err != nil {
return err
}
if ar.AnnotationsJSON == nil {
ar.AnnotationsJSON = make(map[string]string)
}
ar.AnnotationsJSON["runbook_url"] = url
b, err := json.Marshal(ar.AnnotationsJSON)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("annotations", string(b)).Error
}
return DB(ctx).Model(ar).UpdateColumn(column, value).Error
}

View File

@@ -8,11 +8,11 @@ import (
)
type AlertingEngines struct {
Id int64 `json:"id" gorm:"primaryKey"`
Instance string `json:"instance"`
Cluster string `json:"cluster"`
DatasourceId int64 `json:"datasource_id"`
Clock int64 `json:"clock"`
Id int64 `json:"id" gorm:"primaryKey"`
Instance string `json:"instance"`
EngineCluster string `json:"cluster" gorm:"engine_cluster"`
DatasourceId int64 `json:"datasource_id"`
Clock int64 `json:"clock"`
}
func (e *AlertingEngines) TableName() string {
@@ -107,11 +107,11 @@ func AlertingEngineGet(ctx *ctx.Context, where string, args ...interface{}) (*Al
func AlertingEngineGetsClusters(ctx *ctx.Context, where string, args ...interface{}) ([]string, error) {
var arr []string
var err error
session := DB(ctx).Model(new(AlertingEngines)).Where("cluster != ''").Order("cluster").Distinct("cluster")
session := DB(ctx).Model(new(AlertingEngines)).Where("engine_cluster != ''").Order("engine_cluster").Distinct("engine_cluster")
if where == "" {
err = session.Pluck("cluster", &arr).Error
err = session.Pluck("engine_cluster", &arr).Error
} else {
err = session.Where(where, args...).Pluck("cluster", &arr).Error
err = session.Where(where, args...).Pluck("engine_cluster", &arr).Error
}
return arr, err
}
@@ -130,7 +130,7 @@ func AlertingEngineGetsInstances(ctx *ctx.Context, where string, args ...interfa
func AlertingEngineHeartbeatWithCluster(ctx *ctx.Context, instance, cluster string, datasourceId int64) error {
var total int64
err := DB(ctx).Model(new(AlertingEngines)).Where("instance=? and cluster = ? and datasource_id=?", instance, cluster, datasourceId).Count(&total).Error
err := DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Count(&total).Error
if err != nil {
return err
}
@@ -138,15 +138,15 @@ func AlertingEngineHeartbeatWithCluster(ctx *ctx.Context, instance, cluster stri
if total == 0 {
// insert
err = DB(ctx).Create(&AlertingEngines{
Instance: instance,
DatasourceId: datasourceId,
Cluster: cluster,
Clock: time.Now().Unix(),
Instance: instance,
DatasourceId: datasourceId,
EngineCluster: cluster,
Clock: time.Now().Unix(),
}).Error
} else {
// updates
fields := map[string]interface{}{"clock": time.Now().Unix()}
err = DB(ctx).Model(new(AlertingEngines)).Where("instance=? and cluster = ? and datasource_id=?", instance, cluster, datasourceId).Updates(fields).Error
err = DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Updates(fields).Error
}
return err

View File

@@ -12,6 +12,7 @@ type HostMeta struct {
MemUtil float64 `json:"mem_util"`
Offset int64 `json:"offset"`
UnixTime int64 `json:"unixtime"`
RemoteAddr string `json:"remote_addr"`
}
func (h HostMeta) MarshalBinary() ([]byte, error) {

View File

@@ -22,14 +22,15 @@ type Target struct {
TagsMap map[string]string `json:"-" gorm:"-"` // internal use, append tags to series
UpdateAt int64 `json:"update_at"`
UnixTime int64 `json:"unixtime" gorm:"-"`
Offset int64 `json:"offset" gorm:"-"`
TargetUp float64 `json:"target_up" gorm:"-"`
MemUtil float64 `json:"mem_util" gorm:"-"`
CpuNum int `json:"cpu_num" gorm:"-"`
CpuUtil float64 `json:"cpu_util" gorm:"-"`
OS string `json:"os" gorm:"-"`
Arch string `json:"arch" gorm:"-"`
UnixTime int64 `json:"unixtime" gorm:"-"`
Offset int64 `json:"offset" gorm:"-"`
TargetUp float64 `json:"target_up" gorm:"-"`
MemUtil float64 `json:"mem_util" gorm:"-"`
CpuNum int `json:"cpu_num" gorm:"-"`
CpuUtil float64 `json:"cpu_util" gorm:"-"`
OS string `json:"os" gorm:"-"`
Arch string `json:"arch" gorm:"-"`
RemoteAddr string `json:"remote_addr" gorm:"-"`
}
func (t *Target) TableName() string {
@@ -296,6 +297,7 @@ func (t *Target) FillMeta(meta *HostMeta) {
t.Offset = meta.Offset
t.OS = meta.OS
t.Arch = meta.Arch
t.RemoteAddr = meta.RemoteAddr
}
func TargetIdents(ctx *ctx.Context, ids []int64) ([]string, error) {

View File

@@ -4,6 +4,7 @@ import "github.com/ccfos/nightingale/v6/pkg/ctx"
type TaskRecord struct {
Id int64 `json:"id" gorm:"primaryKey"`
EventId int64 `json:"event_id"`
GroupId int64 `json:"group_id"`
IbexAddress string `json:"ibex_address"`
IbexAuthUser string `json:"ibex_auth_user"`