mirror of
https://github.com/ccfos/nightingale.git
synced 2026-03-08 00:49:00 +00:00
Compare commits
22 Commits
dashboard_
...
docker_rel
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3a075e7681 | ||
|
|
4ec5612d78 | ||
|
|
817ed0ab1b | ||
|
|
63aa615761 | ||
|
|
2a36902760 | ||
|
|
bca9331182 | ||
|
|
199a23e385 | ||
|
|
c733f16cc7 | ||
|
|
81585649aa | ||
|
|
2c4422d657 | ||
|
|
aaf66cb386 | ||
|
|
cfed4d8318 | ||
|
|
606cd538ec | ||
|
|
bafb3b2546 | ||
|
|
9a0224697f | ||
|
|
23156552db | ||
|
|
36bca795fa | ||
|
|
b5503ae93e | ||
|
|
3c102e47ed | ||
|
|
60bf8139b1 | ||
|
|
fc0d077c9f | ||
|
|
3a610f7ea0 |
@@ -115,7 +115,7 @@ dockers:
|
||||
goarch: arm64
|
||||
ids:
|
||||
- build
|
||||
dockerfile: docker/Dockerfile.goreleaser
|
||||
dockerfile: docker/Dockerfile.goreleaser.arm64
|
||||
extra_files:
|
||||
- pub
|
||||
- etc
|
||||
|
||||
@@ -154,6 +154,7 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
|
||||
// write db
|
||||
record := models.TaskRecord{
|
||||
Id: res.Dat,
|
||||
EventId: event.Id,
|
||||
GroupId: tpl.GroupId,
|
||||
IbexAddress: ibexConf.Address,
|
||||
IbexAuthUser: ibexConf.BasicAuthUser,
|
||||
|
||||
@@ -49,7 +49,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
|
||||
body = dingtalk{
|
||||
Msgtype: "markdown",
|
||||
Markdown: dingtalkMarkdown{
|
||||
Title: ctx.Rule.Name,
|
||||
Title: ctx.Event.RuleName,
|
||||
Text: message,
|
||||
},
|
||||
}
|
||||
@@ -57,7 +57,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
|
||||
body = dingtalk{
|
||||
Msgtype: "markdown",
|
||||
Markdown: dingtalkMarkdown{
|
||||
Title: ctx.Rule.Name,
|
||||
Title: ctx.Event.RuleName,
|
||||
Text: message + "\n" + strings.Join(ats, " "),
|
||||
},
|
||||
At: dingtalkAt{
|
||||
|
||||
@@ -31,7 +31,7 @@ func (es *EmailSender) Send(ctx MessageContext) {
|
||||
if es.subjectTpl != nil {
|
||||
subject = BuildTplMessage(es.subjectTpl, ctx.Event)
|
||||
} else {
|
||||
subject = ctx.Rule.Name
|
||||
subject = ctx.Event.RuleName
|
||||
}
|
||||
content := BuildTplMessage(es.contentTpl, ctx.Event)
|
||||
es.WriteEmail(subject, content, tos)
|
||||
|
||||
@@ -78,7 +78,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
|
||||
writers := writer.NewWriters(config.Pushgw)
|
||||
|
||||
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
|
||||
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas)
|
||||
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, targetCache)
|
||||
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
|
||||
|
||||
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"path"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -31,12 +32,13 @@ type Router struct {
|
||||
PromClients *prom.PromClientMap
|
||||
Redis storage.Redis
|
||||
MetaSet *metas.Set
|
||||
TargetCache *memsto.TargetCacheType
|
||||
Sso *sso.SsoClient
|
||||
Ctx *ctx.Context
|
||||
}
|
||||
|
||||
func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
|
||||
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set) *Router {
|
||||
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, tc *memsto.TargetCacheType) *Router {
|
||||
return &Router{
|
||||
HTTP: httpConfig,
|
||||
Center: center,
|
||||
@@ -46,6 +48,7 @@ func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operatio
|
||||
PromClients: pc,
|
||||
Redis: redis,
|
||||
MetaSet: metaSet,
|
||||
TargetCache: tc,
|
||||
Sso: sso,
|
||||
Ctx: ctx,
|
||||
}
|
||||
@@ -93,12 +96,18 @@ func (rt *Router) configNoRoute(r *gin.Engine) {
|
||||
switch suffix {
|
||||
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map":
|
||||
cwdarr := []string{"/"}
|
||||
if runtime.GOOS == "windows" {
|
||||
cwdarr[0] = ""
|
||||
}
|
||||
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
|
||||
cwdarr = append(cwdarr, "pub")
|
||||
cwdarr = append(cwdarr, strings.Split(c.Request.URL.Path, "/")...)
|
||||
c.File(path.Join(cwdarr...))
|
||||
default:
|
||||
cwdarr := []string{"/"}
|
||||
if runtime.GOOS == "windows" {
|
||||
cwdarr[0] = ""
|
||||
}
|
||||
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
|
||||
cwdarr = append(cwdarr, "pub")
|
||||
cwdarr = append(cwdarr, "index.html")
|
||||
|
||||
@@ -36,6 +36,17 @@ func (rt *Router) heartbeat(c *gin.Context) {
|
||||
ginx.Dangerous(err)
|
||||
|
||||
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
|
||||
req.RemoteAddr = c.ClientIP()
|
||||
rt.MetaSet.Set(req.Hostname, req)
|
||||
ginx.NewRender(c).Message(nil)
|
||||
|
||||
gid := ginx.QueryInt64(c, "gid", 0)
|
||||
|
||||
if gid != 0 {
|
||||
target, has := rt.TargetCache.Get(req.Hostname)
|
||||
if has && target.GroupId != gid {
|
||||
err = models.TargetUpdateBgid(rt.Ctx, []string{req.Hostname}, gid, false)
|
||||
}
|
||||
}
|
||||
|
||||
ginx.NewRender(c).Message(err)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package router
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html/template"
|
||||
|
||||
"github.com/ccfos/nightingale/v6/center/cconf"
|
||||
@@ -22,6 +23,11 @@ func (rt *Router) notifyTplUpdateContent(c *gin.Context) {
|
||||
var f models.NotifyTpl
|
||||
ginx.BindJSON(c, &f)
|
||||
|
||||
if err := templateValidate(f); err != nil {
|
||||
ginx.NewRender(c).Message(err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
ginx.NewRender(c).Message(f.UpdateContent(rt.Ctx))
|
||||
}
|
||||
|
||||
@@ -29,9 +35,25 @@ func (rt *Router) notifyTplUpdate(c *gin.Context) {
|
||||
var f models.NotifyTpl
|
||||
ginx.BindJSON(c, &f)
|
||||
|
||||
if err := templateValidate(f); err != nil {
|
||||
ginx.NewRender(c).Message(err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
ginx.NewRender(c).Message(f.Update(rt.Ctx))
|
||||
}
|
||||
|
||||
func templateValidate(f models.NotifyTpl) error {
|
||||
if f.Content == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(f.Content); err != nil {
|
||||
return fmt.Errorf("notify template verify illegal:%s", err.Error())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rt *Router) notifyTplPreview(c *gin.Context) {
|
||||
var event models.AlertCurEvent
|
||||
err := json.Unmarshal([]byte(cconf.EVENT_EXAMPLE), &event)
|
||||
|
||||
@@ -13,6 +13,7 @@ alter table `board` add built_in tinyint(1) not null default 0 comment '0:false
|
||||
alter table `board` add hide tinyint(1) not null default 0 comment '0:false 1:true';
|
||||
|
||||
alter table `chart_share` add datasource_id bigint unsigned not null default 0;
|
||||
alter table `chart_share` drop dashboard_id;
|
||||
|
||||
alter table `alert_rule` add datasource_ids varchar(255) not null default '';
|
||||
alter table `alert_rule` add rule_config text not null comment 'rule_config';
|
||||
@@ -41,6 +42,9 @@ alter table `alert_his_event` add annotations text not null comment 'annotations
|
||||
alter table `alert_his_event` add rule_config text not null comment 'rule_config';
|
||||
|
||||
alter table `alerting_engines` add datasource_id bigint unsigned not null default 0;
|
||||
alter table `alerting_engines` change cluster engine_cluster varchar(128) not null default '' comment 'n9e engine cluster';
|
||||
|
||||
alter table `task_record` add event_id bigint not null comment 'event id' default 0;
|
||||
|
||||
CREATE TABLE `datasource`
|
||||
(
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
FROM --platform=$BUILDPLATFORM python:3-slim
|
||||
FROM --platform=$TARGETPLATFORM python:3-slim
|
||||
|
||||
|
||||
WORKDIR /app
|
||||
ADD n9e /app
|
||||
ADD etc /app
|
||||
RUN mkdir -p /app/integrations
|
||||
ADD n9e /app/
|
||||
ADD etc /app/
|
||||
ADD integrations /app/integrations/
|
||||
ADD http://download.flashcat.cloud/wait /wait
|
||||
RUN mkdir -p /app/pub && chmod +x /wait
|
||||
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
|
||||
ADD pub /app/pub/
|
||||
RUN chmod +x n9e
|
||||
|
||||
EXPOSE 17000
|
||||
|
||||
|
||||
13
docker/Dockerfile.goreleaser.arm64
Normal file
13
docker/Dockerfile.goreleaser.arm64
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM --platform=$TARGETPLATFORM python:3-slim
|
||||
|
||||
|
||||
WORKDIR /app
|
||||
ADD n9e /app/
|
||||
ADD etc /app/
|
||||
ADD integrations /app/integrations/
|
||||
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_aarch64 /wait
|
||||
ADD pub /app/pub/
|
||||
|
||||
EXPOSE 17000
|
||||
|
||||
CMD ["/app/n9e", "-h"]
|
||||
@@ -651,6 +651,7 @@ COMMENT ON COLUMN task_tpl_host.host IS 'ip or hostname';
|
||||
CREATE TABLE task_record
|
||||
(
|
||||
id bigint not null ,
|
||||
event_id bigint not null default 0,
|
||||
group_id bigint not null ,
|
||||
ibex_address varchar(128) not null,
|
||||
ibex_auth_user varchar(128) not null default '',
|
||||
@@ -669,22 +670,23 @@ CREATE TABLE task_record
|
||||
) ;
|
||||
CREATE INDEX task_record_cg_idx ON task_record (create_at, group_id);
|
||||
CREATE INDEX task_record_create_by_idx ON task_record (create_by);
|
||||
CREATE INDEX task_record_event_id_idx ON task_record (event_id);
|
||||
COMMENT ON COLUMN task_record.id IS 'ibex task id';
|
||||
COMMENT ON COLUMN task_record.group_id IS 'busi group id';
|
||||
|
||||
COMMENT ON COLUMN task_record.event_id IS 'event id';
|
||||
|
||||
CREATE TABLE alerting_engines
|
||||
(
|
||||
id serial,
|
||||
instance varchar(128) not null default '' ,
|
||||
datasource_id bigint not null default 0 ,
|
||||
cluster varchar(128) not null default '' ,
|
||||
engine_cluster varchar(128) not null default '' ,
|
||||
clock bigint not null,
|
||||
PRIMARY KEY (id)
|
||||
) ;
|
||||
COMMENT ON COLUMN alerting_engines.instance IS 'instance identification, e.g. 10.9.0.9:9090';
|
||||
COMMENT ON COLUMN alerting_engines.datasource_id IS 'datasource id';
|
||||
COMMENT ON COLUMN alerting_engines.cluster IS 'target reader cluster';
|
||||
COMMENT ON COLUMN alerting_engines.engine_cluster IS 'target reader cluster';
|
||||
|
||||
|
||||
CREATE TABLE datasource
|
||||
|
||||
@@ -531,6 +531,7 @@ CREATE TABLE `task_tpl_host`
|
||||
CREATE TABLE `task_record`
|
||||
(
|
||||
`id` bigint unsigned not null comment 'ibex task id',
|
||||
`event_id` bigint not null comment 'event id' default 0,
|
||||
`group_id` bigint not null comment 'busi group id',
|
||||
`ibex_address` varchar(128) not null,
|
||||
`ibex_auth_user` varchar(128) not null default '',
|
||||
@@ -547,7 +548,8 @@ CREATE TABLE `task_record`
|
||||
`create_by` varchar(64) not null default '',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY (`create_at`, `group_id`),
|
||||
KEY (`create_by`)
|
||||
KEY (`create_by`),
|
||||
KEY (`event_id`)
|
||||
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
|
||||
|
||||
CREATE TABLE `alerting_engines`
|
||||
@@ -555,12 +557,11 @@ CREATE TABLE `alerting_engines`
|
||||
`id` int unsigned NOT NULL AUTO_INCREMENT,
|
||||
`instance` varchar(128) not null default '' comment 'instance identification, e.g. 10.9.0.9:9090',
|
||||
`datasource_id` bigint not null default 0 comment 'datasource id',
|
||||
`cluster` varchar(128) not null default '' comment 'n9e-alert cluster',
|
||||
`engine_cluster` varchar(128) not null default '' comment 'n9e-alert cluster',
|
||||
`clock` bigint not null,
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
|
||||
|
||||
|
||||
CREATE TABLE `datasource`
|
||||
(
|
||||
`id` int unsigned NOT NULL AUTO_INCREMENT,
|
||||
|
||||
7
fe.sh
7
fe.sh
@@ -7,4 +7,9 @@ curl -o n9e-fe-${VERSION}.tar.gz -L https://github.com/n9e/fe/releases/download/
|
||||
|
||||
tar zxvf n9e-fe-${VERSION}.tar.gz
|
||||
|
||||
cp ./docker/initsql/a-n9e.sql n9e.sql
|
||||
cp ./docker/initsql/a-n9e.sql n9e.sql
|
||||
|
||||
TAG=$(curl -sX GET https://api.github.com/repos/ccfos/nightingale/releases/latest | awk '/tag_name/{print $4;exit}' FS='[""]')
|
||||
VERSION=$(echo $TAG)
|
||||
VERSION=${VERSION/v/}
|
||||
sed -i "s/nightingale:latest/nightingale:${VERSION}/g" ./docker/docker-compose.yaml
|
||||
1115
integrations/TDEngine/dashboards/tasokeeper3.x.json
Normal file
1115
integrations/TDEngine/dashboards/tasokeeper3.x.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
integrations/TDEngine/icon/taos.png
Normal file
BIN
integrations/TDEngine/icon/taos.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 14 KiB |
266
integrations/kubernetes/alerts/apiserver.json
Normal file
266
integrations/kubernetes/alerts/apiserver.json
Normal file
@@ -0,0 +1,266 @@
|
||||
[
|
||||
{
|
||||
"name": "KubeClientCertificateExpiration-S2",
|
||||
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "KubeClientCertificateExpiration-S1",
|
||||
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "AggregatedAPIErrors",
|
||||
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "AggregatedAPIDown",
|
||||
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 300,
|
||||
"prom_ql": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "KubeAPIDown",
|
||||
"note": "KubeAPI has disappeared from Prometheus target discovery.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "KubeAPIErrorBudgetBurn-S1-120秒",
|
||||
"note": "The API server is burning too much error budget.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 120,
|
||||
"prom_ql": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"long=1h",
|
||||
"short=5m"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "KubeAPIErrorBudgetBurn-S1-900秒",
|
||||
"note": "The API server is burning too much error budget.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"long=6h",
|
||||
"short=30m"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "KubeAPIErrorBudgetBurn-S2-3600秒",
|
||||
"note": "The API server is burning too much error budget.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 3600,
|
||||
"prom_ql": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"long=1d",
|
||||
"short=2h"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "KubeAPIErrorBudgetBurn-S2-10800秒",
|
||||
"note": "The API server is burning too much error budget.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 10800,
|
||||
"prom_ql": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"long=3d",
|
||||
"short=6h"
|
||||
]
|
||||
}
|
||||
]
|
||||
366
integrations/kubernetes/alerts/kubelet.json
Normal file
366
integrations/kubernetes/alerts/kubelet.json
Normal file
@@ -0,0 +1,366 @@
|
||||
[
|
||||
{
|
||||
"name": "Node状态异常",
|
||||
"note": "{{ $labels.node }} has been unready for more than 15 minutes.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "Node不可达",
|
||||
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "Node运行太多Pod",
|
||||
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "Node状态抖动",
|
||||
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "PLEG耗时高",
|
||||
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 300,
|
||||
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "Pod启动耗时高",
|
||||
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "客户端证书过期-S2",
|
||||
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 604800\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "客户端证书过期-S1",
|
||||
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 86400\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "服务端证书过期-S2",
|
||||
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 604800\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "服务端证书过期-S1",
|
||||
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 86400\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "客户端证书续签错误",
|
||||
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "服务证书续签错误",
|
||||
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "kubelet故障",
|
||||
"note": "Kubelet has disappeared from Prometheus target discovery.",
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 900,
|
||||
"prom_ql": "absent(up{job=\"kubelet\"} == 1)\n",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
}
|
||||
]
|
||||
1010
integrations/kubernetes/alerts/node-exporter.json
Normal file
1010
integrations/kubernetes/alerts/node-exporter.json
Normal file
File diff suppressed because it is too large
Load Diff
642
integrations/kubernetes/dashboards/APIServer.json
Normal file
642
integrations/kubernetes/dashboards/APIServer.json
Normal file
@@ -0,0 +1,642 @@
|
||||
{
|
||||
"name": "Kubernetes / API Server",
|
||||
"tags": "Categraf",
|
||||
"ident": "",
|
||||
"configs": {
|
||||
"version": "2.0.0",
|
||||
"links": [],
|
||||
"var": [],
|
||||
"panels": [
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "up{job=\"apiserver\"}",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Health Status",
|
||||
"links": [],
|
||||
"description": "apiserver的实例健康状态,0表示down,1表示up",
|
||||
"custom": {
|
||||
"showHeader": true,
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"displayMode": "seriesToRows"
|
||||
},
|
||||
"options": {
|
||||
"valueMappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
},
|
||||
{
|
||||
"type": "special",
|
||||
"result": {
|
||||
"color": "#3fc453",
|
||||
"text": "UP"
|
||||
},
|
||||
"match": {
|
||||
"special": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "special",
|
||||
"result": {
|
||||
"color": "#f80202",
|
||||
"text": "DOWN"
|
||||
},
|
||||
"match": {
|
||||
"special": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{}
|
||||
],
|
||||
"version": "2.0.0",
|
||||
"type": "table",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"i": "98f46bc1-c078-40f2-915c-f0836957bf2f",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "98f46bc1-c078-40f2-915c-f0836957bf2f"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "apiserver_requested_deprecated_apis{job=\"apiserver\"}",
|
||||
"legend": ""
|
||||
}
|
||||
],
|
||||
"name": "Deprecated Kubernetes Resources",
|
||||
"links": [],
|
||||
"description": "当前版本apiserver使用,未来版本中要移除的资源",
|
||||
"custom": {
|
||||
"showHeader": true,
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"displayMode": "labelsOfSeriesToRows",
|
||||
"columns": [
|
||||
"group",
|
||||
"version",
|
||||
"resource",
|
||||
"removed_release"
|
||||
],
|
||||
"sortOrder": "ascend"
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"overrides": [
|
||||
{}
|
||||
],
|
||||
"version": "2.0.0",
|
||||
"type": "table",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0,
|
||||
"i": "73beb13a-bd10-4a68-bb9e-5b9ab63da154",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "73beb13a-bd10-4a68-bb9e-5b9ab63da154"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (instance,code) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
|
||||
"legend": "{{ instance }} {{ code }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - HTTP Requests by code",
|
||||
"links": [],
|
||||
"description": "按照返回码分类统计apiserver请求数",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8,
|
||||
"i": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
|
||||
"legend": "{{ instance }} {{ verb }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - HTTP Requests by verb",
|
||||
"links": [],
|
||||
"description": "按照请求动作分类统计apiserver的请求数",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8,
|
||||
"i": "94def0cb-0b86-42f7-a4b2-dde714bbb918",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "94def0cb-0b86-42f7-a4b2-dde714bbb918"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "apiserver_current_inflight_requests{job=\"apiserver\"}",
|
||||
"legend": "{{ instance }} {{ request_kind }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Current Inflight Requests by kind",
|
||||
"links": [],
|
||||
"description": "当前并发请求apiserver的数量",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16,
|
||||
"i": "ce5a15ad-11c6-44a2-a071-be57009162e1",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "fb6266a3-3da0-4310-bfe8-c64a53db5db3"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) by (instance,verb,le))*1000",
|
||||
"legend": "{{ instance }} {{ verb }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - HTTP Requets Latency by verb",
|
||||
"links": [],
|
||||
"description": "apiserver的响应延迟,按请求动作分类统计",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "milliseconds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16,
|
||||
"i": "045dca2d-d69b-47a7-b25e-656adb357e11",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "045dca2d-d69b-47a7-b25e-656adb357e11"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket {job=\"apiserver\",verb!=\"WATCH\"}[5m])) by (instance,le))*1000",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - HTTP Requets Latency by instance",
|
||||
"links": [],
|
||||
"description": "apiserver的响应延迟(非watch请求)",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "milliseconds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24,
|
||||
"i": "1e775704-9ee4-45ce-9d24-b49af89fb5c7",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "1e775704-9ee4-45ce-9d24-b49af89fb5c7"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by(instance,verb) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\"}[5m]))\n / sum by(instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
|
||||
"legend": "{{ instance }} {{ verb }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Errors by verb",
|
||||
"links": [],
|
||||
"description": "apiserver的5xx错误率,按请求动作分类统计",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24,
|
||||
"i": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by(instance) (rate(apiserver_request_total{code=~\"5..\", job=\"apiserver\"}[5m]))\n / sum by(instance) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Errors by Instance",
|
||||
"links": [],
|
||||
"description": "apiserver的5xx 错误率(5xx请求数/总请求数)",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32,
|
||||
"i": "92a209a1-7d30-4627-9ae1-55ded5095ed7",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "92a209a1-7d30-4627-9ae1-55ded5095ed7"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(workqueue_depth{job=\"apiserver\"}[5m])) by (instance,name)",
|
||||
"legend": "{{ instance }} {{ name }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Work Queue by instance",
|
||||
"links": [],
|
||||
"description": "apiserver工作队列深度,越接近0越好",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32,
|
||||
"i": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (instance)",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - HTTP Requests by instance",
|
||||
"links": [],
|
||||
"description": "5分钟内apiserver的请求数统计",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "none"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "normal"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 40,
|
||||
"i": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "rate(process_cpu_seconds_total{job=\"apiserver\"}[5m])",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - CPU Usage by instance",
|
||||
"links": [],
|
||||
"description": "apiserver的cpu使用率",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "percent",
|
||||
"decimals": 2
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 40,
|
||||
"i": "3d5c1ae5-e640-4986-9202-78258169bffb",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "3d5c1ae5-e640-4986-9202-78258169bffb"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "process_resident_memory_bytes{job=\"apiserver\"}",
|
||||
"legend": "{{ instance }}"
|
||||
}
|
||||
],
|
||||
"name": "API Server - Memory Usage by instance",
|
||||
"links": [],
|
||||
"description": "apiserver的内存使用量",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "bytesIEC"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": []
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0.25,
|
||||
"gradientMode": "none",
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 48,
|
||||
"i": "1550a2d5-c808-4174-865a-a41b2c16b486",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "1550a2d5-c808-4174-865a-a41b2c16b486"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
271
integrations/kubernetes/dashboards/Cadvisor.json
Normal file
271
integrations/kubernetes/dashboards/Cadvisor.json
Normal file
@@ -0,0 +1,271 @@
|
||||
{
|
||||
"name": "Cadvisor",
|
||||
"tags": "",
|
||||
"ident": "",
|
||||
"configs": {
|
||||
"version": "2.0.0",
|
||||
"links": [],
|
||||
"var": [
|
||||
{
|
||||
"type": "query",
|
||||
"name": "host",
|
||||
"definition": "label_values({__name__=~\"container.*\"},instance)",
|
||||
"allValue": ".*",
|
||||
"allOption": true,
|
||||
"multi": false,
|
||||
"reg": ""
|
||||
},
|
||||
{
|
||||
"type": "query",
|
||||
"name": "container",
|
||||
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
|
||||
"allValue": ".*",
|
||||
"allOption": true,
|
||||
"multi": false,
|
||||
"reg": ""
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"id": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
|
||||
"type": "timeseries",
|
||||
"name": "CPU Usage",
|
||||
"links": [],
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"i": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
|
||||
"isResizable": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
|
||||
"legend": "{{name}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "#634CD9",
|
||||
"value": null,
|
||||
"type": "base"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"version": "2.0.0",
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "linear",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"maxPerRow": 4
|
||||
},
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"id": "6690fff4-c159-40e5-b340-65a3ba85e37e",
|
||||
"type": "timeseries",
|
||||
"name": "Memory Usage",
|
||||
"links": [],
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7,
|
||||
"i": "6690fff4-c159-40e5-b340-65a3ba85e37e",
|
||||
"isResizable": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||
"legend": "{{name}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "#634CD9",
|
||||
"value": null,
|
||||
"type": "base"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"version": "2.0.0",
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "linear",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"maxPerRow": 4
|
||||
},
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"id": "3c798af5-cfae-4962-9b70-85736df44bb1",
|
||||
"type": "timeseries",
|
||||
"name": "Memory Cached",
|
||||
"links": [],
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 7,
|
||||
"i": "3c798af5-cfae-4962-9b70-85736df44bb1",
|
||||
"isResizable": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||
"legend": "{{name}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "#634CD9",
|
||||
"value": null,
|
||||
"type": "base"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"version": "2.0.0",
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "linear",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"maxPerRow": 4
|
||||
},
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"id": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
|
||||
"type": "timeseries",
|
||||
"name": "Received Network Traffic",
|
||||
"links": [],
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15,
|
||||
"i": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
|
||||
"isResizable": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||
"legend": "{{name}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "#634CD9",
|
||||
"value": null,
|
||||
"type": "base"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"version": "2.0.0",
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "linear",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"maxPerRow": 4
|
||||
},
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"id": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
|
||||
"type": "timeseries",
|
||||
"name": "Sent Network Traffic",
|
||||
"links": [],
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15,
|
||||
"i": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
|
||||
"isResizable": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||
"legend": "{{name}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "#634CD9",
|
||||
"value": null,
|
||||
"type": "base"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"custom": {
|
||||
"version": "2.0.0",
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "linear",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"maxPerRow": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
1605
integrations/kubernetes/dashboards/Container.json
Normal file
1605
integrations/kubernetes/dashboards/Container.json
Normal file
File diff suppressed because it is too large
Load Diff
1110
integrations/kubernetes/dashboards/ControllerManager.json
Normal file
1110
integrations/kubernetes/dashboards/ControllerManager.json
Normal file
File diff suppressed because it is too large
Load Diff
1238
integrations/kubernetes/dashboards/KubeStateMetrics.json
Normal file
1238
integrations/kubernetes/dashboards/KubeStateMetrics.json
Normal file
File diff suppressed because it is too large
Load Diff
438
integrations/kubernetes/dashboards/KubeletMetrics.json
Normal file
438
integrations/kubernetes/dashboards/KubeletMetrics.json
Normal file
@@ -0,0 +1,438 @@
|
||||
{
|
||||
"name": "Kubernetes / Kubelet Metrics",
|
||||
"tags": "Categraf",
|
||||
"ident": "",
|
||||
"configs": {
|
||||
"var": [
|
||||
{
|
||||
"name": "cluster",
|
||||
"definition": "label_values(kubelet_running_pods, cluster)",
|
||||
"multi": true,
|
||||
"allOption": true
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"definition": "label_values(kubelet_running_pods{cluster=~\"$cluster\"}, instance)",
|
||||
"multi": true,
|
||||
"allOption": true
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(up{source=\"kubelet\", cluster=~\"$cluster\"})"
|
||||
}
|
||||
],
|
||||
"name": "Kubelet UP",
|
||||
"custom": {
|
||||
"textMode": "value",
|
||||
"colorMode": "background",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"i": "d3caf396-b3a1-449b-acec-f550967889e6",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "d3caf396-b3a1-449b-acec-f550967889e6"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(kubelet_running_pods{cluster=~\"$cluster\", instance=~\"$instance\"})"
|
||||
}
|
||||
],
|
||||
"name": "Running Pods",
|
||||
"custom": {
|
||||
"textMode": "valueAndName",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 4,
|
||||
"y": 0,
|
||||
"i": "38c38b23-a7e3-4177-8c41-3ce955ea0434",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "38c38b23-a7e3-4177-8c41-3ce955ea0434"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(kubelet_running_containers{cluster=~\"$cluster\", instance=~\"$instance\", container_state=\"running\"})"
|
||||
}
|
||||
],
|
||||
"name": "Running Containers",
|
||||
"custom": {
|
||||
"textMode": "valueAndName",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 8,
|
||||
"y": 0,
|
||||
"i": "26bf2320-fcff-48f8-a6fc-aa9076bb9329",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "525859b9-91d7-4180-b363-bf8ceec977d8"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"desired_state_of_world\"})"
|
||||
}
|
||||
],
|
||||
"name": "Desired Volumes",
|
||||
"custom": {
|
||||
"textMode": "valueAndName",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 12,
|
||||
"y": 0,
|
||||
"i": "54ae4ab3-e932-418c-a637-f2f515cce1b9",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "84af4617-2ae0-4b30-a82a-6e8586342224"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"actual_state_of_world\"})"
|
||||
}
|
||||
],
|
||||
"name": "Actual Volumes",
|
||||
"custom": {
|
||||
"textMode": "valueAndName",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 16,
|
||||
"y": 0,
|
||||
"i": "d9de76d7-2203-40e7-a792-9888ec869e82",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "d431f4bd-9115-41d2-a494-1d680bdd1e0f"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))"
|
||||
}
|
||||
],
|
||||
"name": "OP Errors in 5min",
|
||||
"custom": {
|
||||
"textMode": "value",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"valueMappings": [
|
||||
{
|
||||
"type": "range",
|
||||
"match": {
|
||||
"from": 1
|
||||
},
|
||||
"result": {
|
||||
"color": "#d0021b"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "range",
|
||||
"match": {
|
||||
"to": 1
|
||||
},
|
||||
"result": {
|
||||
"color": "#417505"
|
||||
}
|
||||
}
|
||||
],
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 4,
|
||||
"x": 20,
|
||||
"y": 0,
|
||||
"i": "bf2bbd15-347d-404c-9b8f-e524875befe2",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "54de62bc-8af3-4c27-8b8e-1af567b363fc"
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": "730d4a9b-791f-4aaf-a042-668f66e73814",
|
||||
"name": "Operations",
|
||||
"collapsed": true,
|
||||
"layout": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 3,
|
||||
"i": "730d4a9b-791f-4aaf-a042-668f66e73814",
|
||||
"isResizable": false
|
||||
},
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "increase(kubelet_runtime_operations_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])"
|
||||
}
|
||||
],
|
||||
"name": "Operations in 5min",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 4,
|
||||
"i": "d26e6818-6704-492a-8cbf-58473dd85716",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "d26e6818-6704-492a-8cbf-58473dd85716"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])"
|
||||
}
|
||||
],
|
||||
"name": "Operation Errors in 5min",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 4,
|
||||
"i": "4e585d2f-c61c-4350-86ec-dca7ddc34ceb",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "09a6ad5b-8c0e-4f17-b17f-3ebc514f7d20"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "increase(kubelet_runtime_operations_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_runtime_operations_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
|
||||
}
|
||||
],
|
||||
"name": "Average Operation duration in 1 hour (Unit: Second)",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 4,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8,
|
||||
"i": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7"
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
|
||||
"name": "PLEG relist",
|
||||
"collapsed": true,
|
||||
"layout": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12,
|
||||
"i": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
|
||||
"isResizable": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "rate(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
|
||||
}
|
||||
],
|
||||
"name": "relist rate",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13,
|
||||
"i": "f3822da8-a9c9-4db1-ba12-465d3ece823e",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "f3822da8-a9c9-4db1-ba12-465d3ece823e"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "increase(kubelet_pleg_relist_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])"
|
||||
}
|
||||
],
|
||||
"name": "relist duration (Unit: Second)",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13,
|
||||
"i": "2b4ada76-6c30-42cd-9bd3-c939b4c0139c",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "a6e4c914-bfca-4419-a264-f5b1cbab261a"
|
||||
}
|
||||
],
|
||||
"version": "2.0.0"
|
||||
}
|
||||
}
|
||||
1005
integrations/kubernetes/dashboards/Scheduler.json
Normal file
1005
integrations/kubernetes/dashboards/Scheduler.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
integrations/kubernetes/icon/kubernetes.png
Normal file
BIN
integrations/kubernetes/icon/kubernetes.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
@@ -1437,7 +1437,7 @@
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "bytesIEC",
|
||||
"util": "bitsIEC",
|
||||
"decimals": 0
|
||||
},
|
||||
"thresholds": {
|
||||
@@ -1732,4 +1732,4 @@
|
||||
],
|
||||
"version": "3.0.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
68
integrations/processes/alerts/categraf-processes.json
Normal file
68
integrations/processes/alerts/categraf-processes.json
Normal file
@@ -0,0 +1,68 @@
|
||||
[
|
||||
{
|
||||
"cate": "prometheus",
|
||||
"datasource_ids": [
|
||||
0
|
||||
],
|
||||
"name": "Too many running processes",
|
||||
"note": "",
|
||||
"prod": "metric",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 0,
|
||||
"severities": [
|
||||
2
|
||||
],
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 60,
|
||||
"prom_ql": "",
|
||||
"rule_config": {
|
||||
"queries": [
|
||||
{
|
||||
"prom_ql": "processes_running > (system_n_cpus * 3)",
|
||||
"severity": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
"prom_eval_interval": 30,
|
||||
"enable_stime": "00:00",
|
||||
"enable_stimes": [
|
||||
"00:00"
|
||||
],
|
||||
"enable_etime": "23:59",
|
||||
"enable_etimes": [
|
||||
"23:59"
|
||||
],
|
||||
"enable_days_of_week": [
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6"
|
||||
],
|
||||
"enable_days_of_weeks": [
|
||||
[
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6"
|
||||
]
|
||||
],
|
||||
"enable_in_bg": 1,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"notify_max_number": 10,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [],
|
||||
"annotations": {}
|
||||
}
|
||||
]
|
||||
234
integrations/processes/dashboards/categraf-processes.json
Normal file
234
integrations/processes/dashboards/categraf-processes.json
Normal file
@@ -0,0 +1,234 @@
|
||||
{
|
||||
"name": "Processes by UlricQin",
|
||||
"tags": "Categraf Linux OS",
|
||||
"ident": "",
|
||||
"configs": {
|
||||
"var": [
|
||||
{
|
||||
"name": "Datasource",
|
||||
"label": "",
|
||||
"type": "datasource",
|
||||
"definition": "prometheus",
|
||||
"defaultValue": 37
|
||||
},
|
||||
{
|
||||
"name": "ident",
|
||||
"label": "Host",
|
||||
"type": "query",
|
||||
"datasource": {
|
||||
"cate": "prometheus",
|
||||
"value": "${Datasource}"
|
||||
},
|
||||
"definition": "label_values(processes_running, ident)",
|
||||
"multi": true,
|
||||
"allOption": true
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"type": "barGauge",
|
||||
"id": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"i": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
|
||||
"isResizable": true
|
||||
},
|
||||
"version": "3.0.0",
|
||||
"datasourceCate": "prometheus",
|
||||
"datasourceValue": "${Datasource}",
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "processes_running{ident=~\"$ident\"}",
|
||||
"legend": "{{ident}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"name": "Running Processes",
|
||||
"custom": {
|
||||
"calc": "lastNotNull",
|
||||
"baseColor": "#9470FF",
|
||||
"serieWidth": 20,
|
||||
"sortOrder": "desc"
|
||||
},
|
||||
"options": {
|
||||
"valueMappings": [
|
||||
{
|
||||
"type": "range",
|
||||
"result": {
|
||||
"color": "#f10808"
|
||||
},
|
||||
"match": {
|
||||
"from": 50
|
||||
}
|
||||
}
|
||||
],
|
||||
"standardOptions": {}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "barGauge",
|
||||
"id": "659f5f75-24ca-493c-97cb-3d99abd52172",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0,
|
||||
"i": "df457bf0-17c8-4d05-a527-cfaf0f2b844c",
|
||||
"isResizable": true
|
||||
},
|
||||
"version": "3.0.0",
|
||||
"datasourceCate": "prometheus",
|
||||
"datasourceValue": "${Datasource}",
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "processes_total{ident=~\"$ident\"}",
|
||||
"legend": "{{ident}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"name": "Total Processes",
|
||||
"custom": {
|
||||
"calc": "lastNotNull",
|
||||
"baseColor": "#9470FF",
|
||||
"serieWidth": 20,
|
||||
"sortOrder": "desc"
|
||||
},
|
||||
"options": {
|
||||
"valueMappings": [
|
||||
{
|
||||
"type": "range",
|
||||
"result": {
|
||||
"color": "#f10808"
|
||||
},
|
||||
"match": {
|
||||
"from": 600
|
||||
}
|
||||
}
|
||||
],
|
||||
"standardOptions": {}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "barGauge",
|
||||
"id": "5e849509-1c41-44c7-85ee-d8c0adf7c623",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8,
|
||||
"i": "62291285-be84-470a-9ccc-53be7a8733fd",
|
||||
"isResizable": true
|
||||
},
|
||||
"version": "3.0.0",
|
||||
"datasourceCate": "prometheus",
|
||||
"datasourceValue": "${Datasource}",
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "processes_total_threads{ident=~\"$ident\"}",
|
||||
"legend": "{{ident}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"name": "Total Threads",
|
||||
"custom": {
|
||||
"calc": "lastNotNull",
|
||||
"baseColor": "#9470FF",
|
||||
"serieWidth": 20,
|
||||
"sortOrder": "desc"
|
||||
},
|
||||
"options": {
|
||||
"valueMappings": [
|
||||
{
|
||||
"type": "range",
|
||||
"result": {
|
||||
"color": "#ff8286"
|
||||
},
|
||||
"match": {
|
||||
"from": 2000
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "range",
|
||||
"result": {
|
||||
"color": "#f30909"
|
||||
},
|
||||
"match": {
|
||||
"from": 4000
|
||||
}
|
||||
}
|
||||
],
|
||||
"standardOptions": {}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": "b2850506-6cdd-48cc-9223-70acff9212b0",
|
||||
"layout": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8,
|
||||
"i": "b2850506-6cdd-48cc-9223-70acff9212b0",
|
||||
"isResizable": true
|
||||
},
|
||||
"version": "3.0.0",
|
||||
"datasourceCate": "prometheus",
|
||||
"datasourceValue": "${Datasource}",
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum({__name__=~\"processes_sleeping|processes_dead|processes_paging|processes_total_threads|processes_total|processes_idle|processes_running|processes_zombies|processes_stopped|processes_unknown|processes_blocked\", ident=~\"$ident\"}) by (__name__)",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"name": "SUM by Process state",
|
||||
"custom": {
|
||||
"showHeader": true,
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"displayMode": "labelsOfSeriesToRows",
|
||||
"sortColumn": "value",
|
||||
"sortOrder": "descend",
|
||||
"columns": []
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"overrides": [
|
||||
{}
|
||||
]
|
||||
}
|
||||
],
|
||||
"version": "3.0.0"
|
||||
}
|
||||
}
|
||||
BIN
integrations/processes/icon/linux.png
Normal file
BIN
integrations/processes/icon/linux.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.7 KiB |
22
integrations/processes/markdown/readme.md
Normal file
22
integrations/processes/markdown/readme.md
Normal file
@@ -0,0 +1,22 @@
|
||||
## Categraf as collector
|
||||
|
||||
configuration file: `conf/input.processes/processes.toml`
|
||||
|
||||
默认配置如下(一般维持默认不用动):
|
||||
|
||||
```toml
|
||||
# # collect interval
|
||||
# interval = 15
|
||||
|
||||
# # force use ps command to gather
|
||||
# force_ps = false
|
||||
|
||||
# # force use /proc to gather
|
||||
# force_proc = false
|
||||
```
|
||||
|
||||
有两种采集方式,使用 ps 命令,或者直接读取 `/proc` 目录,默认是后者。如果想强制使用 ps 命令才采集,开启 force_ps 即可:
|
||||
|
||||
```
|
||||
force_ps = true
|
||||
```
|
||||
@@ -425,18 +425,6 @@ func AlertNumbers(ctx *ctx.Context, bgids []int64) (map[int64]int64, error) {
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func AlertCurEventGetAll(ctx *ctx.Context, cluster string) ([]*AlertCurEvent, error) {
|
||||
session := DB(ctx).Model(&AlertCurEvent{})
|
||||
|
||||
if cluster != "" {
|
||||
session = session.Where("cluster = ?", cluster)
|
||||
}
|
||||
|
||||
var lst []*AlertCurEvent
|
||||
err := session.Find(&lst).Error
|
||||
return lst, err
|
||||
}
|
||||
|
||||
func AlertCurEventGetByIds(ctx *ctx.Context, ids []int64) ([]*AlertCurEvent, error) {
|
||||
var lst []*AlertCurEvent
|
||||
|
||||
|
||||
@@ -278,6 +278,10 @@ func (ar *AlertRule) Update(ctx *ctx.Context, arf AlertRule) error {
|
||||
}
|
||||
|
||||
func (ar *AlertRule) UpdateColumn(ctx *ctx.Context, column string, value interface{}) error {
|
||||
if value == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if column == "datasource_ids" {
|
||||
b, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
@@ -286,6 +290,68 @@ func (ar *AlertRule) UpdateColumn(ctx *ctx.Context, column string, value interfa
|
||||
return DB(ctx).Model(ar).UpdateColumn(column, string(b)).Error
|
||||
}
|
||||
|
||||
if column == "severity" {
|
||||
severity := int(value.(float64))
|
||||
if ar.Cate == PROMETHEUS {
|
||||
var ruleConfig PromRuleConfig
|
||||
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(ruleConfig.Queries) != 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ruleConfig.Queries[0].Severity = severity
|
||||
b, err := json.Marshal(ruleConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
|
||||
} else if ar.Cate == HOST {
|
||||
var ruleConfig HostRuleConfig
|
||||
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(ruleConfig.Triggers) != 1 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ruleConfig.Triggers[0].Severity = severity
|
||||
|
||||
b, err := json.Marshal(ruleConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
|
||||
}
|
||||
}
|
||||
|
||||
if column == "runbook_url" {
|
||||
url := value.(string)
|
||||
|
||||
err := json.Unmarshal([]byte(ar.Annotations), &ar.AnnotationsJSON)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if ar.AnnotationsJSON == nil {
|
||||
ar.AnnotationsJSON = make(map[string]string)
|
||||
}
|
||||
|
||||
ar.AnnotationsJSON["runbook_url"] = url
|
||||
|
||||
b, err := json.Marshal(ar.AnnotationsJSON)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return DB(ctx).Model(ar).UpdateColumn("annotations", string(b)).Error
|
||||
}
|
||||
|
||||
return DB(ctx).Model(ar).UpdateColumn(column, value).Error
|
||||
}
|
||||
|
||||
|
||||
@@ -8,11 +8,11 @@ import (
|
||||
)
|
||||
|
||||
type AlertingEngines struct {
|
||||
Id int64 `json:"id" gorm:"primaryKey"`
|
||||
Instance string `json:"instance"`
|
||||
Cluster string `json:"cluster"`
|
||||
DatasourceId int64 `json:"datasource_id"`
|
||||
Clock int64 `json:"clock"`
|
||||
Id int64 `json:"id" gorm:"primaryKey"`
|
||||
Instance string `json:"instance"`
|
||||
EngineCluster string `json:"cluster" gorm:"engine_cluster"`
|
||||
DatasourceId int64 `json:"datasource_id"`
|
||||
Clock int64 `json:"clock"`
|
||||
}
|
||||
|
||||
func (e *AlertingEngines) TableName() string {
|
||||
@@ -107,11 +107,11 @@ func AlertingEngineGet(ctx *ctx.Context, where string, args ...interface{}) (*Al
|
||||
func AlertingEngineGetsClusters(ctx *ctx.Context, where string, args ...interface{}) ([]string, error) {
|
||||
var arr []string
|
||||
var err error
|
||||
session := DB(ctx).Model(new(AlertingEngines)).Where("cluster != ''").Order("cluster").Distinct("cluster")
|
||||
session := DB(ctx).Model(new(AlertingEngines)).Where("engine_cluster != ''").Order("engine_cluster").Distinct("engine_cluster")
|
||||
if where == "" {
|
||||
err = session.Pluck("cluster", &arr).Error
|
||||
err = session.Pluck("engine_cluster", &arr).Error
|
||||
} else {
|
||||
err = session.Where(where, args...).Pluck("cluster", &arr).Error
|
||||
err = session.Where(where, args...).Pluck("engine_cluster", &arr).Error
|
||||
}
|
||||
return arr, err
|
||||
}
|
||||
@@ -130,7 +130,7 @@ func AlertingEngineGetsInstances(ctx *ctx.Context, where string, args ...interfa
|
||||
|
||||
func AlertingEngineHeartbeatWithCluster(ctx *ctx.Context, instance, cluster string, datasourceId int64) error {
|
||||
var total int64
|
||||
err := DB(ctx).Model(new(AlertingEngines)).Where("instance=? and cluster = ? and datasource_id=?", instance, cluster, datasourceId).Count(&total).Error
|
||||
err := DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Count(&total).Error
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -138,15 +138,15 @@ func AlertingEngineHeartbeatWithCluster(ctx *ctx.Context, instance, cluster stri
|
||||
if total == 0 {
|
||||
// insert
|
||||
err = DB(ctx).Create(&AlertingEngines{
|
||||
Instance: instance,
|
||||
DatasourceId: datasourceId,
|
||||
Cluster: cluster,
|
||||
Clock: time.Now().Unix(),
|
||||
Instance: instance,
|
||||
DatasourceId: datasourceId,
|
||||
EngineCluster: cluster,
|
||||
Clock: time.Now().Unix(),
|
||||
}).Error
|
||||
} else {
|
||||
// updates
|
||||
fields := map[string]interface{}{"clock": time.Now().Unix()}
|
||||
err = DB(ctx).Model(new(AlertingEngines)).Where("instance=? and cluster = ? and datasource_id=?", instance, cluster, datasourceId).Updates(fields).Error
|
||||
err = DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Updates(fields).Error
|
||||
}
|
||||
|
||||
return err
|
||||
|
||||
@@ -12,6 +12,7 @@ type HostMeta struct {
|
||||
MemUtil float64 `json:"mem_util"`
|
||||
Offset int64 `json:"offset"`
|
||||
UnixTime int64 `json:"unixtime"`
|
||||
RemoteAddr string `json:"remote_addr"`
|
||||
}
|
||||
|
||||
func (h HostMeta) MarshalBinary() ([]byte, error) {
|
||||
|
||||
@@ -22,14 +22,15 @@ type Target struct {
|
||||
TagsMap map[string]string `json:"-" gorm:"-"` // internal use, append tags to series
|
||||
UpdateAt int64 `json:"update_at"`
|
||||
|
||||
UnixTime int64 `json:"unixtime" gorm:"-"`
|
||||
Offset int64 `json:"offset" gorm:"-"`
|
||||
TargetUp float64 `json:"target_up" gorm:"-"`
|
||||
MemUtil float64 `json:"mem_util" gorm:"-"`
|
||||
CpuNum int `json:"cpu_num" gorm:"-"`
|
||||
CpuUtil float64 `json:"cpu_util" gorm:"-"`
|
||||
OS string `json:"os" gorm:"-"`
|
||||
Arch string `json:"arch" gorm:"-"`
|
||||
UnixTime int64 `json:"unixtime" gorm:"-"`
|
||||
Offset int64 `json:"offset" gorm:"-"`
|
||||
TargetUp float64 `json:"target_up" gorm:"-"`
|
||||
MemUtil float64 `json:"mem_util" gorm:"-"`
|
||||
CpuNum int `json:"cpu_num" gorm:"-"`
|
||||
CpuUtil float64 `json:"cpu_util" gorm:"-"`
|
||||
OS string `json:"os" gorm:"-"`
|
||||
Arch string `json:"arch" gorm:"-"`
|
||||
RemoteAddr string `json:"remote_addr" gorm:"-"`
|
||||
}
|
||||
|
||||
func (t *Target) TableName() string {
|
||||
@@ -296,6 +297,7 @@ func (t *Target) FillMeta(meta *HostMeta) {
|
||||
t.Offset = meta.Offset
|
||||
t.OS = meta.OS
|
||||
t.Arch = meta.Arch
|
||||
t.RemoteAddr = meta.RemoteAddr
|
||||
}
|
||||
|
||||
func TargetIdents(ctx *ctx.Context, ids []int64) ([]string, error) {
|
||||
|
||||
@@ -4,6 +4,7 @@ import "github.com/ccfos/nightingale/v6/pkg/ctx"
|
||||
|
||||
type TaskRecord struct {
|
||||
Id int64 `json:"id" gorm:"primaryKey"`
|
||||
EventId int64 `json:"event_id"`
|
||||
GroupId int64 `json:"group_id"`
|
||||
IbexAddress string `json:"ibex_address"`
|
||||
IbexAuthUser string `json:"ibex_auth_user"`
|
||||
|
||||
Reference in New Issue
Block a user