Compare commits

..

18 Commits

Author SHA1 Message Date
kongfei
dbc9a4fe58 chmod 755 wait for aarch64 2023-04-28 16:27:16 +08:00
青牛踏雪
0f4478318e update victoriametrics url links. @cyancow (#1516)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.

* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter

* add victoriametrics dashboard based on categraf collection.

* up victoriametrics url links.

* Update README.md

---------

Co-authored-by: ulricqin <ulricqin@qq.com>
2023-04-27 21:28:04 +08:00
Ulric Qin
c0d0eb0e69 code refactor 2023-04-27 21:22:48 +08:00
Ulric Qin
b62762b2e6 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-27 21:20:39 +08:00
ning
810ca0e469 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-04-27 21:19:58 +08:00
青牛踏雪
33e3b224b9 add victoriametrics dashboard based on categraf collection. (#1515)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.

* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter

* add victoriametrics dashboard based on categraf collection.
2023-04-27 21:19:46 +08:00
ning
24d7b2b1bf update dockerfile 2023-04-27 21:19:42 +08:00
Ulric Qin
1d5ff1b28d code refactor 2023-04-27 21:18:41 +08:00
ning
ed5c8c5758 fix Dockerfile 2023-04-27 20:23:28 +08:00
Ulric Qin
01f7860900 code refactor 2023-04-27 20:22:23 +08:00
Ulric Qin
a6bb03c8ba update http dash 2023-04-27 20:20:39 +08:00
Ulric Qin
e9150b2ae0 rename dir of net_response 2023-04-27 20:19:17 +08:00
Ulric Qin
30d1ebd808 update http icon and n9e icon 2023-04-27 19:51:58 +08:00
Ulric Qin
2f69d92055 add markdown readme of procstat 2023-04-27 19:46:47 +08:00
Ulric Qin
deeb40b4a0 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-27 19:35:10 +08:00
Ulric Qin
37f68fd52b add procstat integrations 2023-04-27 19:34:57 +08:00
ning
73828e50b5 update fe.sh 2023-04-27 18:23:40 +08:00
kongfei605
7e73850117 Merge pull request #1514 from ccfos/docker_release
update dockerfile for github-action
2023-04-27 18:08:00 +08:00
20 changed files with 3506 additions and 75 deletions

View File

@@ -6,6 +6,7 @@ ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
RUN chmod +x /wait
ADD pub /app/pub/
EXPOSE 17000

View File

@@ -1,3 +1,4 @@
FROM flashcatcloud/toolbox:v0.0.1 as toolbox
FROM --platform=$TARGETPLATFORM python:3-slim
@@ -5,8 +6,8 @@ WORKDIR /app
ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_aarch64 /wait
ADD pub /app/pub/
COPY --chmod=755 --from=toolbox /toolbox/wait_aarch64 /wait
EXPOSE 17000

5
fe.sh
View File

@@ -8,8 +8,3 @@ curl -o n9e-fe-${VERSION}.tar.gz -L https://github.com/n9e/fe/releases/download/
tar zxvf n9e-fe-${VERSION}.tar.gz
cp ./docker/initsql/a-n9e.sql n9e.sql
TAG=$(curl -sX GET https://api.github.com/repos/ccfos/nightingale/releases/latest | awk '/tag_name/{print $4;exit}' FS='[""]')
VERSION=$(echo $TAG)
VERSION=${VERSION/v/}
sed -i "s/nightingale:latest/nightingale:${VERSION}/g" ./docker/docker-compose.yaml

Binary file not shown.

After

Width:  |  Height:  |  Size: 888 B

View File

@@ -1,41 +1,68 @@
{
"name": "http detect",
"name": "HTTP detect by UlricQin",
"tags": "",
"ident": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "0cd7c8aa-456c-4522-97ef-0b1710e7af8a",
"layout": {
"h": 1,
"i": "0cd7c8aa-456c-4522-97ef-0b1710e7af8a",
"w": 24,
"x": 0,
"y": 0
},
"name": "Default chart group",
"type": "row"
},
{
"custom": {
"aggrDimension": "target",
"calc": "lastNotNull",
"displayMode": "labelValuesToRows",
"showHeader": true
},
"type": "table",
"id": "3674dbfa-243a-49f6-baa5-b7f887c1afb0",
"layout": {
"h": 15,
"i": "3674dbfa-243a-49f6-baa5-b7f887c1afb0",
"w": 24,
"x": 0,
"y": 1
"y": 0,
"i": "3674dbfa-243a-49f6-baa5-b7f887c1afb0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "max(http_response_result_code) by (target)",
"legend": "UP?",
"refId": "A",
"instant": true
},
{
"expr": "max(http_response_response_code) by (target)",
"legend": "status code",
"refId": "B",
"instant": true
},
{
"expr": "max(http_response_response_time) by (target) *1000",
"legend": "latency",
"refId": "C",
"instant": true
},
{
"expr": "max(http_response_cert_expire_timestamp) by (target) - time()",
"legend": "cert expire",
"refId": "D",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "URL Details",
"custom": {
"showHeader": true,
"colorMode": "background",
"calc": "lastNotNull",
"displayMode": "labelValuesToRows",
"aggrDimension": "target",
"sortColumn": "target",
"sortOrder": "ascend"
},
"options": {
"standardOptions": {},
"valueMappings": []
"valueMappings": [],
"standardOptions": {}
},
"overrides": [
{
@@ -76,43 +103,115 @@
"properties": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"valueMappings": [
{
"type": "range",
"result": {
"color": "#f60c0c"
},
"match": {
"to": 604800
}
},
{
"type": "range",
"result": {
"color": "#ffae39"
},
"match": {
"to": 2592000
}
}
]
},
"type": "special"
},
{
"type": "special",
"matcher": {
"value": "B"
},
"properties": {
"valueMappings": [
{
"type": "range",
"result": {
"color": "#2c9d3d"
},
"match": {
"to": 399
}
},
{
"type": "range",
"result": {
"color": "#ff656b"
},
"match": {
"to": 499
}
},
{
"type": "range",
"result": {
"color": "#f10808"
},
"match": {
"from": 500
}
}
],
"standardOptions": {}
}
},
{
"type": "special",
"matcher": {
"value": "C"
},
"properties": {
"valueMappings": [
{
"type": "range",
"result": {
"color": "#2c9d3d"
},
"match": {
"to": 400
}
},
{
"type": "range",
"result": {
"color": "#ff656b"
},
"match": {
"from": 400
}
},
{
"type": "range",
"result": {
"color": "#f11313"
},
"match": {
"from": 2000
}
}
],
"standardOptions": {
"util": "milliseconds"
}
}
}
],
"targets": [
{
"expr": "max(http_response_result_code) by (target)",
"legend": "UP?",
"refId": "A"
},
{
"expr": "max(http_response_response_code) by (target)",
"legend": "status code",
"refId": "B"
},
{
"expr": "max(http_response_response_time) by (target)",
"legend": "latency(s)",
"refId": "C"
},
{
"expr": "max(http_response_cert_expire_timestamp) by (target) - time()",
"legend": "cert expire",
"refId": "D"
}
],
"type": "table",
"version": "2.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}"
]
}
],
"version": "3.0.0",
"var": [
{
"name": "prom",
"name": "Datasource",
"type": "datasource",
"definition": "prometheus"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 975 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

View File

@@ -20,3 +20,4 @@ configuration file: `conf/input.processes/processes.toml`
```
force_ps = true
```

View File

@@ -0,0 +1,62 @@
[
{
"name": "there is a process count of 0, indicating that a certain process may have crashed",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "procstat_lookup_count == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "process handle limit is too low",
"note": "",
"severity": 3,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "procstat_rlimit_num_fds_soft < 2048",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

View File

@@ -0,0 +1,642 @@
{
"name": "Procstat by UlricQin",
"tags": "Categraf OS",
"ident": "",
"configs": {
"var": [
{
"name": "Datasource",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 37
},
{
"name": "ident",
"label": "Host",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(system_load_norm_1, ident)",
"multi": true,
"allOption": true
},
{
"name": "search_string",
"label": "Proc",
"type": "query",
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(procstat_lookup_count{ident=~\"$ident\"}, search_string)",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "stat",
"id": "be9aac6c-4401-4c61-8c43-574cf314ffef",
"layout": {
"h": 4,
"w": 5,
"x": 0,
"y": 0,
"i": "be9aac6c-4401-4c61-8c43-574cf314ffef",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"legend": "{{ident}} {{search_string}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Count Now",
"custom": {
"textMode": "valueAndName",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "table",
"id": "da621e2c-ae2b-4375-9a66-2bec7832490b",
"layout": {
"h": 4,
"w": 9,
"x": 5,
"y": 0,
"i": "79db82d9-5f46-4c45-bb9f-c23f94d99e0a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"legend": "{{ident}} {{search_string}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Count Table",
"custom": {
"showHeader": true,
"colorMode": "background",
"calc": "lastNotNull",
"displayMode": "labelsOfSeriesToRows",
"columns": [
"ident",
"search_string",
"value"
],
"sortColumn": "ident",
"sortOrder": "ascend"
},
"options": {
"valueMappings": [
{
"type": "special",
"result": {
"color": "#fa0c0c"
},
"match": {
"special": 0
}
},
{
"type": "range",
"result": {
"color": "#2c9d3d"
},
"match": {
"from": 1
}
}
],
"standardOptions": {}
},
"overrides": [
{}
]
},
{
"type": "timeseries",
"id": "881c04fd-8804-432e-9b34-b4761590de20",
"layout": {
"h": 4,
"w": 10,
"x": 14,
"y": 0,
"i": "24b55362-d900-43c0-98d5-f2e994bf22a6",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"legend": "{{ident}} {{search_string}}",
"instant": false
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Count Trend",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "stat",
"id": "ffeb0fc6-ee02-4fdd-a8e3-ec2b9db9c23c",
"layout": {
"h": 4,
"w": 5,
"x": 0,
"y": 4,
"i": "acd6e7b5-99f5-4d9b-9124-8072c14e5fea",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_uptime_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}",
"legend": "{{ident}} {{search_string}}",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Uptime",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#2c9d3d",
"value": null,
"type": "base"
}
]
}
}
},
{
"type": "timeseries",
"id": "c642a30a-da86-402c-87bf-c2f98616bf95",
"layout": {
"h": 4,
"w": 9,
"x": 5,
"y": 4,
"i": "c642a30a-da86-402c-87bf-c2f98616bf95",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_cpu_usage_total{ident=~\"$ident\", search_string=~\"$search_string\"}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU Util",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "cbc2444e-49c7-45e1-b64e-cd1282b5a419",
"layout": {
"h": 4,
"w": 10,
"x": 14,
"y": 4,
"i": "198846a2-4794-4ba9-9c2d-137bce22f266",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_mem_usage_total{ident=~\"$ident\", search_string=~\"$search_string\"}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Mem Util",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "d2bff162-5801-4d85-94d7-d63145d5b935",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 8,
"i": "a208e192-cf74-468b-9bcb-cb81c8d78d24",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_num_fds_total{ident=~\"$ident\", search_string=~\"$search_string\"}/procstat_rlimit_num_fds_soft_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}*100"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "FD soft Util",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "46a71143-84b5-4dde-87db-2f0403df6519",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 8,
"i": "22dfb5e4-1d17-4e06-a9b4-b25cb33d1c20",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_num_fds_total{ident=~\"$ident\", search_string=~\"$search_string\"}/procstat_rlimit_num_fds_hard_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}*100"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "FD hard Util",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "3dda4eb5-a27f-4d54-9547-ae8f0ac9bb96",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 12,
"i": "3dda4eb5-a27f-4d54-9547-ae8f0ac9bb96",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_read_bytes_total{ident=~\"$ident\", search_string=~\"$search_string\"}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Read bytes",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "c97403f4-618d-4037-8ea7-5deb32eb8d56",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 12,
"i": "ae0dc449-8263-4f38-8c52-d50b3cb3f1b4",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"refId": "A",
"expr": "procstat_read_bytes_total{ident=~\"$ident\", search_string=~\"$search_string\"}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Write bytes",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
],
"version": "3.0.0"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 848 B

View File

@@ -0,0 +1,80 @@
## Categraf as collector
configuration file: `conf/input.procstat/procstat.toml`
进程监控插件两个核心作用监控进程是否存活、监控进程使用了多少资源CPU、内存、文件句柄等
### 存活监控
如果进程监听了端口,就直接用 net_response 来做存活性监控即可,无需使用 procstat 来做,因为:端口在监听,说明进程一定活着,反之则不一定。
### 进程筛选
机器上进程很多,我们要做进程监控,就要想办法告诉 Categraf 要监控哪些进程,通过 search 打头的那几个配置,可以做进程过滤筛选:
```toml
[[instnaces]]
# # executable name (ie, pgrep <search_exec_substring>)
search_exec_substring = "nginx"
# # pattern as argument for pgrep (ie, pgrep -f <search_cmdline_substring>)
# search_cmdline_substring = "n9e server"
# # windows service name
# search_win_service = ""
```
上面三个 search 相关的配置每个采集目标选用其中一个。有一个额外的配置search_user配合search_exec_substring 或者 search_cmdline_substring 使用,表示匹配指定 username 的特定进程。如果不需要指定username保持配置注释即可。
```toml
# # search process with specific user, option with exec_substring or cmdline_substring
# search_user = ""
```
默认的进程监控的配置,`[[instnaces]]` 是注释掉的,记得打开。
### mode
mode 配置有两个值供选择,一个是 solaris一个是 irix默认是 irix用这个配置来决定使用哪种 cpu 使用率的计算方法:
```go
func (ins *Instance) gatherCPU(slist *types.SampleList, procs map[PID]Process, tags map[string]string, solarisMode bool) {
var value float64
for pid := range procs {
v, err := procs[pid].Percent(time.Duration(0))
if err == nil {
if solarisMode {
value += v / float64(runtime.NumCPU())
slist.PushFront(types.NewSample("cpu_usage", v/float64(runtime.NumCPU()), map[string]string{"pid": fmt.Sprint(pid)}, tags))
} else {
value += v
slist.PushFront(types.NewSample("cpu_usage", v, map[string]string{"pid": fmt.Sprint(pid)}, tags))
}
}
}
if ins.GatherTotal {
slist.PushFront(types.NewSample("cpu_usage_total", value, tags))
}
}
```
### gather_total
比如进程名字是 mysql 的进程,同时可能运行了多个,我们想知道这个机器上的所有 mysql 的进程占用的总的 cpu、mem、fd 等,就设置 gather_total = true当然对于 uptime 和 limit 的采集gather_total 的时候是取的多个进程的最小值
### gather_per_pid
还是拿 mysql 举例,一个机器上可能同时运行了多个,我们可能想知道每个 mysql 进程的资源占用情况,此时就要启用 gather_per_pid 的配置,设置为 true此时会采集每个进程的资源占用情况并附上 pid 作为标签来区分
### gather_more_metrics
默认 procstat 插件只是采集进程数量,如果想采集进程占用的资源,就要启用 gather_more_metrics 中的项,启用哪个就额外采集哪个
### jvm
gather_more_metrics 中有个 jvm如果是 Java 的进程可以选择开启,非 Java 的进程就不要开启了。需要注意的是,这个监控需要依赖机器上的 jstat 命令,这是社区小伙伴贡献的采集代码,感谢 [@lsy1990](https://github.com/lsy1990)
### One more thing
要监控什么进程就去目标机器修改 Categraf 的配置 `conf/input.procstat/procstat.toml` ,如果嫌麻烦,可以联系我们采购专业版,专业版支持在服务端 WEB 上统一做配置,不需要登录目标机器修改 Categraf 的配置。

View File

@@ -0,0 +1,55 @@
[
{
"cate": "prometheus",
"name": "VictoriaMetrics服务宕机",
"note": "当前采集所有实例存活状态",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "up{service=~\"vm.+\"} < 1",
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

View File

@@ -0,0 +1,47 @@
## VictoriaMetrics Dashboard & Alerts
使用[categraf](https://github.com/flashcatcloud/categraf)中[inputs.prometheus](https://github.com/flashcatcloud/categraf/tree/main/inputs/prometheus)插件采集[VictoriaMetrics](https://docs.victoriametrics.com/)三个服务组件默认暴露的指标数据:
写入模块: `vminsert` 端口:`8480` URI`metrics`
查询模块: `vmselect` 端口:`8481` URI`metrics`
存储模块: `vmstorage` 端口:`8482` URI`metrics`
### 配置文件示例:
其中label_key: `instance` label: `service` 为[dashboard](../dashboard/victoriametrics.json)中选择变量,如果有特殊需求,可自行修改或者添加;
```toml
# vmstorage
[[instances]]
urls = [
"http://127.0.0.1:8482/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="vmstorage"}
# vmselect
[[instances]]
urls = [
"http://127.0.0.1:8481/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="vmselect"}
# vminsert
[[instances]]
urls = [
"http://127.0.0.1:8480/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = {service="vminsert"}
```
### 告警规则
[alerts](../alerts/alerts.json)