Compare commits

..

290 Commits

Author SHA1 Message Date
ning
ff40fe3518 refactor stats 2023-07-20 16:36:40 +08:00
青牛踏雪
bb5680f6c4 fix windows dashboards label_values (#1639) 2023-07-20 11:49:20 +08:00
李明
acbe49f518 index pattern basic op (#1635)
* index pattern basic op
2023-07-20 11:21:51 +08:00
青牛踏雪
9dd55938c2 add Gitlab dashboard and alert rules based on categraf acquisition (#1636) 2023-07-20 11:17:21 +08:00
shardingHe
5433e6e27e AlertAggrView update verify (#1637)
Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
2023-07-19 20:33:56 +08:00
ning
2dd6eb5f0f fix: get targets 2023-07-19 15:56:25 +08:00
Yening Qin
1731713dbb fix: get all target by guest user (#1634)
* fix: targets api get all

* code refactor
2023-07-19 15:08:10 +08:00
Ulric Qin
327ddb7bad code refactor 2023-07-17 19:47:58 +08:00
Ulric Qin
9e4adc1fa2 code refactor 2023-07-17 17:47:14 +08:00
Ulric Qin
bce7fdb470 code refactor 2023-07-17 17:13:56 +08:00
Ulric Qin
b79422962c code refactor 2023-07-17 17:13:39 +08:00
Ulric Qin
e5989ae5c2 rename integration Mongo to MongoDB 2023-07-17 17:11:27 +08:00
Ulric Qin
64feafa3a6 code refactor 2023-07-17 12:18:15 +08:00
Ulric Qin
52e4fa4d0d rename obs to dumper 2023-07-17 07:04:01 +08:00
Ulric Qin
6462c02861 rename obs to dumper 2023-07-17 07:01:19 +08:00
Ulric Qin
c657182659 refactor forward series 2023-07-16 11:35:56 +08:00
Ulric Qin
04d93eff34 refactor observe functions 2023-07-16 10:29:55 +08:00
Ulric Qin
40d60aeb4a add observe 2023-07-16 10:06:41 +08:00
Ulric Qin
ac875fa1b9 fix logger format output 2023-07-16 06:38:57 +08:00
shardingHe
b7c3e8a4f5 add interface of validation rule (#1606)
* add interface of validation rule

---------

Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
Co-authored-by: Yening Qin <710leo@gmail.com>
2023-07-14 14:16:35 +08:00
ning
2524e15947 Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:45:07 +08:00
ning
995c579403 docs: update built-in alert rule 2023-07-14 11:44:55 +08:00
Ulric Qin
848b7ac1ae Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:37:50 +08:00
Ulric Qin
9476b5ba7c code refactor 2023-07-14 11:37:39 +08:00
ning
7b58696bdc Merge branch 'main' of github.com:ccfos/nightingale 2023-07-14 11:16:19 +08:00
ning
6159178d99 set alert_rule.promql empty 2023-07-14 11:16:06 +08:00
青牛踏雪
99e5e0c117 add MinIO dashboard and alert rules based on categraf acquisition (#1625)
* add MinIO  dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition

* add MinIO dashboard and alert rules based on categraf acquisition
2023-07-14 10:37:35 +08:00
Yening Qin
be1a3c1d8b sub and mute rule by severity (#1621)
* sub severity

* mute by severity
2023-07-13 11:16:32 +08:00
Yening Qin
f6378b055c docs: optimize the name of the integrations directory 2023-07-12 18:09:05 +08:00
Yening Qin
2574bb19cd rename integration ceph 2023-07-12 17:59:59 +08:00
Yening Qin
aa9d43cc69 rename integration 2023-07-12 17:58:20 +08:00
李明
d7f18ebec1 add mute hook (#1617) 2023-07-12 16:39:43 +08:00
Ulric Qin
b40f6976bb code refactor 2023-07-12 15:31:09 +08:00
Ulric Qin
cd1db57b7c code refactor 2023-07-12 15:05:58 +08:00
青牛踏雪
5a6ca42c75 add ceph dashboard and alert rules based on categraf acquisition (#1619) 2023-07-11 15:08:00 +08:00
Ulric Qin
80874a743c refactor logic: do not extract ident when ignore_ident exists 2023-07-11 14:56:53 +08:00
ulricqin
6cc612564f fix alert mute compute (#1618) 2023-07-11 10:12:09 +08:00
Yening Qin
909bbb5e66 refactor alert eval (#1616) 2023-07-10 18:49:15 +08:00
青牛踏雪
ff3ea7de58 update postgresql dashboard and alert rules based on categraf acquisition (#1613) 2023-07-07 15:09:27 +08:00
kongfei605
dd316e6ce1 alerts rule and dashboards for pg (#1612) 2023-07-07 14:04:55 +08:00
kongfei605
ba893e77cd update title of tidb alerts (#1611) 2023-07-07 14:04:15 +08:00
青牛踏雪
21904f1e39 add kafka dashboard and alert rules based on categraf acquisition (#1607)
* add kafka dashboard and alert rules based on categraf acquisition

* add kafka dashboard and alert rules based on categraf acquisition
2023-07-06 19:54:37 +08:00
kongfei605
b5d5ecbab2 Merge pull request #1605 from longzhuquan/main
添加TiDB大盘,告警规则
2023-07-06 16:11:19 +08:00
Talon
ee612908ac feat(Login): add rsa to password (#1604) 2023-07-06 16:09:04 +08:00
Yong Wang (IT)
2ee04dffac 添加TiDB大盘,告警规则 2023-07-06 15:54:39 +08:00
青牛踏雪
be25adf990 add dashboard and alert rules based on categraf acquisition (#1603) 2023-07-06 15:50:53 +08:00
dependabot[bot]
ab72b6e1ba build(deps): bump google.golang.org/grpc from 1.51.0 to 1.53.0 (#1602) 2023-07-06 15:50:04 +08:00
laiwei
a4718e7a45 use star-history 2023-07-04 11:26:01 +08:00
青牛踏雪
f948d50d8b add springboot actuator 2.0 dashboard (#1601) 2023-07-03 19:38:40 +08:00
Ulric Qin
cb797d5913 Merge branch 'main' of github.com:ccfos/nightingale 2023-07-03 19:13:34 +08:00
Ulric Qin
8941c192de code refactor 2023-07-03 19:13:23 +08:00
alick-liming
5b726c1e61 optimize i18n format (#1600) 2023-07-01 15:45:59 +08:00
xtan
03871a0bf0 feat: provide alert info to ibex via stdin (#1599)
* feat: provide alert info to ibex via stdin

* refactor: rename tags to stdin

* refactor: format json to ibex
2023-06-30 19:03:56 +08:00
青牛踏雪
e002e9cb8f add VictoriaMetrics New Alerts Rule & add VictoriaMetrics Images. (#1598) 2023-06-29 15:47:48 +08:00
qifenggang
d414831c79 heartbeat update target table update_at field (#1595)
Co-authored-by: qifenggang <qifenggang@sina.com>
2023-06-29 15:47:11 +08:00
alick-liming
89807ada94 i18n const -> var (#1594) 2023-06-28 22:24:53 +08:00
青牛踏雪
351a31b079 fix ipmi readme.md (#1592)
* fix ipmi readme.md

* fix ipmi readme.md
2023-06-28 14:25:12 +08:00
青牛踏雪
af0127c905 add the ipmi dashboards & alerts rules (#1588) 2023-06-27 21:07:10 +08:00
青牛踏雪
95612e7140 add the kube-state-metrics, prometheus, kube-controller-plane alarm & record rules (#1586) 2023-06-26 16:45:45 +08:00
ning
a338b5233c code refactor 2023-06-25 20:30:58 +08:00
ning
ad26225f63 refactor: recording rule model 2023-06-25 19:14:19 +08:00
ning
16db570f18 refactor datasource 2023-06-25 17:04:02 +08:00
ning
97c68360a1 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-25 10:23:16 +08:00
ning
00192b9d0f code refactor 2023-06-25 10:23:04 +08:00
Ulric Qin
e745253d08 refactor integrations and add configuration: UseFileAssets 2023-06-22 18:27:25 +08:00
ning
76905c55d5 refactor: loki datasource check 2023-06-21 21:36:21 +08:00
kongfei605
d4bce5456b snmp & smart dashboards (#1581)
* snmp & smart dashboards

* update

* update snmp
2023-06-21 20:42:50 +08:00
Ulric Qin
58136d30e6 code refactor 2023-06-21 14:59:38 +08:00
Ulric Qin
563fb0330a code refactor 2023-06-21 14:35:36 +08:00
Ulric Qin
c2ab3b4240 Merge branch 'main' of github.com:ccfos/nightingale 2023-06-21 14:05:22 +08:00
Ulric Qin
f5dde6e4d6 fix wrong descriptions 2023-06-21 14:05:08 +08:00
青牛踏雪
a9779703dd add AliYun monitor dashboard & readme.md (#1579) 2023-06-20 15:37:08 +08:00
青牛踏雪
9f4a9e77ae add vmware RabbitMQ monitor dashboard & alerts & readme.md (#1578) 2023-06-20 13:45:11 +08:00
Ulric Qin
df37071c3d refactor pushgw 2023-06-20 10:34:14 +08:00
ning
fa164ac5d2 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-16 18:17:21 +08:00
ning
f5de4c3f22 refactor db2fe 2023-06-16 18:17:14 +08:00
ning
dd9099af0a refactor db2fe 2023-06-16 18:13:28 +08:00
dependabot[bot]
5bdb63a818 build(deps): bump golang.org/x/image (#1575)
Bumps [golang.org/x/image](https://github.com/golang/image) from 0.0.0-20190501045829-6d32002ffd75 to 0.5.0.
- [Commits](https://github.com/golang/image/commits/v0.5.0)

---
updated-dependencies:
- dependency-name: golang.org/x/image
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-06-16 17:58:27 +08:00
ning
8a4c709e87 refactor models 2023-06-16 17:44:26 +08:00
xtan
75f6e07c40 feat: add verification code for login (#1566)
* feat: add verification code for login

* feat: 支持图形验证码开关
2023-06-16 14:47:11 +08:00
Yening Qin
de9b11a049 recording rule add query configs (#1574)
* add query config

* migrate table
2023-06-16 14:40:54 +08:00
ning
067b3f91a7 refactor: change default notify tpl 2023-06-15 19:21:18 +08:00
ning
5d215a89b6 refactor: optimize alert mute when time is 23:59:xx 2023-06-15 18:02:25 +08:00
ning
63679c15dd Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-15 17:01:29 +08:00
ning
38229a43dc refactor: notify tpl 2023-06-15 17:01:17 +08:00
青牛踏雪
1d1ae238d4 add elasticsearch_by_categraf monitor dashboard & alerts & markdown (#1573) 2023-06-15 16:30:06 +08:00
ning
c2d300c0f1 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-15 16:17:38 +08:00
ning
bcb89017a0 refactor: remove default notify template file 2023-06-15 16:17:26 +08:00
Ulric Qin
e04a3eed5f Merge branch 'main' of github.com:ccfos/nightingale 2023-06-15 15:26:13 +08:00
Ulric Qin
e77cf40938 add n9e v6 dashboard 2023-06-15 15:25:58 +08:00
ning
cb66b19d70 refactor: change configs.cval length 2023-06-15 14:35:21 +08:00
ning
9edf05c19a Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-15 11:22:39 +08:00
ning
6a6b4a2283 update ops 2023-06-15 11:22:26 +08:00
青牛踏雪
0473bb3925 add springboot actuator monitor dashboard & alerts & markdown (#1571) 2023-06-15 08:04:00 +08:00
ning
4afc3a60a4 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-14 13:56:13 +08:00
shardingHe
e9c9a3ac58 feat: notify tpl support add and delete (#1567)
* notifyTpl add and delete

* notifyTpl add and delete

* optimization notifyTpl

* optimization notifyTpl

* optimization notifyTpl

* optimization notifyTpl

---------

Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
2023-06-14 13:51:53 +08:00
ning
98260e239e Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-14 13:26:18 +08:00
ning
f751b2034d fix: recovery event tags being lost after promql modification 2023-06-14 13:26:01 +08:00
青牛踏雪
9ce22a33f0 add vmware vsphere monitor dashboard & alerts & readme.md (#1565) 2023-06-13 07:18:04 +08:00
laiwei
3da64ca0fe refine readme 2023-06-12 20:49:25 +08:00
ning
9a883dc02c refactor: feishu_card sender 2023-06-12 12:21:49 +08:00
Ulric Qin
5ab6fe7e56 code refactor 2023-06-12 10:22:51 +08:00
shardingHe
c730eaa860 Move feishucard (#1563)
* Fix an exception situation where the prod and cate fields cannot be updated.

* add feishucard.tpl

* move feishucard to v6

---------

Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
2023-06-11 21:04:02 +08:00
ning
5ba2d6bc8e fix: concurrent map writes 2023-06-09 17:49:34 +08:00
ning
64feee79ff Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-09 10:07:54 +08:00
ning
c490ab09ad fix: cli upgrade 2023-06-09 10:07:42 +08:00
shardingHe
61762e894c Fix: an exception situation where the prod and cate fields cannot be updated. (#1561)
Co-authored-by: shardingHe <wangzihe@flashcat.cloud>
2023-06-08 15:05:29 +08:00
ning
ac4ff33dff refactor: remove phone space 2023-06-07 10:22:44 +08:00
ning
72abeea51f add user login log 2023-06-06 13:41:22 +08:00
ning
6ec2b42669 code refactor 2023-06-05 15:30:10 +08:00
ning
a93e967d30 refactor: update target_up 2023-06-05 14:59:56 +08:00
ning
b5984b7871 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-06-05 14:42:40 +08:00
ning
70ccbbc929 update target_up 2023-06-05 14:42:28 +08:00
dependabot[bot]
79d4fc508c build(deps): bump github.com/gin-gonic/gin from 1.9.0 to 1.9.1 (#1559)
Bumps [github.com/gin-gonic/gin](https://github.com/gin-gonic/gin) from 1.9.0 to 1.9.1.
- [Release notes](https://github.com/gin-gonic/gin/releases)
- [Changelog](https://github.com/gin-gonic/gin/blob/master/CHANGELOG.md)
- [Commits](https://github.com/gin-gonic/gin/compare/v1.9.0...v1.9.1)

---
updated-dependencies:
- dependency-name: github.com/gin-gonic/gin
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-06-03 20:38:05 +08:00
ning
794f0f874f change HostDatasourceId 2023-06-02 13:12:53 +08:00
Ulric Qin
aff53e8be3 Merge branch 'main' of github.com:ccfos/nightingale 2023-06-02 12:11:47 +08:00
Ulric Qin
2de6847323 refactor fe.sh 2023-06-02 12:11:35 +08:00
ning
eed037a3a1 change heartbeat api 2023-06-02 11:57:15 +08:00
ning
4099c467bb code refactor 2023-06-02 11:42:40 +08:00
ning
6b51adbc9a code refactor 2023-06-02 11:29:30 +08:00
ning
307be1dda2 fix: datasource bind to alert engine where update 2023-06-02 11:22:05 +08:00
ning
7da6145ec6 fix: promClients hit 2023-06-02 10:19:42 +08:00
Ulric Qin
0e4298a592 use standard http client instead of beego client 2023-06-02 09:58:10 +08:00
Ulric Qin
037fab74eb code refactor 2023-06-02 09:20:24 +08:00
Ulric Qin
fb849928c9 code refactor 2023-06-02 08:33:41 +08:00
Ulric Qin
7833aae0a1 code refactor 2023-06-02 08:13:08 +08:00
Ulric Qin
6edd71b1f0 code refactor 2023-06-02 08:06:40 +08:00
ulricqin
2f2f310a40 add hearbeat api for pushgw (#1560) 2023-06-02 08:06:23 +08:00
Ulric Qin
14bfdaa2ee code refactor 2023-06-02 07:39:04 +08:00
Ulric Qin
ffd0a69e43 fix: leaking connections 2023-06-02 07:31:21 +08:00
Ulric Qin
5b79d0ef46 code refactor 2023-06-01 21:20:21 +08:00
Ulric Qin
8f2a885a7d code refactor 2023-06-01 21:16:51 +08:00
Ulric Qin
31f6300c16 code refactor 2023-06-01 21:09:04 +08:00
Ulric Qin
54710c22f0 code refactor 2023-06-01 21:03:18 +08:00
Ulric Qin
352aa2b6b1 code refactor 2023-06-01 20:57:36 +08:00
Ulric Qin
624e5b5e62 debug 2023-06-01 20:45:51 +08:00
ning
65e3b5c8f1 fix: goreleaser 2023-06-01 20:28:28 +08:00
ning
750732f203 docs: update makefile 2023-06-01 20:16:23 +08:00
ning
9957711643 add n9e-edge 2023-06-01 20:01:29 +08:00
Ulric Qin
8f4fb0d28b code refactor for fe.sh 2023-06-01 19:28:54 +08:00
Ulric Qin
5d63f23cfc code refactor 2023-06-01 18:27:00 +08:00
Ulric Qin
c0fb8d22db code refactor 2023-06-01 18:02:01 +08:00
ulricqin
1732b297b1 refactor basic auth configurations: merge HTTP.Pushgw and HTTP.Heartbeat to HTTP.APIForAgent; merge HTTP.Alert and HTTP.Service to HTTP.APIForService (#1558) 2023-06-01 16:23:19 +08:00
ning
f1a5c2065c change alert.toml.example 2023-06-01 14:35:47 +08:00
Yening Qin
6b9ceda9c1 fix: host filter (#1557)
* fix host filter
2023-06-01 14:16:47 +08:00
ning
7390d42e62 refactor: change Makefile 2023-05-31 14:39:31 +08:00
ning
a35f879dc0 refactor: change event notify log 2023-05-31 14:19:41 +08:00
xtan
3fd4ea4853 feat: embed front-end files into n9e executable (#1556)
* feat: embed front-end files into n9e executable
2023-05-31 10:30:01 +08:00
ning
20f0a9d16d fix: webhook update note 2023-05-26 15:41:16 +08:00
ning
5d4151983a refactor: init alert 2023-05-25 14:42:18 +08:00
Yening Qin
83b5f12474 refactor: n9e-alert and n9e-pushgw sync config by http api (#1545)
* get alert mute by api

* add service api

* fix sync datasource

* change event persist

* add hearbeat

* change pushgw update target

* code refactor

* fix get user members

* refactor get alert rules

* update AlertCurEventGetByRuleIdAndDsId

* refactor get from api

* add role perm list and change get datasource

* refactor: get ops and metrics

* change some logs

* change get datasource
2023-05-23 20:53:04 +08:00
ning
8c7bfb4f4a Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-05-23 13:48:39 +08:00
ning
4ccf887920 fix panic where atertRuleCache.Get is nil 2023-05-23 13:48:26 +08:00
Ulric Qin
546d9cb2cc code refactor 2023-05-18 09:42:33 +08:00
Ulric Qin
391b42a399 code refactor 2023-05-18 09:40:18 +08:00
ning
a916a0fc6b refactor: set default script timeout 2023-05-17 15:25:00 +08:00
ning
da9f5fbb12 fix: hashring use lock 2023-05-17 14:45:28 +08:00
Yening Qin
ad3cf58bf3 feat: add ExtraSenders (#1536)
* refactor-sender

* update  upgrade.sql
2023-05-16 19:44:12 +08:00
ning
a77dc15e36 fix: ts fill tags 2023-05-16 10:19:16 +08:00
ning
9ad51aeeff refactor: rule prod check 2023-05-15 13:10:33 +08:00
ning
2c7f030ea5 fix ident extract 2023-05-12 14:01:50 +08:00
ning
039be7fc6c rename es dashbaord name 2023-05-11 18:54:07 +08:00
ning
9bff2509a8 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-05-11 16:39:23 +08:00
ning
35b3cbb697 feat: add get datasource ids api 2023-05-11 16:39:11 +08:00
kongfei605
d81275b9c8 Merge pull request #1534 from dreamking02/patch-1
Update config.toml
2023-05-10 20:34:05 +08:00
dreamking02
e29dd58823 Update config.toml 2023-05-10 18:46:40 +08:00
ning
b64aa03ccf refactor: FillSeverities 2023-05-10 16:14:36 +08:00
ning
3893cb00a5 refactor: FillSeverities 2023-05-10 15:12:26 +08:00
ning
4b6985c8af Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-05-09 20:49:00 +08:00
MoonStrider
7cc9470823 Update alert_rule.go (#1528)
fix: rule.FillSeverities
2023-05-09 20:48:34 +08:00
ning
b97dfce0ad refactor: get node debug log 2023-05-09 20:19:48 +08:00
ning
357d3dff78 refactor: get node debug log 2023-05-09 19:43:59 +08:00
ning
d0604f0c97 refactor: alert rule sync 2023-05-09 19:21:36 +08:00
ning
8fafa0075b fix: filter host by tags 2023-05-09 15:14:28 +08:00
ning
caa23fbba1 refactor: oidc attributes username assignable 2023-05-09 10:18:11 +08:00
ning
4b9fea3cb2 refactor: ident extract 2023-05-09 10:08:49 +08:00
ning
f61a04f43f refactor: cas login 2023-05-09 10:08:09 +08:00
Ulric Qin
ef3588ff46 add host_table_view_demo.json 2023-05-06 17:41:40 +08:00
ning
3e3210bb81 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-05-06 14:24:14 +08:00
ning
da7ef5a92e refactor: set heartbeat ip 2023-05-06 14:24:00 +08:00
Ulric Qin
82b91164fe Merge branch 'main' of github.com:ccfos/nightingale 2023-05-06 11:58:34 +08:00
Ulric Qin
033d45309f add snmp markdown 2023-05-06 11:58:21 +08:00
ning
60e9fb21f1 docs: update upgrade.sql 2023-05-06 10:41:19 +08:00
ning
508006ad01 refactor: notify template 2023-05-05 19:51:22 +08:00
Ulric Qin
97d7b0574a code refactor 2023-05-05 18:21:16 +08:00
Ulric Qin
c44aebd404 code refactor 2023-05-05 16:23:36 +08:00
Ulric Qin
2afa921a5d code refactor 2023-05-05 16:13:29 +08:00
Ulric Qin
313c820f1f code refactor 2023-05-05 16:09:35 +08:00
Ulric Qin
02f0b4579b update net_response dashboard 2023-05-05 16:05:54 +08:00
0x0034
36eb308ef6 fix: 修正添加loki 数据源校验问题 (#1524)
Co-authored-by: 若尘 <ruochen@ruochendeMacBook-Pro.local>
2023-05-05 14:45:47 +08:00
dependabot[bot]
cd2db571cf build(deps): bump github.com/gin-gonic/gin from 1.8.2 to 1.9.0 (#1523)
Bumps [github.com/gin-gonic/gin](https://github.com/gin-gonic/gin) from 1.8.2 to 1.9.0.
- [Release notes](https://github.com/gin-gonic/gin/releases)
- [Changelog](https://github.com/gin-gonic/gin/blob/master/CHANGELOG.md)
- [Commits](https://github.com/gin-gonic/gin/compare/v1.8.2...v1.9.0)

---
updated-dependencies:
- dependency-name: github.com/gin-gonic/gin
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-05-05 11:14:01 +08:00
kongfei605
a0cf12b171 Merge pull request #1522 from ccfos/dashboard
add canal dashboards
2023-05-05 10:52:11 +08:00
kongfei
8358ab4b81 add canal dashboards 2023-05-05 10:46:53 +08:00
青牛踏雪
0fc6cb8ef2 fix vm old dashboard to new (#1521) 2023-05-04 20:13:01 +08:00
xiechenglong
e1ab013c45 fix:Unknown column 'rw' in 'field list' (#1519)
Co-authored-by: xiechenglong <xiechenglong@inspur.com>
2023-05-04 15:17:15 +08:00
xtan
d984ad8bf4 docs: pg sql script and gitignore (#1518) 2023-05-04 08:59:17 +08:00
kongfei605
86fe3c7c43 chmod 755 wait for aarch64 (#1517) 2023-04-28 16:39:57 +08:00
青牛踏雪
0f4478318e update victoriametrics url links. @cyancow (#1516)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.

* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter

* add victoriametrics dashboard based on categraf collection.

* up victoriametrics url links.

* Update README.md

---------

Co-authored-by: ulricqin <ulricqin@qq.com>
2023-04-27 21:28:04 +08:00
Ulric Qin
c0d0eb0e69 code refactor 2023-04-27 21:22:48 +08:00
Ulric Qin
b62762b2e6 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-27 21:20:39 +08:00
ning
810ca0e469 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-04-27 21:19:58 +08:00
青牛踏雪
33e3b224b9 add victoriametrics dashboard based on categraf collection. (#1515)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.

* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter

* add victoriametrics dashboard based on categraf collection.
2023-04-27 21:19:46 +08:00
ning
24d7b2b1bf update dockerfile 2023-04-27 21:19:42 +08:00
Ulric Qin
1d5ff1b28d code refactor 2023-04-27 21:18:41 +08:00
ning
ed5c8c5758 fix Dockerfile 2023-04-27 20:23:28 +08:00
Ulric Qin
01f7860900 code refactor 2023-04-27 20:22:23 +08:00
Ulric Qin
a6bb03c8ba update http dash 2023-04-27 20:20:39 +08:00
Ulric Qin
e9150b2ae0 rename dir of net_response 2023-04-27 20:19:17 +08:00
Ulric Qin
30d1ebd808 update http icon and n9e icon 2023-04-27 19:51:58 +08:00
Ulric Qin
2f69d92055 add markdown readme of procstat 2023-04-27 19:46:47 +08:00
Ulric Qin
deeb40b4a0 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-27 19:35:10 +08:00
Ulric Qin
37f68fd52b add procstat integrations 2023-04-27 19:34:57 +08:00
ning
73828e50b5 update fe.sh 2023-04-27 18:23:40 +08:00
kongfei605
7e73850117 Merge pull request #1514 from ccfos/docker_release
update dockerfile for github-action
2023-04-27 18:08:00 +08:00
kongfei
3a075e7681 update dockerfile for github-action 2023-04-27 18:06:49 +08:00
ulricqin
4ec5612d78 add processes dashboards and alerts (#1513) 2023-04-27 16:11:21 +08:00
Yening Qin
817ed0ab1b fix get engine cluster list (#1512)
* fix: get engine cluster list
2023-04-27 15:33:14 +08:00
Yening Qin
63aa615761 compatible with TDSQL-C Mysql (#1511) 2023-04-27 14:28:59 +08:00
ning
2a36902760 fix: alert rule batch update severity 2023-04-27 11:54:14 +08:00
ning
bca9331182 compatible with TDSQL-C Mysql 2023-04-27 10:45:41 +08:00
alick-liming
199a23e385 refactor: get ClientIP (#1502)
* 调整ClientIP获取
2023-04-27 10:26:34 +08:00
ning
c733f16cc7 auto change n9e version in docker-compose.yaml 2023-04-26 17:32:07 +08:00
ning
81585649aa Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-04-26 17:15:18 +08:00
ning
2c4422d657 auto change n9e version in docker-compose.yaml 2023-04-26 17:15:05 +08:00
青牛踏雪
aaf66cb386 docs: add apiserver kubelet node alerts template to k8s (#1508)
* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter
2023-04-26 14:18:13 +08:00
Ulric Qin
cfed4d8318 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-25 15:02:26 +08:00
Ulric Qin
606cd538ec update dingtalk title 2023-04-25 15:02:13 +08:00
kongfei605
bafb3b2546 Merge pull request #1506 from ccfos/docker_release
update dockerfile
2023-04-25 11:56:32 +08:00
kongfei
9a0224697f typo 2023-04-25 11:55:58 +08:00
kongfei
23156552db update dockerfile 2023-04-25 11:54:29 +08:00
青牛踏雪
36bca795fa add kubernetes dashboard based on categraf collection. (#1503)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine

* add kubernetes dashboard based on categraf collection.
2023-04-24 19:58:12 +08:00
Ulric Qin
b5503ae93e update static files router 2023-04-24 19:42:02 +08:00
青牛踏雪
3c102e47ed add taoskeeper 3.x dashboard (#1501)
* add taoskeeper 3.x dashboard

* modify directory name taos to TDEngine
2023-04-24 19:28:17 +08:00
xtan
60bf8139b1 feat: add eventid to ibex task_record (#1497) 2023-04-24 18:01:48 +08:00
alick-liming
fc0d077c9f feat:1.verify notify template 2.heartbeat add remote_addr 3. gid auto busi group (#1498)
* 1.通知模版校验 2.对象列表remote_addr

* 1.bgid参数调整 2.语句优化

* 代码优化

* 代码调整
2023-04-24 16:02:45 +08:00
kongfei605
3a610f7ea0 fix standard output option for dashboards (#1500) 2023-04-24 13:18:06 +08:00
xtan
f8990ee85e fix: fix alert mute error for pg (#1496) 2023-04-21 14:05:26 +08:00
ning
88040bf277 modify fe.sh 2023-04-20 19:28:36 +08:00
ning
1e15dc1f30 fix batch update recording rule datasource 2023-04-20 18:02:50 +08:00
ning
9880b466db add /datasource/brief 2023-04-20 17:23:18 +08:00
ning
b7780ebbdb update n9e.sql 2023-04-20 16:50:59 +08:00
ning
1fa524b710 fix: set default ibex conf 2023-04-20 16:40:58 +08:00
ning
aa2c0cffce refactor docker-compose 2023-04-20 16:35:12 +08:00
ning
ed1c89fb7e refactor: heartbeat cluster name to engine name 2023-04-20 15:08:21 +08:00
ning
988327dead refactor built in board 2023-04-20 15:02:36 +08:00
xtan
5db168224e docs: docker-compose versions based on pg and vm (#1488) 2023-04-19 11:23:41 +08:00
idcdog
7622eba87f Adjust data source validation logic to support victoria-metrics clusters (#1487)
* fix: 调整数据源校验逻辑以便支持victoria-metrics集群
2023-04-19 11:06:53 +08:00
xtan
1cb58fedf7 docs: n9e and ibex init sql for postgresql (#1485) 2023-04-18 16:18:35 +08:00
ning
7dcaec0a7b update readme 2023-04-17 19:42:29 +08:00
ning
4f315cb6d5 host event append busigroup label 2023-04-17 17:33:54 +08:00
ning
9a2d898214 refactor: datasource check 2023-04-17 16:57:28 +08:00
ning
530561c038 refactor: datasource check 2023-04-17 16:19:19 +08:00
ning
fc68d2d598 update goreleaser 2023-04-14 19:02:14 +08:00
ning
1b40c38a7a modify docker-compose.yaml 2023-04-14 18:21:30 +08:00
Yening Qin
d39d4cb91d change builtin board (#1481) 2023-04-14 12:24:50 +08:00
lwangrabbit
e415538ffd fix: sendmm run ok with illegal token (#1476)
Co-authored-by: wanglipeng <wanglipeng@huayun.com>
2023-04-13 17:02:04 +08:00
Yening Qin
05c767a803 datasource check (#1479) 2023-04-13 16:59:25 +08:00
Ulric Qin
923cff1c19 Merge branch 'main' of github.com:ccfos/nightingale 2023-04-13 10:14:09 +08:00
Ulric Qin
ef18d2a95f fix pub static files router 2023-04-13 10:13:43 +08:00
laiwei
3abc4d0bfd update readme for v6 2023-04-12 20:13:33 +08:00
monch
a3ec69fe4a refactor: 优化钉钉通知被@时的排版 (#1475) 2023-04-11 18:10:04 +08:00
ning
403466f872 Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-04-11 16:30:47 +08:00
ning
81abd2f02a fix: datasource update 2023-04-11 16:30:35 +08:00
Ulric Qin
263c77cbbf update discord to slack 2023-04-07 17:09:32 +08:00
710leo
ef42a78e59 update n9e.sql 2023-04-06 22:13:57 +08:00
ning
4c7746b3b4 refactor: target miss add append tags 2023-04-06 17:15:02 +08:00
ning
b142a5726e target miss add append tags 2023-04-06 17:12:46 +08:00
ning
cc68b75489 fix: get builtin icon 2023-04-06 14:42:34 +08:00
ning
1ce79e29d5 fix: panic when template is nil 2023-04-03 17:25:52 +08:00
ning
ee167ce0ba update readme 2023-04-03 12:12:20 +08:00
idcdog
544cd02ef1 fix: the issue of the 'skip ssl validation' request in elasticsearch not taking effect (#1457) 2023-04-01 20:54:41 +08:00
ning
34ad6bc220 fix push data 2023-04-01 11:46:09 +08:00
ning
c7c694e70b refactor: ignore redis is nil 2023-03-31 16:20:33 +08:00
ning
dc26bb78d8 fix: redis get nil 2023-03-31 10:17:46 +08:00
ning
a0c635b830 update Dockerfile.goreleaser 2023-03-30 17:22:47 +08:00
ning
0e95c29b7d fix: goreleaser 2023-03-30 16:57:12 +08:00
ning
cab9fed700 fix: dockerfile 2023-03-30 16:39:05 +08:00
Yening Qin
4ad47fb8f4 refactor: push series (#1455) 2023-03-30 14:50:53 +08:00
ning
50345cb823 update initsql 2023-03-29 16:14:19 +08:00
ning
95bb67e66d Merge branch 'main' of ssh://github.com/ccfos/nightingale 2023-03-29 11:51:51 +08:00
ning
90fbd9f16a fix: busigroup append tag 2023-03-29 11:51:39 +08:00
kongfei605
5c8411eba1 update docker image (#1449) 2023-03-29 11:16:19 +08:00
ning
03edb84d09 fix: annotations panic 2023-03-28 20:16:09 +08:00
ning
958a8c3ed1 fix: ldap user roles set 2023-03-28 16:25:49 +08:00
Yening Qin
a2a0b41909 refactor: redis mset and mget (#1446)
* refactor redis mset
2023-03-28 15:39:43 +08:00
Tripitakav
64e1085766 fix nil pointer (#1443) 2023-03-27 19:20:31 +08:00
ning
5c97986908 update upgrade.sql 2023-03-27 15:51:52 +08:00
ning
66e291e3c3 fix: target_up show 2023-03-27 12:07:25 +08:00
ning
365fcd5dd7 update upgrade.sql 2023-03-24 23:08:37 +08:00
ning
63690ba084 fix: cli upgrade alert_mute 2023-03-24 21:01:34 +08:00
472 changed files with 92178 additions and 50095 deletions

9
.gitignore vendored
View File

@@ -31,6 +31,9 @@ _test
/etc/*.local.yml
/etc/*.local.conf
/etc/plugins/*.local.yml
/etc/script/rules.yaml
/etc/script/alert-rules.json
/etc/script/record-rules.json
/data*
/tarball
/run
@@ -41,7 +44,9 @@ _test
/docker/pub
/docker/n9e
/docker/mysqldata
/etc.local
/docker/experience_pg_vm/pgdata
/etc.local*
/front/statik/statik.go
.alerts
.idea
@@ -53,4 +58,4 @@ _test
queries.active
/n9e-*
n9e.sql

View File

@@ -2,6 +2,7 @@ before:
hooks:
# You may remove this if you don't use go modules.
- go mod tidy
- go install github.com/rakyll/statik
snapshot:
name_template: '{{ .Tag }}'
@@ -14,7 +15,8 @@ builds:
- id: build
hooks:
pre:
- ./fe.sh
- cmd: sh -x ./fe.sh
output: true
main: ./cmd/center/
binary: n9e
env:
@@ -40,12 +42,26 @@ builds:
ldflags:
- -s -w
- -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}}
- id: build-edge
main: ./cmd/edge/
binary: n9e-edge
env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
- arm64
ldflags:
- -s -w
- -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}}
archives:
- id: n9e
builds:
- build
- build-cli
- build-edge
format: tar.gz
format_overrides:
- goos: windows
@@ -55,7 +71,6 @@ archives:
files:
- docker/*
- etc/*
- pub/*
- integrations/*
- cli/*
- n9e.sql
@@ -75,8 +90,8 @@ dockers:
- build
dockerfile: docker/Dockerfile.goreleaser
extra_files:
- pub
- etc
- integrations
use: buildx
build_flag_templates:
- "--platform=linux/amd64"
@@ -86,10 +101,10 @@ dockers:
goarch: arm64
ids:
- build
dockerfile: docker/Dockerfile.goreleaser
dockerfile: docker/Dockerfile.goreleaser.arm64
extra_files:
- pub
- etc
- integrations
use: buildx
build_flag_templates:
- "--platform=linux/arm64/v8"

View File

@@ -1,4 +1,4 @@
.PHONY: start build
.PHONY: prebuild build
ROOT:=$(shell pwd -P)
GIT_COMMIT:=$(shell git --work-tree ${ROOT} rev-parse 'HEAD^{commit}')
@@ -6,27 +6,35 @@ _GIT_VERSION:=$(shell git --work-tree ${ROOT} describe --tags --abbrev=14 "${GIT
TAG=$(shell echo "${_GIT_VERSION}" | awk -F"-" '{print $$1}')
RELEASE_VERSION:="$(TAG)-$(GIT_COMMIT)"
all: build
all: prebuild build
prebuild:
echo "begin download and embed the front-end file..."
sh fe.sh
echo "front-end file download and embedding completed."
build:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e ./cmd/center/main.go
build-edge:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-edge ./cmd/edge/
build-alert:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-alert ./cmd/alert/main.go
build-pushgw:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-pushgw ./cmd/pushgw/main.go
build-cli:
build-cli:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-cli ./cmd/cli/main.go
run:
nohup ./n9e > n9e.log 2>&1 &
run_alert:
run-alert:
nohup ./n9e-alert > n9e-alert.log 2>&1 &
run_pushgw:
run-pushgw:
nohup ./n9e-pushgw > n9e-pushgw.log 2>&1 &
release:

106
README.md
View File

@@ -4,99 +4,71 @@
</p>
<p align="center">
<img alt="GitHub latest release" src="https://img.shields.io/github/v/release/ccfos/nightingale"/>
<a href="https://n9e.github.io">
<a href="https://flashcat.cloud/docs/">
<img alt="Docs" src="https://img.shields.io/badge/docs-get%20started-brightgreen"/></a>
<a href="https://hub.docker.com/u/flashcatcloud">
<img alt="Docker pulls" src="https://img.shields.io/docker/pulls/flashcatcloud/nightingale"/></a>
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/ccfos/nightingale">
<img alt="GitHub Repo issues" src="https://img.shields.io/github/issues/ccfos/nightingale">
<img alt="GitHub Repo issues closed" src="https://img.shields.io/github/issues-closed/ccfos/nightingale">
<img alt="GitHub forks" src="https://img.shields.io/github/forks/ccfos/nightingale">
<a href="https://github.com/ccfos/nightingale/graphs/contributors">
<img alt="GitHub contributors" src="https://img.shields.io/github/contributors-anon/ccfos/nightingale"/></a>
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/ccfos/nightingale">
<br/><img alt="GitHub Repo issues" src="https://img.shields.io/github/issues/ccfos/nightingale">
<img alt="GitHub Repo issues closed" src="https://img.shields.io/github/issues-closed/ccfos/nightingale">
<img alt="GitHub forks" src="https://img.shields.io/github/forks/ccfos/nightingale">
<img alt="GitHub latest release" src="https://img.shields.io/github/v/release/ccfos/nightingale"/>
<img alt="License" src="https://img.shields.io/badge/license-Apache--2.0-blue"/>
<a href="https://n9e-talk.slack.com/">
<img alt="GitHub contributors" src="https://img.shields.io/badge/join%20slack-%23n9e-brightgreen.svg"/></a>
</p>
<p align="center">
An open-source cloud-native monitoring system that is <b>all-in-one</b> <br/>
<b>Out-of-the-box</b>, it integrates data collection, visualization, and monitoring alert <br/>
We recommend upgrading your <b>Prometheus + AlertManager + Grafana</b> combination to Nightingale!
告警管理专家,一体化的开源可观测平台
</p>
[English](./README.md) | [中文](./README_ZH.md)
[English](./README_en.md) | [中文](./README.md)
夜莺Nightingale是中国计算机学会托管的开源云原生可观测工具最早由滴滴于 2020 年孵化并开源,并于 2022 年正式捐赠予中国计算机学会。夜莺采用 All-in-One 的设计理念,集数据采集、可视化、监控告警、数据分析于一体,与云原生生态紧密集成,融入了顶级互联网公司可观测性最佳实践,沉淀了众多社区专家经验,开箱即用。
## 资料
- 文档:[flashcat.cloud/docs](https://flashcat.cloud/docs/)
- 提问:[answer.flashcat.cloud](https://answer.flashcat.cloud/)
- 报Bug[github.com/ccfos/nightingale/issues](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&projects=&template=bug_report.yml)
## Highlighted Features
## 功能和特点
- **Out-of-the-box**
- Supports multiple deployment methods such as **Docker, Helm Chart, and cloud services**, integrates data collection, monitoring, and alerting into one system, and comes with various monitoring dashboards, quick views, and alert rule templates. **It greatly reduces the construction cost, learning cost, and usage cost of cloud-native monitoring systems**.
- **Professional Alerting**
- Provides visual alert configuration and management, supports various alert rules, offers the ability to configure silence and subscription rules, supports multiple alert delivery channels, and has features such as alert self-healing and event management.
- **Cloud-Native**
- Quickly builds an enterprise-level cloud-native monitoring system through a turnkey approach, supports multiple collectors such as [Categraf](https://github.com/flashcatcloud/categraf), Telegraf, and Grafana-agent, supports multiple data sources such as Prometheus, VictoriaMetrics, M3DB, ElasticSearch, and Jaeger, and is compatible with importing Grafana dashboards. **It seamlessly integrates with the cloud-native ecosystem**.
- **High Performance and High Availability**
- Due to the multi-data-source management engine of Nightingale and its excellent architecture design, and utilizing a high-performance time-series database, it can handle data collection, storage, and alert analysis scenarios with billions of time-series data, saving a lot of costs.
- Nightingale components can be horizontally scaled with no single point of failure. It has been deployed in thousands of enterprises and tested in harsh production practices. Many leading Internet companies have used Nightingale for cluster machines with hundreds of nodes, processing billions of time-series data.
- **Flexible Extension and Centralized Management**
- Nightingale can be deployed on a 1-core 1G cloud host, deployed in a cluster of hundreds of machines, or run in Kubernetes. Time-series databases, alert engines, and other components can also be decentralized to various data centers and regions, balancing edge deployment with centralized management. **It solves the problem of data fragmentation and lack of unified views**.
- 统一接入各种时序库:支持对接 Prometheus、VictoriaMetrics、Thanos、Mimir、M3DB 等多种时序库,实现统一告警管理
- 专业告警能力:内置支持多种告警规则,可以扩展支持所有通知媒介,支持告警屏蔽、告警抑制、告警自愈、告警事件管理
- 高性能可视化引擎支持多种图表样式内置众多Dashboard模版也可导入Grafana模版开箱即用开源协议商业友好
- 无缝搭配 [Flashduty](https://flashcat.cloud/product/flashcat-duty/)实现告警聚合收敛、认领、升级、排班、IM集成确保告警处理不遗漏减少打扰更好协同
- 支持所有常见采集器:支持 [Categraf](https://flashcat.cloud/product/categraf)、telegraf、grafana-agent、datadog-agent、各种 exporter 作为采集器,没有什么数据是不能监控的
- 一体化观测平台:从 v6 版本开始,支持接入 ElasticSearch、Jaeger 数据源,实现日志、链路、指标多维度的统一可观测
#### If you are using Prometheus and have one or more of the following requirement scenarios, it is recommended that you upgrade to Nightingale:
## 产品演示
- Multiple systems such as Prometheus, Alertmanager, Grafana, etc. are fragmented and lack a unified view and cannot be used out of the box;
- The way to manage Prometheus and Alertmanager by modifying configuration files has a big learning curve and is difficult to collaborate;
- Too much data to scale-up your Prometheus cluster;
- Multiple Prometheus clusters running in production environments, which faced high management and usage costs;
![演示](doc/img/n9e-screenshot-gif-v6.gif)
#### If you are using Zabbix and have the following scenarios, it is recommended that you upgrade to Nightingale:
## 部署架构
- Monitoring too much data and wanting a better scalable solution;
- A high learning curve and a desire for better efficiency of collaborative use in a multi-person, multi-team model;
- Microservice and cloud-native architectures with variable monitoring data lifecycles and high monitoring data dimension bases, which are not easily adaptable to the Zabbix data model;
![架构](doc/img/n9e-arch-latest.png)
## 加入交流群
#### If you are using [open-falcon](https://github.com/open-falcon/falcon-plus), we recommend you to upgrade to Nightingale
- For more information about open-falcon and Nightingale, please refer to read [Ten features and trends of cloud-native monitoring](https://mp.weixin.qq.com/s?__biz=MzkzNjI5OTM5Nw==&mid=2247483738&idx=1&sn=e8bdbb974a2cd003c1abcc2b5405dd18&chksm=c2a19fb0f5d616a63185cd79277a79a6b80118ef2185890d0683d2bb20451bd9303c78d083c5#rd)。
## Getting Started
[English Doc](https://n9e.github.io/) | [中文文档](http://n9e.flashcat.cloud/)
## Screenshots
https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a49e-d60af9bd76e8.mp4
## Architecture
<img src="doc/img/arch-product.png" width="600">
Nightingale monitoring can receive monitoring data reported by various collectors (such as [Categraf](https://github.com/flashcatcloud/categraf) , telegraf, grafana-agent, Prometheus, etc.) and write them to various popular time-series databases (such as Prometheus, M3DB, VictoriaMetrics, Thanos, TDEngine, etc.). It provides configuration capabilities for alert rules, silence rules, and subscription rules, as well as the ability to view monitoring data. It also provides automatic alarm self-healing mechanisms (such as automatically calling back to a webhook address or executing a script after an alarm is triggered), and the ability to store and manage historical alarm events and view them in groups.
If the performance of a standalone time-series database (such as Prometheus) has bottlenecks or poor disaster recovery, we recommend using [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics). The VictoriaMetrics architecture is relatively simple, has excellent performance, and is easy to deploy and maintain. The architecture diagram is as shown above. For more detailed documentation on VictoriaMetrics, please refer to its [official website](https://victoriametrics.com/).
**We welcome you to participate in the Nightingale open-source project and community in various ways, including but not limited to**
- Adding and improving documentation => [n9e.github.io](https://n9e.github.io/)
- Sharing your best practices and experience in using Nightingale monitoring => [Article sharing]((https://n9e.github.io/docs/prologue/share/))
- Submitting product suggestions => [github issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md)
- Submitting code to make Nightingale monitoring faster, more stable, and easier to use => [github pull request](https://github.com/didi/nightingale/pulls)
**Respecting, recognizing, and recording the work of every contributor** is the first guiding principle of the Nightingale open-source community. We advocate effective questioning, which not only respects the developer's time but also contributes to the accumulation of knowledge in the entire community
- Before asking a question, please first refer to the [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq)
- We use [GitHub Discussions](https://github.com/ccfos/nightingale/discussions) as the communication forum. You can search and ask questions here.
- We also recommend that you join ours [discard](https://discord.gg/qsRmtAuPw2) to exchange experiences with other Nightingale users.
## Who is using Nightingale
You can register your usage and share your experience by posting on **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897)**.
欢迎加入 QQ 交流群群号479290895QQ 群适合群友互助,夜莺研发人员通常不在群里。如果要报 bug 请到[这里](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&projects=&template=bug_report.yml),提问到[这里](https://answer.flashcat.cloud/)。
## Stargazers over time
[![Stargazers over time](https://starchart.cc/ccfos/nightingale.svg)](https://starchart.cc/ccfos/nightingale)
[![Stargazers over time](https://api.star-history.com/svg?repos=ccfos/nightingale&type=Date)](https://star-history.com/#ccfos/nightingale&Date)
## Contributors
<a href="https://github.com/ccfos/nightingale/graphs/contributors">
<img src="https://contrib.rocks/image?repo=ccfos/nightingale" />
</a>
## 社区治理
[夜莺开源项目和社区治理架构(草案)](./doc/community-governance.md)
## License
[Apache License V2.0](https://github.com/didi/nightingale/blob/main/LICENSE)
[Apache License V2.0](https://github.com/didi/nightingale/blob/main/LICENSE)

104
README_en.md Normal file
View File

@@ -0,0 +1,104 @@
<p align="center">
<a href="https://github.com/ccfos/nightingale">
<img src="doc/img/nightingale_logo_h.png" alt="nightingale - cloud native monitoring" width="240" /></a>
</p>
<p align="center">
<img alt="GitHub latest release" src="https://img.shields.io/github/v/release/ccfos/nightingale"/>
<a href="https://n9e.github.io">
<img alt="Docs" src="https://img.shields.io/badge/docs-get%20started-brightgreen"/></a>
<a href="https://hub.docker.com/u/flashcatcloud">
<img alt="Docker pulls" src="https://img.shields.io/docker/pulls/flashcatcloud/nightingale"/></a>
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/ccfos/nightingale">
<img alt="GitHub Repo issues" src="https://img.shields.io/github/issues/ccfos/nightingale">
<img alt="GitHub Repo issues closed" src="https://img.shields.io/github/issues-closed/ccfos/nightingale">
<img alt="GitHub forks" src="https://img.shields.io/github/forks/ccfos/nightingale">
<a href="https://github.com/ccfos/nightingale/graphs/contributors">
<img alt="GitHub contributors" src="https://img.shields.io/github/contributors-anon/ccfos/nightingale"/></a>
<a href="https://n9e-talk.slack.com/">
<img alt="GitHub contributors" src="https://img.shields.io/badge/join%20slack-%23n9e-brightgreen.svg"/></a>
<img alt="License" src="https://img.shields.io/badge/license-Apache--2.0-blue"/>
</p>
<p align="center">
An open-source cloud-native monitoring system that is <b>all-in-one</b> <br/>
<b>Out-of-the-box</b>, it integrates data collection, visualization, and monitoring alert <br/>
We recommend upgrading your <b>Prometheus + AlertManager + Grafana</b> combination to Nightingale!
</p>
[English](./README.md) | [中文](./README_ZH.md)
## Highlighted Features
- **Out-of-the-box**
- Supports multiple deployment methods such as **Docker, Helm Chart, and cloud services**, integrates data collection, monitoring, and alerting into one system, and comes with various monitoring dashboards, quick views, and alert rule templates. **It greatly reduces the construction cost, learning cost, and usage cost of cloud-native monitoring systems**.
- **Professional Alerting**
- Provides visual alert configuration and management, supports various alert rules, offers the ability to configure silence and subscription rules, supports multiple alert delivery channels, and has features such as alert self-healing and event management.
- **Cloud-Native**
- Quickly builds an enterprise-level cloud-native monitoring system through a turnkey approach, supports multiple collectors such as [Categraf](https://github.com/flashcatcloud/categraf), Telegraf, and Grafana-agent, supports multiple data sources such as Prometheus, VictoriaMetrics, M3DB, ElasticSearch, and Jaeger, and is compatible with importing Grafana dashboards. **It seamlessly integrates with the cloud-native ecosystem**.
- **High Performance and High Availability**
- Due to the multi-data-source management engine of Nightingale and its excellent architecture design, and utilizing a high-performance time-series database, it can handle data collection, storage, and alert analysis scenarios with billions of time-series data, saving a lot of costs.
- Nightingale components can be horizontally scaled with no single point of failure. It has been deployed in thousands of enterprises and tested in harsh production practices. Many leading Internet companies have used Nightingale for cluster machines with hundreds of nodes, processing billions of time-series data.
- **Flexible Extension and Centralized Management**
- Nightingale can be deployed on a 1-core 1G cloud host, deployed in a cluster of hundreds of machines, or run in Kubernetes. Time-series databases, alert engines, and other components can also be decentralized to various data centers and regions, balancing edge deployment with centralized management. **It solves the problem of data fragmentation and lack of unified views**.
#### If you are using Prometheus and have one or more of the following requirement scenarios, it is recommended that you upgrade to Nightingale:
- Multiple systems such as Prometheus, Alertmanager, Grafana, etc. are fragmented and lack a unified view and cannot be used out of the box;
- The way to manage Prometheus and Alertmanager by modifying configuration files has a big learning curve and is difficult to collaborate;
- Too much data to scale-up your Prometheus cluster;
- Multiple Prometheus clusters running in production environments, which faced high management and usage costs;
#### If you are using Zabbix and have the following scenarios, it is recommended that you upgrade to Nightingale:
- Monitoring too much data and wanting a better scalable solution;
- A high learning curve and a desire for better efficiency of collaborative use in a multi-person, multi-team model;
- Microservice and cloud-native architectures with variable monitoring data lifecycles and high monitoring data dimension bases, which are not easily adaptable to the Zabbix data model;
#### If you are using [open-falcon](https://github.com/open-falcon/falcon-plus), we recommend you to upgrade to Nightingale
- For more information about open-falcon and Nightingale, please refer to read [Ten features and trends of cloud-native monitoring](https://mp.weixin.qq.com/s?__biz=MzkzNjI5OTM5Nw==&mid=2247483738&idx=1&sn=e8bdbb974a2cd003c1abcc2b5405dd18&chksm=c2a19fb0f5d616a63185cd79277a79a6b80118ef2185890d0683d2bb20451bd9303c78d083c5#rd)。
## Getting Started
[English Doc](https://n9e.github.io/) | [中文文档](http://n9e.flashcat.cloud/)
## Screenshots
https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a49e-d60af9bd76e8.mp4
## Architecture
<img src="doc/img/arch-product.png" width="600">
Nightingale monitoring can receive monitoring data reported by various collectors (such as [Categraf](https://github.com/flashcatcloud/categraf) , telegraf, grafana-agent, Prometheus, etc.) and write them to various popular time-series databases (such as Prometheus, M3DB, VictoriaMetrics, Thanos, TDEngine, etc.). It provides configuration capabilities for alert rules, silence rules, and subscription rules, as well as the ability to view monitoring data. It also provides automatic alarm self-healing mechanisms (such as automatically calling back to a webhook address or executing a script after an alarm is triggered), and the ability to store and manage historical alarm events and view them in groups.
If the performance of a standalone time-series database (such as Prometheus) has bottlenecks or poor disaster recovery, we recommend using [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics). The VictoriaMetrics architecture is relatively simple, has excellent performance, and is easy to deploy and maintain. The architecture diagram is as shown above. For more detailed documentation on VictoriaMetrics, please refer to its [official website](https://victoriametrics.com/).
**We welcome you to participate in the Nightingale open-source project and community in various ways, including but not limited to**
- Adding and improving documentation => [n9e.github.io](https://n9e.github.io/)
- Sharing your best practices and experience in using Nightingale monitoring => [Article sharing]((https://n9e.github.io/docs/prologue/share/))
- Submitting product suggestions => [github issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md)
- Submitting code to make Nightingale monitoring faster, more stable, and easier to use => [github pull request](https://github.com/didi/nightingale/pulls)
**Respecting, recognizing, and recording the work of every contributor** is the first guiding principle of the Nightingale open-source community. We advocate effective questioning, which not only respects the developer's time but also contributes to the accumulation of knowledge in the entire community
- Before asking a question, please first refer to the [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq)
- We use [GitHub Discussions](https://github.com/ccfos/nightingale/discussions) as the communication forum. You can search and ask questions here.
- We also recommend that you join ours [Slack channel](https://n9e-talk.slack.com/) to exchange experiences with other Nightingale users.
## Who is using Nightingale
You can register your usage and share your experience by posting on **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897)**.
## Stargazers over time
[![Stargazers over time](https://starchart.cc/ccfos/nightingale.svg)](https://starchart.cc/ccfos/nightingale)
## Contributors
<a href="https://github.com/ccfos/nightingale/graphs/contributors">
<img src="https://contrib.rocks/image?repo=ccfos/nightingale" />
</a>
## License
[Apache License V2.0](https://github.com/didi/nightingale/blob/main/LICENSE)

View File

@@ -7,6 +7,7 @@ import (
)
type Alert struct {
Disable bool
EngineDelay int64
Heartbeat HeartbeatConfig
Alerting Alerting
@@ -23,10 +24,10 @@ type SMTPConfig struct {
}
type HeartbeatConfig struct {
IP string
Interval int64
Endpoint string
ClusterName string
IP string
Interval int64
Endpoint string
EngineName string
}
type Alerting struct {
@@ -66,4 +67,12 @@ func (a *Alert) PreCheck() {
if a.Heartbeat.Interval == 0 {
a.Heartbeat.Interval = 1000
}
if a.Heartbeat.EngineName == "" {
a.Heartbeat.EngineName = "default"
}
if a.EngineDelay == 0 {
a.EngineDelay = 30
}
}

View File

@@ -15,6 +15,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
@@ -23,7 +24,6 @@ import (
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/storage"
)
func Initialize(configDir string, cryptoKey string) (func(), error) {
@@ -37,36 +37,30 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
return nil, err
}
db, err := storage.New(config.DB)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), db)
redis, err := storage.NewRedis(config.Redis)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi)
syncStats := memsto.NewSyncStats()
alertStats := astats.NewSyncStats()
targetCache := memsto.NewTargetCache(ctx, syncStats, redis)
targetCache := memsto.NewTargetCache(ctx, syncStats, nil)
busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats)
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, false)
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
rt.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
@@ -77,20 +71,18 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
}
func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType,
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, isCenter bool) {
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) {
alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats)
recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats)
go models.InitNotifyConfig(ctx, alertc.Alerting.TemplatesDir)
naming := naming.NewNaming(ctx, alertc.Heartbeat, isCenter)
naming := naming.NewNaming(ctx, alertc.Heartbeat)
writers := writer.NewWriters(pushgwc)
record.NewScheduler(alertc, recordingRuleCache, promClients, writers, alertStats)
eval.NewScheduler(isCenter, alertc, externalProcessors, alertRuleCache, targetCache, busiGroupCache, alertMuteCache, datasourceCache, promClients, naming, ctx, alertStats)
eval.NewScheduler(alertc, externalProcessors, alertRuleCache, targetCache, busiGroupCache, alertMuteCache, datasourceCache, promClients, naming, ctx, alertStats)
dp := dispatch.NewDispatch(alertRuleCache, userCache, userGroupCache, alertSubscribeCache, targetCache, notifyConfigCache, alertc.Alerting, ctx)
consumer := dispatch.NewConsumer(alertc.Alerting, ctx, dp)

View File

@@ -10,23 +10,11 @@ const (
)
type Stats struct {
CounterSampleTotal *prometheus.CounterVec
CounterAlertsTotal *prometheus.CounterVec
GaugeAlertQueueSize prometheus.Gauge
GaugeSampleQueueSize *prometheus.GaugeVec
RequestDuration *prometheus.HistogramVec
ForwardDuration *prometheus.HistogramVec
CounterAlertsTotal *prometheus.CounterVec
GaugeAlertQueueSize prometheus.Gauge
}
func NewSyncStats() *Stats {
// 从各个接收接口接收到的监控数据总量
CounterSampleTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "samples_received_total",
Help: "Total number samples received.",
}, []string{"cluster", "channel"})
// 产生的告警总量
CounterAlertsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
@@ -43,51 +31,13 @@ func NewSyncStats() *Stats {
Help: "The size of alert queue.",
})
// 数据转发队列,各个队列的长度
GaugeSampleQueueSize := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sample_queue_size",
Help: "The size of sample queue.",
}, []string{"cluster", "channel_number"})
// 一些重要的请求,比如接收数据的请求,应该统计一下延迟情况
RequestDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.01, .1, 1},
Name: "http_request_duration_seconds",
Help: "HTTP request latencies in seconds.",
}, []string{"code", "path", "method"},
)
// 发往后端TSDB延迟如何
ForwardDuration := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.1, 1, 10},
Name: "forward_duration_seconds",
Help: "Forward samples to TSDB. latencies in seconds.",
}, []string{"cluster", "channel_number"},
)
prometheus.MustRegister(
CounterSampleTotal,
CounterAlertsTotal,
GaugeAlertQueueSize,
GaugeSampleQueueSize,
RequestDuration,
ForwardDuration,
)
return &Stats{
CounterSampleTotal: CounterSampleTotal,
CounterAlertsTotal: CounterAlertsTotal,
GaugeAlertQueueSize: GaugeAlertQueueSize,
GaugeSampleQueueSize: GaugeSampleQueueSize,
RequestDuration: RequestDuration,
ForwardDuration: ForwardDuration,
CounterAlertsTotal: CounterAlertsTotal,
GaugeAlertQueueSize: GaugeAlertQueueSize,
}
}

View File

@@ -15,6 +15,7 @@ type AnomalyPoint struct {
Value float64 `json:"value"`
Severity int `json:"severity"`
Triggered bool `json:"triggered"`
Query string `json:"query"`
}
func NewAnomalyPoint(key string, labels map[string]string, ts int64, value float64, severity int) AnomalyPoint {

View File

@@ -8,6 +8,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/queue"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/concurrent/semaphore"
"github.com/toolkits/pkg/logger"
@@ -82,78 +83,21 @@ func (e *Consumer) consumeOne(event *models.AlertCurEvent) {
}
func (e *Consumer) persist(event *models.AlertCurEvent) {
has, err := models.AlertCurEventExists(e.ctx, "hash=?", event.Hash)
if err != nil {
logger.Errorf("event_persist_check_exists_fail: %v rule_id=%d hash=%s", err, event.RuleId, event.Hash)
if event.Status != 0 {
return
}
his := event.ToHis(e.ctx)
// 不管是告警还是恢复,全量告警里都要记录
if err := his.Add(e.ctx); err != nil {
logger.Errorf(
"event_persist_his_fail: %v rule_id=%d cluster:%s hash=%s tags=%v timestamp=%d value=%s",
err,
event.RuleId,
event.Cluster,
event.Hash,
event.TagsJSON,
event.TriggerTime,
event.TriggerValue,
)
}
if has {
// 活跃告警表中有记录,删之
err = models.AlertCurEventDelByHash(e.ctx, event.Hash)
if !e.ctx.IsCenter {
event.DB2FE()
err := poster.PostByUrls(e.ctx, "/v1/n9e/event-persist", event)
if err != nil {
logger.Errorf("event_del_cur_fail: %v hash=%s", err, event.Hash)
return
logger.Errorf("event%+v persist err:%v", event, err)
}
if !event.IsRecovered {
// 恢复事件从活跃告警列表彻底删掉告警事件要重新加进来新的event
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {
if err := event.Add(e.ctx); err != nil {
logger.Errorf(
"event_persist_cur_fail: %v rule_id=%d cluster:%s hash=%s tags=%v timestamp=%d value=%s",
err,
event.RuleId,
event.Cluster,
event.Hash,
event.TagsJSON,
event.TriggerTime,
event.TriggerValue,
)
}
}
}
return
}
if event.IsRecovered {
// alert_cur_event表里没有数据表示之前没告警结果现在报了恢复神奇....理论上不应该出现的
return
}
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {
if err := event.Add(e.ctx); err != nil {
logger.Errorf(
"event_persist_cur_fail: %v rule_id=%d cluster:%s hash=%s tags=%v timestamp=%d value=%s",
err,
event.RuleId,
event.Cluster,
event.Hash,
event.TagsJSON,
event.TriggerTime,
event.TriggerValue,
)
}
err := models.EventPersist(e.ctx, event)
if err != nil {
logger.Errorf("event%+v persist err:%v", event, err)
}
}

View File

@@ -28,8 +28,10 @@ type Dispatch struct {
alerting aconf.Alerting
senders map[string]sender.Sender
tpls map[string]*template.Template
Senders map[string]sender.Sender
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
BeforeSenderHook func(*models.AlertCurEvent) bool
ctx *ctx.Context
@@ -50,8 +52,10 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
alerting: alerting,
senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
Senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
BeforeSenderHook: func(*models.AlertCurEvent) bool { return true },
ctx: ctx,
}
@@ -61,7 +65,7 @@ func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.Us
func (e *Dispatch) ReloadTpls() error {
err := e.relaodTpls()
if err != nil {
logger.Error("failed to reload tpls: %v", err)
logger.Errorf("failed to reload tpls: %v", err)
}
duration := time.Duration(9000) * time.Millisecond
@@ -81,17 +85,24 @@ func (e *Dispatch) relaodTpls() error {
smtp := e.notifyConfigCache.GetSMTP()
senders := map[string]sender.Sender{
models.Email: sender.NewSender(models.Email, tmpTpls, smtp),
models.Dingtalk: sender.NewSender(models.Dingtalk, tmpTpls, smtp),
models.Wecom: sender.NewSender(models.Wecom, tmpTpls, smtp),
models.Feishu: sender.NewSender(models.Feishu, tmpTpls, smtp),
models.Mm: sender.NewSender(models.Mm, tmpTpls, smtp),
models.Telegram: sender.NewSender(models.Telegram, tmpTpls, smtp),
models.Email: sender.NewSender(models.Email, tmpTpls, smtp),
models.Dingtalk: sender.NewSender(models.Dingtalk, tmpTpls, smtp),
models.Wecom: sender.NewSender(models.Wecom, tmpTpls, smtp),
models.Feishu: sender.NewSender(models.Feishu, tmpTpls, smtp),
models.Mm: sender.NewSender(models.Mm, tmpTpls, smtp),
models.Telegram: sender.NewSender(models.Telegram, tmpTpls, smtp),
models.FeishuCard: sender.NewSender(models.FeishuCard, tmpTpls, smtp),
}
e.RwLock.RLock()
for channel, sender := range e.ExtraSenders {
senders[channel] = sender
}
e.RwLock.RUnlock()
e.RwLock.Lock()
e.tpls = tmpTpls
e.senders = senders
e.Senders = senders
e.RwLock.Unlock()
return nil
}
@@ -132,7 +143,7 @@ func (e *Dispatch) HandleEventNotify(event *models.AlertCurEvent, isSubscribe bo
}
// 处理事件发送,这里用一个goroutine处理一个event的所有发送事件
go e.Send(rule, event, notifyTarget, isSubscribe)
go e.Send(rule, event, notifyTarget)
// 如果是不是订阅规则出现的event, 则需要处理订阅规则的event
if !isSubscribe {
@@ -168,30 +179,45 @@ func (e *Dispatch) handleSub(sub *models.AlertSubscribe, event models.AlertCurEv
if sub.ForDuration > (event.TriggerTime - event.FirstTriggerTime) {
return
}
if len(sub.SeveritiesJson) != 0 {
match := false
for _, s := range sub.SeveritiesJson {
if s == event.Severity || s == 0 {
match = true
break
}
}
if !match {
return
}
}
sub.ModifyEvent(&event)
LogEvent(&event, "subscribe")
event.SubRuleId = sub.Id
e.HandleEventNotify(&event, true)
}
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget, isSubscribe bool) {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, event, uids, e.userCache)
e.RwLock.RLock()
s := e.senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Warningf("no sender for channel: %s", channel)
continue
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget) {
needSend := e.BeforeSenderHook(event)
if needSend {
for channel, uids := range notifyTarget.ToChannelUserMap() {
ctx := sender.BuildMessageContext(rule, []*models.AlertCurEvent{event}, uids, e.userCache)
e.RwLock.RLock()
s := e.Senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
}
s.Send(ctx)
}
logger.Debugf("send event: %s, channel: %s", event.Hash, channel)
for i := 0; i < len(ctx.Users); i++ {
logger.Debug("send event to user: ", ctx.Users[i])
}
s.Send(ctx)
}
// handle event callbacks
sender.SendCallbacks(e.ctx, notifyTarget.ToCallbackList(), event, e.targetCache, e.notifyConfigCache.GetIbex())
sender.SendCallbacks(e.ctx, notifyTarget.ToCallbackList(), event, e.targetCache, e.userCache, e.notifyConfigCache.GetIbex())
// handle global webhooks
sender.SendWebhooks(notifyTarget.ToWebhookList(), event)

View File

@@ -16,7 +16,6 @@ import (
)
type Scheduler struct {
isCenter bool
// key: hash
alertRules map[string]*AlertRuleWorker
@@ -38,11 +37,10 @@ type Scheduler struct {
stats *astats.Stats
}
func NewScheduler(isCenter bool, aconf aconf.Alert, externalProcessors *process.ExternalProcessorsType, arc *memsto.AlertRuleCacheType, targetCache *memsto.TargetCacheType,
func NewScheduler(aconf aconf.Alert, externalProcessors *process.ExternalProcessorsType, arc *memsto.AlertRuleCacheType, targetCache *memsto.TargetCacheType,
busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType, promClients *prom.PromClientMap, naming *naming.Naming,
ctx *ctx.Context, stats *astats.Stats) *Scheduler {
scheduler := &Scheduler{
isCenter: isCenter,
aconf: aconf,
alertRules: make(map[string]*AlertRuleWorker),
@@ -108,7 +106,7 @@ func (s *Scheduler) syncAlertRules() {
alertRule := NewAlertRuleWorker(rule, dsId, processor, s.promClients, s.ctx)
alertRuleWorkers[alertRule.Hash()] = alertRule
}
} else if rule.IsHostRule() && s.isCenter {
} else if rule.IsHostRule() && s.ctx.IsCenter {
// all host rule will be processed by center instance
if !naming.DatasourceHashRing.IsHit(naming.HostDatasource, fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) {
continue

View File

@@ -69,14 +69,16 @@ func (arw *AlertRuleWorker) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-arw.quit:
return
default:
case <-ticker.C:
arw.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()
@@ -85,7 +87,7 @@ func (arw *AlertRuleWorker) Start() {
func (arw *AlertRuleWorker) Eval() {
cachedRule := arw.rule
if cachedRule == nil {
logger.Errorf("rule_eval:%s rule not found", arw.Key())
//logger.Errorf("rule_eval:%s rule not found", arw.Key())
return
}
@@ -109,7 +111,7 @@ func (arw *AlertRuleWorker) Eval() {
}
func (arw *AlertRuleWorker) Stop() {
logger.Infof("%s stopped", arw.Key())
logger.Infof("rule_eval %s stopped", arw.Key())
close(arw.quit)
}
@@ -163,6 +165,7 @@ func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) []common.Anom
points := common.ConvertAnomalyPoints(value)
for i := 0; i < len(points); i++ {
points[i].Severity = query.Severity
points[i].Query = promql
}
lst = append(lst, points...)
}
@@ -202,7 +205,17 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
}
for _, target := range targets {
m := make(map[string]string)
target.FillTagsMap()
for k, v := range target.TagsMap {
m[k] = v
}
m["ident"] = target.Ident
bg := arw.processor.BusiGroupCache.GetByBusiGroupId(target.GroupId)
if bg != nil && bg.LabelEnable == 1 {
m["busigroup"] = bg.LabelValue
}
lst = append(lst, common.NewAnomalyPoint(trigger.Type, m, now, float64(now-target.UpdateAt), trigger.Severity))
}
case "offset":
@@ -211,10 +224,28 @@ func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) []common.Anom
logger.Errorf("rule_eval:%s query:%v, error:%v", arw.Key(), query, err)
continue
}
var targetMap = make(map[string]*models.Target)
for _, target := range targets {
targetMap[target.Ident] = target
}
hostOffsetMap := arw.processor.TargetCache.GetOffsetHost(targets, now, int64(trigger.Duration))
for host, offset := range hostOffsetMap {
m := make(map[string]string)
target, exists := targetMap[host]
if exists {
target.FillTagsMap()
for k, v := range target.TagsMap {
m[k] = v
}
}
m["ident"] = host
bg := arw.processor.BusiGroupCache.GetByBusiGroupId(target.GroupId)
if bg != nil && bg.LabelEnable == 1 {
m["busigroup"] = bg.LabelValue
}
lst = append(lst, common.NewAnomalyPoint(trigger.Type, m, now, float64(offset), trigger.Severity))
}
case "pct_target_miss":

View File

@@ -13,7 +13,11 @@ import (
)
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) bool {
if TimeNonEffectiveMuteStrategy(rule, event) {
if rule.Disabled == 1 {
return true
}
if TimeSpanMuteStrategy(rule, event) {
return true
}
@@ -32,12 +36,9 @@ func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *m
return false
}
// TimeNonEffectiveMuteStrategy 根据规则配置的告警时间过滤,如果产生的告警不在规则配置的告警时间内,则不告警
func TimeNonEffectiveMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
if rule.Disabled == 1 {
return true
}
// TimeSpanMuteStrategy 根据规则配置的告警生效时间过滤,如果产生的告警不在规则配置的告警生效时间内,则不告警,即被mute
// 时间范围左闭右开默认范围00:00-24:00
func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
tm := time.Unix(event.TriggerTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
@@ -52,18 +53,33 @@ func TimeNonEffectiveMuteStrategy(rule *models.AlertRule, event *models.AlertCur
if !strings.Contains(enableDaysOfWeek[i], triggerWeek) {
continue
}
if enableStime[i] <= enableEtime[i] {
if triggerTime < enableStime[i] || triggerTime > enableEtime[i] {
continue
if enableStime[i] < enableEtime[i] {
if enableEtime[i] == "23:59" {
// 02:00-23:59这种情况做个特殊处理相当于左闭右闭区间了
if triggerTime < enableStime[i] {
// mute, 即没生效
continue
}
} else {
// 02:00-04:00 或者 02:00-24:00
if triggerTime < enableStime[i] || triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
} else {
if triggerTime < enableStime[i] && triggerTime > enableEtime[i] {
} else if enableStime[i] > enableEtime[i] {
// 21:00-09:00
if triggerTime < enableStime[i] && triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
// 到这里说明当前时刻在告警规则的某组生效时间范围内,直接返回 false
// 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute直接返回 false
return false
}
return true
}
@@ -156,13 +172,16 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
for i := 0; i < len(mute.PeriodicMutesJson); i++ {
if strings.Contains(mute.PeriodicMutesJson[i].EnableDaysOfWeek, triggerWeek) {
if mute.PeriodicMutesJson[i].EnableStime <= mute.PeriodicMutesJson[i].EnableEtime {
if mute.PeriodicMutesJson[i].EnableStime == mute.PeriodicMutesJson[i].EnableEtime {
matchTime = true
break
} else if mute.PeriodicMutesJson[i].EnableStime < mute.PeriodicMutesJson[i].EnableEtime {
if triggerTime >= mute.PeriodicMutesJson[i].EnableStime && triggerTime < mute.PeriodicMutesJson[i].EnableEtime {
matchTime = true
break
}
} else {
if triggerTime < mute.PeriodicMutesJson[i].EnableStime || triggerTime >= mute.PeriodicMutesJson[i].EnableEtime {
if triggerTime >= mute.PeriodicMutesJson[i].EnableStime || triggerTime < mute.PeriodicMutesJson[i].EnableEtime {
matchTime = true
break
}
@@ -174,5 +193,21 @@ func matchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int
return false
}
var matchSeverity bool
if len(mute.SeveritiesJson) > 0 {
for _, s := range mute.SeveritiesJson {
if event.Severity == s || s == 0 {
matchSeverity = true
break
}
}
} else {
matchSeverity = true
}
if !matchSeverity {
return false
}
return common.MatchTags(event.TagsMap, mute.ITags)
}

View File

@@ -1,6 +1,7 @@
package naming
import (
"errors"
"sync"
"github.com/toolkits/pkg/consistent"
@@ -15,7 +16,7 @@ type DatasourceHashRingType struct {
}
// for alert_rule sharding
var HostDatasource int64 = 100000
var HostDatasource int64 = 99999999
var DatasourceHashRing = DatasourceHashRingType{Rings: make(map[int64]*consistent.Consistent)}
func NewConsistentHashRing(replicas int32, nodes []string) *consistent.Consistent {
@@ -39,8 +40,8 @@ func RebuildConsistentHashRing(datasourceId int64, nodes []string) {
}
func (chr *DatasourceHashRingType) GetNode(datasourceId int64, pk string) (string, error) {
chr.RLock()
defer chr.RUnlock()
chr.Lock()
defer chr.Unlock()
_, exists := chr.Rings[datasourceId]
if !exists {
chr.Rings[datasourceId] = NewConsistentHashRing(int32(NodeReplicas), []string{})
@@ -52,14 +53,27 @@ func (chr *DatasourceHashRingType) GetNode(datasourceId int64, pk string) (strin
func (chr *DatasourceHashRingType) IsHit(datasourceId int64, pk string, currentNode string) bool {
node, err := chr.GetNode(datasourceId, pk)
if err != nil {
logger.Debugf("datasource id:%d pk:%s failed to get node from hashring:%v", datasourceId, pk, err)
if !errors.Is(err, consistent.ErrEmptyCircle) {
logger.Debugf("rule id:%s is not work, datasource id:%d failed to get node from hashring:%v", pk, datasourceId, err)
}
return false
}
return node == currentNode
}
func (chr *DatasourceHashRingType) Set(datasourceId int64, r *consistent.Consistent) {
chr.RLock()
defer chr.RUnlock()
chr.Lock()
defer chr.Unlock()
chr.Rings[datasourceId] = r
}
func (chr *DatasourceHashRingType) Clear() {
chr.Lock()
defer chr.Unlock()
for id := range chr.Rings {
if id == HostDatasource {
continue
}
delete(chr.Rings, id)
}
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
@@ -16,14 +17,12 @@ import (
type Naming struct {
ctx *ctx.Context
heartbeatConfig aconf.HeartbeatConfig
isCenter bool
}
func NewNaming(ctx *ctx.Context, heartbeat aconf.HeartbeatConfig, isCenter bool) *Naming {
func NewNaming(ctx *ctx.Context, heartbeat aconf.HeartbeatConfig) *Naming {
naming := &Naming{
ctx: ctx,
heartbeatConfig: heartbeat,
isCenter: isCenter,
}
naming.Heartbeats()
return naming
@@ -45,6 +44,10 @@ func (n *Naming) Heartbeats() error {
}
func (n *Naming) loopDeleteInactiveInstances() {
if !n.ctx.IsCenter {
return
}
interval := time.Duration(10) * time.Minute
for {
time.Sleep(interval)
@@ -74,25 +77,35 @@ func (n *Naming) heartbeat() error {
var err error
// 在页面上维护实例和集群的对应关系
datasourceIds, err = models.GetDatasourceIdsByClusterName(n.ctx, n.heartbeatConfig.ClusterName)
datasourceIds, err = models.GetDatasourceIdsByEngineName(n.ctx, n.heartbeatConfig.EngineName)
if err != nil {
return err
}
if len(datasourceIds) == 0 {
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.ClusterName, 0)
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, 0)
if err != nil {
logger.Warningf("heartbeat with cluster %s err:%v", "", err)
}
} else {
for i := 0; i < len(datasourceIds); i++ {
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.ClusterName, datasourceIds[i])
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, datasourceIds[i])
if err != nil {
logger.Warningf("heartbeat with cluster %d err:%v", datasourceIds[i], err)
}
}
}
if len(datasourceIds) == 0 {
DatasourceHashRing.Clear()
for dsId := range localss {
if dsId == HostDatasource {
continue
}
delete(localss, dsId)
}
}
for i := 0; i < len(datasourceIds); i++ {
servers, err := n.ActiveServers(datasourceIds[i])
if err != nil {
@@ -112,10 +125,10 @@ func (n *Naming) heartbeat() error {
localss[datasourceIds[i]] = newss
}
if n.isCenter {
if n.ctx.IsCenter {
// 如果是中心节点,还需要处理 host 类型的告警规则host 类型告警规则,和数据源无关,想复用下数据源的 hash ring想用一个虚假的数据源 id 来处理
// if is center node, we need to handle host type alerting rules, host type alerting rules are not related to datasource, we want to reuse the hash ring of datasource, we want to use a fake datasource id to handle it
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.ClusterName, HostDatasource)
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, HostDatasource)
if err != nil {
logger.Warningf("heartbeat with cluster %s err:%v", "", err)
}
@@ -146,6 +159,21 @@ func (n *Naming) ActiveServers(datasourceId int64) ([]string, error) {
return nil, fmt.Errorf("cluster is empty")
}
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?dsid="+fmt.Sprintf("%d", datasourceId))
return lst, err
}
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
}
func (n *Naming) ActiveServersByEngineName() ([]string, error) {
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?engine_name="+n.heartbeatConfig.EngineName)
return lst, err
}
// 30秒内有心跳就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "engine_cluster = ? and clock > ?", n.heartbeatConfig.EngineName, time.Now().Unix()-30)
}

View File

@@ -23,6 +23,8 @@ import (
"github.com/toolkits/pkg/str"
)
type EventMuteHookFunc func(event *models.AlertCurEvent) bool
type ExternalProcessorsType struct {
ExternalLock sync.RWMutex
Processors map[string]*Processor
@@ -43,6 +45,8 @@ func (e *ExternalProcessorsType) GetExternalAlertRule(datasourceId, id int64) (*
return processor, has
}
type HandleEventFunc func(event *models.AlertCurEvent)
type Processor struct {
datasourceId int64
@@ -59,13 +63,17 @@ type Processor struct {
atertRuleCache *memsto.AlertRuleCacheType
TargetCache *memsto.TargetCacheType
busiGroupCache *memsto.BusiGroupCacheType
BusiGroupCache *memsto.BusiGroupCacheType
alertMuteCache *memsto.AlertMuteCacheType
datasourceCache *memsto.DatasourceCacheType
promClients *prom.PromClientMap
ctx *ctx.Context
stats *astats.Stats
HandleFireEventHook HandleEventFunc
HandleRecoverEventHook HandleEventFunc
EventMuteHook EventMuteHookFunc
}
func (p *Processor) Key() string {
@@ -94,7 +102,7 @@ func NewProcessor(rule *models.AlertRule, datasourceId int64, atertRuleCache *me
rule: rule,
TargetCache: targetCache,
busiGroupCache: busiGroupCache,
BusiGroupCache: busiGroupCache,
alertMuteCache: alertMuteCache,
atertRuleCache: atertRuleCache,
datasourceCache: datasourceCache,
@@ -102,6 +110,10 @@ func NewProcessor(rule *models.AlertRule, datasourceId int64, atertRuleCache *me
promClients: promClients,
ctx: ctx,
stats: stats,
HandleFireEventHook: func(event *models.AlertCurEvent) {},
HandleRecoverEventHook: func(event *models.AlertCurEvent) {},
EventMuteHook: func(event *models.AlertCurEvent) bool { return false },
}
p.mayHandleGroup()
@@ -113,13 +125,12 @@ func (p *Processor) Handle(anomalyPoints []common.AnomalyPoint, from string, inh
// 这些信息的修改是不会引起worker restart的但是确实会影响告警处理逻辑
// 所以这里直接从memsto.AlertRuleCache中获取并覆盖
p.inhibit = inhibit
p.rule = p.atertRuleCache.Get(p.rule.Id)
cachedRule := p.rule
cachedRule := p.atertRuleCache.Get(p.rule.Id)
if cachedRule == nil {
logger.Errorf("rule not found %+v", anomalyPoints)
return
}
p.rule = cachedRule
now := time.Now().Unix()
alertingKeys := map[string]struct{}{}
@@ -134,6 +145,11 @@ func (p *Processor) Handle(anomalyPoints []common.AnomalyPoint, from string, inh
logger.Debugf("rule_eval:%s event:%v is muted", p.Key(), event)
continue
}
if p.EventMuteHook(event) {
continue
}
tagHash := TagHash(anomalyPoint)
eventsMap[tagHash] = append(eventsMap[tagHash], event)
}
@@ -171,10 +187,12 @@ func (p *Processor) BuildEvent(anomalyPoint common.AnomalyPoint, from string, no
event.Callbacks = p.rule.Callbacks
event.CallbacksJSON = p.rule.CallbacksJSON
event.Annotations = p.rule.Annotations
event.AnnotationsJSON = p.rule.AnnotationsJSON
event.AnnotationsJSON = make(map[string]string)
event.RuleConfig = p.rule.RuleConfig
event.RuleConfigJson = p.rule.RuleConfigJson
event.Severity = anomalyPoint.Severity
event.ExtraConfig = p.rule.ExtraConfigJSON
event.PromQl = anomalyPoint.Query
if from == "inner" {
event.LastEvalTime = now
@@ -228,6 +246,8 @@ func (p *Processor) RecoverSingle(hash string, now int64, value *string) {
cachedRule.UpdateEvent(event)
event.IsRecovered = true
event.LastEvalTime = now
p.HandleRecoverEventHook(event)
p.pushEventToQueue(event)
}
@@ -285,9 +305,12 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
if cachedRule == nil {
return
}
logger.Debugf("rule_eval:%s event:%+v fire", p.Key(), event)
if fired, has := p.fires.Get(event.Hash); has {
p.fires.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
event.FirstTriggerTime = fired.FirstTriggerTime
p.HandleFireEventHook(event)
if cachedRule.NotifyRepeatStep == 0 {
logger.Debugf("rule_eval:%s event:%+v repeat is zero nothing to do", p.Key(), event)
@@ -297,11 +320,10 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
}
// 之前发送过告警了,这次是否要继续发送,要看是否过了通道静默时间
if event.LastEvalTime > fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if event.LastEvalTime >= fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if cachedRule.NotifyMaxNumber == 0 {
// 最大可以发送次数如果是0表示不想限制最大发送次数一直发即可
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
} else {
// 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数
@@ -310,7 +332,6 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
return
} else {
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
p.pushEventToQueue(event)
}
}
@@ -318,6 +339,7 @@ func (p *Processor) fireEvent(event *models.AlertCurEvent) {
} else {
event.NotifyCurNumber = 1
event.FirstTriggerTime = event.TriggerTime
p.HandleFireEventHook(event)
p.pushEventToQueue(event)
}
}
@@ -338,7 +360,7 @@ func (p *Processor) pushEventToQueue(e *models.AlertCurEvent) {
func (p *Processor) RecoverAlertCurEventFromDb() {
p.pendings = NewAlertCurEventMap(nil)
curEvents, err := models.AlertCurEventGetByRuleIdAndCluster(p.ctx, p.rule.Id, p.datasourceId)
curEvents, err := models.AlertCurEventGetByRuleIdAndDsId(p.ctx, p.rule.Id, p.datasourceId)
if err != nil {
logger.Errorf("recover event from db for rule:%s failed, err:%s", p.Key(), err)
p.fires = NewAlertCurEventMap(nil)
@@ -378,18 +400,19 @@ func (p *Processor) fillTags(anomalyPoint common.AnomalyPoint) {
t, err := template.New(fmt.Sprint(p.rule.Id)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
tagValue = fmt.Sprintf("parse tag value failed, err:%s", err)
tagsMap[arr[0]] = tagValue
continue
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
tagValue = fmt.Sprintf("parse tag value failed, err:%s", err)
tagsMap[arr[0]] = tagValue
continue
}
if err == nil {
tagValue = body.String()
}
tagsMap[arr[0]] = tagValue
tagsMap[arr[0]] = body.String()
}
tagsMap["rulename"] = p.rule.Name
@@ -411,7 +434,7 @@ func (p *Processor) mayHandleIdent() {
func (p *Processor) mayHandleGroup() {
// handle bg
bg := p.busiGroupCache.GetByBusiGroupId(p.rule.GroupId)
bg := p.BusiGroupCache.GetByBusiGroupId(p.rule.GroupId)
if bg != nil {
p.groupName = bg.Name
}
@@ -432,7 +455,7 @@ func labelMapToArr(m map[string]string) []string {
}
func Hash(ruleId, datasourceId int64, vector common.AnomalyPoint) string {
return str.MD5(fmt.Sprintf("%d_%s_%d_%d", ruleId, vector.Labels.String(), datasourceId, vector.Severity))
return str.MD5(fmt.Sprintf("%d_%s_%d_%d_%s", ruleId, vector.Labels.String(), datasourceId, vector.Severity, vector.Query))
}
func TagHash(vector common.AnomalyPoint) string {

View File

@@ -54,14 +54,16 @@ func (rrc *RecordRuleContext) Start() {
if interval <= 0 {
interval = 10
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
go func() {
defer ticker.Stop()
for {
select {
case <-rrc.quit:
return
default:
case <-ticker.C:
rrc.Eval()
time.Sleep(time.Duration(interval) * time.Second)
}
}
}()

View File

@@ -29,25 +29,26 @@ func New(httpConfig httpx.Config, alert aconf.Alert, amc *memsto.AlertMuteCacheT
return &Router{
HTTP: httpConfig,
Alert: alert,
AlertStats: astats,
AlertMuteCache: amc,
TargetCache: tc,
BusiGroupCache: bgc,
AlertStats: astats,
Ctx: ctx,
ExternalProcessors: externalProcessors,
}
}
func (rt *Router) Config(r *gin.Engine) {
if !rt.HTTP.Alert.Enable {
if !rt.HTTP.APIForService.Enable {
return
}
service := r.Group("/v1/n9e")
if len(rt.HTTP.Alert.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.Alert.BasicAuth))
if len(rt.HTTP.APIForService.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth))
}
service.POST("/event", rt.pushEventToQueue)
service.POST("/event-persist", rt.eventPersist)
service.POST("/make-event", rt.makeEvent)
}

View File

@@ -83,6 +83,13 @@ func (rt *Router) pushEventToQueue(c *gin.Context) {
ginx.NewRender(c).Message(nil)
}
func (rt *Router) eventPersist(c *gin.Context) {
var event *models.AlertCurEvent
ginx.BindJSON(c, &event)
event.FE2DB()
ginx.NewRender(c).Message(models.EventPersist(rt.Ctx, event))
}
type eventForm struct {
Alert bool `json:"alert"`
AnomalyPoints []common.AnomalyPoint `json:"vectors"`

View File

@@ -1,6 +1,7 @@
package sender
import (
"encoding/json"
"strconv"
"strings"
"time"
@@ -15,7 +16,7 @@ import (
"github.com/toolkits/pkg/logger"
)
func SendCallbacks(ctx *ctx.Context, urls []string, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, ibexConf aconf.Ibex) {
func SendCallbacks(ctx *ctx.Context, urls []string, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType, ibexConf aconf.Ibex) {
for _, url := range urls {
if url == "" {
continue
@@ -23,7 +24,7 @@ func SendCallbacks(ctx *ctx.Context, urls []string, event *models.AlertCurEvent,
if strings.HasPrefix(url, "${ibex}") {
if !event.IsRecovered {
handleIbex(ctx, url, event, targetCache, ibexConf)
handleIbex(ctx, url, event, targetCache, userCache, ibexConf)
}
continue
}
@@ -34,9 +35,9 @@ func SendCallbacks(ctx *ctx.Context, urls []string, event *models.AlertCurEvent,
resp, code, err := poster.PostJSON(url, 5*time.Second, event, 3)
if err != nil {
logger.Errorf("event_callback(rule_id=%d url=%s) fail, resp: %s, err: %v, code: %d", event.RuleId, url, string(resp), err, code)
logger.Errorf("event_callback_fail(rule_id=%d url=%s), resp: %s, err: %v, code: %d", event.RuleId, url, string(resp), err, code)
} else {
logger.Infof("event_callback(rule_id=%d url=%s) succ, resp: %s, code: %d", event.RuleId, url, string(resp), code)
logger.Infof("event_callback_succ(rule_id=%d url=%s), resp: %s, code: %d", event.RuleId, url, string(resp), code)
}
}
}
@@ -50,6 +51,7 @@ type TaskForm struct {
Pause string `json:"pause"`
Script string `json:"script"`
Args string `json:"args"`
Stdin string `json:"stdin"`
Action string `json:"action"`
Creator string `json:"creator"`
Hosts []string `json:"hosts"`
@@ -60,7 +62,7 @@ type TaskCreateReply struct {
Dat int64 `json:"dat"` // task.id
}
func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, ibexConf aconf.Ibex) {
func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType, ibexConf aconf.Ibex) {
arr := strings.Split(url, "/")
var idstr string
@@ -103,7 +105,7 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
// check perm
// tpl.GroupId - host - account 三元组校验权限
can, err := canDoIbex(ctx, tpl.UpdateBy, tpl, host, targetCache)
can, err := canDoIbex(ctx, tpl.UpdateBy, tpl, host, targetCache, userCache)
if err != nil {
logger.Errorf("event_callback_ibex: check perm fail: %v", err)
return
@@ -114,6 +116,30 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
return
}
tagsMap := make(map[string]string)
for i := 0; i < len(event.TagsJSON); i++ {
pair := strings.TrimSpace(event.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.Split(pair, "=")
if len(arr) != 2 {
continue
}
tagsMap[arr[0]] = arr[1]
}
// 附加告警级别 告警触发值标签
tagsMap["alert_severity"] = strconv.Itoa(event.Severity)
tagsMap["alert_trigger_value"] = event.TriggerValue
tags, err := json.Marshal(tagsMap)
if err != nil {
logger.Errorf("event_callback_ibex: failed to marshal tags to json: %v", tagsMap)
return
}
// call ibex
in := TaskForm{
Title: tpl.Title + " FH: " + host,
@@ -124,6 +150,7 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
Pause: tpl.Pause,
Script: tpl.Script,
Args: tpl.Args,
Stdin: string(tags),
Action: "start",
Creator: tpl.UpdateBy,
Hosts: []string{host},
@@ -154,6 +181,7 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
// write db
record := models.TaskRecord{
Id: res.Dat,
EventId: event.Id,
GroupId: tpl.GroupId,
IbexAddress: ibexConf.Address,
IbexAuthUser: ibexConf.BasicAuthUser,
@@ -175,12 +203,8 @@ func handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent, targe
}
}
func canDoIbex(ctx *ctx.Context, username string, tpl *models.TaskTpl, host string, targetCache *memsto.TargetCacheType) (bool, error) {
user, err := models.UserGetByUsername(ctx, username)
if err != nil {
return false, err
}
func canDoIbex(ctx *ctx.Context, username string, tpl *models.TaskTpl, host string, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType) (bool, error) {
user := userCache.GetByUsername(username)
if user != nil && user.IsAdmin() {
return true, nil
}

View File

@@ -32,7 +32,7 @@ type DingtalkSender struct {
}
func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
@@ -40,7 +40,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ds.tpl, ctx.Event)
message := BuildTplMessage(ds.tpl, ctx.Events)
for _, url := range urls {
var body dingtalk
@@ -49,7 +49,7 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Rule.Name,
Title: ctx.Events[0].RuleName,
Text: message,
},
}
@@ -57,8 +57,8 @@ func (ds *DingtalkSender) Send(ctx MessageContext) {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Rule.Name,
Text: message + " " + strings.Join(ats, " "),
Title: ctx.Events[0].RuleName,
Text: message + "\n" + strings.Join(ats, " "),
},
At: dingtalkAt{
AtMobiles: ats,

View File

@@ -22,18 +22,18 @@ type EmailSender struct {
}
func (es *EmailSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tos := extract(ctx.Users)
var subject string
if es.subjectTpl != nil {
subject = BuildTplMessage(es.subjectTpl, ctx.Event)
subject = BuildTplMessage(es.subjectTpl, []*models.AlertCurEvent{ctx.Events[0]})
} else {
subject = ctx.Rule.Name
subject = ctx.Events[0].RuleName
}
content := BuildTplMessage(es.contentTpl, ctx.Event)
content := BuildTplMessage(es.contentTpl, ctx.Events)
es.WriteEmail(subject, content, tos)
}

View File

@@ -31,11 +31,11 @@ type FeishuSender struct {
}
func (fs *FeishuSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, ats := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Event)
message := BuildTplMessage(fs.tpl, ctx.Events)
for _, url := range urls {
body := feishu{
Msgtype: "text",

144
alert/sender/feishucard.go Normal file
View File

@@ -0,0 +1,144 @@
package sender
import (
"fmt"
"html/template"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
type Conf struct {
WideScreenMode bool `json:"wide_screen_mode"`
EnableForward bool `json:"enable_forward"`
}
type Te struct {
Content string `json:"content"`
Tag string `json:"tag"`
}
type Element struct {
Tag string `json:"tag"`
Text Te `json:"text"`
Content string `json:"content"`
Elements []Element `json:"elements"`
}
type Titles struct {
Content string `json:"content"`
Tag string `json:"tag"`
}
type Headers struct {
Title Titles `json:"title"`
Template string `json:"template"`
}
type Cards struct {
Config Conf `json:"config"`
Elements []Element `json:"elements"`
Header Headers `json:"header"`
}
type feishuCard struct {
feishu
Card Cards `json:"card"`
}
type FeishuCardSender struct {
tpl *template.Template
}
const (
Recovered = "recovered"
Triggered = "triggered"
)
var (
body = feishuCard{
feishu: feishu{Msgtype: "interactive"},
Card: Cards{
Config: Conf{
WideScreenMode: true,
EnableForward: true,
},
Header: Headers{
Title: Titles{
Tag: "plain_text",
},
},
Elements: []Element{
{
Tag: "div",
Text: Te{
Tag: "lark_md",
},
},
{
Tag: "hr",
},
{
Tag: "note",
Elements: []Element{
{
Tag: "lark_md",
},
},
},
},
},
}
)
func (fs *FeishuCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, _ := fs.extract(ctx.Users)
message := BuildTplMessage(fs.tpl, ctx.Events)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
for _, url := range urls {
fs.doSend(url, body)
}
}
func (fs *FeishuCardSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
ats := make([]string, 0)
for i := range users {
if token, has := users[i].ExtractToken(models.FeishuCard); has {
url := token
if !strings.HasPrefix(token, "https://") {
url = "https://open.feishu.cn/open-apis/bot/v2/hook/" + strings.TrimSpace(token)
}
urls = append(urls, url)
}
}
return urls, ats
}
func (fs *FeishuCardSender) doSend(url string, body feishuCard) {
res, code, err := poster.PostJSON(url, time.Second*5, body, 3)
if err != nil {
logger.Errorf("feishucard_sender: result=fail url=%s code=%d error=%v response=%s", url, code, err, string(res))
} else {
logger.Debugf("feishucard_sender: result=succ url=%s code=%d response=%s", url, code, string(res))
}
}

View File

@@ -28,7 +28,7 @@ type MmSender struct {
}
func (ms *MmSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
@@ -36,7 +36,7 @@ func (ms *MmSender) Send(ctx MessageContext) {
if len(urls) == 0 {
return
}
message := BuildTplMessage(ms.tpl, ctx.Event)
message := BuildTplMessage(ms.tpl, ctx.Events)
SendMM(MatterMostMessage{
Text: message,
@@ -59,6 +59,7 @@ func SendMM(message MatterMostMessage) {
u, err := url.Parse(message.Tokens[i])
if err != nil {
logger.Errorf("mm_sender: failed to parse error=%v", err)
continue
}
v, err := url.ParseQuery(u.RawQuery)

View File

@@ -35,7 +35,7 @@ func alertingCallScript(stdinBytes []byte, notifyScript models.NotifyScript) {
if file.IsExist(fpath) {
oldContent, err := file.ToString(fpath)
if err != nil {
logger.Errorf("event_notify: read script file err: %v", err)
logger.Errorf("event_script_notify_fail: read script file err: %v", err)
return
}
@@ -47,13 +47,13 @@ func alertingCallScript(stdinBytes []byte, notifyScript models.NotifyScript) {
if rewrite {
_, err := file.WriteString(fpath, config.Content)
if err != nil {
logger.Errorf("event_notify: write script file err: %v", err)
logger.Errorf("event_script_notify_fail: write script file err: %v", err)
return
}
err = os.Chmod(fpath, 0777)
if err != nil {
logger.Errorf("event_notify: chmod script file err: %v", err)
logger.Errorf("event_script_notify_fail: chmod script file err: %v", err)
return
}
}
@@ -70,7 +70,7 @@ func alertingCallScript(stdinBytes []byte, notifyScript models.NotifyScript) {
err := startCmd(cmd)
if err != nil {
logger.Errorf("event_notify: run cmd err: %v", err)
logger.Errorf("event_script_notify_fail: run cmd err: %v", err)
return
}
@@ -78,20 +78,20 @@ func alertingCallScript(stdinBytes []byte, notifyScript models.NotifyScript) {
if isTimeout {
if err == nil {
logger.Errorf("event_notify: timeout and killed process %s", fpath)
logger.Errorf("event_script_notify_fail: timeout and killed process %s", fpath)
}
if err != nil {
logger.Errorf("event_notify: kill process %s occur error %v", fpath, err)
logger.Errorf("event_script_notify_fail: kill process %s occur error %v", fpath, err)
}
return
}
if err != nil {
logger.Errorf("event_notify: exec script %s occur error: %v, output: %s", fpath, err, buf.String())
logger.Errorf("event_script_notify_fail: exec script %s occur error: %v, output: %s", fpath, err, buf.String())
return
}
logger.Infof("event_notify: exec %s output: %s", fpath, buf.String())
logger.Infof("event_script_notify_ok: exec %s output: %s", fpath, buf.String())
}

View File

@@ -17,9 +17,9 @@ type (
// MessageContext 一个event所生成的告警通知的上下文
MessageContext struct {
Users []*models.User
Rule *models.AlertRule
Event *models.AlertCurEvent
Users []*models.User
Rule *models.AlertRule
Events []*models.AlertCurEvent
}
)
@@ -31,6 +31,8 @@ func NewSender(key string, tpls map[string]*template.Template, smtp aconf.SMTPCo
return &WecomSender{tpl: tpls[models.Wecom]}
case models.Feishu:
return &FeishuSender{tpl: tpls[models.Feishu]}
case models.FeishuCard:
return &FeishuCardSender{tpl: tpls[models.FeishuCard]}
case models.Email:
return &EmailSender{subjectTpl: tpls["mailsubject"], contentTpl: tpls[models.Email], smtp: smtp}
case models.Mm:
@@ -41,22 +43,32 @@ func NewSender(key string, tpls map[string]*template.Template, smtp aconf.SMTPCo
return nil
}
func BuildMessageContext(rule *models.AlertRule, event *models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
func BuildMessageContext(rule *models.AlertRule, events []*models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType) MessageContext {
users := userCache.GetByUserIds(uids)
return MessageContext{
Rule: rule,
Event: event,
Users: users,
Rule: rule,
Events: events,
Users: users,
}
}
func BuildTplMessage(tpl *template.Template, event *models.AlertCurEvent) string {
type BuildTplMessageFunc func(tpl *template.Template, events []*models.AlertCurEvent) string
var BuildTplMessage BuildTplMessageFunc = buildTplMessage
func buildTplMessage(tpl *template.Template, events []*models.AlertCurEvent) string {
if tpl == nil {
return "tpl for current sender not found, please check configuration"
}
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
var content string
for _, event := range events {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
}
content += body.String() + "\n\n"
}
return body.String()
return content
}

View File

@@ -26,11 +26,11 @@ type TelegramSender struct {
}
func (ts *TelegramSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tokens := ts.extract(ctx.Users)
message := BuildTplMessage(ts.tpl, ctx.Event)
message := BuildTplMessage(ts.tpl, ctx.Events)
SendTelegram(TelegramMessage{
Text: message,

View File

@@ -53,7 +53,7 @@ func SendWebhooks(webhooks []*models.Webhook, event *models.AlertCurEvent) {
var resp *http.Response
resp, err = client.Do(req)
if err != nil {
logger.Warningf("WebhookCallError, ruleId: [%d], eventId: [%d], url: [%s], error: [%s]", event.RuleId, event.Id, conf.Url, err)
logger.Errorf("event_webhook_fail, ruleId: [%d], eventId: [%d], url: [%s], error: [%s]", event.RuleId, event.Id, conf.Url, err)
continue
}
@@ -63,6 +63,6 @@ func SendWebhooks(webhooks []*models.Webhook, event *models.AlertCurEvent) {
body, _ = ioutil.ReadAll(resp.Body)
}
logger.Debugf("alertingWebhook done, url: %s, response code: %d, body: %s", conf.Url, resp.StatusCode, string(body))
logger.Debugf("event_webhook_succ, url: %s, response code: %d, body: %s", conf.Url, resp.StatusCode, string(body))
}
}

View File

@@ -25,11 +25,11 @@ type WecomSender struct {
}
func (ws *WecomSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || ctx.Rule == nil || ctx.Event == nil {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls := ws.extract(ctx.Users)
message := BuildTplMessage(ws.tpl, ctx.Event)
message := BuildTplMessage(ws.tpl, ctx.Events)
for _, url := range urls {
body := wecom{
Msgtype: "markdown",

View File

@@ -1,19 +1,14 @@
package cconf
import (
"github.com/gin-gonic/gin"
)
type Center struct {
Plugins []Plugin
BasicAuth gin.Accounts
MetricsYamlFile string
OpsYamlFile string
BuiltinIntegrationsDir string
I18NHeaderKey string
MetricDesc MetricDescType
TargetMetrics map[string]string
AnonymousAccess AnonymousAccess
UseFileAssets bool
}
type Plugin struct {

View File

@@ -4,7 +4,6 @@ import (
"path"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/runner"
)
// metricDesc , As load map happens before read map, there is no necessary to use concurrent map for metric desc store
@@ -33,10 +32,10 @@ func GetMetricDesc(lang, metric string) string {
return MetricDesc.CommonDesc[metric]
}
func LoadMetricsYaml(metricsYamlFile string) error {
func LoadMetricsYaml(configDir, metricsYamlFile string) error {
fp := metricsYamlFile
if fp == "" {
fp = path.Join(runner.Cwd, "etc", "metrics.yaml")
fp = path.Join(configDir, "metrics.yaml")
}
if !file.IsExist(fp) {
return nil

View File

@@ -4,7 +4,6 @@ import (
"path"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/runner"
)
var Operations = Operation{}
@@ -19,10 +18,10 @@ type Ops struct {
Ops []string `yaml:"ops" json:"ops"`
}
func LoadOpsYaml(opsYamlFile string) error {
func LoadOpsYaml(configDir string, opsYamlFile string) error {
fp := opsYamlFile
if fp == "" {
fp = path.Join(runner.Cwd, "etc", "ops.yaml")
fp = path.Join(configDir, "ops.yaml")
}
if !file.IsExist(fp) {
return nil

View File

@@ -11,8 +11,10 @@ import (
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/models/migrate"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/i18nx"
@@ -33,8 +35,8 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
return nil, fmt.Errorf("failed to init config: %v", err)
}
cconf.LoadMetricsYaml(config.Center.MetricsYamlFile)
cconf.LoadOpsYaml(config.Center.OpsYamlFile)
cconf.LoadMetricsYaml(configDir, config.Center.MetricsYamlFile)
cconf.LoadOpsYaml(configDir, config.Center.OpsYamlFile)
logxClean, err := logx.Init(config.Log)
if err != nil {
@@ -47,8 +49,9 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), db)
ctx := ctx.NewContext(context.Background(), db, true)
models.InitRoot(ctx)
migrate.Migrate(db)
redis, err := storage.NewRedis(config.Redis)
if err != nil {
@@ -56,7 +59,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
}
metas := metas.New(redis)
idents := idents.New(db)
idents := idents.New(ctx)
syncStats := memsto.NewSyncStats()
alertStats := astats.NewSyncStats()
@@ -69,16 +72,20 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, true)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
writers := writer.NewWriters(config.Pushgw)
httpx.InitRSAConfig(&config.HTTP.RSA)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas)
centerRouter := centerrt.New(config.HTTP, config.Center, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
@@ -86,6 +93,7 @@ func Initialize(configDir string, cryptoKey string) (func(), error) {
centerRouter.Config(r)
alertrtRouter.Config(r)
pushgwRouter.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)

View File

@@ -95,11 +95,10 @@ func (s *Set) updateTargets(m map[string]models.HostMeta) error {
return nil
}
var values []interface{}
newMap := make(map[string]interface{}, count)
for ident, meta := range m {
values = append(values, models.WrapIdent(ident))
values = append(values, meta)
newMap[models.WrapIdent(ident)] = meta
}
err := s.redis.MSet(context.Background(), values...).Err()
err := storage.MSet(context.Background(), s.redis, newMap)
return err
}

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"net/http"
"path"
"runtime"
"strings"
"time"
@@ -11,15 +12,19 @@ import (
"github.com/ccfos/nightingale/v6/center/cstats"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
_ "github.com/ccfos/nightingale/v6/front/statik"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/aop"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/storage"
"github.com/toolkits/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/rakyll/statik/fs"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
)
type Router struct {
@@ -31,12 +36,17 @@ type Router struct {
PromClients *prom.PromClientMap
Redis storage.Redis
MetaSet *metas.Set
IdentSet *idents.Set
TargetCache *memsto.TargetCacheType
Sso *sso.SsoClient
UserCache *memsto.UserCacheType
UserGroupCache *memsto.UserGroupCacheType
Ctx *ctx.Context
}
func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set) *Router {
pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType,
uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
@@ -45,9 +55,13 @@ func New(httpConfig httpx.Config, center cconf.Center, operations cconf.Operatio
NotifyConfigCache: ncc,
PromClients: pc,
Redis: redis,
Sso: sso,
Ctx: ctx,
MetaSet: metaSet,
IdentSet: idents,
TargetCache: tc,
Sso: sso,
UserCache: uc,
UserGroupCache: ugc,
Ctx: ctx,
}
}
@@ -86,24 +100,57 @@ func languageDetector(i18NHeaderKey string) gin.HandlerFunc {
}
}
func (rt *Router) configNoRoute(r *gin.Engine) {
func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) {
r.NoRoute(func(c *gin.Context) {
arr := strings.Split(c.Request.URL.Path, ".")
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map":
c.File(path.Join(strings.Split("pub/"+c.Request.URL.Path, "/")...))
if !rt.Center.UseFileAssets {
c.FileFromFS(c.Request.URL.Path, *fs)
} else {
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, strings.Split(c.Request.URL.Path, "/")...)
c.File(path.Join(cwdarr...))
}
default:
c.File(path.Join("pub", "index.html"))
if !rt.Center.UseFileAssets {
c.FileFromFS("/", *fs)
} else {
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, "index.html")
c.File(path.Join(cwdarr...))
}
}
})
}
func (rt *Router) Config(r *gin.Engine) {
r.Use(stat())
r.Use(languageDetector(rt.Center.I18NHeaderKey))
r.Use(aop.Recovery())
statikFS, err := fs.New()
if err != nil {
logger.Errorf("cannot create statik fs: %v", err)
}
if !rt.Center.UseFileAssets {
r.StaticFS("/pub", statikFS)
}
pagesPrefix := "/api/n9e"
pages := r.Group(pagesPrefix)
{
@@ -112,23 +159,30 @@ func (rt *Router) Config(r *gin.Engine) {
pages.Any("/proxy/:id/*url", rt.dsProxy)
pages.POST("/query-range-batch", rt.promBatchQueryRange)
pages.POST("/query-instant-batch", rt.promBatchQueryInstant)
pages.GET("/datasource/brief", rt.datasourceBriefs)
} else {
pages.Any("/proxy/:id/*url", rt.auth(), rt.dsProxy)
pages.POST("/query-range-batch", rt.auth(), rt.promBatchQueryRange)
pages.POST("/query-instant-batch", rt.auth(), rt.promBatchQueryInstant)
pages.GET("/datasource/brief", rt.auth(), rt.datasourceBriefs)
}
pages.POST("/auth/login", rt.jwtMock(), rt.loginPost)
pages.POST("/auth/logout", rt.jwtMock(), rt.logoutPost)
pages.POST("/auth/logout", rt.jwtMock(), rt.auth(), rt.logoutPost)
pages.POST("/auth/refresh", rt.jwtMock(), rt.refreshPost)
pages.POST("/auth/captcha", rt.jwtMock(), rt.generateCaptcha)
pages.POST("/auth/captcha-verify", rt.jwtMock(), rt.captchaVerify)
pages.GET("/auth/ifshowcaptcha", rt.ifShowCaptcha)
pages.GET("/auth/sso-config", rt.ssoConfigNameGet)
pages.GET("/auth/rsa-config", rt.rsaConfigGet)
pages.GET("/auth/redirect", rt.loginRedirect)
pages.GET("/auth/redirect/cas", rt.loginRedirectCas)
pages.GET("/auth/redirect/oauth", rt.loginRedirectOAuth)
pages.GET("/auth/callback", rt.loginCallback)
pages.GET("/auth/callback/cas", rt.loginCallbackCas)
pages.GET("/auth/callback/oauth", rt.loginCallbackOAuth)
pages.GET("/auth/perms", rt.allPerms)
pages.GET("/metrics/desc", rt.metricsDescGetFile)
pages.POST("/metrics/desc", rt.metricsDescGetMap)
@@ -188,11 +242,7 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/dashboards/builtin/list", rt.builtinBoardGets)
pages.GET("/builtin-boards-cates", rt.auth(), rt.user(), rt.builtinBoardCateGets)
pages.POST("/builtin-boards-detail", rt.auth(), rt.user(), rt.builtinBoardDetailGets)
pages.GET("/integrations/icon/:cate/:name", func(c *gin.Context) {
cate := ginx.UrlParamStr(c, "cate")
fp := "integrations/" + cate + "/icon/" + ginx.UrlParamStr(c, "name")
c.File(path.Join(fp))
})
pages.GET("/integrations/icon/:cate/:name", rt.builtinIcon)
pages.GET("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.bgro(), rt.boardGets)
pages.POST("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.bgrw(), rt.boardAdd)
@@ -218,6 +268,7 @@ func (rt *Router) Config(r *gin.Engine) {
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)
pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet)
pages.PUT("/busi-group/:id/alert-rule/:arid/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation)
pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets)
pages.POST("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/add"), rt.bgrw(), rt.recordingRuleAddByFE)
@@ -288,11 +339,13 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/role/:id/ops", rt.auth(), rt.admin(), rt.operationOfRole)
pages.PUT("/role/:id/ops", rt.auth(), rt.admin(), rt.roleBindOperation)
pages.GET("operation", rt.operations)
pages.GET("/operation", rt.operations)
pages.GET("/notify-tpls", rt.auth(), rt.admin(), rt.notifyTplGets)
pages.PUT("/notify-tpl/content", rt.auth(), rt.admin(), rt.notifyTplUpdateContent)
pages.PUT("/notify-tpl", rt.auth(), rt.admin(), rt.notifyTplUpdate)
pages.POST("/notify-tpl", rt.auth(), rt.admin(), rt.notifyTplAdd)
pages.DELETE("/notify-tpl/:id", rt.auth(), rt.admin(), rt.notifyTplDel)
pages.POST("/notify-tpl/preview", rt.auth(), rt.admin(), rt.notifyTplPreview)
pages.GET("/sso-configs", rt.auth(), rt.admin(), rt.ssoConfigGets)
@@ -312,19 +365,28 @@ func (rt *Router) Config(r *gin.Engine) {
pages.GET("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigGet)
pages.PUT("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigPut)
pages.GET("/es-index-pattern", rt.auth(), rt.esIndexPatternGet)
pages.GET("/es-index-pattern-list", rt.auth(), rt.esIndexPatternGetAll)
pages.POST("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternAdd)
pages.PUT("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternPut)
pages.DELETE("/es-index-pattern", rt.auth(), rt.admin(), rt.esIndexPatternDel)
}
if rt.HTTP.Service.Enable {
if rt.HTTP.APIForService.Enable {
service := r.Group("/v1/n9e")
if len(rt.HTTP.Service.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.Service.BasicAuth))
if len(rt.HTTP.APIForService.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth))
}
{
service.Any("/prometheus/*url", rt.dsProxy)
service.POST("/users", rt.userAddPost)
service.GET("/users", rt.userFindAll)
service.GET("/targets", rt.targetGets)
service.GET("/user-groups", rt.userGroupGetsByService)
service.GET("/user-group-members", rt.userGroupMemberGetsByService)
service.GET("/targets", rt.targetGetsByService)
service.GET("/targets/tags", rt.targetGetTags)
service.POST("/targets/tags", rt.targetBindTagsByService)
service.DELETE("/targets/tags", rt.targetUnbindTagsByService)
@@ -336,36 +398,56 @@ func (rt *Router) Config(r *gin.Engine) {
service.GET("/alert-rule/:arid", rt.alertRuleGet)
service.GET("/alert-rules", rt.alertRulesGetByService)
service.GET("/alert-subscribes", rt.alertSubscribeGetsByService)
service.GET("/busi-groups", rt.busiGroupGetsByService)
service.GET("/datasources", rt.datasourceGetsByService)
service.GET("/datasource-ids", rt.getDatasourceIds)
service.POST("/server-heartbeat", rt.serverHeartbeat)
service.GET("/servers-active", rt.serversActive)
service.GET("/recording-rules", rt.recordingRuleGetsByService)
service.GET("/alert-mutes", rt.alertMuteGets)
service.POST("/alert-mutes", rt.alertMuteAddByService)
service.DELETE("/alert-mutes", rt.alertMuteDel)
service.GET("/alert-cur-events", rt.alertCurEventsList)
service.GET("/alert-cur-events-get-by-rid", rt.alertCurEventsGetByRid)
service.GET("/alert-his-events", rt.alertHisEventsList)
service.GET("/alert-his-event/:eid", rt.alertHisEventGet)
service.GET("/config/:id", rt.configGet)
service.GET("/configs", rt.configsGet)
service.GET("/config", rt.configGetByKey)
service.PUT("/configs", rt.configsPut)
service.POST("/configs", rt.configsPost)
service.DELETE("/configs", rt.configsDel)
service.POST("/conf-prop/encrypt", rt.confPropEncrypt)
service.POST("/conf-prop/decrypt", rt.confPropDecrypt)
service.GET("/statistic", rt.statistic)
service.GET("/notify-tpls", rt.notifyTplGets)
service.POST("/task-record-add", rt.taskRecordAdd)
}
}
if rt.HTTP.Heartbeat.Enable {
if rt.HTTP.APIForAgent.Enable {
heartbeat := r.Group("/v1/n9e")
{
if len(rt.HTTP.Heartbeat.BasicAuth) > 0 {
heartbeat.Use(gin.BasicAuth(rt.HTTP.Heartbeat.BasicAuth))
if len(rt.HTTP.APIForAgent.BasicAuth) > 0 {
heartbeat.Use(gin.BasicAuth(rt.HTTP.APIForAgent.BasicAuth))
}
heartbeat.POST("/heartbeat", rt.heartbeat)
}
}
rt.configNoRoute(r)
rt.configNoRoute(r, &statikFS)
}
func Render(c *gin.Context, data, msg interface{}) {
@@ -387,9 +469,9 @@ func Dangerous(c *gin.Context, v interface{}, code ...int) {
switch t := v.(type) {
case string:
if t != "" {
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": v}})
c.JSON(http.StatusOK, gin.H{"error": v})
}
case error:
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": t.Error()}})
c.JSON(http.StatusOK, gin.H{"error": t.Error()})
}
}

View File

@@ -69,6 +69,11 @@ func (rt *Router) alertAggrViewPut(c *gin.Context) {
return
}
}
ginx.NewRender(c).Message(view.Update(rt.Ctx, f.Name, f.Rule, f.Cate, me.Id))
view.Name = f.Name
view.Rule = f.Rule
view.Cate = f.Cate
if view.CreateBy == 0 {
view.CreateBy = me.Id
}
ginx.NewRender(c).Message(view.Update(rt.Ctx))
}

View File

@@ -128,6 +128,13 @@ func (rt *Router) alertCurEventsCardDetails(c *gin.Context) {
ginx.NewRender(c).Data(list, err)
}
// alertCurEventsGetByRid
func (rt *Router) alertCurEventsGetByRid(c *gin.Context) {
rid := ginx.QueryInt64(c, "rid")
dsId := ginx.QueryInt64(c, "dsid")
ginx.NewRender(c).Data(models.AlertCurEventGetByRuleIdAndDsId(rt.Ctx, rid, dsId))
}
// 列表方式,拉取活跃告警
func (rt *Router) alertCurEventsList(c *gin.Context) {
stime, etime := getTimeRange(c)

View File

@@ -2,6 +2,7 @@ package router
import (
"net/http"
"strconv"
"strings"
"time"
@@ -27,7 +28,12 @@ func (rt *Router) alertRuleGets(c *gin.Context) {
}
func (rt *Router) alertRulesGetByService(c *gin.Context) {
prods := strings.Split(ginx.QueryStr(c, "prods", ""), ",")
prods := []string{}
prodStr := ginx.QueryStr(c, "prods", "")
if prodStr != "" {
prods = strings.Split(ginx.QueryStr(c, "prods", ""), ",")
}
query := ginx.QueryStr(c, "query", "")
algorithm := ginx.QueryStr(c, "algorithm", "")
cluster := ginx.QueryStr(c, "cluster", "")
@@ -266,3 +272,56 @@ func (rt *Router) alertRuleGet(c *gin.Context) {
ginx.NewRender(c).Data(ar, err)
}
//pre validation before save rule
func (rt *Router) alertRuleValidation(c *gin.Context) {
var f models.AlertRule //new
ginx.BindJSON(c, &f)
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
rt.bgrwCheck(c, ar.GroupId)
if len(f.NotifyChannelsJSON) > 0 && len(f.NotifyGroupsJSON) > 0 { //Validation NotifyChannels
ngids := make([]int64, 0, len(f.NotifyChannelsJSON))
for i := range f.NotifyGroupsJSON {
id, _ := strconv.ParseInt(f.NotifyGroupsJSON[i], 10, 64)
ngids = append(ngids, id)
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
//If any users have a certain notify channel's token, it will be okay. Otherwise, this notify channel is absent of tokens.
ancs := make([]string, 0, len(f.NotifyChannelsJSON)) //absent Notify Channels
for i := range f.NotifyChannelsJSON {
flag := true
for ui := range users {
if _, b := users[ui].ExtractToken(f.NotifyChannelsJSON[i]); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, f.NotifyChannelsJSON[i])
}
}
if len(ancs) > 0 {
ginx.NewRender(c).Message(i18n.Sprintf(c.GetHeader("X-Language"), "All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %s", ancs))
return
}
}
ginx.NewRender(c).Message("")
}

View File

@@ -83,6 +83,9 @@ func (rt *Router) alertSubscribePut(c *gin.Context) {
rt.Ctx,
"name",
"disabled",
"prod",
"cate",
"datasource_ids",
"cluster",
"rule_id",
"tags",
@@ -96,7 +99,8 @@ func (rt *Router) alertSubscribePut(c *gin.Context) {
"webhooks",
"for_duration",
"redefine_webhooks",
"datasource_ids",
"severities",
"extra_config",
))
}
@@ -110,3 +114,8 @@ func (rt *Router) alertSubscribeDel(c *gin.Context) {
ginx.NewRender(c).Message(models.AlertSubscribeDel(rt.Ctx, f.Ids))
}
func (rt *Router) alertSubscribeGetsByService(c *gin.Context) {
lst, err := models.AlertSubscribeGetsByService(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}

View File

@@ -91,6 +91,9 @@ func (rt *Router) builtinBoardCateGets(c *gin.Context) {
boardCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/dashboards")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
var boards []Payload
for _, f := range files {
@@ -243,6 +246,9 @@ func (rt *Router) builtinAlertRules(c *gin.Context) {
alertCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/alerts")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
alertRules := make(map[string][]models.AlertRule)
for _, f := range files {
@@ -298,3 +304,14 @@ func (rt *Router) builtinBoardGet(c *gin.Context) {
ginx.Bomb(http.StatusBadRequest, "%s not found", name)
}
func (rt *Router) builtinIcon(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
cate := ginx.UrlParamStr(c, "cate")
iconPath := fp + "/" + cate + "/icon/" + ginx.UrlParamStr(c, "name")
c.File(path.Join(iconPath))
}

View File

@@ -123,6 +123,11 @@ func (rt *Router) busiGroupGets(c *gin.Context) {
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) busiGroupGetsByService(c *gin.Context) {
lst, err := models.BusiGroupGetAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
// 这个接口只有在活跃告警页面才调用获取各个BG的活跃告警数量
func (rt *Router) busiGroupAlertingsGets(c *gin.Context) {
ids := ginx.QueryStr(c, "ids", "")

View File

@@ -0,0 +1,114 @@
package router
import (
"context"
"time"
"github.com/ccfos/nightingale/v6/storage"
"github.com/gin-gonic/gin"
captcha "github.com/mojocn/base64Captcha"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
type CaptchaRedisStore struct {
redis storage.Redis
}
func (s *CaptchaRedisStore) Set(id string, value string) error {
ctx := context.Background()
err := s.redis.Set(ctx, id, value, time.Duration(300*time.Second)).Err()
if err != nil {
logger.Errorf("captcha id set to redis error : %s", err.Error())
return err
}
return nil
}
func (s *CaptchaRedisStore) Get(id string, clear bool) string {
ctx := context.Background()
val, err := s.redis.Get(ctx, id).Result()
if err != nil {
logger.Errorf("captcha id get from redis error : %s", err.Error())
return ""
}
if clear {
s.redis.Del(ctx, id)
}
return val
}
func (s *CaptchaRedisStore) Verify(id, answer string, clear bool) bool {
old := s.Get(id, clear)
return old == answer
}
func (rt *Router) newCaptchaRedisStore() *CaptchaRedisStore {
if captchaStore == nil {
captchaStore = &CaptchaRedisStore{redis: rt.Redis}
}
return captchaStore
}
var captchaStore *CaptchaRedisStore
type CaptchaReqBody struct {
Id string
VerifyValue string
}
// 生成图形验证码
func (rt *Router) generateCaptcha(c *gin.Context) {
var driver = captcha.NewDriverMath(60, 200, 0, captcha.OptionShowHollowLine, nil, nil, []string{"wqy-microhei.ttc"})
cc := captcha.NewCaptcha(driver, rt.newCaptchaRedisStore())
//data:image/png;base64
id, b64s, err := cc.Generate()
if err != nil {
ginx.NewRender(c).Message(err)
return
}
ginx.NewRender(c).Data(gin.H{
"imgdata": b64s,
"captchaid": id,
}, nil)
}
// 验证
func (rt *Router) captchaVerify(c *gin.Context) {
var param CaptchaReqBody
ginx.BindJSON(c, &param)
//verify the captcha
if captchaStore.Verify(param.Id, param.VerifyValue, true) {
ginx.NewRender(c).Message("")
return
}
ginx.NewRender(c).Message("incorrect verification code")
}
// 验证码开关
func (rt *Router) ifShowCaptcha(c *gin.Context) {
if rt.HTTP.ShowCaptcha.Enable {
ginx.NewRender(c).Data(gin.H{
"show": true,
}, nil)
return
}
ginx.NewRender(c).Data(gin.H{
"show": false,
}, nil)
}
// 验证
func CaptchaVerify(id string, value string) bool {
//verify the captcha
return captchaStore.Verify(id, value, true)
}

View File

@@ -20,6 +20,11 @@ func (rt *Router) configGet(c *gin.Context) {
ginx.NewRender(c).Data(configs, err)
}
func (rt *Router) configGetByKey(c *gin.Context) {
config, err := models.ConfigsGet(rt.Ctx, ginx.QueryStr(c, "key"))
ginx.NewRender(c).Data(config, err)
}
func (rt *Router) configsDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)

View File

@@ -1,10 +1,17 @@
package router
import (
"crypto/tls"
"fmt"
"net/http"
"net/url"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) pluginList(c *gin.Context) {
@@ -29,6 +36,34 @@ func (rt *Router) datasourceList(c *gin.Context) {
Render(c, list, err)
}
func (rt *Router) datasourceGetsByService(c *gin.Context) {
typ := ginx.QueryStr(c, "typ", "")
lst, err := models.GetDatasourcesGetsBy(rt.Ctx, typ, "", "", "")
ginx.NewRender(c).Data(lst, err)
}
type datasourceBrief struct {
Id int64 `json:"id"`
Name string `json:"name"`
PluginType string `json:"plugin_type"`
}
func (rt *Router) datasourceBriefs(c *gin.Context) {
var dss []datasourceBrief
list, err := models.GetDatasourcesGetsBy(rt.Ctx, "", "", "", "")
ginx.Dangerous(err)
for i := range list {
dss = append(dss, datasourceBrief{
Id: list[i].Id,
Name: list[i].Name,
PluginType: list[i].PluginType,
})
}
ginx.NewRender(c).Data(dss, err)
}
func (rt *Router) datasourceUpsert(c *gin.Context) {
var req models.Datasource
ginx.BindJSON(c, &req)
@@ -37,6 +72,13 @@ func (rt *Router) datasourceUpsert(c *gin.Context) {
var err error
var count int64
err = DatasourceCheck(req)
if err != nil {
Dangerous(c, err)
return
}
if req.Id == 0 {
req.CreatedBy = username
req.Status = "enabled"
@@ -52,12 +94,76 @@ func (rt *Router) datasourceUpsert(c *gin.Context) {
}
err = req.Add(rt.Ctx)
} else {
err = req.Update(rt.Ctx, "name", "description", "cluster_name", "settings", "http", "auth", "status", "updated_by", "updated_at")
err = req.Update(rt.Ctx, "name", "description", "cluster_name", "settings", "http", "auth", "updated_by", "updated_at")
}
Render(c, nil, err)
}
func DatasourceCheck(ds models.Datasource) error {
if ds.HTTPJson.Url == "" {
return fmt.Errorf("url is empty")
}
if !strings.HasPrefix(ds.HTTPJson.Url, "http") {
return fmt.Errorf("url must start with http or https")
}
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: ds.HTTPJson.TLS.SkipTlsVerify,
},
},
}
fullURL := ds.HTTPJson.Url
req, err := http.NewRequest("GET", fullURL, nil)
if err != nil {
logger.Errorf("Error creating request: %v", err)
return fmt.Errorf("request url:%s failed", fullURL)
}
if ds.PluginType == models.PROMETHEUS {
subPath := "/api/v1/query"
query := url.Values{}
if strings.Contains(fullURL, "loki") {
subPath = "/api/v1/labels"
} else {
query.Add("query", "1+1")
}
fullURL = fmt.Sprintf("%s%s?%s", ds.HTTPJson.Url, subPath, query.Encode())
req, err = http.NewRequest("POST", fullURL, nil)
if err != nil {
logger.Errorf("Error creating request: %v", err)
return fmt.Errorf("request url:%s failed", fullURL)
}
}
if ds.AuthJson.BasicAuthUser != "" {
req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword)
}
for k, v := range ds.HTTPJson.Headers {
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
logger.Errorf("Error making request: %v\n", err)
return fmt.Errorf("request url:%s failed", fullURL)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
logger.Errorf("Error making request: %v\n", resp.StatusCode)
return fmt.Errorf("request url:%s failed code:%d", fullURL, resp.StatusCode)
}
return nil
}
func (rt *Router) datasourceGet(c *gin.Context) {
var req models.Datasource
ginx.BindJSON(c, &req)
@@ -81,6 +187,13 @@ func (rt *Router) datasourceDel(c *gin.Context) {
Render(c, nil, err)
}
func (rt *Router) getDatasourceIds(c *gin.Context) {
name := ginx.QueryStr(c, "name")
datasourceIds, err := models.GetDatasourceIdsByEngineName(rt.Ctx, name)
ginx.NewRender(c).Data(datasourceIds, err)
}
func Username(c *gin.Context) string {
return c.MustGet("username").(string)

View File

@@ -0,0 +1,71 @@
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
)
// 创建 ES Index Pattern
func (rt *Router) esIndexPatternAdd(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
now := time.Now().Unix()
f.CreateAt = now
f.CreateBy = username
f.UpdateAt = now
f.UpdateBy = username
err := f.Add(rt.Ctx)
ginx.NewRender(c).Message(err)
}
// 更新 ES Index Pattern
func (rt *Router) esIndexPatternPut(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
id := ginx.QueryInt64(c, "id")
esIndexPattern, err := models.EsIndexPatternGetById(rt.Ctx, id)
ginx.Dangerous(err)
if esIndexPattern == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such EsIndexPattern")
return
}
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(esIndexPattern.Update(rt.Ctx, f))
}
// 删除 ES Index Pattern
func (rt *Router) esIndexPatternDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids empty")
}
ginx.NewRender(c).Message(models.EsIndexPatternDel(rt.Ctx, f.Ids))
}
// ES Index Pattern列表
func (rt *Router) esIndexPatternGetAll(c *gin.Context) {
lst, err := models.EsIndexPatternGets(rt.Ctx, "")
ginx.NewRender(c).Data(lst, err)
}
// ES Index Pattern 单个数据
func (rt *Router) esIndexPatternGet(c *gin.Context) {
id := ginx.QueryInt64(c, "id")
item, err := models.EsIndexPatternGet(rt.Ctx, "id=?", id)
ginx.NewRender(c).Data(item, err)
}

View File

@@ -17,6 +17,41 @@ import (
const defaultLimit = 300
func (rt *Router) statistic(c *gin.Context) {
name := ginx.QueryStr(c, "name")
var model interface{}
var err error
var statistics *models.Statistics
switch name {
case "alert_mute":
model = models.AlertMute{}
case "alert_rule":
model = models.AlertRule{}
case "alert_subscribe":
model = models.AlertSubscribe{}
case "busi_group":
model = models.BusiGroup{}
case "recording_rule":
model = models.RecordingRule{}
case "target":
model = models.Target{}
case "user":
model = models.User{}
case "user_group":
model = models.UserGroup{}
case "datasource":
// datasource update_at is different from others
statistics, err = models.DatasourceStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
default:
ginx.Bomb(http.StatusBadRequest, "invalid name")
}
statistics, err = models.StatisticsGet(rt.Ctx, model)
ginx.NewRender(c).Data(statistics, err)
}
func queryDatasourceIds(c *gin.Context) []int64 {
datasourceIds := ginx.QueryStr(c, "datasource_ids", "")
datasourceIds = strings.ReplaceAll(datasourceIds, ",", " ")

View File

@@ -35,7 +35,28 @@ func (rt *Router) heartbeat(c *gin.Context) {
err = json.Unmarshal(bs, &req)
ginx.Dangerous(err)
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
// maybe from pushgw
if req.Offset == 0 {
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
}
if req.RemoteAddr == "" {
req.RemoteAddr = c.ClientIP()
}
rt.MetaSet.Set(req.Hostname, req)
ginx.NewRender(c).Message(nil)
var items = make(map[string]struct{})
items[req.Hostname] = struct{}{}
rt.IdentSet.MSet(items)
gid := ginx.QueryInt64(c, "gid", 0)
if gid != 0 {
target, has := rt.TargetCache.Get(req.Hostname)
if has && target.GroupId != gid {
err = models.TargetUpdateBgid(rt.Ctx, []string{req.Hostname}, gid, false)
}
}
ginx.NewRender(c).Message(err)
}

View File

@@ -1,6 +1,7 @@
package router
import (
"encoding/base64"
"fmt"
"net/http"
"strconv"
@@ -12,6 +13,7 @@ import (
"github.com/ccfos/nightingale/v6/pkg/ldapx"
"github.com/ccfos/nightingale/v6/pkg/oauth2x"
"github.com/ccfos/nightingale/v6/pkg/oidcx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/pelletier/go-toml/v2"
"github.com/dgrijalva/jwt-go"
@@ -21,26 +23,46 @@ import (
)
type loginForm struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
Captchaid string `json:"captchaid"`
Verifyvalue string `json:"verifyvalue"`
}
func (rt *Router) loginPost(c *gin.Context) {
var f loginForm
ginx.BindJSON(c, &f)
logger.Infof("username:%s login from:%s", f.Username, c.ClientIP())
user, err := models.PassLogin(rt.Ctx, f.Username, f.Password)
if rt.HTTP.ShowCaptcha.Enable {
if !CaptchaVerify(f.Captchaid, f.Verifyvalue) {
ginx.NewRender(c).Message("incorrect verification code")
return
}
}
authPassWord := f.Password
// need decode
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
user, err := models.PassLogin(rt.Ctx, f.Username, authPassWord)
if err != nil {
// pass validate fail, try ldap
if rt.Sso.LDAP.Enable {
roles := strings.Join(rt.Sso.LDAP.DefaultRoles, " ")
user, err = models.LdapLogin(rt.Ctx, f.Username, f.Password, roles, rt.Sso.LDAP)
user, err = models.LdapLogin(rt.Ctx, f.Username, authPassWord, roles, rt.Sso.LDAP)
if err != nil {
logger.Debugf("ldap login failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
user.RolesLst = rt.Sso.LDAP.DefaultRoles
user.RolesLst = strings.Fields(user.Roles)
} else {
ginx.NewRender(c).Message(err)
return
@@ -67,6 +89,7 @@ func (rt *Router) loginPost(c *gin.Context) {
}
func (rt *Router) logoutPost(c *gin.Context) {
logger.Infof("username:%s login from:%s", c.GetString("username"), c.ClientIP())
metadata, err := rt.extractTokenMetadata(c.Request)
if err != nil {
ginx.NewRender(c, http.StatusBadRequest).Message("failed to parse jwt token")
@@ -537,3 +560,19 @@ func (rt *Router) ssoConfigUpdate(c *gin.Context) {
ginx.NewRender(c).Message(nil)
}
type RSAConfigOutput struct {
OpenRSA bool
RSAPublicKey string
}
func (rt *Router) rsaConfigGet(c *gin.Context) {
publicKey := ""
if rt.HTTP.RSA.OpenRSA {
publicKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPublicKey)
}
ginx.NewRender(c).Data(RSAConfigOutput{
OpenRSA: rt.HTTP.RSA.OpenRSA,
RSAPublicKey: publicKey,
}, nil)
}

View File

@@ -21,7 +21,7 @@ func (rt *Router) alertMuteGetsByBG(c *gin.Context) {
func (rt *Router) alertMuteGets(c *gin.Context) {
prods := strings.Fields(ginx.QueryStr(c, "prods", ""))
bgid := ginx.QueryInt64(c, "bgid", 0)
bgid := ginx.QueryInt64(c, "bgid", -1)
query := ginx.QueryStr(c, "query", "")
lst, err := models.AlertMuteGets(rt.Ctx, prods, bgid, query)

View File

@@ -5,6 +5,7 @@ import (
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/pelletier/go-toml/v2"
@@ -29,9 +30,12 @@ func (rt *Router) webhookPuts(c *gin.Context) {
var webhooks []models.Webhook
ginx.BindJSON(c, &webhooks)
for i := 0; i < len(webhooks); i++ {
for k, v := range webhooks[i].HeaderMap {
webhooks[i].Headers = append(webhooks[i].Headers, k)
webhooks[i].Headers = append(webhooks[i].Headers, v)
webhooks[i].Headers = []string{}
if len(webhooks[i].HeaderMap) > 0 {
for k, v := range webhooks[i].HeaderMap {
webhooks[i].Headers = append(webhooks[i].Headers, k)
webhooks[i].Headers = append(webhooks[i].Headers, v)
}
}
}
@@ -137,32 +141,15 @@ func (rt *Router) notifyContactPuts(c *gin.Context) {
ginx.NewRender(c).Message(models.ConfigsSet(rt.Ctx, models.NOTIFYCONTACT, string(data)))
}
const DefaultSMTP = `
Host = ""
Port = 994
User = "username"
Pass = "password"
From = "username@163.com"
InsecureSkipVerify = true
Batch = 5
`
const DefaultIbex = `
Address = "http://127.0.0.1:10090"
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
Timeout = 3000
`
func (rt *Router) notifyConfigGet(c *gin.Context) {
key := ginx.QueryStr(c, "ckey")
cval, err := models.ConfigsGet(rt.Ctx, key)
if cval == "" {
switch key {
case models.IBEX:
cval = DefaultIbex
cval = memsto.DefaultIbex
case models.SMTP:
cval = DefaultSMTP
cval = memsto.DefaultSMTP
}
}
ginx.NewRender(c).Data(cval, err)

View File

@@ -3,17 +3,32 @@ package router
import (
"bytes"
"encoding/json"
"fmt"
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/str"
)
func (rt *Router) notifyTplGets(c *gin.Context) {
m := make(map[string]struct{})
for _, channel := range models.DefaultChannels {
m[channel] = struct{}{}
}
m["mailsubject"] = struct{}{}
lst, err := models.NotifyTplGets(rt.Ctx)
for i := 0; i < len(lst); i++ {
if _, exists := m[lst[i].Channel]; exists {
lst[i].BuiltIn = true
}
}
ginx.NewRender(c).Data(lst, err)
}
@@ -21,6 +36,7 @@ func (rt *Router) notifyTplGets(c *gin.Context) {
func (rt *Router) notifyTplUpdateContent(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
ginx.Dangerous(templateValidate(f))
ginx.NewRender(c).Message(f.UpdateContent(rt.Ctx))
}
@@ -28,24 +44,76 @@ func (rt *Router) notifyTplUpdateContent(c *gin.Context) {
func (rt *Router) notifyTplUpdate(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
ginx.Dangerous(templateValidate(f))
ginx.NewRender(c).Message(f.Update(rt.Ctx))
}
func templateValidate(f models.NotifyTpl) error {
if len(f.Channel) > 32 {
return fmt.Errorf("channel length should not exceed 32")
}
if str.Dangerous(f.Channel) {
return fmt.Errorf("channel should not contain dangerous characters")
}
if len(f.Name) > 255 {
return fmt.Errorf("name length should not exceed 255")
}
if str.Dangerous(f.Name) {
return fmt.Errorf("name should not contain dangerous characters")
}
if f.Content == "" {
return nil
}
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
text := strings.Join(append(defs, f.Content), "")
if _, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text); err != nil {
return fmt.Errorf("notify template verify illegal:%s", err.Error())
}
return nil
}
func (rt *Router) notifyTplPreview(c *gin.Context) {
var event models.AlertCurEvent
err := json.Unmarshal([]byte(cconf.EVENT_EXAMPLE), &event)
if err != nil {
ginx.NewRender(c).Message(err.Error())
return
}
ginx.Dangerous(err)
var f models.NotifyTpl
ginx.BindJSON(c, &f)
tpl, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(f.Content)
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
text := strings.Join(append(defs, f.Content), "")
tpl, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text)
ginx.Dangerous(err)
event.TagsMap = make(map[string]string)
for i := 0; i < len(event.TagsJSON); i++ {
pair := strings.TrimSpace(event.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.Split(pair, "=")
if len(arr) != 2 {
continue
}
event.TagsMap[arr[0]] = arr[1]
}
var body bytes.Buffer
var ret string
if err := tpl.Execute(&body, event); err != nil {
@@ -56,3 +124,25 @@ func (rt *Router) notifyTplPreview(c *gin.Context) {
ginx.NewRender(c).Data(ret, nil)
}
// add new notify template
func (rt *Router) notifyTplAdd(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
f.Channel = strings.TrimSpace(f.Channel)
ginx.Dangerous(templateValidate(f))
count, err := models.NotifyTplCountByChannel(rt.Ctx, f.Channel)
ginx.Dangerous(err)
if count != 0 {
ginx.Bomb(200, "Refuse to create duplicate channel(unique)")
}
ginx.NewRender(c).Message(f.Create(rt.Ctx))
}
// delete notify template, not allowed to delete the system defaults(models.DefaultChannels)
func (rt *Router) notifyTplDel(c *gin.Context) {
f := new(models.NotifyTpl)
id := ginx.UrlParamInt64(c, "id")
ginx.NewRender(c).Message(f.NotifyTplDelete(rt.Ctx, id))
}

View File

@@ -2,11 +2,13 @@ package router
import (
"context"
"crypto/tls"
"net"
"net/http"
"net/http/httputil"
"net/url"
"strings"
"sync"
"time"
pkgprom "github.com/ccfos/nightingale/v6/pkg/prom"
@@ -138,13 +140,18 @@ func (rt *Router) dsProxy(c *gin.Context) {
http.Error(w, err.Error(), http.StatusBadGateway)
}
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(ds.HTTPJson.Timeout) * time.Millisecond,
MaxIdleConnsPerHost: ds.HTTPJson.MaxIdleConnsPerHost,
transport, has := transportGet(dsId, ds.UpdatedAt)
if !has {
transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: ds.HTTPJson.TLS.SkipTlsVerify},
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(ds.HTTPJson.Timeout) * time.Millisecond,
MaxIdleConnsPerHost: ds.HTTPJson.MaxIdleConnsPerHost,
}
transportPut(dsId, ds.UpdatedAt, transport)
}
proxy := &httputil.ReverseProxy{
@@ -155,3 +162,44 @@ func (rt *Router) dsProxy(c *gin.Context) {
proxy.ServeHTTP(c.Writer, c.Request)
}
var (
transports = map[int64]http.RoundTripper{}
updatedAts = map[int64]int64{}
transportsLock = &sync.Mutex{}
)
func transportGet(dsid, newUpdatedAt int64) (http.RoundTripper, bool) {
transportsLock.Lock()
defer transportsLock.Unlock()
tran, has := transports[dsid]
if !has {
return nil, false
}
oldUpdateAt, has := updatedAts[dsid]
if !has {
oldtran := tran.(*http.Transport)
oldtran.CloseIdleConnections()
delete(transports, dsid)
return nil, false
}
if oldUpdateAt != newUpdatedAt {
oldtran := tran.(*http.Transport)
oldtran.CloseIdleConnections()
delete(transports, dsid)
delete(updatedAts, dsid)
return nil, false
}
return tran, has
}
func transportPut(dsid, updatedat int64, tran http.RoundTripper) {
transportsLock.Lock()
transports[dsid] = tran
updatedAts[dsid] = updatedat
transportsLock.Unlock()
}

View File

@@ -1,7 +1,10 @@
package router
import (
"encoding/json"
"net/http"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
@@ -16,6 +19,11 @@ func (rt *Router) recordingRuleGets(c *gin.Context) {
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) recordingRuleGetsByService(c *gin.Context) {
ars, err := models.RecordingRuleEnabledGets(rt.Ctx)
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) recordingRuleGet(c *gin.Context) {
rrid := ginx.UrlParamInt64(c, "rrid")
@@ -104,6 +112,25 @@ func (rt *Router) recordingRulePutFields(c *gin.Context) {
f.Fields["update_by"] = c.MustGet("username").(string)
f.Fields["update_at"] = time.Now().Unix()
if _, ok := f.Fields["datasource_ids"]; ok {
// datasource_ids = "1 2 3"
idsStr := strings.Fields(f.Fields["datasource_ids"].(string))
ids := make([]int64, 0)
for _, idStr := range idsStr {
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "datasource_ids error")
}
ids = append(ids, id)
}
bs, err := json.Marshal(ids)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "datasource_ids error")
}
f.Fields["datasource_ids"] = string(bs)
}
for i := 0; i < len(f.Ids); i++ {
ar, err := models.RecordingRuleGetById(rt.Ctx, f.Ids[i])
ginx.Dangerous(err)

View File

@@ -83,3 +83,18 @@ func (rt *Router) roleGets(c *gin.Context) {
lst, err := models.RoleGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) allPerms(c *gin.Context) {
roles, err := models.RoleGetsAll(rt.Ctx)
ginx.Dangerous(err)
m := make(map[string][]string)
for _, r := range roles {
lst, err := models.OperationsOfRole(rt.Ctx, strings.Fields(r.Name))
if err != nil {
continue
}
m[r.Name] = lst
}
ginx.NewRender(c).Data(m, err)
}

View File

@@ -1,6 +1,8 @@
package router
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/gin-gonic/gin"
@@ -16,3 +18,23 @@ func (rt *Router) serverClustersGet(c *gin.Context) {
list, err := models.AlertingEngineGetsClusters(rt.Ctx, "")
ginx.NewRender(c).Data(list, err)
}
func (rt *Router) serverHeartbeat(c *gin.Context) {
var req models.HeartbeatInfo
ginx.BindJSON(c, &req)
err := models.AlertingEngineHeartbeatWithCluster(rt.Ctx, req.Instance, req.EngineCluster, req.DatasourceId)
ginx.NewRender(c).Message(err)
}
func (rt *Router) serversActive(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "dsid")
engineName := ginx.QueryStr(c, "engine_name", "")
if engineName != "" {
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
return
}
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
}

View File

@@ -9,6 +9,7 @@ import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/storage"
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
@@ -46,10 +47,27 @@ func (rt *Router) targetGets(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", 30)
dsIds := queryDatasourceIds(c)
total, err := models.TargetTotal(rt.Ctx, bgid, dsIds, query)
var bgids []int64
var err error
if bgid == -1 {
// 全部对象的情况,找到用户有权限的业务组
user := c.MustGet("user").(*models.User)
userGroupIds, err := models.MyGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
bgids, err = models.BusiGroupIds(rt.Ctx, userGroupIds)
ginx.Dangerous(err)
// 将未分配业务组的对象也加入到列表中
bgids = append(bgids, 0)
} else {
bgids = append(bgids, bgid)
}
total, err := models.TargetTotal(rt.Ctx, bgids, dsIds, query)
ginx.Dangerous(err)
list, err := models.TargetGets(rt.Ctx, bgid, dsIds, query, limit, ginx.Offset(c, limit))
list, err := models.TargetGets(rt.Ctx, bgids, dsIds, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
if err == nil {
@@ -64,13 +82,13 @@ func (rt *Router) targetGets(c *gin.Context) {
if len(keys) > 0 {
metaMap := make(map[string]*models.HostMeta)
vals := rt.Redis.MGet(context.Background(), keys...).Val()
vals := storage.MGet(context.Background(), rt.Redis, keys)
for _, value := range vals {
var meta models.HostMeta
if value == nil {
continue
}
err := json.Unmarshal([]byte(value.(string)), &meta)
err := json.Unmarshal(value, &meta)
if err != nil {
logger.Warningf("unmarshal %v host meta failed: %v", value, err)
continue
@@ -81,13 +99,16 @@ func (rt *Router) targetGets(c *gin.Context) {
for i := 0; i < len(list); i++ {
if meta, ok := metaMap[list[i].Ident]; ok {
list[i].FillMeta(meta)
if now.Unix()-list[i].UpdateAt < 120 {
list[i].TargetUp = 1
}
} else {
// 未上报过元数据的主机cpuNum默认为-1, 用于前端展示 unknown
list[i].CpuNum = -1
}
if now.Unix()-list[i].UnixTime/1000 < 60 {
list[i].TargetUp = 2
} else if now.Unix()-list[i].UnixTime/1000 < 180 {
list[i].TargetUp = 1
}
}
}
@@ -99,6 +120,11 @@ func (rt *Router) targetGets(c *gin.Context) {
}, nil)
}
func (rt *Router) targetGetsByService(c *gin.Context) {
lst, err := models.TargetGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) targetGetTags(c *gin.Context) {
idents := ginx.QueryStr(c, "idents", "")
idents = strings.ReplaceAll(idents, ",", " ")

View File

@@ -120,6 +120,12 @@ func (f *taskForm) HandleFH(fh string) {
f.Title = f.Title + " FH: " + fh
}
func (rt *Router) taskRecordAdd(c *gin.Context) {
var f *models.TaskRecord
ginx.BindJSON(c, &f)
ginx.NewRender(c).Message(f.Add(rt.Ctx))
}
func (rt *Router) taskAdd(c *gin.Context) {
var f taskForm
ginx.BindJSON(c, &f)

View File

@@ -12,19 +12,8 @@ import (
)
func (rt *Router) userFindAll(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", 20)
query := ginx.QueryStr(c, "query", "")
total, err := models.UserTotal(rt.Ctx, query)
ginx.Dangerous(err)
list, err := models.UserGets(rt.Ctx, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"list": list,
"total": total,
}, nil)
list, err := models.UserGetAll(rt.Ctx)
ginx.NewRender(c).Data(list, err)
}
func (rt *Router) userGets(c *gin.Context) {

View File

@@ -29,6 +29,17 @@ func (rt *Router) userGroupGets(c *gin.Context) {
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) userGroupGetsByService(c *gin.Context) {
lst, err := models.UserGroupGetAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
// user group member get by service
func (rt *Router) userGroupMemberGetsByService(c *gin.Context) {
members, err := models.UserGroupMemberGetAll(rt.Ctx)
ginx.NewRender(c).Data(members, err)
}
type userGroupForm struct {
Name string `json:"name" binding:"required"`
Note string `json:"note"`

View File

@@ -1,7 +1,9 @@
# v5 升级 v6 手册
0. 操作之前,记得备注下数据库!
1. 解压 n9e 安装包
2. 导入 upgrade.sql 到 n9e_v5 数据库
1. 需要先将你正在使用的夜莺数据源表结构更新到和 v5.15.0 一致,[release](https://github.com/ccfos/nightingale/releases) 页面有每个版本表结构的更新说明,可以根据你正在使用的版本,按照说明,逐个执行的更新表结构的语句
2. 解压 n9e 安装包,导入 upgrade.sql 到 n9e_v5 数据库
```
mysql -h 127.0.0.1 -u root -p1234 < cli/upgrade/upgrade.sql
```
@@ -16,4 +18,4 @@ mysql -h 127.0.0.1 -u root -p1234 < cli/upgrade/upgrade.sql
nohup ./n9e &> n9e.log &
```
5. n9e 监听的端口为 17000如果想使用之前的端口,可以在配置文件中将端口改为 18000
5. n9e 监听的端口为 17000需要将之前的 web 端口和数据上报的端口,都调整为 17000

View File

@@ -18,9 +18,9 @@ func Upgrade(configFile string) error {
return err
}
ctx := ctx.NewContext(context.Background(), db)
ctx := ctx.NewContext(context.Background(), db, true)
for _, cluster := range config.Clusters {
count, err := models.GetDatasourcesCountBy(ctx, "", "", cluster.Name)
count, err := models.GetDatasourcesCountByName(ctx, cluster.Name)
if err != nil {
logger.Errorf("get datasource %s count error: %v", cluster.Name, err)
continue

View File

@@ -8,6 +8,13 @@ insert into `role_operation`(role_name, operation) values('Standard', '/trace/ex
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies');
insert into `role_operation`(role_name, operation) values('Standard', '/help/servers');
insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate');
insert into `role_operation`(role_name, operation) values('Admin', '/help/source');
insert into `role_operation`(role_name, operation) values('Admin', '/help/sso');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-tpls');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-settings');
alter table `board` add built_in tinyint(1) not null default 0 comment '0:false 1:true';
alter table `board` add hide tinyint(1) not null default 0 comment '0:false 1:true';
@@ -19,7 +26,7 @@ alter table `alert_rule` add rule_config text not null comment 'rule_config';
alter table `alert_rule` add annotations text not null comment 'annotations';
alter table `alert_mute` add datasource_ids varchar(255) not null default '';
alter table `alert_mute` add periodic_mutes varchar(4096) not null default '';
alter table `alert_mute` add periodic_mutes varchar(4096) not null default '[]';
alter table `alert_mute` add mute_time_type tinyint(1) not null default 0;
alter table `alert_subscribe` add datasource_ids varchar(255) not null default '';
@@ -41,6 +48,9 @@ alter table `alert_his_event` add annotations text not null comment 'annotations
alter table `alert_his_event` add rule_config text not null comment 'rule_config';
alter table `alerting_engines` add datasource_id bigint unsigned not null default 0;
alter table `alerting_engines` change cluster engine_cluster varchar(128) not null default '' comment 'n9e engine cluster';
alter table `task_record` add event_id bigint not null comment 'event id' default 0;
CREATE TABLE `datasource`
(
@@ -81,7 +91,7 @@ CREATE TABLE `notify_tpl` (
CREATE TABLE `sso_config` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null,
`name` varchar(191) not null,
`content` text not null,
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)

View File

@@ -17,7 +17,7 @@ import (
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_CONFIGS)")
configDir = flag.String("configs", osx.GetEnv("N9E_ALERT_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_ALERT_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)

73
cmd/edge/edge.go Normal file
View File

@@ -0,0 +1,73 @@
package main
import (
"context"
"fmt"
"github.com/ccfos/nightingale/v6/alert"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/pushgw/writer"
alertrt "github.com/ccfos/nightingale/v6/alert/router"
pushgwrt "github.com/ccfos/nightingale/v6/pushgw/router"
)
func Initialize(configDir string, cryptoKey string) (func(), error) {
config, err := conf.InitConfig(configDir, cryptoKey)
if err != nil {
return nil, fmt.Errorf("failed to init config: %v", err)
}
logxClean, err := logx.Init(config.Log)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi)
syncStats := memsto.NewSyncStats()
targetCache := memsto.NewTargetCache(ctx, syncStats, nil)
busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats)
idents := idents.New(ctx)
writers := writer.NewWriters(config.Pushgw)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, targetCache, busiGroupCache, idents, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP)
pushgwRouter.Config(r)
if !config.Alert.Disable {
alertStats := astats.NewSyncStats()
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx, config.Alert.Heartbeat)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, dsCache, ctx, promClients, userCache, userGroupCache)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors)
alertrtRouter.Config(r)
}
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
return func() {
logxClean()
httpClean()
}, nil
}

68
cmd/edge/main.go Normal file
View File

@@ -0,0 +1,68 @@
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/ccfos/nightingale/v6/pkg/osx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/toolkits/pkg/runner"
)
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_EDGE_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_EDGE_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
printEnv()
cleanFunc, err := Initialize(*configDir, *cryptoKey)
if err != nil {
log.Fatalln("failed to initialize:", err)
}
code := 1
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
EXIT:
for {
sig := <-sc
fmt.Println("received signal:", sig.String())
switch sig {
case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
code = 0
break EXIT
case syscall.SIGHUP:
// reload configuration?
default:
break EXIT
}
}
cleanFunc()
fmt.Println("process exited")
os.Exit(code)
}
func printEnv() {
runner.Init()
fmt.Println("runner.cwd:", runner.Cwd)
fmt.Println("runner.hostname:", runner.Hostname)
fmt.Println("runner.fd_limits:", runner.FdLimits())
fmt.Println("runner.vm_limits:", runner.VMLimits())
}

View File

@@ -17,7 +17,7 @@ import (
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_CONFIGS)")
configDir = flag.String("configs", osx.GetEnv("N9E_PUSHGW_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_PUSHGW_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)

View File

@@ -2,6 +2,7 @@ package conf
import (
"fmt"
"net"
"os"
"strings"
@@ -16,17 +17,25 @@ import (
)
type ConfigType struct {
Global GlobalConfig
Log logx.Config
HTTP httpx.Config
DB ormx.DBConfig
Redis storage.RedisConfig
Global GlobalConfig
Log logx.Config
HTTP httpx.Config
DB ormx.DBConfig
Redis storage.RedisConfig
CenterApi CenterApi
Pushgw pconf.Pushgw
Alert aconf.Alert
Center cconf.Center
}
type CenterApi struct {
Addrs []string
BasicAuthUser string
BasicAuthPass string
Timeout int64
}
type GlobalConfig struct {
RunMode string
}
@@ -49,28 +58,36 @@ func InitConfig(configDir, cryptoKey string) (*ConfigType, error) {
if config.Alert.Heartbeat.IP == "" {
// auto detect
// config.Alert.Heartbeat.IP = fmt.Sprint(GetOutboundIP())
// 自动获取IP在有些环境下容易出错这里用hostname+pid来作唯一标识
config.Alert.Heartbeat.IP = fmt.Sprint(GetOutboundIP())
if config.Alert.Heartbeat.IP == "" {
hostname, err := os.Hostname()
if err != nil {
fmt.Println("failed to get hostname:", err)
os.Exit(1)
}
hostname, err := os.Hostname()
if err != nil {
fmt.Println("failed to get hostname:", err)
os.Exit(1)
if strings.Contains(hostname, "localhost") {
fmt.Println("Warning! hostname contains substring localhost, setting a more unique hostname is recommended")
}
config.Alert.Heartbeat.IP = hostname
}
if strings.Contains(hostname, "localhost") {
fmt.Println("Warning! hostname contains substring localhost, setting a more unique hostname is recommended")
}
config.Alert.Heartbeat.IP = hostname
// if config.Alert.Heartbeat.IP == "" {
// fmt.Println("heartbeat ip auto got is blank")
// os.Exit(1)
// }
}
config.Alert.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", config.Alert.Heartbeat.IP, config.HTTP.Port)
return config, nil
}
func GetOutboundIP() net.IP {
conn, err := net.Dial("udp", "223.5.5.5:80")
if err != nil {
fmt.Println("auto get outbound ip fail:", err)
return []byte{}
}
defer conn.Close()
localAddr := conn.LocalAddr().(*net.UDPAddr)
return localAddr.IP
}

View File

@@ -14,39 +14,22 @@ func decryptConfig(config *ConfigType, cryptoKey string) error {
config.DB.DSN = decryptDsn
for k := range config.HTTP.Alert.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.Alert.BasicAuth[k], cryptoKey)
for k := range config.HTTP.APIForService.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForService.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.Alert.BasicAuth[k] = decryptPwd
config.HTTP.APIForService.BasicAuth[k] = decryptPwd
}
for k := range config.HTTP.Pushgw.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.Pushgw.BasicAuth[k], cryptoKey)
for k := range config.HTTP.APIForAgent.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForAgent.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.Pushgw.BasicAuth[k] = decryptPwd
}
for k := range config.HTTP.Heartbeat.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.Heartbeat.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.Heartbeat.BasicAuth[k] = decryptPwd
}
for k := range config.HTTP.Service.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.Service.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.Service.BasicAuth[k] = decryptPwd
config.HTTP.APIForAgent.BasicAuth[k] = decryptPwd
}
for i, v := range config.Pushgw.Writers {

View File

@@ -15,24 +15,27 @@
<img alt="GitHub forks" src="https://img.shields.io/github/forks/ccfos/nightingale">
<a href="https://github.com/ccfos/nightingale/graphs/contributors">
<img alt="GitHub contributors" src="https://img.shields.io/github/contributors-anon/ccfos/nightingale"/></a>
<a href="https://n9e-talk.slack.com/">
<img alt="GitHub contributors" src="https://img.shields.io/badge/join%20slack-%23n9e-brightgreen.svg"/></a>
<img alt="License" src="https://img.shields.io/badge/license-Apache--2.0-blue"/>
</p>
<p align="center">
<b>All-in-one</b> 的开源云原生监控系统 <br/>
<b>All-in-one</b> 的开源观测平台 <br/>
<b>开箱即用</b>,集数据采集、可视化、监控告警于一体 <br/>
推荐升级您的 <b>Prometheus + AlertManager + Grafana</b> 组合方案到夜莺!
推荐升级您的 <b>Prometheus + AlertManager + Grafana + ELK + Jaeger</b> 组合方案到夜莺!
</p>
[English](./README.md) | [中文](./README_ZH.md)
[English](./README_en.md) | [中文](./README.md)
## Highlighted Features
## 功能和特点
- **开箱即用**
- 支持 Docker、Helm Chart、云服务等多种部署方式集数据采集、监控告警、可视化为一体内置多种监控仪表盘、快捷视图、告警规则模板导入即可快速使用**大幅降低云原生监控系统的建设成本、学习成本、使用成本**
- **专业告警**
- 可视化的告警配置和管理,支持丰富的告警规则,提供屏蔽规则、订阅规则的配置能力,支持告警多种送达渠道,支持告警自愈、告警事件管理等;
- **推荐您使用夜莺的同时,无缝搭配[FlashDuty](https://flashcat.cloud/product/flashcat-duty/),实现告警聚合收敛、认领、升级、排班、协同,让告警的触达既高效,又确保告警处理不遗漏、做到件件有回响**
- **云原生**
- 以交钥匙的方式快速构建企业级的云原生监控体系,支持 [Categraf](https://github.com/flashcatcloud/categraf)、Telegraf、Grafana-agent 等多种采集器,支持 Prometheus、VictoriaMetrics、M3DB、ElasticSearch、Jaeger 等多种数据源,兼容支持导入 Grafana 仪表盘,**与云原生生态无缝集成**
- **高性能 高可用**
@@ -43,68 +46,85 @@
- **开放社区**
- 托管于[中国计算机学会开源发展委员会](https://www.ccf.org.cn/kyfzwyh/),有[快猫星云](https://flashcat.cloud)和众多公司的持续投入,和数千名社区用户的积极参与,以及夜莺监控项目清晰明确的定位,都保证了夜莺开源社区健康、长久的发展。活跃、专业的社区用户也在持续迭代和沉淀更多的最佳实践于产品中;
**如果您在使用 Prometheus 过程中,有以下的一个或者多个需求场景,推荐您无缝升级到夜莺**
## 使用场景
1. **如果您希望在一个平台中,统一管理和查看 Metrics、Logging、Tracing 数据,推荐你使用夜莺**
- 请参考阅读:[不止于监控,夜莺 V6 全新升级为开源观测平台](http://flashcat.cloud/blog/nightingale-v6-release/)
2. **如果您在使用 Prometheus 过程中,有以下的一个或者多个需求场景,推荐您无缝升级到夜莺**
- Prometheus、Alertmanager、Grafana 等多个系统较为割裂,缺乏统一视图,无法开箱即用;
- 通过修改配置文件来管理 Prometheus、Alertmanager 的方式,学习曲线大,协同有难度;
- 数据量过大而无法扩展您的 Prometheus 集群;
- 生产环境运行多套 Prometheus 集群,面临管理和使用成本高的问题;
3. **如果您在使用 Zabbix有以下的场景推荐您升级到夜莺**
- 监控的数据量太大,希望有更好的扩展解决方案;
- 学习曲线高,多人多团队模式下,希望有更好的协同使用效率;
- 微服务和云原生架构下监控数据的生命周期多变、监控数据维度基数高Zabbix 数据模型不易适配;
- 了解更多Zabbix和夜莺监控的对比推荐您进一步阅读[Zabbix 和夜莺监控选型对比](https://flashcat.cloud/blog/zabbx-vs-nightingale/)
4. **如果您在使用 [Open-Falcon](https://github.com/open-falcon/falcon-plus),我们推荐您升级到夜莺:**
- 关于 Open-Falcon 和夜莺的详细介绍,请参考阅读:[云原生监控的十个特点和趋势](http://flashcat.cloud/blog/10-trends-of-cloudnative-monitoring/)
- 监控系统和可观测平台的区别,请参考阅读:[从监控系统到可观测平台Gap有多大
](https://flashcat.cloud/blog/gap-of-monitoring-to-o11y/)
5. **我们推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为首选的监控数据采集器**
- [Categraf](https://github.com/flashcatcloud/categraf) 是夜莺监控的默认采集器,采用开放插件机制和 All-in-one 的设计理念,同时支持 metric、log、trace、event 的采集。Categraf 不仅可以采集 CPU、内存、网络等系统层面的指标也集成了众多开源组件的采集能力支持K8s生态。Categraf 内置了对应的仪表盘和告警规则,开箱即用。
- Prometheus、Alertmanager、Grafana 等多个系统较为割裂,缺乏统一视图,无法开箱即用;
- 通过修改配置文件来管理 Prometheus、Alertmanager 的方式,学习曲线大,协同有难度;
- 数据量过大而无法扩展您的 Prometheus 集群;
- 生产环境运行多套 Prometheus 集群,面临管理和使用成本高的问题;
## 文档
**如果您在使用 Zabbix有以下的场景推荐您升级到夜莺**
[English Doc](https://n9e.github.io/) | [中文文档](https://flashcat.cloud/docs/)
- 监控的数据量太大,希望有更好的扩展解决方案;
- 学习曲线高,多人多团队模式下,希望有更好的协同使用效率;
- 微服务和云原生架构下监控数据的生命周期多变、监控数据维度基数高Zabbix 数据模型不易适配;
> 了解更多Zabbix和夜莺监控的对比推荐您进一步阅读[《Zabbix 和夜莺监控选型对比》](https://flashcat.cloud/blog/zabbx-vs-nightingale/)
**如果您在使用 [Open-Falcon](https://github.com/open-falcon/falcon-plus),我们推荐您升级到夜莺:**
- 关于 Open-Falcon 和夜莺的详细介绍,请参考阅读:[《云原生监控的十个特点和趋势》](http://flashcat.cloud/blog/10-trends-of-cloudnative-monitoring/)
**我们推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为首选的监控数据采集器**
- [Categraf](https://github.com/flashcatcloud/categraf) 是夜莺监控的默认采集器,采用开放插件机制和 All-in-one 的设计理念,同时支持 metric、log、trace、event 的采集。Categraf 不仅可以采集 CPU、内存、网络等系统层面的指标也集成了众多开源组件的采集能力支持K8s生态。Categraf 内置了对应的仪表盘和告警规则,开箱即用。
## Getting Started
[English Doc](https://n9e.github.io/) | [中文文档](http://n9e.flashcat.cloud/)
## Screenshots
## 产品示意图
https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a49e-d60af9bd76e8.mp4
## Architecture
<img src="doc/img/arch-product.png" width="600">
## 夜莺架构
夜莺监控可以接收各种采集器上报的监控数据(比如 [Categraf](https://github.com/flashcatcloud/categraf)、telegraf、grafana-agent、Prometheus并写入多种流行的时序数据库中可以支持Prometheus、M3DB、VictoriaMetrics、Thanos、TDEngine等提供告警规则、屏蔽规则、订阅规则的配置能力提供监控数据的查看能力提供告警自愈机制告警触发之后自动回调某个webhook地址或者执行某个脚本提供历史告警事件的存储管理、分组查看的能力。
<img src="doc/img/arch-system.png" width="600">
### 中心汇聚式部署方案
夜莺 v5 版本的设计非常简单,核心是 server 和 webapi 两个模块webapi 无状态放到中心端承接前端请求将用户配置写入数据库server 是告警引擎和数据转发模块,一般随着时序库走,一个时序库就对应一套 server每套 server 可以只用一个实例也可以多个实例组成集群server 可以接收 Categraf、Telegraf、Grafana-Agent、Datadog-Agent、Falcon-Plugins 上报的数据,写入后端时序库,周期性从数据库同步告警规则,然后查询时序库做告警判断。每套 server 依赖一个 redis。
![中心汇聚式部署方案](https://download.flashcat.cloud/ulric/20230327133406.png)
夜莺只有一个模块,就是 n9e可以部署多个 n9e 实例组成集群n9e 依赖 2 个存储数据库、Redis数据库可以使用 MySQL 或 Postgres自己按需选用。
n9e 提供的是 HTTP 接口,前面负载均衡可以是 4 层的,也可以是 7 层的。一般就选用 Nginx 就可以了。
n9e 这个模块接收到数据之后,需要转发给后端的时序库,相关配置是:
```toml
[Pushgw]
LabelRewrite = true
[[Pushgw.Writers]]
Url = "http://127.0.0.1:9090/api/v1/write"
```
> 注意:虽然数据源可以在页面配置了,但是上报转发链路,还是需要在配置文件指定。
所有机房的 agent 比如 Categraf、Telegraf、 Grafana-agent、Datadog-agent ),都直接推数据给 n9e这个架构最为简单维护成本最低。当然前提是要求机房之间网络链路比较好一般有专线。如果网络链路不好则要使用下面的部署方式了。
### 边缘下沉式混杂部署方案
![边缘下沉式混杂部署方案](https://download.flashcat.cloud/ulric/20230327135615.png)
这个图尝试解释 3 种不同的情形,比如 A 机房和中心网络链路很好Categraf 可以直接汇报数据给中心 n9e 模块,另一个机房网络链路不好,就需要把时序库下沉部署,时序库下沉了,对应的告警引擎和转发网关也都要跟随下沉,这样数据不会跨机房传输,比较稳定。但是心跳还是需要往中心心跳,要不然在对象列表里看不到机器的 CPU、内存使用率。还有的时候可能是接入的一个已有的 Prometheus数据采集没有走 Categraf那此时只需要把 Prometheus 作为数据源接入夜莺即可,可以在夜莺里看图、配告警规则,但是就是在对象列表里看不到,也不能使用告警自愈的功能,问题也不大,核心功能都不受影响。
边缘机房下沉部署时序库、告警引擎、转发网关的时候要注意告警引擎需要依赖数据库因为要同步告警规则转发网关也要依赖数据库因为要注册对象到数据库里去需要打通相关网络告警引擎和转发网关都不用Redis所以无需为 Redis 打通网络。
### VictoriaMetrics 集群架构
<img src="doc/img/install-vm.png" width="600">
如果单机版本的时序数据库(比如 Prometheus 性能有瓶颈或容灾较差,我们推荐使用 [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)VictoriaMetrics 架构较为简单性能优异易于部署和运维架构图如上。VictoriaMetrics 更详尽的文档,还请参考其[官网](https://victoriametrics.com/)。
## Community
## 夜莺社区
开源项目要更有生命力,离不开开放的治理架构和源源不断的开发者和用户共同参与,我们致力于建立开放、中立的开源治理架构,吸纳更多来自企业、高校等各方面对云原生监控感兴趣、有热情的开发者,一起打造有活力的夜莺开源社区。关于《夜莺开源项目和社区治理架构(草案)》,请查阅 [COMMUNITY GOVERNANCE](./doc/community-governance.md).
**我们欢迎您以各种方式参与到夜莺开源项目和开源社区中来,工作包括不限于**
- 补充和完善文档 => [n9e.github.io](https://n9e.github.io/)
- 分享您在使用夜莺监控过程中的最佳实践和经验心得 => [文章分享](https://n9e.github.io/docs/prologue/share/)
- 分享您在使用夜莺监控过程中的最佳实践和经验心得 => [文章分享](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale/share/)
- 提交产品建议 =》 [github issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md)
- 提交代码,让夜莺监控更快、更稳、更好用 => [github pull request](https://github.com/didi/nightingale/pulls)
**尊重、认可和记录每一位贡献者的工作**是夜莺开源社区的第一指导原则,我们提倡**高效的提问**,这既是对开发者时间的尊重,也是对整个社区知识沉淀的贡献:
- 提问之前请先查阅 [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq)
- 我们使用[GitHub Discussions](https://github.com/ccfos/nightingale/discussions)作为交流论坛,有问题可以到这里搜索、提问
- 我们也推荐你加入微信群,和其他夜莺用户交流经验 (请先加好友:[picobyte](https://www.gitlink.org.cn/UlricQin/gist/tree/master/self.jpeg) 备注:夜莺加群+姓名+公司)
- 我们使用[论坛](https://answer.flashcat.cloud/)进行交流,有问题可以到这里搜索、提问
## Who is using Nightingale
@@ -124,4 +144,4 @@ https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a4
## 加入交流群
<img src="doc/img/wecom.png" width="120">
<img src="doc/img/wecom.png" width="120">

BIN
doc/img/n9e-arch-latest.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 215 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 877 KiB

View File

@@ -4,8 +4,7 @@ FROM python:3-slim
WORKDIR /app
ADD n9e /app
ADD http://download.flashcat.cloud/wait /wait
RUN mkdir -p /app/pub && chmod +x /wait
ADD pub /app/pub/
RUN chmod +x /wait
RUN chmod +x n9e
EXPOSE 17000

View File

@@ -1,14 +1,12 @@
FROM --platform=$BUILDPLATFORM python:3-slim
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app
ADD etc /app
ADD inegrations /app
ADD http://download.flashcat.cloud/wait /wait
RUN mkdir -p /app/pub && chmod +x /wait
ADD pub /app/pub/
RUN chmod +x n9e
ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
ADD --chmod=755 https://github.com/ufoscout/docker-compose-wait/releases/download/2.11.0/wait_x86_64 /wait
RUN chmod +x /wait
EXPOSE 17000

View File

@@ -0,0 +1,13 @@
FROM flashcatcloud/toolbox:v0.0.1 as toolbox
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app/
ADD etc /app/
ADD integrations /app/integrations/
COPY --chmod=755 --from=toolbox /toolbox/wait_aarch64 /wait
EXPOSE 17000
CMD ["/app/n9e", "-h"]

View File

@@ -10,7 +10,6 @@ echo "tag: ${tag}"
rm -rf n9e pub
cp ../n9e .
cp -r ../pub .
docker build -t nightingale:${tag} .

View File

@@ -31,7 +31,7 @@ batch = 2000
chan_size = 10000
[[writers]]
url = "http://n9e:17000/prometheus/v1/write"
url = "http://127.0.0.1:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
@@ -54,7 +54,7 @@ run_mode = "release"
enable = true
# report os version cpu.util mem.util metadata
url = "http://n9e:17000/v1/n9e/heartbeat"
url = "http://127.0.0.1:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
@@ -78,6 +78,6 @@ enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["ibex:20090"]
servers = ["127.0.0.1:20090"]
## temp script dir
meta_dir = "./meta"

View File

@@ -1,9 +1,5 @@
version: "3.7"
networks:
nightingale:
driver: bridge
services:
mysql:
# platform: linux/x86_64
@@ -11,8 +7,6 @@ services:
container_name: mysql
hostname: mysql
restart: always
ports:
- "3406:3306"
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
@@ -20,20 +14,16 @@ services:
- ./mysqldata:/var/lib/mysql/
- ./initsql:/docker-entrypoint-initdb.d/
- ./mysqletc/my.cnf:/etc/my.cnf
networks:
- nightingale
network_mode: host
redis:
image: "redis:6.2"
container_name: redis
hostname: redis
restart: always
ports:
- "6379:6379"
environment:
TZ: Asia/Shanghai
networks:
- nightingale
network_mode: host
prometheus:
image: prom/prometheus
@@ -44,10 +34,7 @@ services:
TZ: Asia/Shanghai
volumes:
- ./prometc:/etc/prometheus
ports:
- "9090:9090"
networks:
- nightingale
network_mode: host
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
@@ -64,18 +51,12 @@ services:
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: mysql:3306
ports:
- "10090:10090"
- "20090:20090"
WAIT_HOSTS: 127.0.0.1:3306
volumes:
- ./ibexetc:/app/etc
networks:
- nightingale
network_mode: host
depends_on:
- mysql
links:
- mysql:mysql
command: >
sh -c "/wait && /app/ibex server"
@@ -87,24 +68,15 @@ services:
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: mysql:3306, redis:6379
WAIT_HOSTS: 127.0.0.1:3306, 127.0.0.1:6379
volumes:
- ./n9eetc:/app/etc
- ./integrations:/app/integrations
ports:
- "17000:17000"
networks:
- nightingale
- ../etc:/app/etc
network_mode: host
depends_on:
- mysql
- redis
- prometheus
- ibex
links:
- mysql:mysql
- redis:redis
- prometheus:prometheus
- ibex:ibex
command: >
sh -c "/wait && /app/n9e"
@@ -122,13 +94,7 @@ services:
- ./categraf/conf:/etc/categraf/conf
- /:/hostfs
- /var/run/docker.sock:/var/run/docker.sock
# ports:
# - "9100:9100/tcp"
networks:
- nightingale
network_mode: host
depends_on:
- n9e
- ibex
links:
- n9e:n9e
- ibex:ibex
- ibex

View File

@@ -0,0 +1,83 @@
[global]
# whether print configs
print_configs = false
# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$HOSTNAME"
# will not add label(agent_hostname) if true
omit_hostname = false
# s | ms
precision = "ms"
# global collect interval
interval = 15
[global.labels]
source="categraf"
# region = "shanghai"
# env = "localhost"
[writer_opt]
# default: 2000
batch = 2000
# channel(as queue) size
chan_size = 10000
[[writers]]
url = "http://n9e:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[heartbeat]
enable = true
# report os version cpu.util mem.util metadata
url = "http://n9e:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["ibex:20090"]
## temp script dir
meta_dir = "./meta"

View File

@@ -0,0 +1,5 @@
# # collect interval
# interval = 15
# # whether collect per cpu
# collect_per_cpu = false

View File

@@ -0,0 +1,11 @@
# # collect interval
# interval = 15
# # By default stats will be gathered for all mount points.
# # Set mount_points will restrict the stats to only the specified mount points.
# mount_points = ["/"]
# Ignore mount points by filesystem type.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
ignore_mount_points = ["/boot"]

View File

@@ -0,0 +1,6 @@
# # collect interval
# interval = 15
# # By default, categraf will gather stats for all devices including disk partitions.
# # Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb", "vd*"]

View File

@@ -0,0 +1,63 @@
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Docker Endpoint
## To use TCP, set endpoint = "tcp://[ip]:[port]"
## To use environment variables (ie, docker-machine), set endpoint = "ENV"
endpoint = "unix:///var/run/docker.sock"
## Set to true to collect Swarm metrics(desired_replicas, running_replicas)
gather_services = false
gather_extend_memstats = false
container_id_label_enable = true
container_id_label_short_style = true
## Containers to include and exclude. Globs accepted.
## Note that an empty array for both will include all containers
container_name_include = []
container_name_exclude = []
## Container states to include and exclude. Globs accepted.
## When empty only containers in the "running" state will be captured.
## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
# container_state_include = []
# container_state_exclude = []
## Timeout for docker list, info, and stats commands
timeout = "5s"
## Specifies for which classes a per-device metric should be issued
## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...)
## Please note that this setting has no effect if 'perdevice' is set to 'true'
perdevice_include = []
## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values.
## Possible values are 'cpu', 'blkio' and 'network'
## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin.
## Please note that this setting has no effect if 'total' is set to 'false'
total_include = ["cpu", "blkio", "network"]
## Which environment variables should we use as a tag
##tag_env = ["JAVA_HOME", "HEAP_SIZE"]
## docker labels to include and exclude as tags. Globs accepted.
## Note that an empty array for both will include all labels as tags
docker_label_include = []
docker_label_exclude = ["annotation*", "io.kubernetes*", "*description*", "*maintainer*", "*hash", "*author*"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false

View File

@@ -0,0 +1,2 @@
# # collect interval
# interval = 15

View File

@@ -0,0 +1,124 @@
# # collect interval
# interval = 15
# file: /proc/vmstat
[white_list]
oom_kill = 1
nr_free_pages = 0
nr_alloc_batch = 0
nr_inactive_anon = 0
nr_active_anon = 0
nr_inactive_file = 0
nr_active_file = 0
nr_unevictable = 0
nr_mlock = 0
nr_anon_pages = 0
nr_mapped = 0
nr_file_pages = 0
nr_dirty = 0
nr_writeback = 0
nr_slab_reclaimable = 0
nr_slab_unreclaimable = 0
nr_page_table_pages = 0
nr_kernel_stack = 0
nr_unstable = 0
nr_bounce = 0
nr_vmscan_write = 0
nr_vmscan_immediate_reclaim = 0
nr_writeback_temp = 0
nr_isolated_anon = 0
nr_isolated_file = 0
nr_shmem = 0
nr_dirtied = 0
nr_written = 0
numa_hit = 0
numa_miss = 0
numa_foreign = 0
numa_interleave = 0
numa_local = 0
numa_other = 0
workingset_refault = 0
workingset_activate = 0
workingset_nodereclaim = 0
nr_anon_transparent_hugepages = 0
nr_free_cma = 0
nr_dirty_threshold = 0
nr_dirty_background_threshold = 0
pgpgin = 0
pgpgout = 0
pswpin = 0
pswpout = 0
pgalloc_dma = 0
pgalloc_dma32 = 0
pgalloc_normal = 0
pgalloc_movable = 0
pgfree = 0
pgactivate = 0
pgdeactivate = 0
pgfault = 0
pgmajfault = 0
pglazyfreed = 0
pgrefill_dma = 0
pgrefill_dma32 = 0
pgrefill_normal = 0
pgrefill_movable = 0
pgsteal_kswapd_dma = 0
pgsteal_kswapd_dma32 = 0
pgsteal_kswapd_normal = 0
pgsteal_kswapd_movable = 0
pgsteal_direct_dma = 0
pgsteal_direct_dma32 = 0
pgsteal_direct_normal = 0
pgsteal_direct_movable = 0
pgscan_kswapd_dma = 0
pgscan_kswapd_dma32 = 0
pgscan_kswapd_normal = 0
pgscan_kswapd_movable = 0
pgscan_direct_dma = 0
pgscan_direct_dma32 = 0
pgscan_direct_normal = 0
pgscan_direct_movable = 0
pgscan_direct_throttle = 0
zone_reclaim_failed = 0
pginodesteal = 0
slabs_scanned = 0
kswapd_inodesteal = 0
kswapd_low_wmark_hit_quickly = 0
kswapd_high_wmark_hit_quickly = 0
pageoutrun = 0
allocstall = 0
pgrotated = 0
drop_pagecache = 0
drop_slab = 0
numa_pte_updates = 0
numa_huge_pte_updates = 0
numa_hint_faults = 0
numa_hint_faults_local = 0
numa_pages_migrated = 0
pgmigrate_success = 0
pgmigrate_fail = 0
compact_migrate_scanned = 0
compact_free_scanned = 0
compact_isolated = 0
compact_stall = 0
compact_fail = 0
compact_success = 0
htlb_buddy_alloc_success = 0
htlb_buddy_alloc_fail = 0
unevictable_pgs_culled = 0
unevictable_pgs_scanned = 0
unevictable_pgs_rescued = 0
unevictable_pgs_mlocked = 0
unevictable_pgs_munlocked = 0
unevictable_pgs_cleared = 0
unevictable_pgs_stranded = 0
thp_fault_alloc = 0
thp_fault_fallback = 0
thp_collapse_alloc = 0
thp_collapse_alloc_failed = 0
thp_split = 0
thp_zero_page_alloc = 0
thp_zero_page_alloc_failed = 0
balloon_inflate = 0
balloon_deflate = 0
balloon_migrate = 0

View File

@@ -0,0 +1,2 @@
# # collect interval
# interval = 15

View File

@@ -0,0 +1,5 @@
# # collect interval
# interval = 15
# # whether collect platform specified metrics
collect_platform_fields = true

View File

@@ -0,0 +1,8 @@
# # collect interval
# interval = 15
# # whether collect protocol stats on Linux
# collect_protocol_stats = false
# # setting interfaces will tell categraf to gather these explicit interfaces
# interfaces = ["eth0"]

View File

@@ -0,0 +1,2 @@
# # collect interval
# interval = 15

View File

@@ -0,0 +1,8 @@
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false

View File

@@ -0,0 +1,5 @@
# # collect interval
# interval = 15
# # whether collect metric: system_n_users
# collect_user_number = false

View File

@@ -0,0 +1,10 @@
[prometheus]
enable=true
scrape_config_file="/etc/prometheus/prometheus.yml"
## log level, debug warn info error
log_level="info"
## wal file storage path ,default ./data-agent
# wal_storage_path="/path/to/storage"
## wal reserve time duration, default value is 2 hour
# wal_min_duration=2

View File

@@ -0,0 +1,129 @@
version: "3.7"
networks:
nightingale:
driver: bridge
services:
postgres:
# platform: linux/x86_64
image: "postgres:12-alpine"
container_name: postgres
hostname: postgres
restart: always
ports:
- "5432:5432"
environment:
TZ: Asia/Shanghai
POSTGRES_USER: root
POSTGRES_PASSWORD: 1234
POSTGRES_DB: n9e_v6
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- ./pgdata:/var/lib/postgresql/data
- ./initsql_for_postgres:/docker-entrypoint-initdb.d/
networks:
- nightingale
redis:
image: "redis:7.0-alpine"
container_name: redis
hostname: redis
restart: always
ports:
- "6379:6379"
environment:
TZ: Asia/Shanghai
networks:
- nightingale
victoriametrics:
image: victoriametrics/victoria-metrics:v1.79.12
container_name: victoriametrics
hostname: victoriametrics
restart: always
environment:
TZ: Asia/Shanghai
ports:
- "8428:8428"
networks:
- nightingale
command:
- "--loggerTimezone=Asia/Shanghai"
ibex:
image: ulric2019/ibex:0.3
container_name: ibex
hostname: ibex
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: postgres:5432
ports:
- "10090:10090"
- "20090:20090"
volumes:
- ./ibexetc_pg:/app/etc
networks:
- nightingale
depends_on:
- postgres
links:
- postgres:postgres
command: >
sh -c "/wait && /app/ibex server"
n9e:
image: flashcatcloud/nightingale:latest
container_name: n9e
hostname: n9e
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: postgres:5432, redis:6379
volumes:
- ./n9eetc_pg:/app/etc
ports:
- "17000:17000"
networks:
- nightingale
depends_on:
- postgres
- redis
- victoriametrics
- ibex
links:
- postgres:postgres
- redis:redis
- victoriametrics:victoriametrics
- ibex:ibex
command: >
sh -c "/wait && /app/n9e"
categraf:
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always
environment:
TZ: Asia/Shanghai
HOST_PROC: /hostfs/proc
HOST_SYS: /hostfs/sys
HOST_MOUNT_PREFIX: /hostfs
volumes:
- ./categraf/conf:/etc/categraf/conf
- /:/hostfs
- /var/run/docker.sock:/var/run/docker.sock
- ./prometc_vm:/etc/prometheus
# ports:
# - "9100:9100/tcp"
networks:
- nightingale
depends_on:
- n9e
- ibex
links:
- n9e:n9e
- ibex:ibex

View File

@@ -0,0 +1,97 @@
# debug, release
RunMode = "release"
[Log]
# log write dir
Dir = "logs-server"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
Enable = true
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 10090
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[BasicAuth]
# using when call apis
ibex = "ibex"
[RPC]
Listen = "0.0.0.0:20090"
[Heartbeat]
# auto detect if blank
IP = ""
# unit: ms
Interval = 1000
[Output]
# database | remote
ComeFrom = "database"
AgtdPort = 2090
[Gorm]
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "postgres"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
[MySQL]
# mysql address host:port
Address = "mysql:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "ibex"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "postgres:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "n9e_v6"
# ssl mode
SSLMode = "disable"

View File

@@ -0,0 +1,735 @@
CREATE TABLE users (
id bigserial,
username varchar(64) not null ,
nickname varchar(64) not null ,
password varchar(128) not null default '',
phone varchar(16) not null default '',
email varchar(64) not null default '',
portrait varchar(255) not null default '' ,
roles varchar(255) not null ,
contacts varchar(1024) ,
maintainer smallint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (username)
) ;
COMMENT ON COLUMN users.username IS 'login name, cannot rename';
COMMENT ON COLUMN users.nickname IS 'display name, chinese name';
COMMENT ON COLUMN users.portrait IS 'portrait image url';
COMMENT ON COLUMN users.roles IS 'Admin | Standard | Guest, split by space';
COMMENT ON COLUMN users.contacts IS 'json e.g. {wecom:xx, dingtalk_robot_token:yy}';
insert into users(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', date_part('epoch',current_timestamp)::int, 'system', date_part('epoch',current_timestamp)::int, 'system');
CREATE TABLE user_group (
id bigserial,
name varchar(128) not null default '',
note varchar(255) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX user_group_create_by_idx ON user_group (create_by);
CREATE INDEX user_group_update_at_idx ON user_group (update_at);
insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root');
CREATE TABLE user_group_member (
id bigserial,
group_id bigint not null,
user_id bigint not null,
PRIMARY KEY(id)
) ;
CREATE INDEX user_group_member_group_id_idx ON user_group_member (group_id);
CREATE INDEX user_group_member_user_id_idx ON user_group_member (user_id);
insert into user_group_member(group_id, user_id) values(1, 1);
CREATE TABLE configs (
id bigserial,
ckey varchar(191) not null,
cval varchar(4096) not null default '',
PRIMARY KEY (id),
UNIQUE (ckey)
) ;
CREATE TABLE role (
id bigserial,
name varchar(191) not null default '',
note varchar(255) not null default '',
PRIMARY KEY (id),
UNIQUE (name)
) ;
insert into role(name, note) values('Admin', 'Administrator role');
insert into role(name, note) values('Standard', 'Ordinary user role');
insert into role(name, note) values('Guest', 'Readonly user role');
CREATE TABLE role_operation(
id bigserial,
role_name varchar(128) not null,
operation varchar(191) not null,
PRIMARY KEY(id)
) ;
CREATE INDEX role_operation_role_name_idx ON role_operation (role_name);
CREATE INDEX role_operation_operation_idx ON role_operation (operation);
-- Admin is special, who has no concrete operation but can do anything.
insert into role_operation(role_name, operation) values('Guest', '/metric/explorer');
insert into role_operation(role_name, operation) values('Guest', '/object/explorer');
insert into role_operation(role_name, operation) values('Guest', '/log/explorer');
insert into role_operation(role_name, operation) values('Guest', '/trace/explorer');
insert into role_operation(role_name, operation) values('Guest', '/help/version');
insert into role_operation(role_name, operation) values('Guest', '/help/contact');
insert into role_operation(role_name, operation) values('Standard', '/metric/explorer');
insert into role_operation(role_name, operation) values('Standard', '/object/explorer');
insert into role_operation(role_name, operation) values('Standard', '/log/explorer');
insert into role_operation(role_name, operation) values('Standard', '/trace/explorer');
insert into role_operation(role_name, operation) values('Standard', '/help/version');
insert into role_operation(role_name, operation) values('Standard', '/help/contact');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into role_operation(role_name, operation) values('Standard', '/dashboards-built-in');
insert into role_operation(role_name, operation) values('Standard', '/trace/dependencies');
insert into role_operation(role_name, operation) values('Standard', '/users');
insert into role_operation(role_name, operation) values('Standard', '/user-groups');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/add');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/put');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/del');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/add');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/put');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/del');
insert into role_operation(role_name, operation) values('Standard', '/targets');
insert into role_operation(role_name, operation) values('Standard', '/targets/add');
insert into role_operation(role_name, operation) values('Standard', '/targets/put');
insert into role_operation(role_name, operation) values('Standard', '/targets/del');
insert into role_operation(role_name, operation) values('Standard', '/dashboards');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/add');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/put');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/put');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/put');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events');
insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-his-events');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/add');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/put');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/del');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks/add');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks/put');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/add');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/put');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/del');
-- for alert_rule | collect_rule | mute | dashboard grouping
CREATE TABLE busi_group (
id bigserial,
name varchar(191) not null,
label_enable smallint not null default 0,
label_value varchar(191) not null default '' ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (name)
) ;
COMMENT ON COLUMN busi_group.label_value IS 'if label_enable: label_value can not be blank';
insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root');
CREATE TABLE busi_group_member (
id bigserial,
busi_group_id bigint not null ,
user_group_id bigint not null ,
perm_flag char(2) not null ,
PRIMARY KEY (id)
) ;
CREATE INDEX busi_group_member_busi_group_id_idx ON busi_group_member (busi_group_id);
CREATE INDEX busi_group_member_user_group_id_idx ON busi_group_member (user_group_id);
COMMENT ON COLUMN busi_group_member.busi_group_id IS 'busi group id';
COMMENT ON COLUMN busi_group_member.user_group_id IS 'user group id';
COMMENT ON COLUMN busi_group_member.perm_flag IS 'ro | rw';
insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw');
-- for dashboard new version
CREATE TABLE board (
id bigserial,
group_id bigint not null default 0 ,
name varchar(191) not null,
ident varchar(200) not null default '',
tags varchar(255) not null ,
public smallint not null default 0 ,
built_in smallint not null default 0 ,
hide smallint not null default 0 ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (group_id, name)
) ;
CREATE INDEX board_ident_idx ON board (ident);
COMMENT ON COLUMN board.group_id IS 'busi group id';
COMMENT ON COLUMN board.tags IS 'split by space';
COMMENT ON COLUMN board.public IS '0:false 1:true';
COMMENT ON COLUMN board.built_in IS '0:false 1:true';
COMMENT ON COLUMN board.hide IS '0:false 1:true';
-- for dashboard new version
CREATE TABLE board_payload (
id bigint not null ,
payload text not null,
UNIQUE (id)
) ;
COMMENT ON COLUMN board_payload.id IS 'dashboard id';
-- deprecated
CREATE TABLE dashboard (
id bigserial,
group_id bigint not null default 0 ,
name varchar(191) not null,
tags varchar(255) not null ,
configs varchar(8192) ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (group_id, name)
) ;
COMMENT ON COLUMN dashboard.group_id IS 'busi group id';
COMMENT ON COLUMN dashboard.tags IS 'split by space';
COMMENT ON COLUMN dashboard.configs IS 'dashboard variables';
-- deprecated
-- auto create the first subclass 'Default chart group' of dashboard
CREATE TABLE chart_group (
id bigserial,
dashboard_id bigint not null,
name varchar(255) not null,
weight int not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX chart_group_dashboard_id_idx ON chart_group (dashboard_id);
-- deprecated
CREATE TABLE chart (
id bigserial,
group_id bigint not null ,
configs text,
weight int not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX chart_group_id_idx ON chart (group_id);
COMMENT ON COLUMN chart.group_id IS 'chart group id';
CREATE TABLE chart_share (
id bigserial,
cluster varchar(128) not null,
datasource_id bigint not null default 0,
configs text,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
primary key (id)
) ;
CREATE INDEX chart_share_create_at_idx ON chart_share (create_at);
CREATE TABLE alert_rule (
id bigserial,
group_id bigint not null default 0 ,
cate varchar(128) not null,
datasource_ids varchar(255) not null default '' ,
cluster varchar(128) not null,
name varchar(255) not null,
note varchar(1024) not null default '',
prod varchar(255) not null default '',
algorithm varchar(255) not null default '',
algo_params varchar(255),
delay int not null default 0,
severity smallint not null ,
disabled smallint not null ,
prom_for_duration int not null ,
rule_config text not null ,
prom_ql text not null ,
prom_eval_interval int not null ,
enable_stime varchar(255) not null default '00:00',
enable_etime varchar(255) not null default '23:59',
enable_days_of_week varchar(255) not null default '' ,
enable_in_bg smallint not null default 0 ,
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_repeat_step int not null default 0 ,
notify_max_number int not null default 0 ,
recover_duration int not null default 0 ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
append_tags varchar(255) not null default '' ,
annotations text not null ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX alert_rule_group_id_idx ON alert_rule (group_id);
CREATE INDEX alert_rule_update_at_idx ON alert_rule (update_at);
COMMENT ON COLUMN alert_rule.group_id IS 'busi group id';
COMMENT ON COLUMN alert_rule.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_rule.severity IS '1:Emergency 2:Warning 3:Notice';
COMMENT ON COLUMN alert_rule.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN alert_rule.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_rule.rule_config IS 'rule_config';
COMMENT ON COLUMN alert_rule.prom_ql IS 'promql';
COMMENT ON COLUMN alert_rule.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_rule.enable_stime IS '00:00';
COMMENT ON COLUMN alert_rule.enable_etime IS '23:59';
COMMENT ON COLUMN alert_rule.enable_days_of_week IS 'split by space: 0 1 2 3 4 5 6';
COMMENT ON COLUMN alert_rule.enable_in_bg IS '1: only this bg 0: global';
COMMENT ON COLUMN alert_rule.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_rule.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_rule.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_rule.notify_repeat_step IS 'unit: min';
COMMENT ON COLUMN alert_rule.recover_duration IS 'unit: s';
COMMENT ON COLUMN alert_rule.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_rule.append_tags IS 'split by space: service=n9e mod=api';
COMMENT ON COLUMN alert_rule.annotations IS 'annotations';
CREATE TABLE alert_mute (
id bigserial,
group_id bigint not null default 0 ,
prod varchar(255) not null default '',
note varchar(1024) not null default '',
cate varchar(128) not null,
cluster varchar(128) not null,
datasource_ids varchar(255) not null default '' ,
tags jsonb NOT NULL ,
cause varchar(255) not null default '',
btime bigint not null default 0 ,
etime bigint not null default 0 ,
disabled smallint not null default 0 ,
mute_time_type smallint not null default 0,
periodic_mutes varchar(4096) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX alert_mute_group_id_idx ON alert_mute (group_id);
CREATE INDEX alert_mute_update_at_idx ON alert_mute (update_at);
COMMENT ON COLUMN alert_mute.group_id IS 'busi group id';
COMMENT ON COLUMN alert_mute.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_mute.tags IS 'json,map,tagkey->regexp|value';
COMMENT ON COLUMN alert_mute.btime IS 'begin time';
COMMENT ON COLUMN alert_mute.etime IS 'end time';
COMMENT ON COLUMN alert_mute.disabled IS '0:enabled 1:disabled';
CREATE TABLE alert_subscribe (
id bigserial,
name varchar(255) not null default '',
disabled smallint not null default 0 ,
group_id bigint not null default 0 ,
prod varchar(255) not null default '',
cate varchar(128) not null,
datasource_ids varchar(255) not null default '' ,
cluster varchar(128) not null,
rule_id bigint not null default 0,
tags varchar(4096) not null default '' ,
redefine_severity smallint default 0 ,
new_severity smallint not null ,
redefine_channels smallint default 0 ,
new_channels varchar(255) not null default '' ,
user_group_ids varchar(250) not null ,
webhooks text not null,
redefine_webhooks smallint default 0,
for_duration bigint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX alert_subscribe_group_id_idx ON alert_subscribe (group_id);
CREATE INDEX alert_subscribe_update_at_idx ON alert_subscribe (update_at);
COMMENT ON COLUMN alert_subscribe.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN alert_subscribe.group_id IS 'busi group id';
COMMENT ON COLUMN alert_subscribe.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_subscribe.tags IS 'json,map,tagkey->regexp|value';
COMMENT ON COLUMN alert_subscribe.redefine_severity IS 'is redefine severity?';
COMMENT ON COLUMN alert_subscribe.new_severity IS '0:Emergency 1:Warning 2:Notice';
COMMENT ON COLUMN alert_subscribe.redefine_channels IS 'is redefine channels?';
COMMENT ON COLUMN alert_subscribe.new_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_subscribe.user_group_ids IS 'split by space 1 34 5, notify cc to user_group_ids';
CREATE TABLE target (
id bigserial,
group_id bigint not null default 0 ,
ident varchar(191) not null ,
note varchar(255) not null default '' ,
tags varchar(512) not null default '' ,
update_at bigint not null default 0,
PRIMARY KEY (id),
UNIQUE (ident)
) ;
CREATE INDEX target_group_id_idx ON target (group_id);
COMMENT ON COLUMN target.group_id IS 'busi group id';
COMMENT ON COLUMN target.ident IS 'target id';
COMMENT ON COLUMN target.note IS 'append to alert event as field';
COMMENT ON COLUMN target.tags IS 'append to series data as tags, split by space, append external space at suffix';
-- case1: target_idents; case2: target_tags
-- CREATE TABLE collect_rule (
-- id bigserial,
-- group_id bigint not null default 0 comment 'busi group id',
-- cluster varchar(128) not null,
-- target_idents varchar(512) not null default '' comment 'ident list, split by space',
-- target_tags varchar(512) not null default '' comment 'filter targets by tags, split by space',
-- name varchar(191) not null default '',
-- note varchar(255) not null default '',
-- step int not null,
-- type varchar(64) not null comment 'e.g. port proc log plugin',
-- data text not null,
-- append_tags varchar(255) not null default '' comment 'split by space: e.g. mod=n9e dept=cloud',
-- create_at bigint not null default 0,
-- create_by varchar(64) not null default '',
-- update_at bigint not null default 0,
-- update_by varchar(64) not null default '',
-- PRIMARY KEY (id),
-- KEY (group_id, type, name)
-- ) ;
CREATE TABLE metric_view (
id bigserial,
name varchar(191) not null default '',
cate smallint not null ,
configs varchar(8192) not null default '',
create_at bigint not null default 0,
create_by bigint not null default 0,
update_at bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX metric_view_create_by_idx ON metric_view (create_by);
COMMENT ON COLUMN metric_view.cate IS '0: preset 1: custom';
COMMENT ON COLUMN metric_view.create_by IS 'user id';
insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}');
CREATE TABLE recording_rule (
id bigserial,
group_id bigint not null default '0',
datasource_ids varchar(255) not null default '',
cluster varchar(128) not null,
name varchar(255) not null ,
note varchar(255) not null ,
disabled smallint not null default 0 ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
append_tags varchar(255) default '' ,
create_at bigint default '0',
create_by varchar(64) default '',
update_at bigint default '0',
update_by varchar(64) default '',
PRIMARY KEY (id)
) ;
CREATE INDEX recording_rule_group_id_idx ON recording_rule (group_id);
CREATE INDEX recording_rule_update_at_idx ON recording_rule (update_at);
COMMENT ON COLUMN recording_rule.group_id IS 'group_id';
COMMENT ON COLUMN recording_rule.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN recording_rule.name IS 'new metric name';
COMMENT ON COLUMN recording_rule.note IS 'rule note';
COMMENT ON COLUMN recording_rule.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN recording_rule.prom_ql IS 'promql';
COMMENT ON COLUMN recording_rule.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN recording_rule.append_tags IS 'split by space: service=n9e mod=api';
CREATE TABLE alert_aggr_view (
id bigserial,
name varchar(191) not null default '',
rule varchar(2048) not null default '',
cate smallint not null ,
create_at bigint not null default 0,
create_by bigint not null default 0,
update_at bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_aggr_view_create_by_idx ON alert_aggr_view (create_by);
COMMENT ON COLUMN alert_aggr_view.cate IS '0: preset 1: custom';
COMMENT ON COLUMN alert_aggr_view.create_by IS 'user id';
insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0);
insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0);
CREATE TABLE alert_cur_event (
id bigint not null ,
cate varchar(128) not null,
datasource_id bigint not null default 0 ,
cluster varchar(128) not null,
group_id bigint not null ,
group_name varchar(255) not null default '' ,
hash varchar(64) not null ,
rule_id bigint not null,
rule_name varchar(255) not null,
rule_note varchar(2048) not null ,
rule_prod varchar(255) not null default '',
rule_algo varchar(255) not null default '',
severity smallint not null ,
prom_for_duration int not null ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_repeat_next bigint not null default 0 ,
notify_cur_number int not null default 0 ,
target_ident varchar(191) not null default '' ,
target_note varchar(191) not null default '' ,
first_trigger_time bigint,
trigger_time bigint not null,
trigger_value varchar(255) not null,
annotations text not null ,
rule_config text not null ,
tags varchar(1024) not null default '' ,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_cur_event_hash_idx ON alert_cur_event (hash);
CREATE INDEX alert_cur_event_rule_id_idx ON alert_cur_event (rule_id);
CREATE INDEX alert_cur_event_tg_idx ON alert_cur_event (trigger_time, group_id);
CREATE INDEX alert_cur_event_nrn_idx ON alert_cur_event (notify_repeat_next);
COMMENT ON COLUMN alert_cur_event.id IS 'use alert_his_event.id';
COMMENT ON COLUMN alert_cur_event.datasource_id IS 'datasource id';
COMMENT ON COLUMN alert_cur_event.group_id IS 'busi group id of rule';
COMMENT ON COLUMN alert_cur_event.group_name IS 'busi group name';
COMMENT ON COLUMN alert_cur_event.hash IS 'rule_id + vector_pk';
COMMENT ON COLUMN alert_cur_event.rule_note IS 'alert rule note';
COMMENT ON COLUMN alert_cur_event.severity IS '1:Emergency 2:Warning 3:Notice';
COMMENT ON COLUMN alert_cur_event.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_cur_event.prom_ql IS 'promql';
COMMENT ON COLUMN alert_cur_event.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_cur_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_cur_event.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_cur_event.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_cur_event.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_cur_event.notify_repeat_next IS 'next timestamp to notify, get repeat settings from rule';
COMMENT ON COLUMN alert_cur_event.target_ident IS 'target ident, also in tags';
COMMENT ON COLUMN alert_cur_event.target_note IS 'target note';
COMMENT ON COLUMN alert_cur_event.annotations IS 'annotations';
COMMENT ON COLUMN alert_cur_event.rule_config IS 'rule_config';
COMMENT ON COLUMN alert_cur_event.tags IS 'merge data_tags rule_tags, split by ,,';
CREATE TABLE alert_his_event (
id bigserial,
is_recovered smallint not null,
cate varchar(128) not null,
datasource_id bigint not null default 0 ,
cluster varchar(128) not null,
group_id bigint not null ,
group_name varchar(255) not null default '' ,
hash varchar(64) not null ,
rule_id bigint not null,
rule_name varchar(255) not null,
rule_note varchar(2048) not null default 'alert rule note',
rule_prod varchar(255) not null default '',
rule_algo varchar(255) not null default '',
severity smallint not null ,
prom_for_duration int not null ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_cur_number int not null default 0 ,
target_ident varchar(191) not null default '' ,
target_note varchar(191) not null default '' ,
first_trigger_time bigint,
trigger_time bigint not null,
trigger_value varchar(255) not null,
recover_time bigint not null default 0,
last_eval_time bigint not null default 0 ,
tags varchar(1024) not null default '' ,
annotations text not null ,
rule_config text not null ,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_his_event_hash_idx ON alert_his_event (hash);
CREATE INDEX alert_his_event_rule_id_idx ON alert_his_event (rule_id);
CREATE INDEX alert_his_event_tg_idx ON alert_his_event (trigger_time, group_id);
COMMENT ON COLUMN alert_his_event.group_id IS 'busi group id of rule';
COMMENT ON COLUMN alert_his_event.datasource_id IS 'datasource id';
COMMENT ON COLUMN alert_his_event.group_name IS 'busi group name';
COMMENT ON COLUMN alert_his_event.hash IS 'rule_id + vector_pk';
COMMENT ON COLUMN alert_his_event.rule_note IS 'alert rule note';
COMMENT ON COLUMN alert_his_event.severity IS '0:Emergency 1:Warning 2:Notice';
COMMENT ON COLUMN alert_his_event.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_his_event.prom_ql IS 'promql';
COMMENT ON COLUMN alert_his_event.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_his_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_his_event.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_his_event.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_his_event.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_his_event.target_ident IS 'target ident, also in tags';
COMMENT ON COLUMN alert_his_event.target_note IS 'target note';
COMMENT ON COLUMN alert_his_event.last_eval_time IS 'for time filter';
COMMENT ON COLUMN alert_his_event.tags IS 'merge data_tags rule_tags, split by ,,';
COMMENT ON COLUMN alert_his_event.annotations IS 'annotations';
COMMENT ON COLUMN alert_his_event.rule_config IS 'rule_config';
CREATE TABLE task_tpl
(
id serial,
group_id int not null ,
title varchar(255) not null default '',
account varchar(64) not null,
batch int not null default 0,
tolerance int not null default 0,
timeout int not null default 0,
pause varchar(255) not null default '',
script text not null,
args varchar(512) not null default '',
tags varchar(255) not null default '' ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX task_tpl_group_id_idx ON task_tpl (group_id);
COMMENT ON COLUMN task_tpl.group_id IS 'busi group id';
COMMENT ON COLUMN task_tpl.tags IS 'split by space';
CREATE TABLE task_tpl_host
(
ii serial,
id int not null ,
host varchar(128) not null ,
PRIMARY KEY (ii)
) ;
CREATE INDEX task_tpl_host_id_host_idx ON task_tpl_host (id, host);
COMMENT ON COLUMN task_tpl_host.id IS 'task tpl id';
COMMENT ON COLUMN task_tpl_host.host IS 'ip or hostname';
CREATE TABLE task_record
(
id bigint not null ,
event_id bigint not null default 0,
group_id bigint not null ,
ibex_address varchar(128) not null,
ibex_auth_user varchar(128) not null default '',
ibex_auth_pass varchar(128) not null default '',
title varchar(255) not null default '',
account varchar(64) not null,
batch int not null default 0,
tolerance int not null default 0,
timeout int not null default 0,
pause varchar(255) not null default '',
script text not null,
args varchar(512) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX task_record_cg_idx ON task_record (create_at, group_id);
CREATE INDEX task_record_create_by_idx ON task_record (create_by);
CREATE INDEX task_record_event_id_idx ON task_record (event_id);
COMMENT ON COLUMN task_record.id IS 'ibex task id';
COMMENT ON COLUMN task_record.group_id IS 'busi group id';
COMMENT ON COLUMN task_record.event_id IS 'event id';
CREATE TABLE alerting_engines
(
id serial,
instance varchar(128) not null default '' ,
datasource_id bigint not null default 0 ,
engine_cluster varchar(128) not null default '' ,
clock bigint not null,
PRIMARY KEY (id)
) ;
COMMENT ON COLUMN alerting_engines.instance IS 'instance identification, e.g. 10.9.0.9:9090';
COMMENT ON COLUMN alerting_engines.datasource_id IS 'datasource id';
COMMENT ON COLUMN alerting_engines.engine_cluster IS 'target reader cluster';
CREATE TABLE datasource
(
id serial,
name varchar(191) not null default '',
description varchar(255) not null default '',
category varchar(255) not null default '',
plugin_id int not null default 0,
plugin_type varchar(255) not null default '',
plugin_type_name varchar(255) not null default '',
cluster_name varchar(255) not null default '',
settings text not null,
status varchar(255) not null default '',
http varchar(4096) not null default '',
auth varchar(8192) not null default '',
created_at bigint not null default 0,
created_by varchar(64) not null default '',
updated_at bigint not null default 0,
updated_by varchar(64) not null default '',
UNIQUE (name),
PRIMARY KEY (id)
) ;
CREATE TABLE builtin_cate (
id bigserial,
name varchar(191) not null,
user_id bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE TABLE notify_tpl (
id bigserial,
channel varchar(32) not null,
name varchar(255) not null,
content text not null,
PRIMARY KEY (id),
UNIQUE (channel)
) ;
CREATE TABLE sso_config (
id bigserial,
name varchar(191) not null,
content text not null,
PRIMARY KEY (id),
UNIQUE (name)
) ;

File diff suppressed because it is too large Load Diff

View File

@@ -78,12 +78,11 @@ HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[DB]
# postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s
DSN="root:1234@tcp(mysql:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
DSN="host=postgres port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
DBType = "postgres"
# unit: s
MaxLifetime = 7200
# max open connections
@@ -141,4 +140,4 @@ Timeout = 3000
LabelRewrite = true
[[Pushgw.Writers]]
Url = "http://prometheus:9090/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"

Some files were not shown because too many files have changed in this diff Show More