Compare commits

...

44 Commits

Author SHA1 Message Date
Ulric Qin
c353a62656 event.Cluster use target.Cluster instead of rule.Cluster 2022-08-12 13:12:30 +08:00
xiaoziv
48d24c79d6 use slim base image (#1105)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-11 19:35:33 +08:00
xiaoziv
c6a1761a7b support tpls reload (#1104)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-11 17:05:41 +08:00
Ulric Qin
23d7e5a7de add disk_util for target table 2022-08-10 17:05:29 +08:00
xiaoziv
b1b2c7d6b0 feat: support ident disk usage metric (#1100)
* feat: support ident disk usage metric

* code refactor

Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-10 17:00:49 +08:00
Ulric Qin
f34c3c6a2c comment default WriteRelabels 2022-08-10 16:53:41 +08:00
Ulric Qin
454dc7f983 go mod tidy 2022-08-10 16:52:51 +08:00
Resurgence
c1e92b56b9 feat: add write_relabel action before n9e remote writing to multi tsdb (#1098)
* add write relabel config

* change parse relabel Regex field time when config loaded
2022-08-10 16:50:52 +08:00
xiaoziv
fd93fd7182 feat: support i18n metric desc (#1097)
* support i18n metric desc

* code refactor

* code refactor

Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-10 13:21:11 +08:00
Ulric Qin
1a446f0749 fix configurations: TargetMetrics 2022-08-10 10:36:02 +08:00
Ulric Qin
f18ed76593 escape TargetMetrics 2022-08-09 20:07:17 +08:00
Ulric Qin
9b3a9f29d9 extract promql to webapi.conf 2022-08-09 20:01:54 +08:00
Ulric Qin
49965fd5d5 fix target mem util 2022-08-09 17:19:27 +08:00
Ulric Qin
a248e054fa add some host metrics for targets get api 2022-08-09 17:11:24 +08:00
ning
bbb35d36be fix: categraf panic when use docker compose 2022-08-09 10:44:18 +08:00
xiaoziv
fd3e51cbb1 fix i18n header bug (#1095)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-08 20:28:36 +08:00
xiaoziv
bd0480216c feat: support i18n request headerkey (#1094)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-08 19:02:14 +08:00
Ulric Qin
2c963258cf code refactor 2022-08-08 15:26:11 +08:00
Yening Qin
b4f267fb01 feat: prom support tls (#1091) 2022-08-08 12:17:52 +08:00
xiaoziv
ea46401db2 remove record rule check (#1090)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-06 18:15:04 +08:00
xiaoziv
58e777eb00 support graph url (#1088)
Co-authored-by: ziv <xiaozheng@tuya.com>
2022-08-06 18:12:33 +08:00
xiaoziv
04a9161f75 feat: support rule convert from prometheus/vmalert (#1087)
* feat: support rule convert from prometheus/vmalert

* Update rule_converter.py

* Update rule_converter.py

Co-authored-by: ulricqin <ulricqin@qq.com>
2022-08-04 20:06:54 +08:00
xtan
1ed8f38833 feat: add first trigger time (#1086)
Co-authored-by: tanxiao <tanxiao@asiainfo.com>
2022-08-04 19:29:44 +08:00
Ulric Qin
bb17751a81 fix typo 2022-08-02 12:21:01 +08:00
ulricqin
a8dcb1fe83 add retry controller for poster (#1082) 2022-08-02 12:20:02 +08:00
Ulric Qin
1ea30e03a4 check user exists when refresh token 2022-08-01 14:44:22 +08:00
kongfei605
ba0eafa065 docker compose use latest version of n9e and categraf (#1079) 2022-07-29 17:38:28 +08:00
xtan
c78c8d07f2 refactor: error info return (#1077)
Co-authored-by: tanxiao <tanxiao@asiainfo.com>
2022-07-29 17:38:03 +08:00
Ulric Qin
8fe9e57c03 Merge branch 'main' of github.com:ccfos/nightingale 2022-07-29 17:35:49 +08:00
Ulric Qin
64646d2ace refactor linux dashboard 2022-07-29 17:35:23 +08:00
ning
e747e73145 add debug log for ldap login 2022-07-29 15:38:45 +08:00
xtan
896f85efdf refactor: add error log (#1076)
* refactor: add error log

* refactor: update error log

* refactor: fix error log

Co-authored-by: tanxiao <tanxiao@asiainfo.com>
2022-07-29 11:41:39 +08:00
Ulric Qin
77e4499a32 refactor linux dashboard 2022-07-27 19:05:00 +08:00
ulricqin
7c351e09e5 add api: /board/:bid/pure (#1073) 2022-07-27 14:30:35 +08:00
xiaoziv
14ad3b1b0a fix proxy auth username error (#1072) 2022-07-27 14:13:48 +08:00
Ulric Qin
184867d07c feature: query busigroup by ident 2022-07-27 13:13:17 +08:00
Ulric Qin
3476b95b35 fix: query busigroup by ident 2022-07-26 18:23:14 +08:00
Ulric Qin
76e105c93a query busigroup by ident 2022-07-26 17:59:57 +08:00
Ulric Qin
39705787c9 Merge branch 'main' of github.com:ccfos/nightingale 2022-07-26 15:54:42 +08:00
Ulric Qin
293680a9cd use english comma 2022-07-26 15:54:25 +08:00
Yening Qin
05005357fb feat: push event api add mute (#1070) 2022-07-25 16:05:35 +08:00
ulricqin
ba7ff133e6 modify prometheus query batch response format (#1068) 2022-07-23 17:50:16 +08:00
ulricqin
0bd7ba9549 code refactor notify (#1066) 2022-07-22 18:12:42 +08:00
ulricqin
17c7361620 code refactor notify plugin (#1065) 2022-07-22 17:56:52 +08:00
52 changed files with 2809 additions and 1855 deletions

View File

@@ -1,4 +1,5 @@
FROM python:2
FROM python:2.7.8-slim
#FROM python:2
#FROM ubuntu:21.04
WORKDIR /app

View File

@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM python:2
FROM --platform=$BUILDPLATFORM python:2.7.8-slim
WORKDIR /app

View File

@@ -43,3 +43,9 @@ basic_auth_pass = ""
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"

View File

@@ -80,7 +80,7 @@ services:
sh -c "/wait && /app/ibex server"
nwebapi:
image: ulric2019/nightingale:5.9.4
image: flashcatcloud/nightingale:latest
container_name: nwebapi
hostname: nwebapi
restart: always
@@ -108,7 +108,7 @@ services:
sh -c "/wait && /app/n9e webapi"
nserver:
image: ulric2019/nightingale:5.9.4
image: flashcatcloud/nightingale:latest
container_name: nserver
hostname: nserver
restart: always
@@ -136,7 +136,7 @@ services:
sh -c "/wait && /app/n9e server"
categraf:
image: "flashcatcloud/categraf:v0.1.9"
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always

View File

@@ -402,6 +402,7 @@ CREATE TABLE `alert_cur_event` (
`notify_cur_number` int not null default 0 comment '',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` varchar(255) not null,
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
@@ -436,6 +437,7 @@ CREATE TABLE `alert_his_event` (
`notify_cur_number` int not null default 0 comment '',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` varchar(255) not null,
`recover_time` bigint not null default 0,

View File

@@ -436,6 +436,7 @@ CREATE TABLE alert_cur_event (
notify_cur_number int4 not null default 0,
target_ident varchar(191) NOT NULL DEFAULT ''::character varying,
target_note varchar(191) NOT NULL DEFAULT ''::character varying,
first_trigger_time int8,
trigger_time int8 NOT NULL,
trigger_value varchar(255) NOT NULL,
tags varchar(1024) NOT NULL DEFAULT ''::character varying,
@@ -487,6 +488,7 @@ CREATE TABLE alert_his_event (
notify_cur_number int4 not null default 0,
target_ident varchar(191) NOT NULL DEFAULT ''::character varying,
target_note varchar(191) NOT NULL DEFAULT ''::character varying,
first_trigger_time int8,
trigger_time int8 NOT NULL,
trigger_value varchar(255) NOT NULL,
recover_time int8 NOT NULL DEFAULT 0,

View File

@@ -174,4 +174,10 @@ Address = "http://ibex:10090"
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000
Timeout = 3000
[TargetMetrics]
TargetUp = '''max(max_over_time(target_up{ident=~"(%s)"}[%dm])) by (ident)'''
LoadPerCore = '''max(max_over_time(system_load_norm_1{ident=~"(%s)"}[%dm])) by (ident)'''
MemUtil = '''100-max(max_over_time(mem_available_percent{ident=~"(%s)"}[%dm])) by (ident)'''
DiskUtil = '''max(max_over_time(disk_used_percent{ident=~"(%s)", path="/"}[%dm])) by (ident)'''

File diff suppressed because it is too large Load Diff

View File

@@ -1,131 +1,262 @@
cpu_usage_idle: CPU空闲率单位%
cpu_usage_active: CPU使用率(单位:%
cpu_usage_system: CPU内核态时间占比(单位:%
cpu_usage_user: CPU用户态时间占比(单位:%
cpu_usage_nice: 低优先级用户态CPU时间占比也就是进程nice值被调整为1-19之间的CPU时间。这里注意nice可取值范围是-20到19数值越大优先级反而越低(单位:%
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%
cpu_usage_softirq: CPU处理中断的时间占比(单位:%
cpu_usage_steal: 在虚拟机环境下有该指标表示CPU被其他虚拟机争用的时间占比超过20就表示争抢严重(单位:%
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间也就是运行虚拟机的CPU时间占比(单位:%
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%
zh:
cpu_usage_idle: CPU空闲率(单位:%
cpu_usage_active: CPU使用率(单位:%
cpu_usage_system: CPU内核态时间占比(单位:%
cpu_usage_user: CPU用户态时间占比(单位:%
cpu_usage_nice: 低优先级用户态CPU时间占比也就是进程nice值被调整为1-19之间的CPU时间。这里注意nice可取值范围是-20到19数值越大优先级反而越低(单位:%
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%
cpu_usage_irq: CPU处理中断的时间占比(单位:%
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%
cpu_usage_steal: 在虚拟机环境下有该指标表示CPU被其他虚拟机争用的时间占比超过20就表示争抢严重(单位:%
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%
disk_free: 硬盘分区剩余量单位byte
disk_used: 硬盘分区使用量单位byte
disk_used_percent: 硬盘分区使用率(单位:%
disk_total: 硬盘分区总量单位byte
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
disk_free: 硬盘分区剩余量单位byte
disk_used: 硬盘分区使用量单位byte
disk_used_percent: 硬盘分区使用率(单位:%
disk_total: 硬盘分区总量单位byte
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间队列中有I/O请求就计数单位毫秒counter类型需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求不包含在队列中但尚未分配给设备驱动的IO请求gauge类型
diskio_merged_reads: 相邻读请求merge读的次数counter类型
diskio_merged_writes: 相邻写请求merge写的次数counter类型
diskio_read_bytes: 读取的byte数量counter类型需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间单位毫秒counter类型需要用函数求rate才有使用价值
diskio_reads: 读请求次数counter类型需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间如果同时有多个I/O请求时间会叠加单位毫秒
diskio_write_bytes: 写入的byte数量counter类型需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间单位毫秒counter类型需要用函数求rate才有使用价值
diskio_writes: 写请求次数counter类型需要用函数求rate才有使用价值
diskio_io_time: 从设备视角来看I/O请求总时间队列中有I/O请求就计数单位毫秒counter类型需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求不包含在队列中但尚未分配给设备驱动的IO请求gauge类型
diskio_merged_reads: 相邻读请求merge读的次数counter类型
diskio_merged_writes: 相邻写请求merge写的次数counter类型
diskio_read_bytes: 读取的byte数量counter类型需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间单位毫秒counter类型需要用函数求rate才有使用价值
diskio_reads: 读请求次数counter类型需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间如果同时有多个I/O请求时间会叠加单位毫秒
diskio_write_bytes: 写入的byte数量counter类型需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间单位毫秒counter类型需要用函数求rate才有使用价值
diskio_writes: 写请求次数counter类型需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 应用程序可用内存数
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器cache memory用的内存的大小等于 diskcache minus SwapCache
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'这是当前在系统上分配可用的内存总量这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存数
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和avalible的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小SUnreclaim+SReclaimableSlab
mem_swap_cached: 被高速缓冲存储器cache memory用的交换空间的大小已经被交换出来的内存但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 应用程序可用内存数
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器cache memory用的内存的大小等于 diskcache minus SwapCache
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'这是当前在系统上分配可用的内存总量这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存数
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和avalible的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小SUnreclaim+SReclaimableSlab
mem_swap_cached: 被高速缓冲存储器cache memory用的交换空间的大小已经被交换出来的内存但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes)
net_bytes_sent: 网卡发包总数(bytes)
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
net_bytes_recv: 网卡收包总数(bytes)
net_bytes_sent: 网卡发包总数(bytes)
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Krrp-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Krrp-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
en:
cpu_usage_idle: "CPU idle rate(unit%)"
cpu_usage_active: "CPU usage rate(unit%)"
cpu_usage_system: "CPU kernel state time proportion(unit%)"
cpu_usage_user: "CPU user attitude time proportion(unit%)"
cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit%)"
cpu_usage_iowait: "CPU waiting for I/O time proportion(unit%)"
cpu_usage_irq: "CPU processing hard interrupt time proportion(unit%)"
cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit%)"
cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit%)"
cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit%)"
cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit%)"
disk_free: "The remaining amount of the hard disk partition (unit: byte)"
disk_used: "Hard disk partitional use (unit: byte)"
disk_used_percent: "Hard disk partitional use rate (unit:%)"
disk_total: "Total amount of hard disk partition (unit: byte)"
disk_inodes_free: "Hard disk partition INODE remaining amount"
disk_inodes_used: "Hard disk partition INODE usage amount"
disk_inodes_total: "The total amount of hard disk partition INODE"
diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value"
diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type"
diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type"
diskio_merged_writes: "The number of times the request Merge writes, the counter type"
diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value"
diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use"
diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value"
diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)"
diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value"
diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use"
diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value"
kernel_boot_time: "Kernel startup time"
kernel_context_switches: "Number of kernel context switching times"
kernel_entropy_avail: "Entropy pool inside the Linux system"
kernel_interrupts: "Number of kernel interruption"
kernel_processes_forked: "ForK's process number"
mem_active: "The total number of memory (including Cache and BUFFER memory)"
mem_available: "Application can use memory numbers"
mem_available_percent: "Memory remaining percentage (0 ~ 100)"
mem_buffered: "Used to make buffer size for the file"
mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )"
mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system."
mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications"
mem_dirty: "Waiting to be written back to the memory size of the disk"
mem_free: "Senior memory number"
mem_high_free: "Unused high memory size"
mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )"
mem_huge_page_size: "The size of each big page"
mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated"
mem_huge_pages_total: "Reserve the total number of Huge Pages"
mem_inactive: "Free memory (including the memory of free and avalible)"
mem_low_free: "Unused low size"
mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure"
mem_mapped: "The size of the mapping of equipment and files"
mem_page_tables: "The size of the index table of the management of the memory paging page"
mem_shared: "The total memory shared by multiple processes"
mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory"
mem_sreclaimable: "The size of the SLAB can be recovered"
mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimableSlab)"
mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again"
mem_swap_free: "The size of the switching space is not used"
mem_swap_total: "The total size of the exchange space"
mem_total: "Total memory"
mem_used: "Memory number"
mem_used_percent: "The memory has been used by several percentage (0 ~ 100)"
mem_vmalloc_chunk: "The largest continuous unused vmalloc area"
mem_vmalloc_totalL: "You can vmalloc virtual memory size"
mem_vmalloc_used: "Vmalloc's virtual memory size"
mem_write_back: "The memory size of the disk is being written back to the disk"
mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area"
net_bytes_recv: "The total number of packaging of the network card (bytes)"
net_bytes_sent: "Total number of network cards (bytes)"
net_drop_in: "The number of packets for network cards"
net_drop_out: "The number of packets issued by the network card"
net_err_in: "The number of incorrect packets of the network card"
net_err_out: "Number of incorrect number of network cards"
net_packets_recv: "Net card collection quantity"
net_packets_sent: "Number of network card issuance"
netstat_tcp_established: "ESTABLISHED status network link number"
netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number"
netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links"
netstat_tcp_last_ack: "LAST_ ACK status number of network links"
netstat_tcp_listen: "Number of network links in Listen status"
netstat_tcp_syn_recv: "SYN _ RECV status number of network links"
netstat_tcp_syn_sent: "SYN _ SENT status number of network links"
netstat_tcp_time_wait: "Time _ WAIT status network link number"
netstat_udp_socket: "Number of network links in UDP status"
processes_blocked: "The number of processes in the unreprudible sleep state('U','D','L')"
processes_dead: "Number of processes in recycling('X')"
processes_idle: "Number of idle processes hanging('I')"
processes_paging: "Number of paging processes('P')"
processes_running: "Number of processes during operation('R')"
processes_sleeping: "Can interrupt the number of processes('S')"
processes_stopped: "Pushing status process number('T')"
processes_total: "Total process number"
processes_total_threads: "Number of threads"
processes_unknown: "Unknown status process number"
processes_zombies: "Number of zombies('Z')"
swap_used_percent: "SWAP space replace the data volume"
system_load1: "1 minute average load value"
system_load5: "5 minutes average load value"
system_load15: "15 minutes average load value"
system_n_users: "User number"
system_n_cpus: "CPU nuclear number"
system_uptime: "System startup time"
nginx_accepts: "Since Nginx started, the total number of connections has been established with the client"
nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting"
nginx_handled: "Starting from Nginx, the total number of client connections that have been processed"
nginx_reading: "Reading the total number of connections on the http request header"
nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Krrp - Alive requests, this value will be greater than the handled value"
nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures"
nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end"
nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0"
nginx_waiting: "When keep-alive is enabled, this value is equal to active (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command"
nginx_writing: "The total number of connections to send a response to the client"
http_response_content_length: "HTTP message entity transmission length"
http_response_http_response_code: "http response status code"
http_response_response_time: "When http ring application"
http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed"
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
@@ -370,8 +501,6 @@ node_load15: cpu load 15m
# MEM
# 内核态
# 用户追踪已从交换区获取但尚未修改的页面的内存
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
@@ -433,7 +562,7 @@ node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_files_free: Filesystem space available to non-root users in byte
node_filesystem_avail_bytes: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes

View File

@@ -7,13 +7,6 @@ import (
"github.com/tidwall/gjson"
)
// the caller can be called for alerting notify by complete this interface
type inter interface {
Descript() string
Notify([]byte)
NotifyMaintainer([]byte)
}
// N9E complete
type N9EPlugin struct {
Name string
@@ -41,13 +34,13 @@ func (n *N9EPlugin) Notify(bs []byte) {
func (n *N9EPlugin) NotifyMaintainer(bs []byte) {
fmt.Println("do something... begin")
result := string(bs)
fmt.Println("%T",result)
fmt.Println(result)
fmt.Println("do something... end")
}
// will be loaded for alertingCall , The first letter must be capitalized to be exported
var N9eCaller = N9EPlugin{
Name: "n9e",
Description: "演示告警通过动态链接库方式通知",
Name: "N9EPlugin",
Description: "Notify by lib",
BuildAt: time.Now().Local().Format("2006/01/02 15:04:05"),
}

View File

@@ -0,0 +1,193 @@
import json
import yaml
'''
将promtheus/vmalert的rule转换为n9e中的rule
支持k8s的rule configmap
'''
rule_file = 'rules.yaml'
def convert_interval(interval):
if interval.endswith('s') or interval.endswith('S'):
return int(interval[:-1])
if interval.endswith('m') or interval.endswith('M'):
return int(interval[:-1]) * 60
if interval.endswith('h') or interval.endswith('H'):
return int(interval[:-1]) * 60 * 60
if interval.endswith('d') or interval.endswith('D'):
return int(interval[:-1]) * 60 * 60 * 24
return int(interval)
def convert_alert(rule, interval):
name = rule['alert']
prom_ql = rule['expr']
if 'for' in rule:
prom_for_duration = convert_interval(rule['for'])
else:
prom_for_duration = 0
prom_eval_interval = convert_interval(interval)
note = ''
if 'annotations' in rule:
for v in rule['annotations'].values():
note = v
break
append_tags = []
severity = 2
if 'labels' in rule:
for k, v in rule['labels'].items():
if k != 'severity':
append_tags.append('{}={}'.format(k, v))
continue
if v == 'critical':
severity = 1
elif v == 'info':
severity = 3
# elif v == 'warning':
# severity = 2
n9e_alert_rule = {
"name": name,
"note": note,
"severity": severity,
"disabled": 0,
"prom_for_duration": prom_for_duration,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": append_tags
}
return n9e_alert_rule
def convert_record(rule, interval):
name = rule['record']
prom_ql = rule['expr']
prom_eval_interval = convert_interval(interval)
note = ''
append_tags = []
if 'labels' in rule:
for k, v in rule['labels'].items():
append_tags.append('{}={}'.format(k, v))
n9e_record_rule = {
"name": name,
"note": note,
"disabled": 0,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"append_tags": append_tags
}
return n9e_record_rule
'''
example of rule group file
---
groups:
- name: example
rules:
- alert: HighRequestLatency
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
for: 10m
labels:
severity: page
annotations:
summary: High request latency
'''
def deal_group(group):
"""
parse single prometheus/vmalert rule group
"""
alert_rules = []
record_rules = []
for rule_segment in group['groups']:
if 'interval' in rule_segment:
interval = rule_segment['interval']
else:
interval = '15s'
for rule in rule_segment['rules']:
if 'alert' in rule:
alert_rules.append(convert_alert(rule, interval))
else:
record_rules.append(convert_record(rule, interval))
return alert_rules, record_rules
'''
example of k8s rule configmap
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rulefiles-0
data:
etcdrules.yaml: |
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"})
by (job) + 1) / 2)
for: 3m
labels:
severity: critical
'''
def deal_configmap(rule_configmap):
"""
parse rule configmap from k8s
"""
all_record_rules = []
all_alert_rules = []
for _, rule_group_str in rule_configmap['data'].items():
rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader)
alert_rules, record_rules = deal_group(rule_group)
all_alert_rules.extend(alert_rules)
all_record_rules.extend(record_rules)
return all_alert_rules, all_record_rules
def main():
with open(rule_file, 'r') as f:
rule_config = yaml.load(f, Loader=yaml.FullLoader)
# 如果文件是k8s中的configmap,使用下面的方法
# alert_rules, record_rules = deal_configmap(rule_config)
alert_rules, record_rules = deal_group(rule_config)
with open("alert-rules.json", 'w') as fw:
json.dump(alert_rules, fw, indent=2, ensure_ascii=False)
with open("record-rules.json", 'w') as fw:
json.dump(record_rules, fw, indent=2, ensure_ascii=False)
if __name__ == '__main__':
main()

View File

@@ -190,6 +190,12 @@ KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
# [[Writers]]
# Url = "http://127.0.0.1:7201/api/v1/prom/remote/write"

26
etc/template/README.md Normal file
View File

@@ -0,0 +1,26 @@
# 告警消息模版文件
模版中可以使用的变量参考`AlertCurEvent`对象
模版语法如何使用可以参考[html/template](https://pkg.go.dev/html/template)
## 如何在告警模版中添加监控详情url
假设web的地址是http://127.0.0.1:18000/, 实际使用时用web地址替换该地址
在监控模版中添加以下行:
* dingtalk / wecom / feishu
```markdown
[监控详情](http://127.0.0.1:18000/metric/explorer?promql={{ .PromQl | escape }})
```
* mailbody
```html
<tr>
<th>监控详情:</th>
<td>
<a href="http://127.0.0.1:18000/metric/explorer?promql={{ .PromQl | escape }}" target="_blank">点击查看</a>
</td>
</tr>
```

View File

@@ -4,6 +4,9 @@ RunMode = "release"
# # custom i18n dict config
# I18N = "./etc/i18n.json"
# # custom i18n request header key
# I18NHeaderKey = "X-Language"
# metrics descriptions
MetricsYamlFile = "./etc/metrics.yaml"
@@ -198,4 +201,10 @@ Address = "http://127.0.0.1:10090"
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000
Timeout = 3000
[TargetMetrics]
TargetUp = '''max(max_over_time(target_up{ident=~"(%s)"}[%dm])) by (ident)'''
LoadPerCore = '''max(max_over_time(system_load_norm_1{ident=~"(%s)"}[%dm])) by (ident)'''
MemUtil = '''100-max(max_over_time(mem_available_percent{ident=~"(%s)"}[%dm])) by (ident)'''
DiskUtil = '''max(max_over_time(disk_used_percent{ident=~"(%s)", path="/"}[%dm])) by (ident)'''

View File

@@ -46,6 +46,7 @@ type AlertCurEvent struct {
LastEvalTime int64 `json:"last_eval_time" gorm:"-"` // for notify.py 上次计算的时间
LastSentTime int64 `json:"last_sent_time" gorm:"-"` // 上次发送时间
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
FirstTriggerTime int64 `json:"first_trigger_time"` // 连续告警的首次告警时间
}
func (e *AlertCurEvent) TableName() string {
@@ -180,6 +181,7 @@ func (e *AlertCurEvent) ToHis() *AlertHisEvent {
RecoverTime: recoverTime,
LastEvalTime: e.LastEvalTime,
NotifyCurNumber: e.NotifyCurNumber,
FirstTriggerTime: e.FirstTriggerTime,
}
}

View File

@@ -38,7 +38,8 @@ type AlertHisEvent struct {
LastEvalTime int64 `json:"last_eval_time"`
Tags string `json:"-"`
TagsJSON []string `json:"tags" gorm:"-"`
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
FirstTriggerTime int64 `json:"first_trigger_time"` // 连续告警的首次告警时间
}
func (e *AlertHisEvent) TableName() string {

View File

@@ -13,10 +13,10 @@ import (
type TagFilter struct {
Key string `json:"key"` // tag key
Func string `json:"func"` // == | =~ | in
Func string `json:"func"` // `==` | `=~` | `in` | `!=` | `!~` | `not in`
Value string `json:"value"` // tag value
Regexp *regexp.Regexp // parse value to regexp if func = '=~'
Vset map[string]struct{} // parse value to regexp if func = 'in'
Regexp *regexp.Regexp // parse value to regexp if func = '=~' or '!~'
Vset map[string]struct{} // parse value to regexp if func = 'in' or 'not in'
}
type AlertMute struct {

View File

@@ -88,12 +88,12 @@ func (s *AlertSubscribe) Parse() error {
}
for i := 0; i < len(s.ITags); i++ {
if s.ITags[i].Func == "=~" {
if s.ITags[i].Func == "=~" || s.ITags[i].Func == "!~" {
s.ITags[i].Regexp, err = regexp.Compile(s.ITags[i].Value)
if err != nil {
return err
}
} else if s.ITags[i].Func == "in" {
} else if s.ITags[i].Func == "in" || s.ITags[i].Func == "not in" {
arr := strings.Fields(s.ITags[i].Value)
s.ITags[i].Vset = make(map[string]struct{})
for j := 0; j < len(arr); j++ {

View File

@@ -71,6 +71,20 @@ func (b *Board) Del() error {
})
}
func BoardGetByID(id int64) (*Board, error) {
var lst []*Board
err := DB().Where("id = ?", id).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
// BoardGet for detail page
func BoardGet(where string, args ...interface{}) (*Board, error) {
var lst []*Board

View File

@@ -83,14 +83,15 @@ func (re *RecordingRule) Add() error {
return err
}
exists, err := RecordingRuleExists(0, re.GroupId, re.Cluster, re.Name)
if err != nil {
return err
}
if exists {
return errors.New("RecordingRule already exists")
}
// 由于实际场景中会出现name重复的recording rule所以不需要检查重复
//exists, err := RecordingRuleExists(0, re.GroupId, re.Cluster, re.Name)
//if err != nil {
// return err
//}
//
//if exists {
// return errors.New("RecordingRule already exists")
//}
now := time.Now().Unix()
re.CreateAt = now
@@ -100,15 +101,16 @@ func (re *RecordingRule) Add() error {
}
func (re *RecordingRule) Update(ref RecordingRule) error {
if re.Name != ref.Name {
exists, err := RecordingRuleExists(re.Id, re.GroupId, re.Cluster, ref.Name)
if err != nil {
return err
}
if exists {
return errors.New("RecordingRule already exists")
}
}
// 由于实际场景中会出现name重复的recording rule所以不需要检查重复
//if re.Name != ref.Name {
// exists, err := RecordingRuleExists(re.Id, re.GroupId, re.Cluster, ref.Name)
// if err != nil {
// return err
// }
// if exists {
// return errors.New("RecordingRule already exists")
// }
//}
ref.FE2DB()
ref.Id = re.Id

198
src/models/relabel.go Normal file
View File

@@ -0,0 +1,198 @@
package models
import (
"crypto/md5"
"fmt"
"regexp"
"sort"
"strings"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
const (
Replace Action = "replace"
Keep Action = "keep"
Drop Action = "drop"
HashMod Action = "hashmod"
LabelMap Action = "labelmap"
LabelDrop Action = "labeldrop"
LabelKeep Action = "labelkeep"
Lowercase Action = "lowercase"
Uppercase Action = "uppercase"
)
type Action string
type Regexp struct {
*regexp.Regexp
}
type RelabelConfig struct {
SourceLabels model.LabelNames
Separator string
Regex interface{}
Modulus uint64
TargetLabel string
Replacement string
Action Action
}
func Process(labels []*prompb.Label, cfgs ...*RelabelConfig) []*prompb.Label {
for _, cfg := range cfgs {
labels = relabel(labels, cfg)
if labels == nil {
return nil
}
}
return labels
}
func getValue(ls []*prompb.Label, name model.LabelName) string {
for _, l := range ls {
if l.Name == string(name) {
return l.Value
}
}
return ""
}
type LabelBuilder struct {
LabelSet map[string]string
}
func newBuilder(ls []*prompb.Label) *LabelBuilder {
lset := make(map[string]string, len(ls))
for _, l := range ls {
lset[l.Name] = l.Value
}
return &LabelBuilder{LabelSet: lset}
}
func (l *LabelBuilder) set(k, v string) *LabelBuilder {
if v == "" {
return l.del(k)
}
l.LabelSet[k] = v
return l
}
func (l *LabelBuilder) del(ns ...string) *LabelBuilder {
for _, n := range ns {
delete(l.LabelSet, n)
}
return l
}
func (l *LabelBuilder) labels() []*prompb.Label {
ls := make([]*prompb.Label, 0, len(l.LabelSet))
if len(l.LabelSet) == 0 {
return ls
}
for k, v := range l.LabelSet {
ls = append(ls, &prompb.Label{
Name: k,
Value: v,
})
}
sort.Slice(ls, func(i, j int) bool {
return ls[i].Name > ls[j].Name
})
return ls
}
func relabel(lset []*prompb.Label, cfg *RelabelConfig) []*prompb.Label {
values := make([]string, 0, len(cfg.SourceLabels))
for _, ln := range cfg.SourceLabels {
values = append(values, getValue(lset, ln))
}
regx := cfg.Regex.(Regexp)
val := strings.Join(values, cfg.Separator)
lb := newBuilder(lset)
switch cfg.Action {
case Drop:
if regx.MatchString(val) {
return nil
}
case Keep:
if !regx.MatchString(val) {
return nil
}
case Replace:
indexes := regx.FindStringSubmatchIndex(val)
if indexes == nil {
break
}
target := model.LabelName(regx.ExpandString([]byte{}, cfg.TargetLabel, val, indexes))
if !target.IsValid() {
lb.del(cfg.TargetLabel)
break
}
res := regx.ExpandString([]byte{}, cfg.Replacement, val, indexes)
if len(res) == 0 {
lb.del(cfg.TargetLabel)
break
}
lb.set(string(target), string(res))
case Lowercase:
lb.set(cfg.TargetLabel, strings.ToLower(val))
case Uppercase:
lb.set(cfg.TargetLabel, strings.ToUpper(val))
case HashMod:
mod := sum64(md5.Sum([]byte(val))) % cfg.Modulus
lb.set(cfg.TargetLabel, fmt.Sprintf("%d", mod))
case LabelMap:
for _, l := range lset {
if regx.MatchString(l.Name) {
res := regx.ReplaceAllString(l.Name, cfg.Replacement)
lb.set(res, l.Value)
}
}
case LabelDrop:
for _, l := range lset {
if regx.MatchString(l.Name) {
lb.del(l.Name)
}
}
case LabelKeep:
for _, l := range lset {
if !regx.MatchString(l.Name) {
lb.del(l.Name)
}
}
default:
panic(fmt.Errorf("relabel: unknown relabel action type %q", cfg.Action))
}
return lb.labels()
}
func sum64(hash [md5.Size]byte) uint64 {
var s uint64
for i, b := range hash {
shift := uint64((md5.Size - i - 1) * 8)
s |= uint64(b) << shift
}
return s
}
func NewRegexp(s string) (Regexp, error) {
regex, err := regexp.Compile("^(?:" + s + ")$")
return Regexp{Regexp: regex}, err
}
func MustNewRegexp(s string) Regexp {
re, err := NewRegexp(s)
if err != nil {
panic(err)
}
return re
}

View File

@@ -20,6 +20,11 @@ type Target struct {
TagsJSON []string `json:"tags" gorm:"-"`
TagsMap map[string]string `json:"-" gorm:"-"` // internal use, append tags to series
UpdateAt int64 `json:"update_at"`
TargetUp float64 `json:"target_up" gorm:"-"`
LoadPerCore float64 `json:"load_per_core" gorm:"-"`
MemUtil float64 `json:"mem_util" gorm:"-"`
DiskUtil float64 `json:"disk_util" gorm:"-"`
}
func (t *Target) TableName() string {

View File

@@ -450,6 +450,21 @@ func (u *User) BusiGroups(limit int, query string, all ...bool) ([]BusiGroup, er
var lst []BusiGroup
if u.IsAdmin() || (len(all) > 0 && all[0]) {
err := session.Where("name like ?", "%"+query+"%").Find(&lst).Error
if err != nil {
return lst, err
}
if len(lst) == 0 && len(query) > 0 {
// 隐藏功能一般人不告诉哈哈。query可能是给的ident所以上面的sql没有查到当做ident来查一下试试
var t *Target
t, err = TargetGet("ident=?", query)
if err != nil {
return lst, err
}
err = DB().Order("name").Limit(limit).Where("id=?", t.GroupId).Find(&lst).Error
}
return lst, err
}
@@ -468,6 +483,22 @@ func (u *User) BusiGroups(limit int, query string, all ...bool) ([]BusiGroup, er
}
err = session.Where("id in ?", busiGroupIds).Where("name like ?", "%"+query+"%").Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 && len(query) > 0 {
var t *Target
t, err = TargetGet("ident=?", query)
if err != nil {
return lst, err
}
if slice.ContainsInt64(busiGroupIds, t.GroupId) {
err = DB().Order("name").Limit(limit).Where("id=?", t.GroupId).Find(&lst).Error
}
}
return lst, err
}

9
src/notifier/notifier.go Normal file
View File

@@ -0,0 +1,9 @@
package notifier
type Notifier interface {
Descript() string
Notify([]byte)
NotifyMaintainer([]byte)
}
var Instance Notifier

View File

@@ -6,9 +6,11 @@ import (
"io/ioutil"
"net/http"
"time"
"github.com/toolkits/pkg/logger"
)
func PostJSON(url string, timeout time.Duration, v interface{}) (response []byte, code int, err error) {
func PostJSON(url string, timeout time.Duration, v interface{}, retries ...int) (response []byte, code int, err error) {
var bs []byte
bs, err = json.Marshal(v)
@@ -26,7 +28,29 @@ func PostJSON(url string, timeout time.Duration, v interface{}) (response []byte
req.Header.Set("Content-Type", "application/json")
var resp *http.Response
resp, err = client.Do(req)
if len(retries) > 0 {
for i := 0; i < retries[0]; i++ {
resp, err = client.Do(req)
if err == nil {
break
}
tryagain := ""
if i+1 < retries[0] {
tryagain = " try again"
}
logger.Warningf("failed to curl %s error: %s"+tryagain, url, err)
if i+1 < retries[0] {
time.Sleep(time.Millisecond * 200)
}
}
} else {
resp, err = client.Do(req)
}
if err != nil {
return
}

View File

@@ -12,25 +12,26 @@ import (
// ClientConfig represents the standard client TLS config.
type ClientConfig struct {
TLSCA string
TLSCert string
TLSKey string
TLSKeyPwd string
InsecureSkipVerify bool
ServerName string
TLSMinVersion string
TLSCA string `toml:"tls_ca"`
TLSCert string `toml:"tls_cert"`
TLSKey string `toml:"tls_key"`
TLSKeyPwd string `toml:"tls_key_pwd"`
InsecureSkipVerify bool `toml:"insecure_skip_verify"`
ServerName string `toml:"tls_server_name"`
TLSMinVersion string `toml:"tls_min_version"`
TLSMaxVersion string `toml:"tls_max_version"`
}
// ServerConfig represents the standard server TLS config.
type ServerConfig struct {
TLSCert string
TLSKey string
TLSKeyPwd string
TLSAllowedCACerts []string
TLSCipherSuites []string
TLSMinVersion string
TLSMaxVersion string
TLSAllowedDNSNames []string
TLSCert string `toml:"tls_cert"`
TLSKey string `toml:"tls_key"`
TLSKeyPwd string `toml:"tls_key_pwd"`
TLSAllowedCACerts []string `toml:"tls_allowed_cacerts"`
TLSCipherSuites []string `toml:"tls_cipher_suites"`
TLSMinVersion string `toml:"tls_min_version"`
TLSMaxVersion string `toml:"tls_max_version"`
TLSAllowedDNSNames []string `toml:"tls_allowed_dns_names"`
}
// TLSConfig returns a tls.Config, may be nil without error if TLS is not
@@ -70,6 +71,16 @@ func (c *ClientConfig) TLSConfig() (*tls.Config, error) {
tlsConfig.MinVersion = tls.VersionTLS13
}
if c.TLSMaxVersion == "1.0" {
tlsConfig.MaxVersion = tls.VersionTLS10
} else if c.TLSMaxVersion == "1.1" {
tlsConfig.MaxVersion = tls.VersionTLS11
} else if c.TLSMaxVersion == "1.2" {
tlsConfig.MaxVersion = tls.VersionTLS12
} else if c.TLSMaxVersion == "1.3" {
tlsConfig.MaxVersion = tls.VersionTLS13
}
return tlsConfig, nil
}

View File

@@ -2,11 +2,13 @@ package tplx
import (
"html/template"
"net/url"
"regexp"
"strings"
)
var TemplateFuncMap = template.FuncMap{
"escape": url.PathEscape,
"unescaped": Unescaped,
"urlconvert": Urlconvert,
"timeformat": Timeformat,

View File

@@ -66,7 +66,7 @@ func SendDingtalk(message DingtalkMessage) {
}
}
res, code, err := poster.PostJSON(ur, time.Second*5, body)
res, code, err := poster.PostJSON(ur, time.Second*5, body, 3)
if err != nil {
logger.Errorf("dingtalk_sender: result=fail url=%s code=%d error=%v response=%s", ur, code, err, string(res))
} else {

View File

@@ -42,7 +42,7 @@ func SendFeishu(message FeishuMessage) {
},
}
res, code, err := poster.PostJSON(url, time.Second*5, body)
res, code, err := poster.PostJSON(url, time.Second*5, body, 3)
if err != nil {
logger.Errorf("feishu_sender: result=fail url=%s code=%d error=%v response=%s", url, code, err, string(res))
} else {

View File

@@ -31,7 +31,7 @@ func SendWecom(message WecomMessage) {
},
}
res, code, err := poster.PostJSON(url, time.Second*5, body)
res, code, err := poster.PostJSON(url, time.Second*5, body, 3)
if err != nil {
logger.Errorf("wecom_sender: result=fail url=%s code=%d error=%v response=%s", url, code, err, string(res))
} else {

View File

@@ -2,8 +2,11 @@ package config
import (
"fmt"
"log"
"net"
"os"
"plugin"
"runtime"
"strings"
"sync"
"time"
@@ -11,6 +14,8 @@ import (
"github.com/gin-gonic/gin"
"github.com/koding/multiconfig"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/notifier"
"github.com/didi/nightingale/v5/src/pkg/httpx"
"github.com/didi/nightingale/v5/src/pkg/logx"
"github.com/didi/nightingale/v5/src/pkg/ormx"
@@ -100,6 +105,33 @@ func MustLoad(fpaths ...string) {
}
}
if C.Alerting.CallPlugin.Enable {
if runtime.GOOS == "windows" {
fmt.Println("notify plugin on unsupported os:", runtime.GOOS)
os.Exit(1)
}
p, err := plugin.Open(C.Alerting.CallPlugin.PluginPath)
if err != nil {
fmt.Println("failed to load plugin:", err)
os.Exit(1)
}
caller, err := p.Lookup(C.Alerting.CallPlugin.Caller)
if err != nil {
fmt.Println("failed to lookup plugin Caller:", err)
os.Exit(1)
}
ins, ok := caller.(notifier.Notifier)
if !ok {
log.Println("notifier interface not implemented")
os.Exit(1)
}
notifier.Instance = ins
}
if C.WriterOpt.QueueMaxSize <= 0 {
C.WriterOpt.QueueMaxSize = 100000
}
@@ -112,6 +144,33 @@ func MustLoad(fpaths ...string) {
C.WriterOpt.QueueCount = 100
}
for _, write := range C.Writers {
for _, relabel := range write.WriteRelabels {
regex, ok := relabel.Regex.(string)
if !ok {
log.Println("Regex field must be a string")
os.Exit(1)
}
if regex == "" {
regex = "(.*)"
}
relabel.Regex = models.MustNewRegexp(regex)
if relabel.Separator == "" {
relabel.Separator = ";"
}
if relabel.Action == "" {
relabel.Action = "replace"
}
if relabel.Replacement == "" {
relabel.Replacement = "$1"
}
}
}
fmt.Println("heartbeat.ip:", C.Heartbeat.IP)
fmt.Printf("heartbeat.interval: %dms\n", C.Heartbeat.Interval)
})
@@ -175,6 +234,8 @@ type WriterOptions struct {
MaxIdleConnsPerHost int
Headers []string
WriteRelabels []*models.RelabelConfig
}
type WriterGlobalOpt struct {

View File

@@ -32,7 +32,7 @@ func callback(event *models.AlertCurEvent) {
url = "http://" + url
}
resp, code, err := poster.PostJSON(url, 5*time.Second, event)
resp, code, err := poster.PostJSON(url, 5*time.Second, event, 3)
if err != nil {
logger.Errorf("event_callback(rule_id=%d url=%s) fail, resp: %s, err: %v, code: %d", event.RuleId, url, string(resp), err, code)
} else {

View File

@@ -4,13 +4,15 @@ import (
"context"
"time"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/src/server/common/sender"
"github.com/didi/nightingale/v5/src/server/config"
promstat "github.com/didi/nightingale/v5/src/server/stat"
)
func Start(ctx context.Context) error {
err := initTpls()
err := reloadTpls()
if err != nil {
return err
}
@@ -28,6 +30,13 @@ func Start(ctx context.Context) error {
return nil
}
func Reload() {
err := reloadTpls()
if err != nil {
logger.Error("engine reload err:", err)
}
}
func reportQueueSize() {
for {
time.Sleep(time.Second)

View File

@@ -6,7 +6,7 @@ import (
)
// 如果传入了clock这个可选参数就表示使用这个clock表示的时间否则就从event的字段中取TriggerTime
func isMuted(event *models.AlertCurEvent, clock ...int64) bool {
func IsMuted(event *models.AlertCurEvent, clock ...int64) bool {
mutes, has := memsto.AlertMuteCache.Gets(event.GroupId)
if !has || len(mutes) == 0 {
return false

View File

@@ -9,9 +9,8 @@ import (
"net/http"
"os/exec"
"path"
"plugin"
"runtime"
"strings"
"sync"
"time"
"github.com/pkg/errors"
@@ -22,6 +21,7 @@ import (
"github.com/toolkits/pkg/slice"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/notifier"
"github.com/didi/nightingale/v5/src/pkg/sys"
"github.com/didi/nightingale/v5/src/pkg/tplx"
"github.com/didi/nightingale/v5/src/server/common/sender"
@@ -30,9 +30,12 @@ import (
"github.com/didi/nightingale/v5/src/storage"
)
var tpls = make(map[string]*template.Template)
var (
tpls map[string]*template.Template
rwLock sync.RWMutex
)
func initTpls() error {
func reloadTpls() error {
if config.C.Alerting.TemplatesDir == "" {
config.C.Alerting.TemplatesDir = path.Join(runner.Cwd, "etc", "template")
}
@@ -57,6 +60,7 @@ func initTpls() error {
return errors.New("no tpl files under " + config.C.Alerting.TemplatesDir)
}
tmpTpls := make(map[string]*template.Template)
for i := 0; i < len(tplFiles); i++ {
tplpath := path.Join(config.C.Alerting.TemplatesDir, tplFiles[i])
@@ -65,9 +69,12 @@ func initTpls() error {
return errors.WithMessage(err, "failed to parse tpl: "+tplpath)
}
tpls[tplFiles[i]] = tpl
tmpTpls[tplFiles[i]] = tpl
}
rwLock.Lock()
tpls = tmpTpls
rwLock.Unlock()
return nil
}
@@ -79,6 +86,9 @@ type Notice struct {
func genNotice(event *models.AlertCurEvent) Notice {
// build notice body with templates
ntpls := make(map[string]string)
rwLock.RLock()
defer rwLock.RUnlock()
for filename, tpl := range tpls {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
@@ -103,7 +113,6 @@ func alertingRedisPub(bs []byte) {
func handleNotice(notice Notice, bs []byte) {
alertingCallScript(bs)
alertingCallPlugin(bs)
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
@@ -398,12 +407,6 @@ func alertingCallScript(stdinBytes []byte) {
logger.Infof("event_notify: exec %s output: %s", fpath, buf.String())
}
type Notifier interface {
Descript() string
Notify([]byte)
NotifyMaintainer([]byte)
}
// call notify.so via golang plugin build
// ig. etc/script/notify/notify.so
func alertingCallPlugin(stdinBytes []byte) {
@@ -411,26 +414,8 @@ func alertingCallPlugin(stdinBytes []byte) {
return
}
if runtime.GOOS == "windows" {
logger.Errorf("call notify plugin on unsupported os: %s", runtime.GOOS)
return
}
p, err := plugin.Open(config.C.Alerting.CallPlugin.PluginPath)
if err != nil {
logger.Errorf("failed to open notify plugin: %v", err)
return
}
caller, err := p.Lookup(config.C.Alerting.CallPlugin.Caller)
if err != nil {
logger.Errorf("failed to load caller: %v", err)
return
}
notifier, ok := caller.(Notifier)
if !ok {
logger.Errorf("notifier interface not implemented): %v", err)
return
}
notifier.Notify(stdinBytes)
logger.Debugf("alertingCallPlugin done. %s", notifier.Descript())
logger.Debugf("alertingCallPlugin begin")
logger.Debugf("payload:", string(stdinBytes))
notifier.Instance.Notify(stdinBytes)
logger.Debugf("alertingCallPlugin done")
}

View File

@@ -2,11 +2,10 @@ package engine
import (
"encoding/json"
"plugin"
"runtime"
"time"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/notifier"
"github.com/didi/nightingale/v5/src/server/common/sender"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/didi/nightingale/v5/src/server/memsto"
@@ -14,72 +13,59 @@ import (
"github.com/toolkits/pkg/logger"
)
type NoticeMaintainer struct {
NotifyUsersObj []*models.User `json:"notify_user_obj" gorm:"-"`
Title string `json:"title"`
Content string `json:"content"`
type MaintainMessage struct {
Tos []*models.User `json:"tos"`
Title string `json:"title"`
Content string `json:"content"`
}
func noticeCallPlugin(stdinBytes []byte) {
func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*models.User) {
if !config.C.Alerting.CallPlugin.Enable {
return
}
if runtime.GOOS == "windows" {
logger.Errorf("call notify plugin on unsupported os: %s", runtime.GOOS)
stdinBytes, err := json.Marshal(MaintainMessage{
Tos: users,
Title: title,
Content: "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime,
})
if err != nil {
logger.Error("failed to marshal MaintainMessage:", err)
return
}
p, err := plugin.Open(config.C.Alerting.CallPlugin.PluginPath)
if err != nil {
logger.Errorf("failed to open notify plugin: %v", err)
return
}
caller, err := p.Lookup(config.C.Alerting.CallPlugin.Caller)
if err != nil {
logger.Errorf("failed to load caller: %v", err)
return
}
notifier, ok := caller.(Notifier)
if !ok {
logger.Errorf("notifier interface not implemented): %v", err)
return
}
notifier.NotifyMaintainer(stdinBytes)
logger.Debugf("noticeCallPlugin done. %s", notifier.Descript())
notifier.Instance.NotifyMaintainer(stdinBytes)
logger.Debugf("notify maintainer with plugin done")
}
// notify to maintainer to handle the error
func notifyToMaintainer(e error, title string) {
logger.Errorf("notifyToMaintainer, title:%s, error:%v", title, e)
logger.Errorf("notifyToMaintainertitle:%s, error:%v", title, e)
var noticeMaintainer NoticeMaintainer
maintainerUsers := memsto.UserCache.GetMaintainerUsers()
if len(maintainerUsers) == 0 {
users := memsto.UserCache.GetMaintainerUsers()
if len(users) == 0 {
return
}
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
noticeMaintainer.NotifyUsersObj = maintainerUsers
noticeMaintainer.Content = "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
noticeMaintainer.Title = title
stdinBytes, err := json.Marshal(noticeMaintainer)
if err != nil {
logger.Errorf("notifyToMaintainer: failed to marshal noticeMaintainer: %v", err)
} else {
noticeCallPlugin(stdinBytes)
}
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
notifyMaintainerWithPlugin(e, title, triggerTime, users)
notifyMaintainerWithBuiltin(e, title, triggerTime, users)
}
func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*models.User) {
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
return
}
emailset := make(map[string]struct{})
phoneset := make(map[string]struct{})
wecomset := make(map[string]struct{})
dingtalkset := make(map[string]struct{})
feishuset := make(map[string]struct{})
for _, user := range maintainerUsers {
for _, user := range users {
if user.Email != "" {
emailset[user.Email] = struct{}{}
}
@@ -118,13 +104,13 @@ func notifyToMaintainer(e error, title string) {
if len(emailset) == 0 {
continue
}
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
sender.WriteEmail(title, content, StringSetKeys(emailset))
case "dingtalk":
if len(dingtalkset) == 0 {
continue
}
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
sender.SendDingtalk(sender.DingtalkMessage{
Title: title,
Text: content,
@@ -135,7 +121,7 @@ func notifyToMaintainer(e error, title string) {
if len(wecomset) == 0 {
continue
}
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
sender.SendWecom(sender.WecomMessage{
Text: content,
Tokens: StringSetKeys(wecomset),
@@ -145,7 +131,7 @@ func notifyToMaintainer(e error, title string) {
continue
}
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
sender.SendFeishu(sender.FeishuMessage{
Text: content,
AtMobiles: phones,

View File

@@ -87,7 +87,7 @@ func (r RuleEval) Start() {
return
default:
r.Work()
logger.Debugf("rule executedrule_id=%d", r.RuleID())
logger.Debugf("rule executed, rule_id=%d", r.RuleID())
interval := r.rule.PromEvalInterval
if interval <= 0 {
interval = 10
@@ -116,8 +116,7 @@ func (r RuleEval) Work() {
value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
// 告警查询prometheus逻辑出错发告警信息给管理员
notifyToMaintainer(err, "查询prometheus出错")
notifyToMaintainer(err, "failed to query prometheus")
return
}
@@ -190,7 +189,6 @@ func (ws *WorkersType) Build(rids []int64) {
elst, err := models.AlertCurEventGetByRule(rules[hash].Id)
if err != nil {
logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err)
notifyToMaintainer(err, "AlertCurEventGetByRule ErrorruleID="+fmt.Sprint(rules[hash].Id))
continue
}
@@ -300,10 +298,12 @@ func (r RuleEval) judge(vectors []conv.Vector) {
// handle target note
targetIdent, has := vectors[i].Labels["ident"]
targetNote := ""
targetCluster := ""
if has {
target, exists := memsto.TargetCache.Get(string(targetIdent))
if exists {
targetNote = target.Note
targetCluster = target.Cluster
// 对于包含ident的告警事件check一下ident所属bg和rule所属bg是否相同
// 如果告警规则选择了只在本BG生效那其他BG的机器就不能因此规则产生告警
@@ -326,7 +326,7 @@ func (r RuleEval) judge(vectors []conv.Vector) {
}
// isMuted only need TriggerTime RuleName and TagsMap
if isMuted(event) {
if IsMuted(event) {
logger.Infof("event_muted: rule_id=%d %s", r.rule.Id, vectors[i].Key)
continue
}
@@ -334,7 +334,7 @@ func (r RuleEval) judge(vectors []conv.Vector) {
tagsArr := labelMapToArr(tagsMap)
sort.Strings(tagsArr)
event.Cluster = r.rule.Cluster
event.Cluster = targetCluster
event.Hash = hash
event.RuleId = r.rule.Id
event.RuleName = r.rule.Name
@@ -420,6 +420,7 @@ func (r RuleEval) fireEvent(event *models.AlertCurEvent) {
if r.rule.NotifyMaxNumber == 0 {
// 最大可以发送次数如果是0表示不想限制最大发送次数一直发即可
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
r.pushEventToQueue(event)
} else {
// 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数
@@ -427,6 +428,7 @@ func (r RuleEval) fireEvent(event *models.AlertCurEvent) {
return
} else {
event.NotifyCurNumber = fired.NotifyCurNumber + 1
event.FirstTriggerTime = fired.FirstTriggerTime
r.pushEventToQueue(event)
}
}
@@ -434,6 +436,7 @@ func (r RuleEval) fireEvent(event *models.AlertCurEvent) {
}
} else {
event.NotifyCurNumber = 1
event.FirstTriggerTime = event.TriggerTime
r.pushEventToQueue(event)
}
}

View File

@@ -36,6 +36,13 @@ func pushEventToQueue(c *gin.Context) {
event.TagsMap[arr[0]] = arr[1]
}
// isMuted only need TriggerTime RuleName and TagsMap
if engine.IsMuted(event) {
logger.Infof("event_muted: rule_id=%d %s", event.RuleId, event.Hash)
ginx.NewRender(c).Message(nil)
return
}
if err := event.ParseRuleNote(); err != nil {
event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err)
}

View File

@@ -12,6 +12,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/src/server/common"
"github.com/didi/nightingale/v5/src/server/config"
@@ -156,6 +157,7 @@ func handleOpenTSDB(c *gin.Context) {
}
if err != nil {
logger.Debugf("opentsdb msg format error: %s", err.Error())
c.String(400, err.Error())
return
}
@@ -170,12 +172,20 @@ func handleOpenTSDB(c *gin.Context) {
for i := 0; i < len(arr); i++ {
if err := arr[i].Clean(ts); err != nil {
logger.Debugf("opentsdb msg clean error: %s", err.Error())
if fail == 0 {
msg = fmt.Sprintf("%s , Error clean: %s", msg, err.Error())
}
fail++
continue
}
pt, err := arr[i].ToProm()
if err != nil {
logger.Debugf("opentsdb msg to tsdb error: %s", err.Error())
if fail == 0 {
msg = fmt.Sprintf("%s , Error toprom: %s", msg, err.Error())
}
fail++
continue
}
@@ -202,6 +212,10 @@ func handleOpenTSDB(c *gin.Context) {
idents.Idents.MSet(ids)
}
if fail > 0 {
logger.Debugf("opentsdb msg process error , msg is : %s", string(bs))
}
c.JSON(200, gin.H{
"succ": succ,
"fail": fail,

View File

@@ -9,6 +9,7 @@ import (
"syscall"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/src/pkg/httpx"
"github.com/didi/nightingale/v5/src/pkg/logx"
@@ -75,6 +76,9 @@ EXIT:
break EXIT
case syscall.SIGHUP:
// reload configuration?
logger.Info("start reload configs")
engine.Reload()
logger.Info("reload configs finished")
default:
break EXIT
}

View File

@@ -9,6 +9,7 @@ import (
"net/http"
"time"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/golang/protobuf/proto"
"github.com/golang/snappy"
@@ -24,11 +25,28 @@ type WriterType struct {
Client api.Client
}
func (w WriterType) writeRelabel(items []*prompb.TimeSeries) []*prompb.TimeSeries {
ritems := make([]*prompb.TimeSeries, 0, len(items))
for _, item := range items {
lbls := models.Process(item.Labels, w.Opts.WriteRelabels...)
if len(lbls) == 0 {
continue
}
ritems = append(ritems, item)
}
return ritems
}
func (w WriterType) Write(index int, items []*prompb.TimeSeries, headers ...map[string]string) {
if len(items) == 0 {
return
}
items = w.writeRelabel(items)
if len(items) == 0 {
return
}
start := time.Now()
defer func() {
promstat.ForwardDuration.WithLabelValues(config.C.ClusterName, fmt.Sprint(index)).Observe(time.Since(start).Seconds())

View File

@@ -14,6 +14,7 @@ import (
"github.com/didi/nightingale/v5/src/pkg/logx"
"github.com/didi/nightingale/v5/src/pkg/oidcc"
"github.com/didi/nightingale/v5/src/pkg/ormx"
"github.com/didi/nightingale/v5/src/pkg/tls"
"github.com/didi/nightingale/v5/src/storage"
)
@@ -77,6 +78,7 @@ func MustLoad(fpaths ...string) {
type Config struct {
RunMode string
I18N string
I18NHeaderKey string
AdminRole string
MetricsYamlFile string
BuiltinAlertsDir string
@@ -97,6 +99,7 @@ type Config struct {
Clusters []ClusterOptions
Ibex Ibex
OIDC oidcc.Config
TargetMetrics map[string]string
}
type ClusterOptions struct {
@@ -112,6 +115,9 @@ type ClusterOptions struct {
DialTimeout int64
KeepAlive int64
UseTLS bool
tls.ClientConfig
MaxIdleConnsPerHost int
}

View File

@@ -3,28 +3,43 @@ package config
import (
"path"
cmap "github.com/orcaman/concurrent-map"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/runner"
)
var Metrics = cmap.New()
// metricDesc , As load map happens before read map, there is no necessary to use concurrent map for metric desc store
type metricDesc struct {
CommonDesc map[string]string `yaml:",inline" json:"common"`
Zh map[string]string `yaml:"zh" json:"zh"`
En map[string]string `yaml:"en" json:"en"`
}
var MetricDesc metricDesc
// GetMetricDesc , if metric is not registered, empty string will be returned
func GetMetricDesc(lang, metric string) string {
var m map[string]string
if lang == "zh" {
m = MetricDesc.Zh
} else {
m = MetricDesc.En
}
if m != nil {
if desc, has := m[metric]; has {
return desc
}
}
return MetricDesc.CommonDesc[metric]
}
func loadMetricsYaml() error {
fp := path.Join(runner.Cwd, "etc", "metrics.yaml")
fp := C.MetricsYamlFile
if fp == "" {
fp = path.Join(runner.Cwd, "etc", "metrics.yaml")
}
if !file.IsExist(fp) {
return nil
}
nmap := make(map[string]string)
err := file.ReadYaml(fp, &nmap)
if err != nil {
return err
}
for key, val := range nmap {
Metrics.Set(key, val)
}
return nil
return file.ReadYaml(fp, &MetricDesc)
}

View File

@@ -65,6 +65,9 @@ func initClustersFromConfig() error {
for i := 0; i < len(opts); i++ {
cluster := newClusterByOption(opts[i])
if cluster == nil {
continue
}
Clusters.Put(opts[i].Name, cluster)
}
@@ -165,7 +168,17 @@ func loadClustersFromAPI() {
MaxIdleConnsPerHost: 32,
}
Clusters.Put(item.Name, newClusterByOption(opt))
if strings.HasPrefix(opt.Prom, "https") {
opt.UseTLS = true
opt.InsecureSkipVerify = true
}
cluster := newClusterByOption(opt)
if cluster == nil {
continue
}
Clusters.Put(item.Name, cluster)
continue
}
}
@@ -173,7 +186,6 @@ func loadClustersFromAPI() {
func newClusterByOption(opt config.ClusterOptions) *ClusterType {
transport := &http.Transport{
// TLSClientConfig: tlsConfig,
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(opt.DialTimeout) * time.Millisecond,
@@ -182,6 +194,15 @@ func newClusterByOption(opt config.ClusterOptions) *ClusterType {
MaxIdleConnsPerHost: opt.MaxIdleConnsPerHost,
}
if opt.UseTLS {
tlsConfig, err := opt.TLSConfig()
if err != nil {
logger.Errorf("new cluster %s fail: %v", opt.Name, err)
return nil
}
transport.TLSClientConfig = tlsConfig
}
cli, err := api.NewClient(api.Config{
Address: opt.Prom,
RoundTripper: transport,
@@ -189,6 +210,7 @@ func newClusterByOption(opt config.ClusterOptions) *ClusterType {
if err != nil {
logger.Errorf("new client fail: %v", err)
return nil
}
cluster := &ClusterType{

View File

@@ -31,6 +31,25 @@ func stat() gin.HandlerFunc {
}
}
func languageDetector() gin.HandlerFunc {
headerKey := config.C.I18NHeaderKey
return func(c *gin.Context) {
if headerKey != "" {
lang := c.GetHeader(headerKey)
if lang != "" {
if strings.HasPrefix(lang, "*") || strings.HasPrefix(lang, "zh") {
c.Request.Header.Set("X-Language", "zh")
} else if strings.HasPrefix(lang, "en") {
c.Request.Header.Set("X-Language", "en")
} else {
c.Request.Header.Set("X-Language", lang)
}
}
}
c.Next()
}
}
func New(version string) *gin.Engine {
gin.SetMode(config.C.RunMode)
@@ -41,6 +60,7 @@ func New(version string) *gin.Engine {
r := gin.New()
r.Use(stat())
r.Use(languageDetector())
r.Use(aop.Recovery())
// whether print access log
@@ -98,14 +118,11 @@ func configRoute(r *gin.Engine, version string) {
pages := r.Group(pagesPrefix)
{
if config.C.AnonymousAccess.PromQuerier {
pages.Any("/prometheus/*url", prometheusProxy)
pages.POST("/query-range-batch", promBatchQueryRange)
} else {
pages.Any("/prometheus/*url", auth(), prometheusProxy)
pages.POST("/query-range-batch", auth(), promBatchQueryRange)
}
@@ -179,6 +196,7 @@ func configRoute(r *gin.Engine, version string) {
pages.POST("/busi-group/:id/board/:bid/clone", auth(), user(), perm("/dashboards/add"), bgrw(), boardClone)
pages.GET("/board/:bid", auth(), user(), boardGet)
pages.GET("/board/:bid/pure", boardPureGet)
pages.PUT("/board/:bid", auth(), user(), perm("/dashboards/put"), boardPut)
pages.PUT("/board/:bid/configs", auth(), user(), perm("/dashboards/put"), boardPutConfigs)
pages.DELETE("/boards", auth(), user(), perm("/dashboards/del"), boardDel)

View File

@@ -51,6 +51,17 @@ func boardGet(c *gin.Context) {
ginx.NewRender(c).Data(board, nil)
}
func boardPureGet(c *gin.Context) {
board, err := models.BoardGetByID(ginx.UrlParamInt64(c, "bid"))
ginx.Dangerous(err)
if board == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
ginx.NewRender(c).Data(board, nil)
}
// bgrwCheck
func boardDel(c *gin.Context) {
var f idsForm

View File

@@ -69,6 +69,12 @@ func busiGroupMemberAdd(c *gin.Context) {
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
for i := 0; i < len(members); i++ {
if members[i].BusiGroupId != targetbg.Id {
ginx.Bomb(http.StatusBadRequest, "business group id invalid")
}
}
ginx.NewRender(c).Message(targetbg.AddMembers(members, username))
}
@@ -79,6 +85,12 @@ func busiGroupMemberDel(c *gin.Context) {
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
for i := 0; i < len(members); i++ {
if members[i].BusiGroupId != targetbg.Id {
ginx.Bomb(http.StatusBadRequest, "business group id invalid")
}
}
ginx.NewRender(c).Message(targetbg.DelMembers(members, username))
}

View File

@@ -3,6 +3,7 @@ package router
import (
"fmt"
"net/http"
"strconv"
"strings"
"time"
@@ -31,6 +32,7 @@ func loginPost(c *gin.Context) {
if config.C.LDAP.Enable {
user, err = models.LdapLogin(f.Username, f.Password)
if err != nil {
logger.Debugf("ldap login failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
@@ -115,6 +117,24 @@ func refreshPost(c *gin.Context) {
return
}
userid, err := strconv.ParseInt(strings.Split(userIdentity, "-")[0], 10, 64)
if err != nil {
ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse user_identity from jwt")
return
}
u, err := models.UserGetById(userid)
if err != nil {
ginx.NewRender(c, http.StatusInternalServerError).Message("failed to query user by id")
return
}
if u == nil {
// user already deleted
ginx.NewRender(c, http.StatusUnauthorized).Message("user already deleted")
return
}
// Delete the previous Refresh Token
err = deleteAuth(c.Request.Context(), refreshUuid)
if err != nil {

View File

@@ -1,35 +1,14 @@
package router
import (
"path"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/ginx"
"github.com/toolkits/pkg/runner"
"github.com/didi/nightingale/v5/src/webapi/config"
)
func metricsDescGetFile(c *gin.Context) {
fp := config.C.MetricsYamlFile
if fp == "" {
fp = path.Join(runner.Cwd, "etc", "metrics.yaml")
}
if !file.IsExist(fp) {
c.String(404, "%s not found", fp)
return
}
ret := make(map[string]string)
err := file.ReadYaml(fp, &ret)
if err != nil {
c.String(500, err.Error())
return
}
c.JSON(200, ret)
c.JSON(200, config.MetricDesc)
}
// 前端传过来一个metric数组后端去查询有没有对应的释义返回map
@@ -38,13 +17,8 @@ func metricsDescGetMap(c *gin.Context) {
ginx.BindJSON(c, &arr)
ret := make(map[string]string)
for i := 0; i < len(arr); i++ {
desc, has := config.Metrics.Get(arr[i])
if !has {
ret[arr[i]] = ""
} else {
ret[arr[i]] = desc.(string)
}
for _, key := range arr {
ret[key] = config.GetMetricDesc(c.GetHeader("X-Language"), key)
}
ginx.NewRender(c).Data(ret, nil)

View File

@@ -59,7 +59,7 @@ func proxyAuth() gin.HandlerFunc {
return func(c *gin.Context) {
user := handleProxyUser(c)
c.Set("userid", user.Id)
c.Set("username", user)
c.Set("username", user.Username)
c.Next()
}
}
@@ -119,7 +119,6 @@ func jwtMock() gin.HandlerFunc {
"refresh_token": "",
}, nil)
c.Abort()
return
}
}

View File

@@ -32,21 +32,15 @@ type batchQueryForm struct {
func promBatchQueryRange(c *gin.Context) {
xcluster := c.GetHeader("X-Cluster")
if xcluster == "" {
c.String(500, "X-Cluster is blank")
return
ginx.Bomb(http.StatusBadRequest, "header(X-Cluster) is blank")
}
var f batchQueryForm
err := c.BindJSON(&f)
if err != nil {
c.String(500, err.Error())
return
}
ginx.Dangerous(c.BindJSON(&f))
cluster, exist := prom.Clusters.Get(xcluster)
if !exist {
c.String(http.StatusBadRequest, "cluster(%s) not found", xcluster)
return
ginx.Bomb(http.StatusBadRequest, "cluster(%s) not found", xcluster)
}
var lst []model.Value
@@ -59,15 +53,12 @@ func promBatchQueryRange(c *gin.Context) {
}
resp, _, err := cluster.PromClient.QueryRange(context.Background(), item.Query, r)
if err != nil {
c.String(500, err.Error())
return
}
ginx.Dangerous(err)
lst = append(lst, resp)
}
c.JSON(200, lst)
ginx.NewRender(c).Data(lst, nil)
}
func prometheusProxy(c *gin.Context) {

View File

@@ -1,21 +1,27 @@
package router
import (
"context"
"fmt"
"net/http"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/ginx"
"github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/server/common/conv"
"github.com/didi/nightingale/v5/src/webapi/config"
"github.com/didi/nightingale/v5/src/webapi/prom"
)
func targetGets(c *gin.Context) {
bgid := ginx.QueryInt64(c, "bgid", -1)
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 30)
mins := ginx.QueryInt(c, "mins", 2)
clusters := queryClusters(c)
total, err := models.TargetTotal(bgid, clusters, query)
@@ -26,8 +32,60 @@ func targetGets(c *gin.Context) {
if err == nil {
cache := make(map[int64]*models.BusiGroup)
targetsMap := make(map[string]*models.Target)
for i := 0; i < len(list); i++ {
ginx.Dangerous(list[i].FillGroup(cache))
targetsMap[list[i].Cluster+list[i].Ident] = list[i]
}
now := time.Now()
// query LoadPerCore / MemUtil / TargetUp / DiskUsedPercent from prometheus
// map key: cluster, map value: ident list
targets := make(map[string][]string)
for i := 0; i < len(list); i++ {
targets[list[i].Cluster] = append(targets[list[i].Cluster], list[i].Ident)
}
for cluster := range targets {
cc, has := prom.Clusters.Get(cluster)
if !has {
continue
}
targetArr := targets[cluster]
if len(targetArr) == 0 {
continue
}
targetRe := strings.Join(targetArr, "|")
valuesMap := make(map[string]map[string]float64)
for metric, ql := range config.C.TargetMetrics {
promql := fmt.Sprintf(ql, targetRe, mins)
values, err := instantQuery(context.Background(), cc, promql, now)
ginx.Dangerous(err)
valuesMap[metric] = values
}
// handle values
for metric, values := range valuesMap {
for ident := range values {
mapkey := cluster + ident
if t, has := targetsMap[mapkey]; has {
switch metric {
case "LoadPerCore":
t.LoadPerCore = values[ident]
case "MemUtil":
t.MemUtil = values[ident]
case "TargetUp":
t.TargetUp = values[ident]
case "DiskUtil":
t.DiskUtil = values[ident]
}
}
}
}
}
}
@@ -37,6 +95,29 @@ func targetGets(c *gin.Context) {
}, nil)
}
func instantQuery(ctx context.Context, c *prom.ClusterType, promql string, ts time.Time) (map[string]float64, error) {
ret := make(map[string]float64)
val, warnings, err := c.PromClient.Query(ctx, promql, ts)
if err != nil {
return ret, err
}
if len(warnings) > 0 {
return ret, fmt.Errorf("instant query occur warnings, promql: %s, warnings: %v", promql, warnings)
}
vectors := conv.ConvertVectors(val)
for i := range vectors {
ident, has := vectors[i].Labels["ident"]
if has {
ret[string(ident)] = vectors[i].Value
}
}
return ret, nil
}
func targetGetTags(c *gin.Context) {
idents := ginx.QueryStr(c, "idents")
idents = strings.ReplaceAll(idents, ",", " ")