From 24c393ae86a367e12d8beaae369125e1ad0759d0 Mon Sep 17 00:00:00 2001 From: Dmitry Dunaev Date: Thu, 14 Jul 2022 12:22:07 +0300 Subject: [PATCH] [WIFI-9824] Add: elasticsearch exporter + example alert Signed-off-by: Dmitry Dunaev --- .../servicemonitor--es-exporter.yaml | 17 +++++ helmfile/cloud-sdk/helmfile.lock | 7 +- helmfile/cloud-sdk/helmfile.yaml | 73 ++++++++++++++++++- 3 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 helmfile/cloud-sdk/charts/standalone-monitoring/templates/servicemonitor--es-exporter.yaml diff --git a/helmfile/cloud-sdk/charts/standalone-monitoring/templates/servicemonitor--es-exporter.yaml b/helmfile/cloud-sdk/charts/standalone-monitoring/templates/servicemonitor--es-exporter.yaml new file mode 100644 index 0000000..0a2b45f --- /dev/null +++ b/helmfile/cloud-sdk/charts/standalone-monitoring/templates/servicemonitor--es-exporter.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + annotations: + meta.helm.sh/release-name: prometheus-operator + labels: + app: prometheus-es-exporter + app.kubernetes.io/managed-by: Helm + release: prometheus-operator + name: elasticsearch-exporter-prometheus-es-exporter +spec: + endpoints: + - path: / + port: prometheus + selector: + matchLabels: + app: prometheus-es-exporter diff --git a/helmfile/cloud-sdk/helmfile.lock b/helmfile/cloud-sdk/helmfile.lock index 674d6cb..750d512 100644 --- a/helmfile/cloud-sdk/helmfile.lock +++ b/helmfile/cloud-sdk/helmfile.lock @@ -45,8 +45,11 @@ dependencies: - name: metrics-server repository: https://charts.bitnami.com/bitnami version: 5.10.13 +- name: prometheus-es-exporter + repository: https://braedon.github.io/helm + version: 0.2.0 - name: tigera-operator repository: https://projectcalico.docs.tigera.io/charts version: v3.22.2 -digest: sha256:289fecc40f3cfb7c7ef4d458d313295cce9d950559391d51b973775333f4f07f -generated: "2022-06-23T14:05:20.599854754+03:00" +digest: sha256:d5e93771fd4dba01fcb0d0d88d32ab7bc06aa2807bdc326a4eaaddb0d5a27c62 +generated: "2022-07-14T11:08:05.177261511+03:00" diff --git a/helmfile/cloud-sdk/helmfile.yaml b/helmfile/cloud-sdk/helmfile.yaml index a97a5f7..07dfd20 100644 --- a/helmfile/cloud-sdk/helmfile.yaml +++ b/helmfile/cloud-sdk/helmfile.yaml @@ -25,6 +25,8 @@ repositories: url: https://prometheus-community.github.io/helm-charts - name: projectcalico url: https://projectcalico.docs.tigera.io/charts +- name: braedon + url: https://braedon.github.io/helm environments: azure: @@ -289,6 +291,16 @@ releases: annotations: title: PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* has less than 20% free storage description: "The PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* only has {{`{{ $value }}`}}% capacity left. Please increase its size or clean it up." + - alert: ElasticSearch new assert_violation errors found + expr: rate(assert_violation_errors_hits[1m]) > 0 + labels: + severity: warning + area: testbed + service: elasticsearch + namespace: "{{`{{ $labels.namespace }}`}}" + annotations: + title: New ElasticSearch logs found with Assertion violation (WIFI-9824) + description: New ElasticSearch logs found with Assertion violation (WIFI-9824) - grafana: grafana.ini: @@ -448,7 +460,7 @@ releases: serviceAccount: create: true hosts: - - elasticsearch-client.{{ .Environment.Values.monitoring.namespace }}.svc.cluster.local + - elasticsearch-client.{{ .Environment.Values.monitoring.namespace }}.svc.cluster.local - resources: requests: cpu: 100m @@ -992,3 +1004,62 @@ releases: app: github-actions-network-policies sub1: calico sub2: actions-runner-controller-addon + +- name: elasticsearch-exporter + condition: elastic.enabled + namespace: {{ .Environment.Values.monitoring.namespace }} + chart: braedon/prometheus-es-exporter + version: 0.2.0 + labels: + group: monitoring + app: elasticsearch-exporter + values: + - image: + tag: 0.14.1 + - elasticsearch: + cluster: elasticsearch-client.{{ .Environment.Values.monitoring.namespace }}.svc.cluster.local:9200 + queries: |- + [DEFAULT] + QueryIntervalSecs = 15 + QueryTimeoutSecs = 10 + QueryIndices = _all + QueryOnError = drop + QueryOnMissing = drop + [query_assert_violation_errors] + QueryOnError = preserve + QueryOnMissing = zero + QueryJson = { + "query": { + "bool": { + "filter": { + "bool": { + "must": [ + { + "match": { + "kubernetes.namespace_name.keyword": "openwifi-qa01" + } + }, + { + "match": { + "kubernetes.container_name.keyword": "owgw" + } + } + ] + } + }, + "must": { + "match": { + "message": "Assertion violation: !_pStream" + } + } + } + }, + "aggs": { + "agg_terms_kubernetes.container_name.keyword": { + "terms": { + "field": "kubernetes.container_name.keyword" + } + } + } + } +