Merge pull request #240 from Telecominfraproject/WIFI-13304-Bring-back-prometheus-and-grafana-to-TIP-WLAN-CI-CD-cluster

Enable and upgrade prometheus and grafana
This commit is contained in:
Carsten Schafer
2024-01-18 17:04:47 -05:00
committed by GitHub

View File

@@ -40,9 +40,7 @@ environments:
#enabled: true #enabled: true
enabled: false enabled: false
- prometheus: - prometheus:
#TODO: enabled: true
#enabled: true
enabled: false
- k8s-dashboard: - k8s-dashboard:
enabled: true enabled: true
- portainer: - portainer:
@@ -151,7 +149,6 @@ releases:
condition: ingress.enabled condition: ingress.enabled
<<: *default <<: *default
chart: nginx/ingress-nginx chart: nginx/ingress-nginx
#version: 4.2.0
version: 4.8.2 version: 4.8.2
labels: labels:
role: setup role: setup
@@ -177,24 +174,29 @@ releases:
publishService: publishService:
enabled: true enabled: true
metrics: metrics:
#TODO: enabled: true
#enabled: true
enabled: false
serviceMonitor: serviceMonitor:
#TODO: enabled: true
#enabled: true
enabled: false
additionalLabels: additionalLabels:
release: prometheus-operator release: prometheus-operator
- defaultBackend: - defaultBackend:
enabled: true enabled: true
# monitoring # monitoring
- name: prometheus-operator-crds
chart: prometheus-community/prometheus-operator-crds
condition: prometheus.enabled
version: 8.0.1
labels:
role: setup
group: monitoring
app: prometheus-operator
- name: prometheus-operator - name: prometheus-operator
condition: prometheus.enabled condition: prometheus.enabled
namespace: {{ .Environment.Values.monitoring.namespace }} namespace: {{ .Environment.Values.monitoring.namespace }}
chart: prometheus-community/kube-prometheus-stack chart: prometheus-community/kube-prometheus-stack
version: 41.5.1 version: 55.8.1
labels: labels:
role: setup role: setup
group: monitoring group: monitoring
@@ -220,10 +222,14 @@ releases:
storage: 50Gi storage: 50Gi
ingress: ingress:
enabled: true enabled: true
ingressClassName: nginx-sso
annotations: annotations:
nginx.ingress.kubernetes.io/auth-url: "https://$host/oauth2/auth" nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri" nginx.ingress.kubernetes.io/auth-secret: k8s-dashboard-basic-auth
kubernetes.io/ingress.class: nginx-sso nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
#nginx.ingress.kubernetes.io/auth-url: "https://$host/oauth2/auth"
#nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
#kubernetes.io/ingress.class: nginx-sso
hosts: hosts:
- prometheus.{{ .Environment.Values.domain }} - prometheus.{{ .Environment.Values.domain }}
- additionalPrometheusRulesMap: - additionalPrometheusRulesMap:
@@ -316,21 +322,22 @@ releases:
annotations: annotations:
title: PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* has less than 20% free storage title: PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* has less than 20% free storage
description: "The PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* only has {{`{{ $value }}`}}% capacity left. Please increase its size or clean it up." description: "The PVC *{{`{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}`}}* only has {{`{{ $value }}`}}% capacity left. Please increase its size or clean it up."
- alert: ElasticSearch new assert_violation errors found # Not doing ES (currently)
expr: rate(assert_violation_errors_hits[1m]) > 0 # - alert: ElasticSearch new assert_violation errors found
labels: # expr: rate(assert_violation_errors_hits[1m]) > 0
severity: warning # labels:
area: testbed # severity: warning
service: elasticsearch # area: testbed
namespace: "{{`{{ $labels.namespace }}`}}" # service: elasticsearch
annotations: # namespace: "{{`{{ $labels.namespace }}`}}"
title: New ElasticSearch logs found with Assertion violation (WIFI-9824) # annotations:
description: New ElasticSearch logs found with Assertion violation (WIFI-9824) # title: New ElasticSearch logs found with Assertion violation (WIFI-9824)
# description: New ElasticSearch logs found with Assertion violation (WIFI-9824)
- grafana: - grafana:
image: #image:
repository: grafana/grafana #repository: grafana/grafana
tag: 8.5.13 #tag: 8.5.13
grafana.ini: grafana.ini:
users: users:
viewers_can_edit: true viewers_can_edit: true
@@ -344,10 +351,14 @@ releases:
enabled: false enabled: false
ingress: ingress:
enabled: true enabled: true
ingressClassName: nginx-sso
annotations: annotations:
nginx.ingress.kubernetes.io/auth-url: "https://$host/oauth2/auth" nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri" nginx.ingress.kubernetes.io/auth-secret: k8s-dashboard-basic-auth
kubernetes.io/ingress.class: nginx-sso nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
#nginx.ingress.kubernetes.io/auth-url: "https://$host/oauth2/auth"
#nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
#kubernetes.io/ingress.class: nginx-sso
hosts: hosts:
- grafana.{{ .Environment.Values.domain }} - grafana.{{ .Environment.Values.domain }}
dashboardProviders: dashboardProviders:
@@ -363,12 +374,12 @@ releases:
options: options:
path: /var/lib/grafana/dashboards/default path: /var/lib/grafana/dashboards/default
dashboards: #dashboards:
default: # default:
performance: # performance:
url: https://raw.githubusercontent.com/Telecominfraproject/wlan-testing/master/grafana/performance.json # url: https://raw.githubusercontent.com/Telecominfraproject/wlan-testing/master/grafana/performance.json
qaDebugging: # qaDebugging:
url: https://raw.githubusercontent.com/Telecominfraproject/wlan-cloud-ucentral-analytics/main/grafana/qa-debugging.json # url: https://raw.githubusercontent.com/Telecominfraproject/wlan-cloud-ucentral-analytics/main/grafana/qa-debugging.json
datasources: datasources:
datasources.yaml: datasources.yaml:
@@ -379,36 +390,36 @@ releases:
url: http://prometheus-operated:9090 url: http://prometheus-operated:9090
access: proxy access: proxy
isDefault: false isDefault: false
- name: InfluxDB # Not doing influx and ES anymore (for now)
type: influxdb # - name: InfluxDB
access: proxy # type: influxdb
url: https://influx.cicd.{{ .Environment.Values.domain }} # access: proxy
user: tip # url: https://influx.cicd.{{ .Environment.Values.domain }}
secureJsonData: # user: tip
token: {{ .Environment.Values.influxdb.adminUser.token }} # secureJsonData:
password: {{ .Environment.Values.influxdb.adminUser.password }} # token: {{ .Environment.Values.influxdb.adminUser.token }}
jsonData: # password: {{ .Environment.Values.influxdb.adminUser.password }}
version: Flux # jsonData:
organization: tip # version: Flux
defaultBucket: tip-cicd # organization: tip
- name: ES # defaultBucket: tip-cicd
type: elasticsearch # - name: ES
access: proxy # type: elasticsearch
url: http://elasticsearch-client.monitoring.svc:9200 # access: proxy
database: logstash-* # url: http://elasticsearch-client.monitoring.svc:9200
isDefault: false # database: logstash-*
jsonData: # isDefault: false
esVersion: 6 # jsonData:
timeField: '@timestamp' # esVersion: 6
logMessageField: message # timeField: '@timestamp'
logLevelField: fields.level # logMessageField: message
# logLevelField: fields.level
- alertmanager: - alertmanager:
config: config:
global: global:
resolve_timeout: 1m resolve_timeout: 1m
slack_api_url: {{ .Environment.Values.alertmanager.slack_api_url }} slack_api_url: {{ .Environment.Values.alertmanager.slack_api_url }}
route: route:
receiver: "null" receiver: "null"
routes: routes:
@@ -429,10 +440,10 @@ releases:
{{- readFile "alertmanager-templates/text.tpl" | nindent 14 }} {{- readFile "alertmanager-templates/text.tpl" | nindent 14 }}
title_link: "" title_link: ""
- kube-state-metrics: # - kube-state-metrics:
image: # image:
repository: tip-tip-wlan-cloud-ucentral.jfrog.io/kube-state-metrics # repository: tip-tip-wlan-cloud-ucentral.jfrog.io/kube-state-metrics
tag: v2.6.0-tip20221103 # tag: v2.6.0-tip20221103
- name: prometheus-operator-helper - name: prometheus-operator-helper
condition: prometheus.enabled condition: prometheus.enabled
@@ -450,24 +461,24 @@ releases:
- proxy: - proxy:
namespace: kube-system namespace: kube-system
- name: prometheus-operator-ingress-auth #- name: prometheus-operator-ingress-auth
condition: prometheus.enabled # condition: prometheus.enabled
namespace: kube-system # namespace: kube-system
chart: charts/sso # chart: charts/sso
labels: # labels:
role: setup # role: setup
group: monitoring # group: monitoring
app: prometheus-operator # app: prometheus-operator
sub: oAuth # sub: oAuth
values: # values:
- monitoring: # - monitoring:
domain: example.com # domain: example.com
namespace: {{ .Environment.Values.monitoring.namespace }} # namespace: {{ .Environment.Values.monitoring.namespace }}
- oidc: # - oidc:
issuerUrl: {{ .Environment.Values.sso.oidc.issuerUrl }} # issuerUrl: {{ .Environment.Values.sso.oidc.issuerUrl }}
clientId: {{ .Environment.Values.sso.oidc.clientId }} # clientId: {{ .Environment.Values.sso.oidc.clientId }}
clientSecret: {{ .Environment.Values.sso.oidc.clientSecret }} # clientSecret: {{ .Environment.Values.sso.oidc.clientSecret }}
cookieSecret: {{ .Environment.Values.sso.oidc.cookieSecret }} # cookieSecret: {{ .Environment.Values.sso.oidc.cookieSecret }}
- name: fluentd - name: fluentd
condition: elastic.enabled condition: elastic.enabled