diff --git a/CHANGES.md b/CHANGES.md
index 79136b37..7df885e6 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -4,6 +4,8 @@ Notable changes between versions.
## Latest
+## v1.8.3
+
* Kubernetes v1.8.3
* Run etcd on-host, across controllers
* Promote AWS platform to beta
@@ -29,6 +31,11 @@ Notable changes between versions.
* Fix firewall rules for multi-controller kubelet scraping and node-exporter
* Remove support for self-hosted etcd
+#### Addons
+
+* Add Prometheus 2.0 addon with alerting rules
+* Add Grafana dashboard for observing metrics
+
## v1.8.2
* Kubernetes v1.8.2
diff --git a/addons/prometheus/discovery/kube-controller-manager.yaml b/addons/prometheus/discovery/kube-controller-manager.yaml
new file mode 100644
index 00000000..19faa6cf
--- /dev/null
+++ b/addons/prometheus/discovery/kube-controller-manager.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: kube-controller-manager
+ namespace: kube-system
+ annotations:
+ prometheus.io/scrape: 'true'
+spec:
+ type: ClusterIP
+ # service is created to allow prometheus to scrape endpoints
+ clusterIP: None
+ selector:
+ k8s-app: kube-controller-manager
+ ports:
+ - name: metrics
+ protocol: TCP
+ port: 10252
+ targetPort: 10252
diff --git a/addons/prometheus/discovery/kube-scheduler.yaml b/addons/prometheus/discovery/kube-scheduler.yaml
new file mode 100644
index 00000000..80fd1913
--- /dev/null
+++ b/addons/prometheus/discovery/kube-scheduler.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: kube-scheduler
+ namespace: kube-system
+ annotations:
+ prometheus.io/scrape: 'true'
+spec:
+ type: ClusterIP
+ # service is created to allow prometheus to scrape endpoints
+ clusterIP: None
+ selector:
+ k8s-app: kube-scheduler
+ ports:
+ - name: metrics
+ protocol: TCP
+ port: 10251
+ targetPort: 10251
diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml
index c3f9d784..5282ace9 100644
--- a/addons/prometheus/rules.yaml
+++ b/addons/prometheus/rules.yaml
@@ -4,5 +4,469 @@ metadata:
name: prometheus-rules
namespace: monitoring
data:
- example.rule: |
- job_service:rpc_durations_seconds_count:avg_rate5m = avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
+ # Rules adapted from those provided by coreos/prometheus-operator and SoundCloud
+ alertmanager.rules.yaml: |+
+ groups:
+ - name: ./alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
+ GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
+ "alertmanager-$1", "alertmanager", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: The configuration of the instances of the Alertmanager cluster
+ `{{$labels.service}}` are out of sync.
+ summary: Alertmanager configurations are inconsistent
+ - alert: AlertmanagerDownOrMissing
+ expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
+ "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ description: An unexpected number of Alertmanagers are scraped or Alertmanagers
+ disappeared from discovery.
+ summary: Alertmanager down or not discovered
+ - alert: FailedReload
+ expr: alertmanager_config_last_reload_successful == 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
+ }}/{{ $labels.pod}}.
+ summary: Alertmanager configuration reload has failed
+ etcd3.rules.yaml: |+
+ groups:
+ - name: ./etcd3.rules
+ rules:
+ - alert: InsufficientMembers
+ expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ description: If one more etcd member goes down the cluster will be unavailable
+ summary: etcd cluster insufficient members
+ - alert: NoLeader
+ expr: etcd_server_has_leader{job="etcd"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ description: etcd member {{ $labels.instance }} has no leader
+ summary: etcd member has no leader
+ - alert: HighNumberOfLeaderChanges
+ expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
+ changes within the last hour
+ summary: a high number of leader changes within the etcd cluster are happening
+ - alert: HighNumberOfFailedGRPCRequests
+ expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+ / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+ on etcd instance {{ $labels.instance }}'
+ summary: a high number of gRPC requests are failing
+ - alert: HighNumberOfFailedGRPCRequests
+ expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+ / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+ on etcd instance {{ $labels.instance }}'
+ summary: a high number of gRPC requests are failing
+ - alert: GRPCRequestsSlow
+ expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
+ }} are slow
+ summary: slow gRPC requests
+ - alert: HighNumberOfFailedHTTPRequests
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+ BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+ instance {{ $labels.instance }}'
+ summary: a high number of HTTP requests are failing
+ - alert: HighNumberOfFailedHTTPRequests
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+ BY (method) > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+ instance {{ $labels.instance }}'
+ summary: a high number of HTTP requests are failing
+ - alert: HTTPRequestsSlow
+ expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
+ }} are slow
+ summary: slow HTTP requests
+ - alert: EtcdMemberCommunicationSlow
+ expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} member communication with
+ {{ $labels.To }} is slow
+ summary: etcd member communication is slow
+ - alert: HighNumberOfFailedProposals
+ expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
+ failures within the last hour
+ summary: a high number of proposals within the etcd cluster are failing
+ - alert: HighFsyncDurations
+ expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
+ > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} fync durations are high
+ summary: high fsync durations
+ - alert: HighCommitDurations
+ expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
+ > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} commit durations are high
+ summary: high commit durations
+ general.rules.yaml: |+
+ groups:
+ - name: ./general.rules
+ rules:
+ - alert: TargetDown
+ expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+ summary: Targets are down
+ - alert: TooManyOpenFileDescriptors
+ expr: 100 * (process_open_fds / process_max_fds) > 95
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
+ summary: too many open file descriptors
+ - record: instance:fd_utilization
+ expr: process_open_fds / process_max_fds
+ - alert: FdExhaustionClose
+ expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+ $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+ summary: file descriptors soon exhausted
+ - alert: FdExhaustionClose
+ expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+ $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+ summary: file descriptors soon exhausted
+ kube-apiserver.rules.yaml: |+
+ groups:
+ - name: ./kube-apiserver.rules
+ rules:
+ - alert: K8SApiserverDown
+ expr: absent(up{job="kubernetes-apiservers"} == 1)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: Prometheus failed to scrape API server(s), or all API servers have
+ disappeared from service discovery.
+ summary: API server unreachable
+ - alert: K8SApiServerLatency
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
+ WITHOUT (instance, resource)) / 1e+06 > 1
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 99th percentile Latency for {{ $labels.verb }} requests to the
+ kube-apiserver is higher than 1s.
+ summary: Kubernetes apiserver latency is high
+ kube-controller-manager.rules.yaml: |+
+ groups:
+ - name: ./kube-controller-manager.rules
+ rules:
+ - alert: K8SControllerManagerDown
+ expr: absent(up{kubernetes_name="kube-controller-manager"} == 1)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: There is no running K8S controller manager. Deployments and replication
+ controllers are not making progress.
+ summary: Controller manager is down
+ kube-scheduler.rules.yaml: |+
+ groups:
+ - name: ./kube-scheduler.rules
+ rules:
+ - alert: K8SSchedulerDown
+ expr: absent(up{kubernetes_name="kube-scheduler"} == 1)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: There is no running K8S scheduler. New pods are not being assigned
+ to nodes.
+ summary: Scheduler is down
+ kubelet.rules.yaml: |+
+ groups:
+ - name: ./kubelet.rules
+ rules:
+ - alert: K8SNodeNotReady
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+ for: 1h
+ labels:
+ severity: warning
+ annotations:
+ description: The Kubelet on {{ $labels.node }} has not checked in with the API,
+ or has set itself to NotReady, for more than an hour
+ summary: Node status is NotReady
+ - alert: K8SManyNodesNotReady
+ expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
+ > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
+ 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
+ state).'
+ summary: Many Kubernetes nodes are Not Ready
+ - alert: K8SKubeletDown
+ expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) > 0.03
+ for: 1h
+ labels:
+ severity: warning
+ annotations:
+ description: Prometheus failed to scrape {{ $value }}% of kubelets.
+ summary: Many Kubelets cannot be scraped
+ - alert: K8SKubeletDown
+ expr: absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"})
+ > 0.1
+ for: 1h
+ labels:
+ severity: critical
+ annotations:
+ description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
+ have disappeared from service discovery.
+ summary: Many Kubelets cannot be scraped
+ - alert: K8SKubeletTooManyPods
+ expr: kubelet_running_pod_count > 100
+ labels:
+ severity: warning
+ annotations:
+ description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
+ to the limit of 110
+ summary: Kubelet is close to pod limit
+ kubernetes.rules.yaml: |+
+ groups:
+ - name: ./kubernetes.rules
+ rules:
+ - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+ expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
+ "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+ controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+ expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
+ "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+ container_name)
+ - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+ expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
+ "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+ controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+ expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
+ "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+ container_name)
+ - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+ expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
+ "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+ controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+ expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
+ "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+ container_name)
+ - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+ expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
+ "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+ container_name)
+ - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+ expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
+ "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+ container_name)
+ - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+ expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
+ "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+ controller, pod_name, container_name, scope, type)
+ - record: cluster_namespace_controller_pod_container:memory_oom:rate
+ expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
+ "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+ controller, pod_name, container_name, scope, type)
+ - record: cluster:memory_allocation:percent
+ expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
+ / sum(machine_memory_bytes) BY (cluster)
+ - record: cluster:memory_used:percent
+ expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
+ BY (cluster)
+ - record: cluster:cpu_allocation:percent
+ expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
+ * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+ - record: cluster:node_cpu_use:percent
+ expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
+ BY (cluster)
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
+ cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
+ cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
+ cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+ BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ node.rules.yaml: |+
+ groups:
+ - name: ./node.rules
+ rules:
+ - alert: NodeExporterDown
+ expr: absent(up{kubernetes_name="node-exporter"} == 1)
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: Prometheus could not scrape a node-exporter for more than 10m,
+ or node-exporters have disappeared from discovery.
+ summary: node-exporter cannot be scraped
+ - alert: K8SNodeOutOfDisk
+ expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+ labels:
+ service: k8s
+ severity: critical
+ annotations:
+ description: '{{ $labels.node }} has run out of disk space.'
+ summary: Node ran out of disk space.
+ - alert: K8SNodeMemoryPressure
+ expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
+ 1
+ labels:
+ service: k8s
+ severity: warning
+ annotations:
+ description: '{{ $labels.node }} is under memory pressure.'
+ summary: Node is under memory pressure.
+ - alert: K8SNodeDiskPressure
+ expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+ labels:
+ service: k8s
+ severity: warning
+ annotations:
+ description: '{{ $labels.node }} is under disk pressure.'
+ summary: Node is under disk pressure.
+ prometheus.rules.yaml: |+
+ groups:
+ - name: ./prometheus.rules
+ rules:
+ - alert: FailedReload
+ expr: prometheus_config_last_reload_successful == 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
+ }}/{{ $labels.pod}}.
+ summary: Prometheus configuration reload has failed
diff --git a/docs/addons/prometheus.md b/docs/addons/prometheus.md
index b21c4895..fc302fa6 100644
--- a/docs/addons/prometheus.md
+++ b/docs/addons/prometheus.md
@@ -30,9 +30,9 @@ Exporters expose metrics for 3rd-party systems that don't natively expose Promet
* [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) - Deployment that exposes Kubernetes object metrics
* [blackbox_exporter](https://github.com/prometheus/blackbox_exporter) - Scrapes HTTP, HTTPS, DNS, TCP, or ICMP endpoints and exposes availability as metrics
-### Queries and Graphs
+### Queries and Alerts
-Prometheus provides a simplistic UI for querying and graphing metrics. Use `kubectl` to authenticate to the apiserver and create a local port-forward to the Prometheus pod.
+Prometheus provides a simplistic UI for querying metrics and viewing alerts. Use `kubectl` to authenticate to the apiserver and create a local port-forward to the Prometheus pod.
```
kubectl get pods -n monitoring
@@ -44,6 +44,8 @@ Visit [127.0.0.1:9090](http://127.0.0.1:9090) to query [expressions](http://127.


+
+
## Grafana
diff --git a/docs/img/prometheus-alerts.png b/docs/img/prometheus-alerts.png
new file mode 100644
index 00000000..9b70e25e
Binary files /dev/null and b/docs/img/prometheus-alerts.png differ