mirror of
https://github.com/outbackdingo/talos-cloud-controller-manager.git
synced 2026-01-27 10:20:27 +00:00
feat: expose metrics
Expose API call metrics and node CSR approve statistics. Signed-off-by: Serge Logvinov <serge.logvinov@sinextra.dev>
This commit is contained in:
2
Makefile
2
Makefile
@@ -73,7 +73,7 @@ build: ## Build
|
||||
.PHONY: run
|
||||
run: build
|
||||
./talos-cloud-controller-manager-$(ARCH) --v=5 --kubeconfig=kubeconfig --cloud-config=hack/ccm-config.yaml --controllers=cloud-node \
|
||||
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1
|
||||
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1 --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics
|
||||
|
||||
.PHONY: lint
|
||||
lint: ## Lint Code
|
||||
|
||||
@@ -11,5 +11,5 @@ keywords:
|
||||
maintainers:
|
||||
- name: sergelogvinov
|
||||
url: https://github.com/sergelogvinov
|
||||
version: 0.3.0
|
||||
version: 0.3.1
|
||||
appVersion: "v1.6.0"
|
||||
|
||||
@@ -56,6 +56,7 @@ spec:
|
||||
- --leader-elect-resource-name=cloud-controller-manager-talos
|
||||
- --use-service-account-credentials
|
||||
- --secure-port={{ .Values.service.containerPort }}
|
||||
- --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics
|
||||
{{- with .Values.extraArgs }}
|
||||
{{- toYaml . | nindent 12 }}
|
||||
{{- end }}
|
||||
|
||||
@@ -105,6 +105,9 @@ service:
|
||||
containerPort: 50258
|
||||
# -- Additional custom annotations for Service.
|
||||
annotations: {}
|
||||
# prometheus.io/scrape: "true"
|
||||
# prometheus.io/scheme: "https"
|
||||
# prometheus.io/port: "50258"
|
||||
|
||||
# -- Resource requests and limits.
|
||||
# ref: http://kubernetes.io/docs/user-guide/compute-resources/
|
||||
|
||||
76
docs/metrics.md
Normal file
76
docs/metrics.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# Metrics documentation
|
||||
|
||||
This document is a reflection of the current state of the exposed metrics of the Talos CCM.
|
||||
|
||||
## Gather metrics from talos-cloud-controller-manager
|
||||
|
||||
By default, the Talos CCM exposes metrics on the `https://localhost:50258/metrics` endpoint.
|
||||
|
||||
Enabling the metrics is done by setting the `--secure-port` and the `--authorization-always-allow-paths` flag to allow access to the `/metrics` endpoint.
|
||||
|
||||
```yaml
|
||||
talos-cloud-controller-manager
|
||||
--authorization-always-allow-paths="/metrics"
|
||||
--secure-port=50258
|
||||
```
|
||||
|
||||
### Helm chart values
|
||||
|
||||
The following values can be set in the Helm chart to expose the metrics of the Talos CCM.
|
||||
|
||||
```yaml
|
||||
service:
|
||||
containerPort: 50258
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/scheme: "https"
|
||||
prometheus.io/port: "50258"
|
||||
```
|
||||
|
||||
## Metrics exposed by the CCM
|
||||
|
||||
### Talos API calls
|
||||
|
||||
|Metric name|Metric type|Labels/tags|
|
||||
|-----------|-----------|-----------|
|
||||
|talosccm_api_request_duration_seconds|Histogram|`request`=<api_request>|
|
||||
|talosccm_api_request_errors_total|Counter|`request`=<api_request>|
|
||||
|
||||
Example output:
|
||||
|
||||
```txt
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.1"} 10
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.25"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="1"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="2.5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="10"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="30"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="addresses",le="+Inf"} 16
|
||||
talosccm_api_request_duration_seconds_sum{request="addresses"} 1.369387789
|
||||
talosccm_api_request_duration_seconds_count{request="addresses"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.1"} 14
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.25"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="1"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="2.5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="5"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="10"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="30"} 16
|
||||
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="+Inf"} 16
|
||||
talosccm_api_request_duration_seconds_sum{request="platformmetadata"} 1.2046141220000002
|
||||
talosccm_api_request_duration_seconds_count{request="platformmetadata"} 16
|
||||
```
|
||||
|
||||
### Certificate signing requests (CSR) approval calls
|
||||
|
||||
|Metric name|Metric type|Labels/tags|
|
||||
|-----------|-----------|-----------|
|
||||
|talosccm_csr_approval_count|Counter|`status`=<approve|deny>|
|
||||
|
||||
Example output:
|
||||
|
||||
```txt
|
||||
talosccm_csr_approval_count{status="approve"} 2
|
||||
```
|
||||
20
pkg/metrics/metrics.go
Normal file
20
pkg/metrics/metrics.go
Normal file
@@ -0,0 +1,20 @@
|
||||
// Package metrics collects metrics.
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// MetricContext indicates the context for Talos client metrics.
|
||||
type MetricContext struct {
|
||||
start time.Time
|
||||
attributes []string
|
||||
}
|
||||
|
||||
// NewMetricContext creates a new MetricContext.
|
||||
func NewMetricContext(resource string) *MetricContext {
|
||||
return &MetricContext{
|
||||
start: time.Now(),
|
||||
attributes: []string{resource},
|
||||
}
|
||||
}
|
||||
51
pkg/metrics/metrics_api.go
Normal file
51
pkg/metrics/metrics_api.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"k8s.io/component-base/metrics"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
)
|
||||
|
||||
// TalosMetrics contains the metrics for Talos API calls.
|
||||
type TalosMetrics struct {
|
||||
Duration *metrics.HistogramVec
|
||||
Errors *metrics.CounterVec
|
||||
}
|
||||
|
||||
var apiMetrics = registerAPIMetrics()
|
||||
|
||||
// ObserveRequest records the request latency and counts the errors.
|
||||
func (mc *MetricContext) ObserveRequest(err error) error {
|
||||
apiMetrics.Duration.WithLabelValues(mc.attributes...).Observe(
|
||||
time.Since(mc.start).Seconds())
|
||||
|
||||
if err != nil {
|
||||
apiMetrics.Errors.WithLabelValues(mc.attributes...).Inc()
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func registerAPIMetrics() *TalosMetrics {
|
||||
metrics := &TalosMetrics{
|
||||
Duration: metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Name: "talosccm_api_request_duration_seconds",
|
||||
Help: "Latency of an Talos API call",
|
||||
Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30},
|
||||
}, []string{"request"}),
|
||||
Errors: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Name: "talosccm_api_request_errors_total",
|
||||
Help: "Total number of errors for an Talos API call",
|
||||
}, []string{"request"}),
|
||||
}
|
||||
|
||||
legacyregistry.MustRegister(
|
||||
metrics.Duration,
|
||||
metrics.Errors,
|
||||
)
|
||||
|
||||
return metrics
|
||||
}
|
||||
44
pkg/metrics/metrics_csr.go
Normal file
44
pkg/metrics/metrics_csr.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"k8s.io/component-base/metrics"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
)
|
||||
|
||||
// CSRMetrics contains the metrics for certificate signing requests.
|
||||
type CSRMetrics struct {
|
||||
approvalCount *metrics.CounterVec
|
||||
}
|
||||
|
||||
// CSRApprovalStatus is the status of a CSR.
|
||||
type CSRApprovalStatus string
|
||||
|
||||
const (
|
||||
// ApprovalStatusDeny is used when a CSR is denied.
|
||||
ApprovalStatusDeny CSRApprovalStatus = "deny"
|
||||
// ApprovalStatusApprove is used when a CSR is approved.
|
||||
ApprovalStatusApprove CSRApprovalStatus = "approve"
|
||||
)
|
||||
|
||||
var csrMetrics = registerCSRMetrics()
|
||||
|
||||
// CSRApprovedCount counts the number of approved, denied and ignored CSRs.
|
||||
func CSRApprovedCount(status CSRApprovalStatus) {
|
||||
csrMetrics.approvalCount.WithLabelValues(string(status)).Inc()
|
||||
}
|
||||
|
||||
func registerCSRMetrics() *CSRMetrics {
|
||||
metrics := &CSRMetrics{
|
||||
approvalCount: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Name: "talosccm_csr_approval_count",
|
||||
Help: "Count of approved, denied and ignored node CSRs",
|
||||
}, []string{"status"}),
|
||||
}
|
||||
|
||||
legacyregistry.MustRegister(
|
||||
metrics.approvalCount,
|
||||
)
|
||||
|
||||
return metrics
|
||||
}
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
|
||||
utilsnet "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
|
||||
"github.com/siderolabs/talos/pkg/machinery/resources/network"
|
||||
@@ -193,11 +194,15 @@ func csrNodeChecks(ctx context.Context, kclient clientkubernetes.Interface, x509
|
||||
|
||||
for _, ip := range x509cr.IPAddresses {
|
||||
if !slices.Contains(nodeAddrs, ip.String()) {
|
||||
metrics.CSRApprovedCount(metrics.ApprovalStatusDeny)
|
||||
|
||||
return false, fmt.Errorf("csrNodeChecks: CSR %s Node IP addresses don't match corresponding "+
|
||||
"Node IP addresses %q, got %q", x509cr.DNSNames[0], nodeAddrs, ip)
|
||||
}
|
||||
}
|
||||
|
||||
metrics.CSRApprovedCount(metrics.ApprovalStatusApprove)
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"maps"
|
||||
"strings"
|
||||
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
|
||||
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/platform"
|
||||
@@ -64,9 +65,11 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
|
||||
return nil, fmt.Errorf("error refreshing client connection: %w", err)
|
||||
}
|
||||
|
||||
mc := metrics.NewMetricContext(runtime.PlatformMetadataID)
|
||||
|
||||
for _, ip := range nodeIPs {
|
||||
meta, err = i.c.getNodeMetadata(ctx, ip)
|
||||
if err == nil {
|
||||
if mc.ObserveRequest(err) == nil {
|
||||
nodeIP = ip
|
||||
|
||||
break
|
||||
@@ -99,8 +102,10 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
|
||||
return nil, fmt.Errorf("error transforming node: %w", err)
|
||||
}
|
||||
|
||||
mc = metrics.NewMetricContext("addresses")
|
||||
|
||||
ifaces, err := i.c.getNodeIfaces(ctx, nodeIP)
|
||||
if err != nil {
|
||||
if mc.ObserveRequest(err) != nil {
|
||||
return nil, fmt.Errorf("error getting interfaces list from the node %s: %w", node.Name, err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user