feat: expose metrics

Expose API call metrics and node CSR approve statistics.

Signed-off-by: Serge Logvinov <serge.logvinov@sinextra.dev>
This commit is contained in:
Serge Logvinov
2024-05-05 19:44:32 +03:00
parent 0faf0ae2b4
commit 0e8728c11d
10 changed files with 209 additions and 4 deletions

View File

@@ -73,7 +73,7 @@ build: ## Build
.PHONY: run
run: build
./talos-cloud-controller-manager-$(ARCH) --v=5 --kubeconfig=kubeconfig --cloud-config=hack/ccm-config.yaml --controllers=cloud-node \
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1 --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics
.PHONY: lint
lint: ## Lint Code

View File

@@ -11,5 +11,5 @@ keywords:
maintainers:
- name: sergelogvinov
url: https://github.com/sergelogvinov
version: 0.3.0
version: 0.3.1
appVersion: "v1.6.0"

View File

@@ -56,6 +56,7 @@ spec:
- --leader-elect-resource-name=cloud-controller-manager-talos
- --use-service-account-credentials
- --secure-port={{ .Values.service.containerPort }}
- --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics
{{- with .Values.extraArgs }}
{{- toYaml . | nindent 12 }}
{{- end }}

View File

@@ -105,6 +105,9 @@ service:
containerPort: 50258
# -- Additional custom annotations for Service.
annotations: {}
# prometheus.io/scrape: "true"
# prometheus.io/scheme: "https"
# prometheus.io/port: "50258"
# -- Resource requests and limits.
# ref: http://kubernetes.io/docs/user-guide/compute-resources/

76
docs/metrics.md Normal file
View File

@@ -0,0 +1,76 @@
# Metrics documentation
This document is a reflection of the current state of the exposed metrics of the Talos CCM.
## Gather metrics from talos-cloud-controller-manager
By default, the Talos CCM exposes metrics on the `https://localhost:50258/metrics` endpoint.
Enabling the metrics is done by setting the `--secure-port` and the `--authorization-always-allow-paths` flag to allow access to the `/metrics` endpoint.
```yaml
talos-cloud-controller-manager
--authorization-always-allow-paths="/metrics"
--secure-port=50258
```
### Helm chart values
The following values can be set in the Helm chart to expose the metrics of the Talos CCM.
```yaml
service:
containerPort: 50258
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: "https"
prometheus.io/port: "50258"
```
## Metrics exposed by the CCM
### Talos API calls
|Metric name|Metric type|Labels/tags|
|-----------|-----------|-----------|
|talosccm_api_request_duration_seconds|Histogram|`request`=<api_request>|
|talosccm_api_request_errors_total|Counter|`request`=<api_request>|
Example output:
```txt
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.1"} 10
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.25"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="1"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="2.5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="10"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="30"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="+Inf"} 16
talosccm_api_request_duration_seconds_sum{request="addresses"} 1.369387789
talosccm_api_request_duration_seconds_count{request="addresses"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.1"} 14
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.25"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="1"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="2.5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="10"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="30"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="+Inf"} 16
talosccm_api_request_duration_seconds_sum{request="platformmetadata"} 1.2046141220000002
talosccm_api_request_duration_seconds_count{request="platformmetadata"} 16
```
### Certificate signing requests (CSR) approval calls
|Metric name|Metric type|Labels/tags|
|-----------|-----------|-----------|
|talosccm_csr_approval_count|Counter|`status`=<approve|deny>|
Example output:
```txt
talosccm_csr_approval_count{status="approve"} 2
```

20
pkg/metrics/metrics.go Normal file
View File

@@ -0,0 +1,20 @@
// Package metrics collects metrics.
package metrics
import (
"time"
)
// MetricContext indicates the context for Talos client metrics.
type MetricContext struct {
start time.Time
attributes []string
}
// NewMetricContext creates a new MetricContext.
func NewMetricContext(resource string) *MetricContext {
return &MetricContext{
start: time.Now(),
attributes: []string{resource},
}
}

View File

@@ -0,0 +1,51 @@
package metrics
import (
"time"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// TalosMetrics contains the metrics for Talos API calls.
type TalosMetrics struct {
Duration *metrics.HistogramVec
Errors *metrics.CounterVec
}
var apiMetrics = registerAPIMetrics()
// ObserveRequest records the request latency and counts the errors.
func (mc *MetricContext) ObserveRequest(err error) error {
apiMetrics.Duration.WithLabelValues(mc.attributes...).Observe(
time.Since(mc.start).Seconds())
if err != nil {
apiMetrics.Errors.WithLabelValues(mc.attributes...).Inc()
}
return err
}
func registerAPIMetrics() *TalosMetrics {
metrics := &TalosMetrics{
Duration: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "talosccm_api_request_duration_seconds",
Help: "Latency of an Talos API call",
Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30},
}, []string{"request"}),
Errors: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "talosccm_api_request_errors_total",
Help: "Total number of errors for an Talos API call",
}, []string{"request"}),
}
legacyregistry.MustRegister(
metrics.Duration,
metrics.Errors,
)
return metrics
}

View File

@@ -0,0 +1,44 @@
package metrics
import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// CSRMetrics contains the metrics for certificate signing requests.
type CSRMetrics struct {
approvalCount *metrics.CounterVec
}
// CSRApprovalStatus is the status of a CSR.
type CSRApprovalStatus string
const (
// ApprovalStatusDeny is used when a CSR is denied.
ApprovalStatusDeny CSRApprovalStatus = "deny"
// ApprovalStatusApprove is used when a CSR is approved.
ApprovalStatusApprove CSRApprovalStatus = "approve"
)
var csrMetrics = registerCSRMetrics()
// CSRApprovedCount counts the number of approved, denied and ignored CSRs.
func CSRApprovedCount(status CSRApprovalStatus) {
csrMetrics.approvalCount.WithLabelValues(string(status)).Inc()
}
func registerCSRMetrics() *CSRMetrics {
metrics := &CSRMetrics{
approvalCount: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "talosccm_csr_approval_count",
Help: "Count of approved, denied and ignored node CSRs",
}, []string{"status"}),
}
legacyregistry.MustRegister(
metrics.approvalCount,
)
return metrics
}

View File

@@ -7,6 +7,7 @@ import (
"fmt"
"strings"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
utilsnet "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
@@ -193,11 +194,15 @@ func csrNodeChecks(ctx context.Context, kclient clientkubernetes.Interface, x509
for _, ip := range x509cr.IPAddresses {
if !slices.Contains(nodeAddrs, ip.String()) {
metrics.CSRApprovedCount(metrics.ApprovalStatusDeny)
return false, fmt.Errorf("csrNodeChecks: CSR %s Node IP addresses don't match corresponding "+
"Node IP addresses %q, got %q", x509cr.DNSNames[0], nodeAddrs, ip)
}
}
metrics.CSRApprovedCount(metrics.ApprovalStatusApprove)
return true, nil
}

View File

@@ -6,6 +6,7 @@ import (
"maps"
"strings"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/platform"
@@ -64,9 +65,11 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
return nil, fmt.Errorf("error refreshing client connection: %w", err)
}
mc := metrics.NewMetricContext(runtime.PlatformMetadataID)
for _, ip := range nodeIPs {
meta, err = i.c.getNodeMetadata(ctx, ip)
if err == nil {
if mc.ObserveRequest(err) == nil {
nodeIP = ip
break
@@ -99,8 +102,10 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
return nil, fmt.Errorf("error transforming node: %w", err)
}
mc = metrics.NewMetricContext("addresses")
ifaces, err := i.c.getNodeIfaces(ctx, nodeIP)
if err != nil {
if mc.ObserveRequest(err) != nil {
return nil, fmt.Errorf("error getting interfaces list from the node %s: %w", node.Name, err)
}