mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 02:08:13 +00:00 
			
		
		
		
	Merge pull request #129950 from ffromani/alignment-error-detail-metrics
node: metrics for alignment failures
This commit is contained in:
		| @@ -325,13 +325,15 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai | ||||
| 	defer func() { | ||||
| 		if rerr != nil { | ||||
| 			metrics.CPUManagerPinningErrorsTotal.Inc() | ||||
| 			if p.options.FullPhysicalCPUsOnly { | ||||
| 				metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc() | ||||
| 			} | ||||
| 			return | ||||
| 		} | ||||
| 		if !p.options.FullPhysicalCPUsOnly { | ||||
| 		if p.options.FullPhysicalCPUsOnly { | ||||
| 			// increment only if we know we allocate aligned resources | ||||
| 			return | ||||
| 		} | ||||
| 			metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc() | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	if p.options.FullPhysicalCPUsOnly { | ||||
| @@ -752,6 +754,7 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC | ||||
| func (p *staticPolicy) initializeMetrics(s state.State) { | ||||
| 	metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000)) | ||||
| 	metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s))) | ||||
| 	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists | ||||
| } | ||||
|  | ||||
| func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) { | ||||
|   | ||||
| @@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { | ||||
| 		klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name) | ||||
|  | ||||
| 		if !admit { | ||||
| 			if IsAlignmentGuaranteed(s.policy) { | ||||
| 				metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc() | ||||
| 			} | ||||
| 			metrics.TopologyManagerAdmissionErrorsTotal.Inc() | ||||
| 			return admission.GetPodAdmitResult(&TopologyAffinityError{}) | ||||
| 		} | ||||
|   | ||||
| @@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { | ||||
| 	bestHint, admit := s.calculateAffinity(pod) | ||||
| 	klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod)) | ||||
| 	if !admit { | ||||
| 		if IsAlignmentGuaranteed(s.policy) { | ||||
| 			// increment only if we know we allocate aligned resources. | ||||
| 			metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc() | ||||
| 		} | ||||
| 		metrics.TopologyManagerAdmissionErrorsTotal.Inc() | ||||
| 		return admission.GetPodAdmitResult(&TopologyAffinityError{}) | ||||
| 	} | ||||
|   | ||||
| @@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology | ||||
| 		scope: scope, | ||||
| 	} | ||||
|  | ||||
| 	manager.initializeMetrics() | ||||
|  | ||||
| 	return manager, nil | ||||
| } | ||||
|  | ||||
| func (m *manager) initializeMetrics() { | ||||
| 	// ensure the values exist | ||||
| 	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0) | ||||
| 	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0) | ||||
| 	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0) | ||||
| 	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0) | ||||
| } | ||||
|  | ||||
| func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint { | ||||
| 	return m.scope.GetAffinity(podUID, containerName) | ||||
| } | ||||
|   | ||||
| @@ -132,6 +132,7 @@ const ( | ||||
|  | ||||
| 	// Metric for tracking aligment of compute resources | ||||
| 	ContainerAlignedComputeResourcesNameKey          = "container_aligned_compute_resources_count" | ||||
| 	ContainerAlignedComputeResourcesFailureNameKey   = "container_aligned_compute_resources_failure_count" | ||||
| 	ContainerAlignedComputeResourcesScopeLabelKey    = "scope" | ||||
| 	ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary" | ||||
|  | ||||
| @@ -818,7 +819,18 @@ var ( | ||||
| 		}, | ||||
| 		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey}, | ||||
| 	) | ||||
| 	// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages | ||||
|  | ||||
| 	// ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary | ||||
| 	ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec( | ||||
| 		&metrics.CounterOpts{ | ||||
| 			Subsystem:      KubeletSubsystem, | ||||
| 			Name:           ContainerAlignedComputeResourcesFailureNameKey, | ||||
| 			Help:           "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.", | ||||
| 			StabilityLevel: metrics.ALPHA, | ||||
| 		}, | ||||
| 		[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey}, | ||||
| 	) | ||||
|  | ||||
| 	MemoryManagerPinningRequestTotal = metrics.NewCounter( | ||||
| 		&metrics.CounterOpts{ | ||||
| 			Subsystem:      KubeletSubsystem, | ||||
| @@ -1079,6 +1091,7 @@ func Register(collectors ...metrics.StableCollector) { | ||||
| 		legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores) | ||||
| 		legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount) | ||||
| 		legacyregistry.MustRegister(ContainerAlignedComputeResources) | ||||
| 		legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure) | ||||
| 		legacyregistry.MustRegister(MemoryManagerPinningRequestTotal) | ||||
| 		legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal) | ||||
| 		legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) | ||||
|   | ||||
| @@ -104,6 +104,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| @@ -111,6 +112,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 				"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::physical_cpu": timelessSample(0), | ||||
| 				}), | ||||
| 			}) | ||||
|  | ||||
| 			ginkgo.By("Giving the Kubelet time to start up and produce metrics") | ||||
| @@ -127,6 +131,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| @@ -134,6 +139,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 				"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::physical_cpu": timelessSample(1), | ||||
| 				}), | ||||
| 			}) | ||||
|  | ||||
| 			ginkgo.By("Giving the Kubelet time to start up and produce metrics") | ||||
| @@ -150,6 +158,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| @@ -157,6 +166,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 				"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::physical_cpu": timelessSample(0), | ||||
| 				}), | ||||
| 			}) | ||||
|  | ||||
| 			ginkgo.By("Giving the Kubelet time to start up and produce metrics") | ||||
| @@ -178,6 +190,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa | ||||
| 				"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::physical_cpu": timelessSample(1), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::physical_cpu": timelessSample(0), | ||||
| 				}), | ||||
| 			}) | ||||
|  | ||||
| 			ginkgo.By("Giving the Kubelet time to update the alignment metrics") | ||||
|   | ||||
| @@ -28,7 +28,6 @@ import ( | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" | ||||
| 	"k8s.io/kubernetes/pkg/kubelet/metrics" | ||||
| 	"k8s.io/kubernetes/test/e2e/feature" | ||||
| 	"k8s.io/kubernetes/test/e2e/framework" | ||||
| 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod" | ||||
| @@ -84,6 +83,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| @@ -91,6 +91,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::numa_node": timelessSample(0), | ||||
| 					"pod::numa_node":       timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| 				}), | ||||
| @@ -110,6 +114,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| @@ -117,6 +122,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::numa_node": timelessSample(0), | ||||
| 					"pod::numa_node":       timelessSample(1), | ||||
| 				}), | ||||
| 				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"": checkMetricValueGreaterThan(0), | ||||
| 				}), | ||||
| @@ -136,6 +145,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(1), | ||||
| @@ -143,6 +153,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 				"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					"": timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::numa_node": timelessSample(0), | ||||
| 					"pod::numa_node":       timelessSample(0), | ||||
| 				}), | ||||
| 				"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"": checkMetricValueGreaterThan(0), | ||||
| 				}), | ||||
| @@ -162,9 +176,15 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. | ||||
| 			// being [Serial], we can also assume noone else but us is running pods. | ||||
| 			ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") | ||||
|  | ||||
| 			idFn := makeCustomPairID("scope", "boundary") | ||||
| 			matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ | ||||
| 				"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{ | ||||
| 					metrics.AlignedNUMANode: timelessSample(1), | ||||
| 				"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(idFn, gstruct.Elements{ | ||||
| 					"container::numa_node": timelessSample(0), | ||||
| 					"pod::numa_node":       timelessSample(1), | ||||
| 				}), | ||||
| 				"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ | ||||
| 					"container::numa_node": timelessSample(0), | ||||
| 					"pod::numa_node":       timelessSample(0), | ||||
| 				}), | ||||
| 			}) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot