From a027b439e58cc0ccae23991c8f25674138e97bc8 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 7 Mar 2025 15:21:52 +0100 Subject: [PATCH] DRA: add device taint eviction controller The controller is derived from the node taint eviction controller. In contrast to that controller it tracks the UID of pods to prevent deleting the wrong pod when it got replaced. --- .../app/controllermanager.go | 1 + .../app/controllermanager_test.go | 1 + cmd/kube-controller-manager/app/core.go | 27 + .../names/controller_names.go | 1 + hack/verify-prometheus-imports.sh | 1 + .../device_taint_eviction.go | 916 +++++++++ .../device_taint_eviction_test.go | 1754 +++++++++++++++++ pkg/controller/devicetainteviction/doc.go | 8 +- .../devicetainteviction/metrics/metrics.go | 78 +- .../devicetainteviction/taint_eviction.go | 614 ------ .../taint_eviction_test.go | 941 --------- .../tainteviction/namespacedobject.go | 50 + .../tainteviction/taint_eviction.go | 8 +- pkg/controller/tainteviction/timed_workers.go | 47 +- .../dynamicresources/dynamicresources_test.go | 2 +- pkg/scheduler/testing/wrappers.go | 20 +- .../rbac/bootstrappolicy/controller_policy.go | 18 + 17 files changed, 2888 insertions(+), 1599 deletions(-) create mode 100644 pkg/controller/devicetainteviction/device_taint_eviction.go create mode 100644 pkg/controller/devicetainteviction/device_taint_eviction_test.go delete mode 100644 pkg/controller/devicetainteviction/taint_eviction.go delete mode 100644 pkg/controller/devicetainteviction/taint_eviction_test.go create mode 100644 pkg/controller/tainteviction/namespacedobject.go diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index dc847d66e5d..822f334c94c 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -588,6 +588,7 @@ func NewControllerDescriptors() map[string]*ControllerDescriptor { // feature gated register(newStorageVersionGarbageCollectorControllerDescriptor()) register(newResourceClaimControllerDescriptor()) + register(newDeviceTaintEvictionControllerDescriptor()) register(newLegacyServiceAccountTokenCleanerControllerDescriptor()) register(newValidatingAdmissionPolicyStatusControllerDescriptor()) register(newTaintEvictionControllerDescriptor()) diff --git a/cmd/kube-controller-manager/app/controllermanager_test.go b/cmd/kube-controller-manager/app/controllermanager_test.go index 88af44ed6d6..dd99d4c8269 100644 --- a/cmd/kube-controller-manager/app/controllermanager_test.go +++ b/cmd/kube-controller-manager/app/controllermanager_test.go @@ -93,6 +93,7 @@ func TestControllerNamesDeclaration(t *testing.T) { names.EphemeralVolumeController, names.StorageVersionGarbageCollectorController, names.ResourceClaimController, + names.DeviceTaintEvictionController, names.LegacyServiceAccountTokenCleanerController, names.ValidatingAdmissionPolicyStatusController, names.ServiceCIDRController, diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index a6b9f5bb6e3..500903de4d0 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -41,6 +41,7 @@ import ( "k8s.io/klog/v2" "k8s.io/kubernetes/cmd/kube-controller-manager/names" pkgcontroller "k8s.io/kubernetes/pkg/controller" + "k8s.io/kubernetes/pkg/controller/devicetainteviction" endpointcontroller "k8s.io/kubernetes/pkg/controller/endpoint" "k8s.io/kubernetes/pkg/controller/garbagecollector" namespacecontroller "k8s.io/kubernetes/pkg/controller/namespace" @@ -231,6 +232,32 @@ func startTaintEvictionController(ctx context.Context, controllerContext Control return nil, true, nil } +func newDeviceTaintEvictionControllerDescriptor() *ControllerDescriptor { + return &ControllerDescriptor{ + name: names.DeviceTaintEvictionController, + initFunc: startDeviceTaintEvictionController, + requiredFeatureGates: []featuregate.Feature{ + // TODO update app.TestFeatureGatedControllersShouldNotDefineAliases when removing these feature gates. + features.DynamicResourceAllocation, + features.DRADeviceTaints, + }, + } +} + +func startDeviceTaintEvictionController(ctx context.Context, controllerContext ControllerContext, controllerName string) (controller.Interface, bool, error) { + deviceTaintEvictionController := devicetainteviction.New( + controllerContext.ClientBuilder.ClientOrDie(names.DeviceTaintEvictionController), + controllerContext.InformerFactory.Core().V1().Pods(), + controllerContext.InformerFactory.Resource().V1beta1().ResourceClaims(), + controllerContext.InformerFactory.Resource().V1beta1().ResourceSlices(), + controllerContext.InformerFactory.Resource().V1alpha3().DeviceTaintRules(), + controllerContext.InformerFactory.Resource().V1beta1().DeviceClasses(), + controllerName, + ) + go deviceTaintEvictionController.Run(ctx) + return nil, true, nil +} + func newCloudNodeLifecycleControllerDescriptor() *ControllerDescriptor { return &ControllerDescriptor{ name: cpnames.CloudNodeLifecycleController, diff --git a/cmd/kube-controller-manager/names/controller_names.go b/cmd/kube-controller-manager/names/controller_names.go index db88935c4c7..7aa1b6998c0 100644 --- a/cmd/kube-controller-manager/names/controller_names.go +++ b/cmd/kube-controller-manager/names/controller_names.go @@ -69,6 +69,7 @@ const ( NodeIpamController = "node-ipam-controller" NodeLifecycleController = "node-lifecycle-controller" TaintEvictionController = "taint-eviction-controller" + DeviceTaintEvictionController = "device-taint-eviction-controller" PersistentVolumeBinderController = "persistentvolume-binder-controller" PersistentVolumeAttachDetachController = "persistentvolume-attach-detach-controller" PersistentVolumeExpanderController = "persistentvolume-expander-controller" diff --git a/hack/verify-prometheus-imports.sh b/hack/verify-prometheus-imports.sh index ba1321e1e41..18a67e77922 100755 --- a/hack/verify-prometheus-imports.sh +++ b/hack/verify-prometheus-imports.sh @@ -37,6 +37,7 @@ source "${KUBE_ROOT}/hack/lib/util.sh" # See: https://github.com/kubernetes/kubernetes/issues/89267 allowed_prometheus_importers=( ./cluster/images/etcd-version-monitor/etcd-version-monitor.go + ./pkg/controller/devicetainteviction/device_taint_eviction_test.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram_test.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram_vec.go diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go new file mode 100644 index 00000000000..a2cc9354a58 --- /dev/null +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -0,0 +1,916 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devicetainteviction + +import ( + "context" + "fmt" + "math" + "slices" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/google/go-cmp/cmp" //nolint:depguard // Discouraged for production use (https://github.com/kubernetes/kubernetes/issues/104821) but has no good alternative for logging. + + v1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1beta1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" + coreinformers "k8s.io/client-go/informers/core/v1" + resourcealphainformers "k8s.io/client-go/informers/resource/v1alpha3" + resourceinformers "k8s.io/client-go/informers/resource/v1beta1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/dynamic-resource-allocation/resourceclaim" + resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker" + "k8s.io/klog/v2" + apipod "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/controller/devicetainteviction/metrics" + "k8s.io/kubernetes/pkg/controller/tainteviction" + utilpod "k8s.io/kubernetes/pkg/util/pod" +) + +const ( + // retries is the number of times that the controller tries to delete a pod + // that needs to be evicted. + retries = 5 +) + +// Controller listens to Taint changes of DRA devices and Toleration changes of ResourceClaims, +// then deletes Pods which use ResourceClaims that don't tolerate a NoExecute taint. +// Pods which have already reached a final state (aka terminated) don't need to be deleted. +// +// All of the logic which identifies pods which need to be evicted runs in the +// handle* event handlers. They don't call any blocking method. All the blocking +// calls happen in a [tainteviction.TimedWorkerQueue], using the context passed to Run. +// +// The [resourceslicetracker] takes care of applying taints defined in DeviceTaintRules +// to ResourceSlices. This controller here receives modified ResourceSlices with all +// applicable taints from that tracker and doesn't need to care about where a +// taint came from, the DRA driver or a DeviceTaintRule. +type Controller struct { + name string + + // logger is the general-purpose logger to be used for background activities. + logger klog.Logger + + // handlerLogger is specifically for logging during event handling. It may be nil + // if no such logging is desired. + eventLogger *klog.Logger + + client clientset.Interface + recorder record.EventRecorder + podInformer coreinformers.PodInformer + podLister corelisters.PodLister + claimInformer resourceinformers.ResourceClaimInformer + sliceInformer resourceinformers.ResourceSliceInformer + taintInformer resourcealphainformers.DeviceTaintRuleInformer + classInformer resourceinformers.DeviceClassInformer + haveSynced []cache.InformerSynced + metrics metrics.Metrics + + // evictPod ensures that the pod gets evicted at the specified time. + // It doesn't block. + evictPod func(pod tainteviction.NamespacedObject, fireAt time.Time) + + // cancelEvict cancels eviction set up with evictPod earlier. + // Idempotent, returns false if there was nothing to cancel. + cancelEvict func(pod tainteviction.NamespacedObject) bool + + // allocatedClaims holds all currently known allocated claims. + allocatedClaims map[types.NamespacedName]allocatedClaim // A value is slightly more efficient in BenchmarkTaintUntaint (less allocations!). + + // pools indexes all slices by driver and pool name. + pools map[poolID]pool + + hasSynced atomic.Int32 +} + +type poolID struct { + driverName, poolName string +} + +type pool struct { + slices sets.Set[*resourceapi.ResourceSlice] + maxGeneration int64 +} + +// addSlice adds one slice to the pool. +func (p *pool) addSlice(slice *resourceapi.ResourceSlice) { + if slice == nil { + return + } + if p.slices == nil { + p.slices = sets.New[*resourceapi.ResourceSlice]() + p.maxGeneration = math.MinInt64 + } + p.slices.Insert(slice) + + // Adding a slice can only increase the generation. + if slice.Spec.Pool.Generation > p.maxGeneration { + p.maxGeneration = slice.Spec.Pool.Generation + } +} + +// removeSlice removes a slice. It must have been added before. +func (p *pool) removeSlice(slice *resourceapi.ResourceSlice) { + if slice == nil { + return + } + p.slices.Delete(slice) + + // Removing a slice might have decreased the generation to + // that of some other slice. + if slice.Spec.Pool.Generation == p.maxGeneration { + maxGeneration := int64(math.MinInt64) + for slice := range p.slices { + if slice.Spec.Pool.Generation > maxGeneration { + maxGeneration = slice.Spec.Pool.Generation + } + } + p.maxGeneration = maxGeneration + } +} + +// getTaintedDevices appends all device taints with NoExecute effect. +// The result is sorted by device name. +func (p pool) getTaintedDevices() []taintedDevice { + var buffer []taintedDevice + for slice := range p.slices { + if slice.Spec.Pool.Generation != p.maxGeneration { + continue + } + for _, device := range slice.Spec.Devices { + if device.Basic == nil { + // Unknown device type, not supported. + continue + } + for _, taint := range device.Basic.Taints { + if taint.Effect != resourceapi.DeviceTaintEffectNoExecute { + continue + } + buffer = append(buffer, taintedDevice{deviceName: device.Name, taint: taint}) + } + } + } + + // slices.SortFunc is more efficient than sort.Slice here. + slices.SortFunc(buffer, func(a, b taintedDevice) int { + return strings.Compare(a.deviceName, b.deviceName) + }) + return buffer +} + +// getDevice looks up one device by name. Out-dated slices are ignored. +func (p pool) getDevice(deviceName string) *resourceapi.BasicDevice { + for slice := range p.slices { + if slice.Spec.Pool.Generation != p.maxGeneration { + continue + } + for _, device := range slice.Spec.Devices { + if device.Basic == nil { + // Unknown device type, not supported. + continue + } + if device.Name == deviceName { + return device.Basic + } + } + } + + return nil +} + +type taintedDevice struct { + deviceName string + taint resourceapi.DeviceTaint +} + +// allocatedClaim is a ResourceClaim which has an allocation result. It +// may or may not be tainted such that pods need to be evicted. +type allocatedClaim struct { + *resourceapi.ResourceClaim + + // evictionTime, if non-nil, is the time at which pods using this claim need to be evicted. + // This is the smallest value of all such per-device values. + // For each device, the value is calculated as `