diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index dc847d66e5d..822f334c94c 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -588,6 +588,7 @@ func NewControllerDescriptors() map[string]*ControllerDescriptor { // feature gated register(newStorageVersionGarbageCollectorControllerDescriptor()) register(newResourceClaimControllerDescriptor()) + register(newDeviceTaintEvictionControllerDescriptor()) register(newLegacyServiceAccountTokenCleanerControllerDescriptor()) register(newValidatingAdmissionPolicyStatusControllerDescriptor()) register(newTaintEvictionControllerDescriptor()) diff --git a/cmd/kube-controller-manager/app/controllermanager_test.go b/cmd/kube-controller-manager/app/controllermanager_test.go index 88af44ed6d6..dd99d4c8269 100644 --- a/cmd/kube-controller-manager/app/controllermanager_test.go +++ b/cmd/kube-controller-manager/app/controllermanager_test.go @@ -93,6 +93,7 @@ func TestControllerNamesDeclaration(t *testing.T) { names.EphemeralVolumeController, names.StorageVersionGarbageCollectorController, names.ResourceClaimController, + names.DeviceTaintEvictionController, names.LegacyServiceAccountTokenCleanerController, names.ValidatingAdmissionPolicyStatusController, names.ServiceCIDRController, diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index a6b9f5bb6e3..500903de4d0 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -41,6 +41,7 @@ import ( "k8s.io/klog/v2" "k8s.io/kubernetes/cmd/kube-controller-manager/names" pkgcontroller "k8s.io/kubernetes/pkg/controller" + "k8s.io/kubernetes/pkg/controller/devicetainteviction" endpointcontroller "k8s.io/kubernetes/pkg/controller/endpoint" "k8s.io/kubernetes/pkg/controller/garbagecollector" namespacecontroller "k8s.io/kubernetes/pkg/controller/namespace" @@ -231,6 +232,32 @@ func startTaintEvictionController(ctx context.Context, controllerContext Control return nil, true, nil } +func newDeviceTaintEvictionControllerDescriptor() *ControllerDescriptor { + return &ControllerDescriptor{ + name: names.DeviceTaintEvictionController, + initFunc: startDeviceTaintEvictionController, + requiredFeatureGates: []featuregate.Feature{ + // TODO update app.TestFeatureGatedControllersShouldNotDefineAliases when removing these feature gates. + features.DynamicResourceAllocation, + features.DRADeviceTaints, + }, + } +} + +func startDeviceTaintEvictionController(ctx context.Context, controllerContext ControllerContext, controllerName string) (controller.Interface, bool, error) { + deviceTaintEvictionController := devicetainteviction.New( + controllerContext.ClientBuilder.ClientOrDie(names.DeviceTaintEvictionController), + controllerContext.InformerFactory.Core().V1().Pods(), + controllerContext.InformerFactory.Resource().V1beta1().ResourceClaims(), + controllerContext.InformerFactory.Resource().V1beta1().ResourceSlices(), + controllerContext.InformerFactory.Resource().V1alpha3().DeviceTaintRules(), + controllerContext.InformerFactory.Resource().V1beta1().DeviceClasses(), + controllerName, + ) + go deviceTaintEvictionController.Run(ctx) + return nil, true, nil +} + func newCloudNodeLifecycleControllerDescriptor() *ControllerDescriptor { return &ControllerDescriptor{ name: cpnames.CloudNodeLifecycleController, diff --git a/cmd/kube-controller-manager/names/controller_names.go b/cmd/kube-controller-manager/names/controller_names.go index db88935c4c7..7aa1b6998c0 100644 --- a/cmd/kube-controller-manager/names/controller_names.go +++ b/cmd/kube-controller-manager/names/controller_names.go @@ -69,6 +69,7 @@ const ( NodeIpamController = "node-ipam-controller" NodeLifecycleController = "node-lifecycle-controller" TaintEvictionController = "taint-eviction-controller" + DeviceTaintEvictionController = "device-taint-eviction-controller" PersistentVolumeBinderController = "persistentvolume-binder-controller" PersistentVolumeAttachDetachController = "persistentvolume-attach-detach-controller" PersistentVolumeExpanderController = "persistentvolume-expander-controller" diff --git a/hack/verify-prometheus-imports.sh b/hack/verify-prometheus-imports.sh index ba1321e1e41..18a67e77922 100755 --- a/hack/verify-prometheus-imports.sh +++ b/hack/verify-prometheus-imports.sh @@ -37,6 +37,7 @@ source "${KUBE_ROOT}/hack/lib/util.sh" # See: https://github.com/kubernetes/kubernetes/issues/89267 allowed_prometheus_importers=( ./cluster/images/etcd-version-monitor/etcd-version-monitor.go + ./pkg/controller/devicetainteviction/device_taint_eviction_test.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram_test.go ./staging/src/k8s.io/component-base/metrics/prometheusextension/timing_histogram_vec.go diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go new file mode 100644 index 00000000000..a2cc9354a58 --- /dev/null +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -0,0 +1,916 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devicetainteviction + +import ( + "context" + "fmt" + "math" + "slices" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/google/go-cmp/cmp" //nolint:depguard // Discouraged for production use (https://github.com/kubernetes/kubernetes/issues/104821) but has no good alternative for logging. + + v1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1beta1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" + coreinformers "k8s.io/client-go/informers/core/v1" + resourcealphainformers "k8s.io/client-go/informers/resource/v1alpha3" + resourceinformers "k8s.io/client-go/informers/resource/v1beta1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + corelisters "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/dynamic-resource-allocation/resourceclaim" + resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker" + "k8s.io/klog/v2" + apipod "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/controller/devicetainteviction/metrics" + "k8s.io/kubernetes/pkg/controller/tainteviction" + utilpod "k8s.io/kubernetes/pkg/util/pod" +) + +const ( + // retries is the number of times that the controller tries to delete a pod + // that needs to be evicted. + retries = 5 +) + +// Controller listens to Taint changes of DRA devices and Toleration changes of ResourceClaims, +// then deletes Pods which use ResourceClaims that don't tolerate a NoExecute taint. +// Pods which have already reached a final state (aka terminated) don't need to be deleted. +// +// All of the logic which identifies pods which need to be evicted runs in the +// handle* event handlers. They don't call any blocking method. All the blocking +// calls happen in a [tainteviction.TimedWorkerQueue], using the context passed to Run. +// +// The [resourceslicetracker] takes care of applying taints defined in DeviceTaintRules +// to ResourceSlices. This controller here receives modified ResourceSlices with all +// applicable taints from that tracker and doesn't need to care about where a +// taint came from, the DRA driver or a DeviceTaintRule. +type Controller struct { + name string + + // logger is the general-purpose logger to be used for background activities. + logger klog.Logger + + // handlerLogger is specifically for logging during event handling. It may be nil + // if no such logging is desired. + eventLogger *klog.Logger + + client clientset.Interface + recorder record.EventRecorder + podInformer coreinformers.PodInformer + podLister corelisters.PodLister + claimInformer resourceinformers.ResourceClaimInformer + sliceInformer resourceinformers.ResourceSliceInformer + taintInformer resourcealphainformers.DeviceTaintRuleInformer + classInformer resourceinformers.DeviceClassInformer + haveSynced []cache.InformerSynced + metrics metrics.Metrics + + // evictPod ensures that the pod gets evicted at the specified time. + // It doesn't block. + evictPod func(pod tainteviction.NamespacedObject, fireAt time.Time) + + // cancelEvict cancels eviction set up with evictPod earlier. + // Idempotent, returns false if there was nothing to cancel. + cancelEvict func(pod tainteviction.NamespacedObject) bool + + // allocatedClaims holds all currently known allocated claims. + allocatedClaims map[types.NamespacedName]allocatedClaim // A value is slightly more efficient in BenchmarkTaintUntaint (less allocations!). + + // pools indexes all slices by driver and pool name. + pools map[poolID]pool + + hasSynced atomic.Int32 +} + +type poolID struct { + driverName, poolName string +} + +type pool struct { + slices sets.Set[*resourceapi.ResourceSlice] + maxGeneration int64 +} + +// addSlice adds one slice to the pool. +func (p *pool) addSlice(slice *resourceapi.ResourceSlice) { + if slice == nil { + return + } + if p.slices == nil { + p.slices = sets.New[*resourceapi.ResourceSlice]() + p.maxGeneration = math.MinInt64 + } + p.slices.Insert(slice) + + // Adding a slice can only increase the generation. + if slice.Spec.Pool.Generation > p.maxGeneration { + p.maxGeneration = slice.Spec.Pool.Generation + } +} + +// removeSlice removes a slice. It must have been added before. +func (p *pool) removeSlice(slice *resourceapi.ResourceSlice) { + if slice == nil { + return + } + p.slices.Delete(slice) + + // Removing a slice might have decreased the generation to + // that of some other slice. + if slice.Spec.Pool.Generation == p.maxGeneration { + maxGeneration := int64(math.MinInt64) + for slice := range p.slices { + if slice.Spec.Pool.Generation > maxGeneration { + maxGeneration = slice.Spec.Pool.Generation + } + } + p.maxGeneration = maxGeneration + } +} + +// getTaintedDevices appends all device taints with NoExecute effect. +// The result is sorted by device name. +func (p pool) getTaintedDevices() []taintedDevice { + var buffer []taintedDevice + for slice := range p.slices { + if slice.Spec.Pool.Generation != p.maxGeneration { + continue + } + for _, device := range slice.Spec.Devices { + if device.Basic == nil { + // Unknown device type, not supported. + continue + } + for _, taint := range device.Basic.Taints { + if taint.Effect != resourceapi.DeviceTaintEffectNoExecute { + continue + } + buffer = append(buffer, taintedDevice{deviceName: device.Name, taint: taint}) + } + } + } + + // slices.SortFunc is more efficient than sort.Slice here. + slices.SortFunc(buffer, func(a, b taintedDevice) int { + return strings.Compare(a.deviceName, b.deviceName) + }) + return buffer +} + +// getDevice looks up one device by name. Out-dated slices are ignored. +func (p pool) getDevice(deviceName string) *resourceapi.BasicDevice { + for slice := range p.slices { + if slice.Spec.Pool.Generation != p.maxGeneration { + continue + } + for _, device := range slice.Spec.Devices { + if device.Basic == nil { + // Unknown device type, not supported. + continue + } + if device.Name == deviceName { + return device.Basic + } + } + } + + return nil +} + +type taintedDevice struct { + deviceName string + taint resourceapi.DeviceTaint +} + +// allocatedClaim is a ResourceClaim which has an allocation result. It +// may or may not be tainted such that pods need to be evicted. +type allocatedClaim struct { + *resourceapi.ResourceClaim + + // evictionTime, if non-nil, is the time at which pods using this claim need to be evicted. + // This is the smallest value of all such per-device values. + // For each device, the value is calculated as `