From 7fb028a433f0f94e677a5bbe04dfe543de2f2b9a Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 6 Feb 2025 18:47:18 +0100 Subject: [PATCH 01/11] DRA: add DRADeviceTaints feature --- pkg/features/kube_features.go | 21 ++++++++++++------- pkg/features/versioned_kube_features.go | 12 +++++++---- .../reference/versioned_feature_list.yaml | 6 ++++++ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 49e6a490e08..3611c3125a1 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -220,6 +220,13 @@ const ( // is to move it into a separate KEP. DRAAdminAccess featuregate.Feature = "DRAAdminAccess" + // owner: @pohly + // kep: http://kep.k8s.io/5055 + // + // Marking devices as tainted can prevent using them for new pods and/or + // cause pods using them to stop. Users can decide to tolerate taints. + DRADeviceTaints featuregate.Feature = "DRADeviceTaints" + // owner: @mortent // kep: http://kep.k8s.io/4816 // @@ -228,6 +235,13 @@ const ( // be selected. DRAPrioritizedList featuregate.Feature = "DRAPrioritizedList" + // owner: @LionelJouin + // kep: http://kep.k8s.io/4817 + // + // Enables support the ResourceClaim.status.devices field and for setting this + // status from DRA drivers. + DRAResourceClaimDeviceStatus featuregate.Feature = "DRAResourceClaimDeviceStatus" + // owner: @pohly // kep: http://kep.k8s.io/4381 // @@ -236,13 +250,6 @@ const ( // based on "structured parameters". DynamicResourceAllocation featuregate.Feature = "DynamicResourceAllocation" - // owner: @LionelJouin - // kep: http://kep.k8s.io/4817 - // - // Enables support the ResourceClaim.status.devices field and for setting this - // status from DRA drivers. - DRAResourceClaimDeviceStatus featuregate.Feature = "DRAResourceClaimDeviceStatus" - // owner: @lauralorenz // kep: https://kep.k8s.io/4603 // diff --git a/pkg/features/versioned_kube_features.go b/pkg/features/versioned_kube_features.go index a92e791317c..24ce82778b3 100644 --- a/pkg/features/versioned_kube_features.go +++ b/pkg/features/versioned_kube_features.go @@ -178,13 +178,12 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate {Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha}, }, - DRAPrioritizedList: { + DRADeviceTaints: { {Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha}, }, - DynamicResourceAllocation: { - {Version: version.MustParse("1.26"), Default: false, PreRelease: featuregate.Alpha}, - {Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Beta}, + DRAPrioritizedList: { + {Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha}, }, DRAResourceClaimDeviceStatus: { @@ -192,6 +191,11 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate {Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.Beta}, }, + DynamicResourceAllocation: { + {Version: version.MustParse("1.26"), Default: false, PreRelease: featuregate.Alpha}, + {Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Beta}, + }, + KubeletCrashLoopBackOffMax: { {Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha}, }, diff --git a/test/compatibility_lifecycle/reference/versioned_feature_list.yaml b/test/compatibility_lifecycle/reference/versioned_feature_list.yaml index 81de567551a..e54a7b287dd 100644 --- a/test/compatibility_lifecycle/reference/versioned_feature_list.yaml +++ b/test/compatibility_lifecycle/reference/versioned_feature_list.yaml @@ -425,6 +425,12 @@ lockToDefault: false preRelease: Alpha version: "1.32" +- name: DRADeviceTaints + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "1.33" - name: DRAPrioritizedList versionedSpecs: - default: false From 797475e1137914faab71f4c3b18d9041bf73cfc8 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 7 Mar 2025 14:46:52 +0100 Subject: [PATCH 02/11] DRA: add device taints API This adds the "DeviceTaint" top-level type to v1alpha3 and related fields to ResourceSlice and ResourceClaim. It's complete enough bring up an API server and generate files. --- pkg/api/testing/defaulting_test.go | 6 + pkg/apis/resource/fuzzer/fuzzer.go | 21 ++ pkg/apis/resource/register.go | 2 + pkg/apis/resource/types.go | 286 ++++++++++++++ pkg/apis/resource/v1alpha3/defaults.go | 9 + pkg/apis/resource/v1alpha3/defaults_test.go | 26 ++ pkg/apis/resource/v1beta1/defaults.go | 9 + pkg/apis/resource/v1beta1/defaults_test.go | 26 ++ pkg/apis/resource/validation/validation.go | 160 +++++++- .../validation_devicetaintrule_test.go | 351 ++++++++++++++++++ .../validation_resourceclaim_test.go | 69 +++- .../validation_resourceslice_test.go | 41 ++ .../default_storage_factory_builder.go | 2 + pkg/printers/internalversion/printers.go | 42 +++ .../devicetaintrule/storage/storage.go | 56 +++ .../devicetaintrule/storage/storage_test.go | 144 +++++++ .../resource/devicetaintrule/strategy.go | 84 +++++ .../resource/devicetaintrule/strategy_test.go | 86 +++++ .../resource/resourceclaim/strategy.go | 2 + .../resource/resourceslice/strategy.go | 36 ++ .../resource/resourceslice/strategy_test.go | 220 +++++++++-- .../resource/rest/storage_resource.go | 9 + .../k8s.io/api/resource/v1alpha3/register.go | 2 + .../src/k8s.io/api/resource/v1alpha3/types.go | 290 +++++++++++++++ .../api/resource/v1beta1/devicetaint.go | 35 ++ .../src/k8s.io/api/resource/v1beta1/types.go | 191 +++++++++- .../dynamic-resource-allocation/api/types.go | 17 + .../resourceclaim/devicetoleration.go | 53 +++ .../resourceclaim/devicetoleration_test.go | 127 +++++++ .../deploy/example/devicetaintpolicy.yaml | 13 + .../apiserver/apply/reset_fields_test.go | 1 + test/integration/etcd/data.go | 6 + 32 files changed, 2379 insertions(+), 43 deletions(-) create mode 100644 pkg/apis/resource/validation/validation_devicetaintrule_test.go create mode 100644 pkg/registry/resource/devicetaintrule/storage/storage.go create mode 100644 pkg/registry/resource/devicetaintrule/storage/storage_test.go create mode 100644 pkg/registry/resource/devicetaintrule/strategy.go create mode 100644 pkg/registry/resource/devicetaintrule/strategy_test.go create mode 100644 staging/src/k8s.io/api/resource/v1beta1/devicetaint.go create mode 100644 staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/devicetoleration.go create mode 100644 staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/devicetoleration_test.go create mode 100644 test/e2e/dra/test-driver/deploy/example/devicetaintpolicy.yaml diff --git a/pkg/api/testing/defaulting_test.go b/pkg/api/testing/defaulting_test.go index 3dab45f7119..3ab7111d015 100644 --- a/pkg/api/testing/defaulting_test.go +++ b/pkg/api/testing/defaulting_test.go @@ -135,14 +135,20 @@ func TestDefaulting(t *testing.T) { {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "ClusterRoleBindingList"}: {}, {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "RoleBinding"}: {}, {Group: "rbac.authorization.k8s.io", Version: "v1", Kind: "RoleBindingList"}: {}, + {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "DeviceTaintRule"}: {}, + {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "DeviceTaintRuleList"}: {}, {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceClaim"}: {}, {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceClaimList"}: {}, {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceClaimTemplate"}: {}, {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceClaimTemplateList"}: {}, + {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceSlice"}: {}, + {Group: "resource.k8s.io", Version: "v1alpha3", Kind: "ResourceSliceList"}: {}, {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceClaim"}: {}, {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceClaimList"}: {}, {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceClaimTemplate"}: {}, {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceClaimTemplateList"}: {}, + {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceSlice"}: {}, + {Group: "resource.k8s.io", Version: "v1beta1", Kind: "ResourceSliceList"}: {}, {Group: "admissionregistration.k8s.io", Version: "v1alpha1", Kind: "ValidatingAdmissionPolicy"}: {}, {Group: "admissionregistration.k8s.io", Version: "v1alpha1", Kind: "ValidatingAdmissionPolicyList"}: {}, {Group: "admissionregistration.k8s.io", Version: "v1alpha1", Kind: "ValidatingAdmissionPolicyBinding"}: {}, diff --git a/pkg/apis/resource/fuzzer/fuzzer.go b/pkg/apis/resource/fuzzer/fuzzer.go index 33bfb5bbb95..34f12e9847d 100644 --- a/pkg/apis/resource/fuzzer/fuzzer.go +++ b/pkg/apis/resource/fuzzer/fuzzer.go @@ -17,6 +17,9 @@ limitations under the License. package fuzzer import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" runtimeserializer "k8s.io/apimachinery/pkg/runtime/serializer" "k8s.io/kubernetes/pkg/apis/resource" @@ -57,6 +60,24 @@ var Funcs = func(codecs runtimeserializer.CodecFactory) []interface{} { }[c.Int31n(2)] } }, + func(r *resource.DeviceToleration, c randfill.Continue) { + c.FillNoCustom(r) + if r.Operator == "" { + r.Operator = []resource.DeviceTolerationOperator{ + resource.DeviceTolerationOpEqual, + resource.DeviceTolerationOpExists, + }[c.Int31n(2)] + } + }, + func(r *resource.DeviceTaint, c randfill.Continue) { + c.FillNoCustom(r) + if r.TimeAdded == nil { + // Current time is more or less random. + // Truncate to seconds because sub-second resolution + // does not survive round-tripping. + r.TimeAdded = &metav1.Time{Time: time.Now().Truncate(time.Second)} + } + }, func(r *resource.OpaqueDeviceConfiguration, c randfill.Continue) { c.FillNoCustom(r) // Match the fuzzer default content for runtime.Object. diff --git a/pkg/apis/resource/register.go b/pkg/apis/resource/register.go index 986dc2168be..3bf3a33bf5a 100644 --- a/pkg/apis/resource/register.go +++ b/pkg/apis/resource/register.go @@ -54,6 +54,8 @@ func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &DeviceClass{}, &DeviceClassList{}, + &DeviceTaintRule{}, + &DeviceTaintRuleList{}, &ResourceClaim{}, &ResourceClaimList{}, &ResourceClaimTemplate{}, diff --git a/pkg/apis/resource/types.go b/pkg/apis/resource/types.go index c2cb135a36d..c4cc591b34c 100644 --- a/pkg/apis/resource/types.go +++ b/pkg/apis/resource/types.go @@ -219,6 +219,18 @@ type BasicDevice struct { // // +optional Capacity map[QualifiedName]DeviceCapacity + + // If specified, these are the driver-defined taints. + // + // The maximum number of taints is 8. + // + // This is an alpha field and requires enabling the DRADeviceTaints + // feature gate. + // + // +optional + // +listType=atomic + // +featureGate=DRADeviceTaints + Taints []DeviceTaint } // DeviceCapacity describes a quantity associated with a device. @@ -297,6 +309,64 @@ type DeviceAttribute struct { // DeviceAttributeMaxValueLength is the maximum length of a string or version attribute value. const DeviceAttributeMaxValueLength = 64 +// DeviceTaintsMaxLength is the maximum number of taints per device. +const DeviceTaintsMaxLength = 8 + +// The device this taint is attached to has the "effect" on +// any claim which does not tolerate the taint and, through the claim, +// to pods using the claim. +type DeviceTaint struct { + // The taint key to be applied to a device. + // Must be a label name. + // + // +required + Key string + + // The taint value corresponding to the taint key. + // Must be a label value. + // + // +optional + Value string + + // The effect of the taint on claims that do not tolerate the taint + // and through such claims on the pods using them. + // Valid effects are NoSchedule and NoExecute. PreferNoSchedule as used for + // nodes is not valid here. + // + // +required + Effect DeviceTaintEffect + + // ^^^^ + // + // Implementing PreferNoSchedule would depend on a scoring solution for DRA. + // It might get added as part of that. + + // TimeAdded represents the time at which the taint was added. + // Added automatically during create or update if not set. + // + // +optional + TimeAdded *metav1.Time + + // ^^^ + // + // This field was defined as "It is only written for NoExecute taints." for node taints. + // But in practice, Kubernetes never did anything with it (no validation, no defaulting, + // ignored during pod eviction in pkg/controller/tainteviction). +} + +// +enum +type DeviceTaintEffect string + +const ( + // Do not allow new pods to schedule which use a tainted device unless they tolerate the taint, + // but allow all pods submitted to Kubelet without going through the scheduler + // to start, and allow all already-running pods to continue running. + DeviceTaintEffectNoSchedule DeviceTaintEffect = "NoSchedule" + + // Evict any already-running pods that do not tolerate the device taint. + DeviceTaintEffectNoExecute DeviceTaintEffect = "NoExecute" +) + // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // ResourceSliceList is a collection of ResourceSlices. @@ -502,6 +572,32 @@ type DeviceRequest struct { // +listType=atomic // +featureGate=DRAPrioritizedList FirstAvailable []DeviceSubRequest + + // If specified, the request's tolerations. + // + // Tolerations for NoSchedule are required to allocate a + // device which has a taint with that effect. The same applies + // to NoExecute. + // + // In addition, should any of the allocated devices get tainted + // with NoExecute after allocation and that effect is not tolerated, + // then all pods consuming the ResourceClaim get deleted to evict + // them. The scheduler will not let new pods reserve the claim while + // it has these tainted devices. Once all pods are evicted, the + // claim will get deallocated. + // + // The maximum number of tolerations is 16. + // + // This field can only be set when deviceClassName is set and no subrequests + // are specified in the firstAvailable list. + // + // This is an alpha field and requires enabling the DRADeviceTaints + // feature gate. + // + // +optional + // +listType=atomic + // +featureGate=DRADeviceTaints + Tolerations []DeviceToleration } // DeviceSubRequest describes a request for device provided in the @@ -574,11 +670,35 @@ type DeviceSubRequest struct { // +optional // +oneOf=AllocationMode Count int64 + + // If specified, the request's tolerations. + // + // Tolerations for NoSchedule are required to allocate a + // device which has a taint with that effect. The same applies + // to NoExecute. + // + // In addition, should any of the allocated devices get tainted + // with NoExecute after allocation and that effect is not tolerated, + // then all pods consuming the ResourceClaim get deleted to evict + // them. The scheduler will not let new pods reserve the claim while + // it has these tainted devices. Once all pods are evicted, the + // claim will get deallocated. + // + // The maximum number of tolerations is 16. + // + // This is an alpha field and requires enabling the DRADeviceTaints + // feature gate. + // + // +optional + // +listType=atomic + // +featureGate=DRADeviceTaints + Tolerations []DeviceToleration } const ( DeviceSelectorsMaxSize = 32 FirstAvailableDeviceRequestMaxSize = 8 + DeviceTolerationsMaxLength = 16 ) type DeviceAllocationMode string @@ -784,6 +904,59 @@ type OpaqueDeviceConfiguration struct { // [OpaqueDeviceConfiguration.Parameters] field. const OpaqueParametersMaxLength = 10 * 1024 +// The ResourceClaim this DeviceToleration is attached to tolerates any taint that matches +// the triple using the matching operator . +type DeviceToleration struct { + // Key is the taint key that the toleration applies to. Empty means match all taint keys. + // If the key is empty, operator must be Exists; this combination means to match all values and all keys. + // Must be a label name. + // + // +optional + Key string + + // Operator represents a key's relationship to the value. + // Valid operators are Exists and Equal. Defaults to Equal. + // Exists is equivalent to wildcard for value, so that a ResourceClaim can + // tolerate all taints of a particular category. + // + // +optional + // +default="Equal" + Operator DeviceTolerationOperator + + // Value is the taint value the toleration matches to. + // If the operator is Exists, the value must be empty, otherwise just a regular string. + // Must be a label value. + // + // +optional + Value string + + // Effect indicates the taint effect to match. Empty means match all taint effects. + // When specified, allowed values are NoSchedule and NoExecute. + // + // +optional + Effect DeviceTaintEffect + + // TolerationSeconds represents the period of time the toleration (which must be + // of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + // it is not set, which means tolerate the taint forever (do not evict). Zero and + // negative values will be treated as 0 (evict immediately) by the system. + // If larger than zero, the time when the pod needs to be evicted is calculated as