DRA scheduler: add special ActionType for ResourceClaim changes

Having a dedicated ActionType which only gets used when the scheduler itself
already detects some change in the list of generated ResourceClaims of a pod
avoids calling the DRA plugin for unrelated Pod changes.
This commit is contained in:
Patrick Ohly
2024-09-20 16:54:53 +02:00
parent d425353c13
commit aee77bfc84
9 changed files with 366 additions and 99 deletions

View File

@@ -95,6 +95,11 @@ var (
PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimName: &claimName}).
PodResourceClaims(v1.PodResourceClaim{Name: resourceName2, ResourceClaimName: &claimName2}).
Obj()
podWithTwoClaimTemplates = st.MakePod().Name(podName).Namespace(namespace).
UID(podUID).
PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimTemplateName: &claimName}).
PodResourceClaims(v1.PodResourceClaim{Name: resourceName2, ResourceClaimTemplateName: &claimName}).
Obj()
// Node with "instance-1" device and no device attributes.
workerNode = &st.MakeNode().Name(nodeName).Label("kubernetes.io/hostname", nodeName).Node
@@ -1356,19 +1361,19 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
pod *v1.Pod
claims []*resourceapi.ResourceClaim
oldObj, newObj interface{}
expectedHint framework.QueueingHint
expectedErr bool
wantHint framework.QueueingHint
wantErr bool
}{
"skip-deletes": {
pod: podWithClaimTemplate,
oldObj: allocatedClaim,
newObj: nil,
expectedHint: framework.QueueSkip,
pod: podWithClaimTemplate,
oldObj: allocatedClaim,
newObj: nil,
wantHint: framework.QueueSkip,
},
"backoff-wrong-new-object": {
pod: podWithClaimTemplate,
newObj: "not-a-claim",
expectedErr: true,
pod: podWithClaimTemplate,
newObj: "not-a-claim",
wantErr: true,
},
"skip-wrong-claim": {
pod: podWithClaimTemplate,
@@ -1377,7 +1382,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
claim.OwnerReferences[0].UID += "123"
return claim
}(),
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"skip-unrelated-claim": {
pod: podWithClaimTemplate,
@@ -1388,19 +1393,19 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
claim.UID += "123"
return claim
}(),
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"queue-on-add": {
pod: podWithClaimName,
newObj: pendingClaim,
expectedHint: framework.Queue,
pod: podWithClaimName,
newObj: pendingClaim,
wantHint: framework.Queue,
},
"backoff-wrong-old-object": {
pod: podWithClaimName,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: "not-a-claim",
newObj: pendingClaim,
expectedErr: true,
pod: podWithClaimName,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: "not-a-claim",
newObj: pendingClaim,
wantErr: true,
},
"skip-adding-finalizer": {
pod: podWithClaimName,
@@ -1411,7 +1416,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
claim.Finalizers = append(claim.Finalizers, "foo")
return claim
}(),
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"queue-on-status-change": {
pod: podWithClaimName,
@@ -1422,7 +1427,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
claim.Status.Allocation = &resourceapi.AllocationResult{}
return claim
}(),
expectedHint: framework.Queue,
wantHint: framework.Queue,
},
"structured-claim-deallocate": {
pod: podWithClaimName,
@@ -1435,7 +1440,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
}(),
// TODO (https://github.com/kubernetes/kubernetes/issues/123697): don't wake up
// claims not using structured parameters.
expectedHint: framework.Queue,
wantHint: framework.Queue,
},
}
@@ -1456,18 +1461,24 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
createClaim := claim.DeepCopy()
createClaim.UID = ""
storedClaim, err := testCtx.client.ResourceV1alpha3().ResourceClaims(createClaim.Namespace).Create(tCtx, createClaim, metav1.CreateOptions{})
require.NoError(t, err, "create claim")
if err != nil {
t.Fatalf("create claim: expected no error, got: %v", err)
}
claim = storedClaim
} else {
cachedClaim, err := testCtx.claimAssumeCache.Get(claimKey)
require.NoError(t, err, "retrieve old claim")
if err != nil {
t.Fatalf("retrieve old claim: expected no error, got: %v", err)
}
updateClaim := claim.DeepCopy()
// The test claim doesn't have those (generated dynamically), so copy them.
updateClaim.UID = cachedClaim.(*resourceapi.ResourceClaim).UID
updateClaim.ResourceVersion = cachedClaim.(*resourceapi.ResourceClaim).ResourceVersion
storedClaim, err := testCtx.client.ResourceV1alpha3().ResourceClaims(updateClaim.Namespace).Update(tCtx, updateClaim, metav1.UpdateOptions{})
require.NoError(t, err, "update claim")
if err != nil {
t.Fatalf("update claim: expected no error, got: %v", err)
}
claim = storedClaim
}
@@ -1485,14 +1496,98 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
// isSchedulableAfterClaimChange.
newObj = claim
}
actualHint, err := testCtx.p.isSchedulableAfterClaimChange(logger, tc.pod, oldObj, newObj)
if tc.expectedErr {
require.Error(t, err)
gotHint, err := testCtx.p.isSchedulableAfterClaimChange(logger, tc.pod, oldObj, newObj)
if tc.wantErr {
if err == nil {
t.Fatal("expected an error, got none")
}
return
}
require.NoError(t, err)
require.Equal(t, tc.expectedHint, actualHint)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
if tc.wantHint != gotHint {
t.Fatalf("expected hint %#v, got %#v", tc.wantHint, gotHint)
}
})
}
}
func Test_isSchedulableAfterPodChange(t *testing.T) {
testcases := map[string]struct {
objs []apiruntime.Object
pod *v1.Pod
claims []*resourceapi.ResourceClaim
obj interface{}
wantHint framework.QueueingHint
wantErr bool
}{
"backoff-wrong-new-object": {
pod: podWithClaimTemplate,
obj: "not-a-claim",
wantErr: true,
},
"complete": {
objs: []apiruntime.Object{pendingClaim},
pod: podWithClaimTemplate,
obj: podWithClaimTemplateInStatus,
wantHint: framework.Queue,
},
"wrong-pod": {
objs: []apiruntime.Object{pendingClaim},
pod: func() *v1.Pod {
pod := podWithClaimTemplate.DeepCopy()
pod.Name += "2"
pod.UID += "2" // This is the relevant difference.
return pod
}(),
obj: podWithClaimTemplateInStatus,
wantHint: framework.QueueSkip,
},
"missing-claim": {
objs: nil,
pod: podWithClaimTemplate,
obj: podWithClaimTemplateInStatus,
wantHint: framework.QueueSkip,
},
"incomplete": {
objs: []apiruntime.Object{pendingClaim},
pod: podWithTwoClaimTemplates,
obj: func() *v1.Pod {
pod := podWithTwoClaimTemplates.DeepCopy()
// Only one of two claims created.
pod.Status.ResourceClaimStatuses = []v1.PodResourceClaimStatus{{
Name: pod.Spec.ResourceClaims[0].Name,
ResourceClaimName: &claimName,
}}
return pod
}(),
wantHint: framework.QueueSkip,
},
}
for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
logger, _ := ktesting.NewTestContext(t)
features := feature.Features{
EnableDynamicResourceAllocation: true,
}
testCtx := setup(t, nil, tc.claims, nil, nil, tc.objs, features)
gotHint, err := testCtx.p.isSchedulableAfterPodChange(logger, tc.pod, nil, tc.obj)
if tc.wantErr {
if err == nil {
t.Fatal("expected an error, got none")
}
return
}
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
if tc.wantHint != gotHint {
t.Fatalf("expected hint %#v, got %#v", tc.wantHint, gotHint)
}
})
}
}
@@ -1503,34 +1598,34 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
schedulings []*resourceapi.PodSchedulingContext
claims []*resourceapi.ResourceClaim
oldObj, newObj interface{}
expectedHint framework.QueueingHint
expectedErr bool
wantHint framework.QueueingHint
wantErr bool
}{
"skip-deleted": {
pod: podWithClaimTemplate,
oldObj: scheduling,
expectedHint: framework.QueueSkip,
pod: podWithClaimTemplate,
oldObj: scheduling,
wantHint: framework.QueueSkip,
},
"skip-missed-deleted": {
pod: podWithClaimTemplate,
oldObj: cache.DeletedFinalStateUnknown{
Obj: scheduling,
},
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"backoff-wrong-old-object": {
pod: podWithClaimTemplate,
oldObj: "not-a-scheduling-context",
newObj: scheduling,
expectedErr: true,
pod: podWithClaimTemplate,
oldObj: "not-a-scheduling-context",
newObj: scheduling,
wantErr: true,
},
"backoff-missed-wrong-old-object": {
pod: podWithClaimTemplate,
oldObj: cache.DeletedFinalStateUnknown{
Obj: "not-a-scheduling-context",
},
newObj: scheduling,
expectedErr: true,
newObj: scheduling,
wantErr: true,
},
"skip-unrelated-object": {
pod: podWithClaimTemplate,
@@ -1540,33 +1635,33 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
scheduling.Name += "-foo"
return scheduling
}(),
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"backoff-wrong-new-object": {
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: "not-a-scheduling-context",
expectedErr: true,
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: "not-a-scheduling-context",
wantErr: true,
},
"skip-missing-claim": {
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: schedulingInfo,
expectedHint: framework.QueueSkip,
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: schedulingInfo,
wantHint: framework.QueueSkip,
},
"skip-missing-infos": {
pod: podWithClaimTemplateInStatus,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: scheduling,
newObj: scheduling,
expectedHint: framework.QueueSkip,
pod: podWithClaimTemplateInStatus,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: scheduling,
newObj: scheduling,
wantHint: framework.QueueSkip,
},
"queue-new-infos": {
pod: podWithClaimTemplateInStatus,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: scheduling,
newObj: schedulingInfo,
expectedHint: framework.Queue,
pod: podWithClaimTemplateInStatus,
claims: []*resourceapi.ResourceClaim{pendingClaim},
oldObj: scheduling,
newObj: schedulingInfo,
wantHint: framework.Queue,
},
"queue-bad-selected-node": {
pod: podWithClaimTemplateInStatus,
@@ -1582,7 +1677,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
scheduling.Status.ResourceClaims[0].UnsuitableNodes = append(scheduling.Status.ResourceClaims[0].UnsuitableNodes, scheduling.Spec.SelectedNode)
return scheduling
}(),
expectedHint: framework.Queue,
wantHint: framework.Queue,
},
"skip-spec-changes": {
pod: podWithClaimTemplateInStatus,
@@ -1593,7 +1688,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
scheduling.Spec.SelectedNode = workerNode.Name
return scheduling
}(),
expectedHint: framework.QueueSkip,
wantHint: framework.QueueSkip,
},
"backoff-other-changes": {
pod: podWithClaimTemplateInStatus,
@@ -1604,7 +1699,7 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
scheduling.Finalizers = append(scheduling.Finalizers, "foo")
return scheduling
}(),
expectedHint: framework.Queue,
wantHint: framework.Queue,
},
}
@@ -1618,14 +1713,20 @@ func Test_isSchedulableAfterPodSchedulingContextChange(t *testing.T) {
EnableDRAControlPlaneController: true,
}
testCtx := setup(t, nil, tc.claims, nil, tc.schedulings, nil, features)
actualHint, err := testCtx.p.isSchedulableAfterPodSchedulingContextChange(logger, tc.pod, tc.oldObj, tc.newObj)
if tc.expectedErr {
require.Error(t, err)
gotHint, err := testCtx.p.isSchedulableAfterPodSchedulingContextChange(logger, tc.pod, tc.oldObj, tc.newObj)
if tc.wantErr {
if err == nil {
t.Fatal("expected an error, got none")
}
return
}
require.NoError(t, err)
require.Equal(t, tc.expectedHint, actualHint)
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
if tc.wantHint != gotHint {
t.Fatalf("expected hint %#v, got %#v", tc.wantHint, gotHint)
}
})
}
}