mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 02:08:13 +00:00 
			
		
		
		
	Merge pull request #120871 from pohly/dra-unsuitable-nodes-selected-node
k8s.io/dynamic-resource-allocation: fix potential scheduling deadlock
This commit is contained in:
		| @@ -122,6 +122,12 @@ type Driver interface { | |||||||
| 	// can be allocated for it (for example, two GPUs requested but | 	// can be allocated for it (for example, two GPUs requested but | ||||||
| 	// the node only has one). | 	// the node only has one). | ||||||
| 	// | 	// | ||||||
|  | 	// The potentialNodes slice contains all potential nodes selected | ||||||
|  | 	// by the scheduler plus the selected node. The response must | ||||||
|  | 	// not contain any other nodes. Implementations do not have to | ||||||
|  | 	// care about size limits in the PodSchedulingContext status, the | ||||||
|  | 	// caller will handle that. | ||||||
|  | 	// | ||||||
| 	// The result of the check is in ClaimAllocation.UnsuitableNodes. | 	// The result of the check is in ClaimAllocation.UnsuitableNodes. | ||||||
| 	// An error indicates that the entire check must be repeated. | 	// An error indicates that the entire check must be repeated. | ||||||
| 	UnsuitableNodes(ctx context.Context, pod *v1.Pod, claims []*ClaimAllocation, potentialNodes []string) error | 	UnsuitableNodes(ctx context.Context, pod *v1.Pod, claims []*ClaimAllocation, potentialNodes []string) error | ||||||
| @@ -757,12 +763,20 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin | |||||||
| 	// and shouldn't, because those allocations might have to be undone to | 	// and shouldn't, because those allocations might have to be undone to | ||||||
| 	// pick a better node. If we don't need to allocate now, then we'll | 	// pick a better node. If we don't need to allocate now, then we'll | ||||||
| 	// simply report back the gather information. | 	// simply report back the gather information. | ||||||
|  | 	// | ||||||
|  | 	// We shouldn't assume that the scheduler has included the selected node | ||||||
|  | 	// in the list of potential nodes. Usually it does, but let's make sure | ||||||
|  | 	// that we check it. | ||||||
|  | 	selectedNode := schedulingCtx.Spec.SelectedNode | ||||||
|  | 	potentialNodes := schedulingCtx.Spec.PotentialNodes | ||||||
|  | 	if selectedNode != "" && !hasString(potentialNodes, selectedNode) { | ||||||
|  | 		potentialNodes = append(potentialNodes, selectedNode) | ||||||
|  | 	} | ||||||
| 	if len(schedulingCtx.Spec.PotentialNodes) > 0 { | 	if len(schedulingCtx.Spec.PotentialNodes) > 0 { | ||||||
| 		if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, schedulingCtx.Spec.PotentialNodes); err != nil { | 		if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, potentialNodes); err != nil { | ||||||
| 			return fmt.Errorf("checking potential nodes: %v", err) | 			return fmt.Errorf("checking potential nodes: %v", err) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	selectedNode := schedulingCtx.Spec.SelectedNode |  | ||||||
| 	logger.V(5).Info("pending pod claims", "claims", claims, "selectedNode", selectedNode) | 	logger.V(5).Info("pending pod claims", "claims", claims, "selectedNode", selectedNode) | ||||||
| 	if selectedNode != "" { | 	if selectedNode != "" { | ||||||
| 		unsuitable := false | 		unsuitable := false | ||||||
| @@ -816,12 +830,12 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin | |||||||
| 			schedulingCtx.Status.ResourceClaims = append(schedulingCtx.Status.ResourceClaims, | 			schedulingCtx.Status.ResourceClaims = append(schedulingCtx.Status.ResourceClaims, | ||||||
| 				resourcev1alpha2.ResourceClaimSchedulingStatus{ | 				resourcev1alpha2.ResourceClaimSchedulingStatus{ | ||||||
| 					Name:            delayed.PodClaimName, | 					Name:            delayed.PodClaimName, | ||||||
| 					UnsuitableNodes: delayed.UnsuitableNodes, | 					UnsuitableNodes: truncateNodes(delayed.UnsuitableNodes, selectedNode), | ||||||
| 				}) | 				}) | ||||||
| 			modified = true | 			modified = true | ||||||
| 		} else if stringsDiffer(schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes, delayed.UnsuitableNodes) { | 		} else if stringsDiffer(schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes, delayed.UnsuitableNodes) { | ||||||
| 			// Update existing entry. | 			// Update existing entry. | ||||||
| 			schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = delayed.UnsuitableNodes | 			schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = truncateNodes(delayed.UnsuitableNodes, selectedNode) | ||||||
| 			modified = true | 			modified = true | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -837,6 +851,23 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin | |||||||
| 	return errPeriodic | 	return errPeriodic | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func truncateNodes(nodes []string, selectedNode string) []string { | ||||||
|  | 	// We might have checked "potential nodes + selected node" above, so | ||||||
|  | 	// this list might be too long by one element. When truncating it, make | ||||||
|  | 	// sure that the selected node is listed. | ||||||
|  | 	lenUnsuitable := len(nodes) | ||||||
|  | 	if lenUnsuitable > resourcev1alpha2.PodSchedulingNodeListMaxSize { | ||||||
|  | 		if nodes[0] == selectedNode { | ||||||
|  | 			// Truncate at the end and keep selected node in the first element. | ||||||
|  | 			nodes = nodes[0 : lenUnsuitable-1] | ||||||
|  | 		} else { | ||||||
|  | 			// Truncate at the front, it's not the selected node. | ||||||
|  | 			nodes = nodes[1:lenUnsuitable] | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return nodes | ||||||
|  | } | ||||||
|  |  | ||||||
| type claimAllocations []*ClaimAllocation | type claimAllocations []*ClaimAllocation | ||||||
|  |  | ||||||
| // MarshalLog replaces the pointers with the actual structs because | // MarshalLog replaces the pointers with the actual structs because | ||||||
|   | |||||||
| @@ -19,6 +19,7 @@ package controller | |||||||
| import ( | import ( | ||||||
| 	"context" | 	"context" | ||||||
| 	"errors" | 	"errors" | ||||||
|  | 	"fmt" | ||||||
| 	"testing" | 	"testing" | ||||||
|  |  | ||||||
| 	"github.com/stretchr/testify/assert" | 	"github.com/stretchr/testify/assert" | ||||||
| @@ -64,6 +65,10 @@ func TestController(t *testing.T) { | |||||||
| 	otherNodeName := "worker-2" | 	otherNodeName := "worker-2" | ||||||
| 	unsuitableNodes := []string{otherNodeName} | 	unsuitableNodes := []string{otherNodeName} | ||||||
| 	potentialNodes := []string{nodeName, otherNodeName} | 	potentialNodes := []string{nodeName, otherNodeName} | ||||||
|  | 	maxNodes := make([]string, resourcev1alpha2.PodSchedulingNodeListMaxSize) | ||||||
|  | 	for i := range maxNodes { | ||||||
|  | 		maxNodes[i] = fmt.Sprintf("node-%d", i) | ||||||
|  | 	} | ||||||
| 	withDeletionTimestamp := func(claim *resourcev1alpha2.ResourceClaim) *resourcev1alpha2.ResourceClaim { | 	withDeletionTimestamp := func(claim *resourcev1alpha2.ResourceClaim) *resourcev1alpha2.ResourceClaim { | ||||||
| 		var deleted metav1.Time | 		var deleted metav1.Time | ||||||
| 		claim = claim.DeepCopy() | 		claim = claim.DeepCopy() | ||||||
| @@ -101,18 +106,24 @@ func TestController(t *testing.T) { | |||||||
| 		podSchedulingCtx.Spec.SelectedNode = nodeName | 		podSchedulingCtx.Spec.SelectedNode = nodeName | ||||||
| 		return podSchedulingCtx | 		return podSchedulingCtx | ||||||
| 	} | 	} | ||||||
| 	withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { | 	withSpecificUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, unsuitableNodes []string) *resourcev1alpha2.PodSchedulingContext { | ||||||
| 		podSchedulingCtx = podSchedulingCtx.DeepCopy() | 		podSchedulingCtx = podSchedulingCtx.DeepCopy() | ||||||
| 		podSchedulingCtx.Status.ResourceClaims = append(podSchedulingCtx.Status.ResourceClaims, | 		podSchedulingCtx.Status.ResourceClaims = append(podSchedulingCtx.Status.ResourceClaims, | ||||||
| 			resourcev1alpha2.ResourceClaimSchedulingStatus{Name: podClaimName, UnsuitableNodes: unsuitableNodes}, | 			resourcev1alpha2.ResourceClaimSchedulingStatus{Name: podClaimName, UnsuitableNodes: unsuitableNodes}, | ||||||
| 		) | 		) | ||||||
| 		return podSchedulingCtx | 		return podSchedulingCtx | ||||||
| 	} | 	} | ||||||
| 	withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { | 	withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { | ||||||
|  | 		return withSpecificUnsuitableNodes(podSchedulingCtx, unsuitableNodes) | ||||||
|  | 	} | ||||||
|  | 	withSpecificPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, potentialNodes []string) *resourcev1alpha2.PodSchedulingContext { | ||||||
| 		podSchedulingCtx = podSchedulingCtx.DeepCopy() | 		podSchedulingCtx = podSchedulingCtx.DeepCopy() | ||||||
| 		podSchedulingCtx.Spec.PotentialNodes = potentialNodes | 		podSchedulingCtx.Spec.PotentialNodes = potentialNodes | ||||||
| 		return podSchedulingCtx | 		return podSchedulingCtx | ||||||
| 	} | 	} | ||||||
|  | 	withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext { | ||||||
|  | 		return withSpecificPotentialNodes(podSchedulingCtx, potentialNodes) | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	var m mockDriver | 	var m mockDriver | ||||||
|  |  | ||||||
| @@ -376,6 +387,48 @@ func TestController(t *testing.T) { | |||||||
| 			expectedSchedulingCtx: withUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))), | 			expectedSchedulingCtx: withUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))), | ||||||
| 			expectedError:         errPeriodic.Error(), | 			expectedError:         errPeriodic.Error(), | ||||||
| 		}, | 		}, | ||||||
|  | 		// pod with delayed allocation, potential nodes, selected node, all unsuitable -> update unsuitable nodes | ||||||
|  | 		"pod-selected-is-potential-node": { | ||||||
|  | 			key:           podKey, | ||||||
|  | 			classes:       classes, | ||||||
|  | 			claim:         delayedClaim, | ||||||
|  | 			expectedClaim: delayedClaim, | ||||||
|  | 			pod:           podWithClaim, | ||||||
|  | 			schedulingCtx: withPotentialNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))), | ||||||
|  | 			driver: m.expectClassParameters(map[string]interface{}{className: 1}). | ||||||
|  | 				expectClaimParameters(map[string]interface{}{claimName: 2}). | ||||||
|  | 				expectUnsuitableNodes(map[string][]string{podClaimName: potentialNodes}, nil), | ||||||
|  | 			expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx)), potentialNodes), | ||||||
|  | 			expectedError:         errPeriodic.Error(), | ||||||
|  | 		}, | ||||||
|  | 		// pod with delayed allocation, max potential nodes, other selected node, all unsuitable -> update unsuitable nodes with truncation at start | ||||||
|  | 		"pod-selected-is-potential-node-truncate-first": { | ||||||
|  | 			key:           podKey, | ||||||
|  | 			classes:       classes, | ||||||
|  | 			claim:         delayedClaim, | ||||||
|  | 			expectedClaim: delayedClaim, | ||||||
|  | 			pod:           podWithClaim, | ||||||
|  | 			schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes), | ||||||
|  | 			driver: m.expectClassParameters(map[string]interface{}{className: 1}). | ||||||
|  | 				expectClaimParameters(map[string]interface{}{claimName: 2}). | ||||||
|  | 				expectUnsuitableNodes(map[string][]string{podClaimName: append(maxNodes, nodeName)}, nil), | ||||||
|  | 			expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append(maxNodes[1:], nodeName)), | ||||||
|  | 			expectedError:         errPeriodic.Error(), | ||||||
|  | 		}, | ||||||
|  | 		// pod with delayed allocation, max potential nodes, other selected node, all unsuitable (but in reverse order) -> update unsuitable nodes with truncation at end | ||||||
|  | 		"pod-selected-is-potential-node-truncate-last": { | ||||||
|  | 			key:           podKey, | ||||||
|  | 			classes:       classes, | ||||||
|  | 			claim:         delayedClaim, | ||||||
|  | 			expectedClaim: delayedClaim, | ||||||
|  | 			pod:           podWithClaim, | ||||||
|  | 			schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes), | ||||||
|  | 			driver: m.expectClassParameters(map[string]interface{}{className: 1}). | ||||||
|  | 				expectClaimParameters(map[string]interface{}{claimName: 2}). | ||||||
|  | 				expectUnsuitableNodes(map[string][]string{podClaimName: append([]string{nodeName}, maxNodes...)}, nil), | ||||||
|  | 			expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append([]string{nodeName}, maxNodes[:len(maxNodes)-1]...)), | ||||||
|  | 			expectedError:         errPeriodic.Error(), | ||||||
|  | 		}, | ||||||
| 	} { | 	} { | ||||||
| 		t.Run(name, func(t *testing.T) { | 		t.Run(name, func(t *testing.T) { | ||||||
| 			_, ctx := ktesting.NewTestContext(t) | 			_, ctx := ktesting.NewTestContext(t) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot