diff --git a/pkg/scheduler/backend/api_cache/api_cache.go b/pkg/scheduler/backend/api_cache/api_cache.go new file mode 100644 index 00000000000..7a021042da0 --- /dev/null +++ b/pkg/scheduler/backend/api_cache/api_cache.go @@ -0,0 +1,72 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicache + +import ( + "context" + + v1 "k8s.io/api/core/v1" + fwk "k8s.io/kube-scheduler/framework" + internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" + internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" + "k8s.io/kubernetes/pkg/scheduler/framework" +) + +// APICache is responsible for sending API calls' requests through scheduling queue or cache. +type APICache struct { + schedulingQueue internalqueue.SchedulingQueue + cache internalcache.Cache +} + +func New(schedulingQueue internalqueue.SchedulingQueue, cache internalcache.Cache) *APICache { + return &APICache{ + schedulingQueue: schedulingQueue, + cache: cache, + } +} + +// PatchPodStatus sends a patch request for a Pod's status through a scheduling queue. +// The patch could be first applied to the cached Pod object and then the API call is executed asynchronously. +// It returns a channel that can be used to wait for the call's completion. +func (c *APICache) PatchPodStatus(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) (<-chan error, error) { + return c.schedulingQueue.PatchPodStatus(pod, condition, nominatingInfo) +} + +// BindPod sends a binding request through a cache. The binding could be first applied to the cached Pod object +// and then the API call is executed asynchronously. +// It returns a channel that can be used to wait for the call's completion. +func (c *APICache) BindPod(binding *v1.Binding) (<-chan error, error) { + return c.cache.BindPod(binding) +} + +// WaitOnFinish blocks until the result of an API call is sent to the given onFinish channel +// (returned by methods BindPod or PreemptPod). +// +// It returns the error received from the channel. +// It also returns nil if the call was skipped or overwritten, +// as these are considered successful lifecycle outcomes. +func (c *APICache) WaitOnFinish(ctx context.Context, onFinish <-chan error) error { + select { + case err := <-onFinish: + if fwk.IsUnexpectedError(err) { + return err + } + case <-ctx.Done(): + return ctx.Err() + } + return nil +} diff --git a/pkg/scheduler/backend/api_dispatcher/api_dispatcher.go b/pkg/scheduler/backend/api_dispatcher/api_dispatcher.go index 0db8b2110f0..8c3d6c6e545 100644 --- a/pkg/scheduler/backend/api_dispatcher/api_dispatcher.go +++ b/pkg/scheduler/backend/api_dispatcher/api_dispatcher.go @@ -65,11 +65,11 @@ func (ad *APIDispatcher) SyncObject(obj metav1.Object) (metav1.Object, error) { // Run starts the main processing loop of the APIDispatcher, which pops calls // from the queue and dispatches them to worker goroutines for execution. func (ad *APIDispatcher) Run(logger klog.Logger) { - go func() { - // Create a new context to allow to cancel the APICalls' execution when the APIDispatcher is closed. - ctx, cancel := context.WithCancel(context.Background()) - ad.cancel = cancel + // Create a new context to allow to cancel the APICalls' execution when the APIDispatcher is closed. + ctx, cancel := context.WithCancel(context.Background()) + ad.cancel = cancel + go func() { for { select { case <-ctx.Done(): diff --git a/pkg/scheduler/backend/cache/cache.go b/pkg/scheduler/backend/cache/cache.go index 26646200113..4908be60df7 100644 --- a/pkg/scheduler/backend/cache/cache.go +++ b/pkg/scheduler/backend/cache/cache.go @@ -30,6 +30,7 @@ import ( "k8s.io/klog/v2" fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/metrics" ) @@ -41,9 +42,9 @@ var ( // It automatically starts a go routine that manages expiration of assumed pods. // "ttl" is how long the assumed pod will get expired. // "ctx" is the context that would close the background goroutine. -func New(ctx context.Context, ttl time.Duration) Cache { +func New(ctx context.Context, ttl time.Duration, apiDispatcher fwk.APIDispatcher) Cache { logger := klog.FromContext(ctx) - cache := newCache(ctx, ttl, cleanAssumedPeriod) + cache := newCache(ctx, ttl, cleanAssumedPeriod, apiDispatcher) cache.run(logger) return cache } @@ -76,6 +77,10 @@ type cacheImpl struct { nodeTree *nodeTree // A map from image name to its ImageStateSummary. imageStates map[string]*fwk.ImageStateSummary + + // apiDispatcher is used for the methods that are expected to send API calls. + // It's non-nil only if the SchedulerAsyncAPICalls feature gate is enabled. + apiDispatcher fwk.APIDispatcher } type podState struct { @@ -87,18 +92,19 @@ type podState struct { bindingFinished bool } -func newCache(ctx context.Context, ttl, period time.Duration) *cacheImpl { +func newCache(ctx context.Context, ttl, period time.Duration, apiDispatcher fwk.APIDispatcher) *cacheImpl { logger := klog.FromContext(ctx) return &cacheImpl{ ttl: ttl, period: period, stop: ctx.Done(), - nodes: make(map[string]*nodeInfoListItem), - nodeTree: newNodeTree(logger, nil), - assumedPods: sets.New[string](), - podStates: make(map[string]*podState), - imageStates: make(map[string]*fwk.ImageStateSummary), + nodes: make(map[string]*nodeInfoListItem), + nodeTree: newNodeTree(logger, nil), + assumedPods: sets.New[string](), + podStates: make(map[string]*podState), + imageStates: make(map[string]*fwk.ImageStateSummary), + apiDispatcher: apiDispatcher, } } @@ -759,3 +765,17 @@ func (cache *cacheImpl) updateMetrics() { metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates))) metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes))) } + +// BindPod handles the pod binding by adding a bind API call to the dispatcher. +// This method should be used only if the SchedulerAsyncAPICalls feature gate is enabled. +func (cache *cacheImpl) BindPod(binding *v1.Binding) (<-chan error, error) { + // Don't store anything in the cache, as the pod is already assumed, and in case of a binding failure, it will be forgotten. + onFinish := make(chan error, 1) + err := cache.apiDispatcher.Add(apicalls.Implementations.PodBinding(binding), fwk.APICallOptions{ + OnFinish: onFinish, + }) + if fwk.IsUnexpectedError(err) { + return onFinish, err + } + return onFinish, nil +} diff --git a/pkg/scheduler/backend/cache/cache_test.go b/pkg/scheduler/backend/cache/cache_test.go index 921fc1b2c90..c86df82864c 100644 --- a/pkg/scheduler/backend/cache/cache_test.go +++ b/pkg/scheduler/backend/cache/cache_test.go @@ -235,7 +235,7 @@ func TestAssumePodScheduled(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) for _, pod := range tc.pods { if err := cache.AssumePod(logger, pod); err != nil { t.Fatalf("AssumePod failed: %v", err) @@ -354,7 +354,7 @@ func TestExpirePod(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, tc.ttl, time.Second) + cache := newCache(ctx, tc.ttl, time.Second, nil) for _, pod := range tc.pods { if err := cache.AssumePod(logger, pod.pod); err != nil { @@ -415,7 +415,7 @@ func TestAddPodWillConfirm(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAssume := range test.podsToAssume { if err := assumeAndFinishBinding(logger, cache, podToAssume, now); err != nil { t.Fatalf("assumePod failed: %v", err) @@ -458,7 +458,7 @@ func TestDump(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAssume := range test.podsToAssume { if err := assumeAndFinishBinding(logger, cache, podToAssume, now); err != nil { t.Errorf("assumePod failed: %v", err) @@ -526,7 +526,7 @@ func TestAddPodAlwaysUpdatesPodInfoInNodeInfo(t *testing.T) { }, } - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAssume := range test.podsToAssume { if err := assumeAndFinishBinding(logger, cache, podToAssume, now); err != nil { t.Fatalf("assumePod failed: %v", err) @@ -585,7 +585,7 @@ func TestAddPodWillReplaceAssumed(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAssume := range test.podsToAssume { if err := assumeAndFinishBinding(logger, cache, podToAssume, now); err != nil { t.Fatalf("assumePod failed: %v", err) @@ -638,7 +638,7 @@ func TestAddPodAfterExpiration(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() now := time.Now() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) if err := assumeAndFinishBinding(logger, cache, test.pod, now); err != nil { t.Fatalf("assumePod failed: %v", err) } @@ -703,7 +703,7 @@ func TestUpdatePod(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAdd := range test.podsToAdd { if err := cache.AddPod(logger, podToAdd); err != nil { t.Fatalf("AddPod failed: %v", err) @@ -765,7 +765,7 @@ func TestUpdatePodAndGet(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) // trying to get an unknown pod should return an error // podToUpdate has not been added yet if _, err := cache.GetPod(tc.podToUpdate); err == nil { @@ -848,7 +848,7 @@ func TestExpireAddUpdatePod(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() now := time.Now() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, podToAssume := range test.podsToAssume { if err := assumeAndFinishBinding(logger, cache, podToAssume, now); err != nil { t.Fatalf("assumePod failed: %v", err) @@ -909,7 +909,7 @@ func TestEphemeralStorageResource(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) if err := cache.AddPod(logger, test.pod); err != nil { t.Fatalf("AddPod failed: %v", err) } @@ -963,7 +963,7 @@ func TestRemovePod(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() nodeName := pod.Spec.NodeName - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) // Add/Assume pod succeeds even before adding the nodes. if tt.assume { if err := cache.AddPod(logger, pod); err != nil { @@ -1013,7 +1013,7 @@ func TestForgetPod(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, ttl, time.Second) + cache := newCache(ctx, ttl, time.Second, nil) for _, pod := range pods { if err := assumeAndFinishBinding(logger, cache, pod, now); err != nil { t.Fatalf("assumePod failed: %v", err) @@ -1226,7 +1226,7 @@ func TestNodeOperators(t *testing.T) { imageStates := buildImageStates(tc.nodes) expected := buildNodeInfo(node, tc.pods, imageStates) - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) for _, nodeItem := range tc.nodes { cache.AddNode(logger, nodeItem) } @@ -1720,7 +1720,7 @@ func TestSchedulerCache_UpdateSnapshot(t *testing.T) { _, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache = newCache(ctx, time.Second, time.Second) + cache = newCache(ctx, time.Second, time.Second, nil) snapshot = NewEmptySnapshot() for _, op := range test.operations { @@ -1954,7 +1954,7 @@ func TestSchedulerCache_updateNodeInfoSnapshotList(t *testing.T) { _, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache = newCache(ctx, time.Second, time.Second) + cache = newCache(ctx, time.Second, time.Second, nil) snapshot = NewEmptySnapshot() test.operations(t) @@ -2060,7 +2060,7 @@ func setupCacheOf1kNodes30kPods(b *testing.B) Cache { logger, ctx := ktesting.NewTestContext(b) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) for i := 0; i < 1000; i++ { nodeName := fmt.Sprintf("node-%d", i) cache.AddNode(logger, st.MakeNode().Name(nodeName).Obj()) @@ -2081,7 +2081,7 @@ func setupCacheWithAssumedPods(b *testing.B, podNum int, assumedTime time.Time) ctx, cancel := context.WithCancel(ctx) addedNodes := make(map[string]struct{}) defer cancel() - cache := newCache(ctx, time.Second, time.Second) + cache := newCache(ctx, time.Second, time.Second, nil) for i := 0; i < podNum; i++ { nodeName := fmt.Sprintf("node-%d", i/10) if _, ok := addedNodes[nodeName]; !ok { diff --git a/pkg/scheduler/backend/cache/fake/fake_cache.go b/pkg/scheduler/backend/cache/fake/fake_cache.go index 50b44b8da10..b6507df4870 100644 --- a/pkg/scheduler/backend/cache/fake/fake_cache.go +++ b/pkg/scheduler/backend/cache/fake/fake_cache.go @@ -20,74 +20,47 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" - "k8s.io/kubernetes/pkg/scheduler/framework" ) // Cache is used for testing type Cache struct { + internalcache.Cache AssumeFunc func(*v1.Pod) ForgetFunc func(*v1.Pod) IsAssumedPodFunc func(*v1.Pod) bool GetPodFunc func(*v1.Pod) *v1.Pod } -// AssumePod is a fake method for testing. +// AssumePod allows to mock this method for testing. func (c *Cache) AssumePod(logger klog.Logger, pod *v1.Pod) error { - c.AssumeFunc(pod) - return nil + if c.AssumeFunc != nil { + c.AssumeFunc(pod) + return nil + } + return c.Cache.AssumePod(logger, pod) } -// FinishBinding is a fake method for testing. -func (c *Cache) FinishBinding(logger klog.Logger, pod *v1.Pod) error { return nil } - -// ForgetPod is a fake method for testing. +// ForgetPod allows to mock this method for testing. func (c *Cache) ForgetPod(logger klog.Logger, pod *v1.Pod) error { - c.ForgetFunc(pod) - return nil + if c.ForgetFunc != nil { + c.ForgetFunc(pod) + return nil + } + return c.Cache.ForgetPod(logger, pod) } -// AddPod is a fake method for testing. -func (c *Cache) AddPod(logger klog.Logger, pod *v1.Pod) error { return nil } - -// UpdatePod is a fake method for testing. -func (c *Cache) UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error { return nil } - -// RemovePod is a fake method for testing. -func (c *Cache) RemovePod(logger klog.Logger, pod *v1.Pod) error { return nil } - -// IsAssumedPod is a fake method for testing. +// IsAssumedPod allows to mock this method for testing. func (c *Cache) IsAssumedPod(pod *v1.Pod) (bool, error) { - return c.IsAssumedPodFunc(pod), nil + if c.IsAssumedPodFunc != nil { + return c.IsAssumedPodFunc(pod), nil + } + return c.Cache.IsAssumedPod(pod) } -// GetPod is a fake method for testing. +// GetPod allows to mock this method for testing. func (c *Cache) GetPod(pod *v1.Pod) (*v1.Pod, error) { - return c.GetPodFunc(pod), nil -} - -// AddNode is a fake method for testing. -func (c *Cache) AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo { return nil } - -// UpdateNode is a fake method for testing. -func (c *Cache) UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo { - return nil -} - -// RemoveNode is a fake method for testing. -func (c *Cache) RemoveNode(logger klog.Logger, node *v1.Node) error { return nil } - -// UpdateSnapshot is a fake method for testing. -func (c *Cache) UpdateSnapshot(logger klog.Logger, snapshot *internalcache.Snapshot) error { - return nil -} - -// NodeCount is a fake method for testing. -func (c *Cache) NodeCount() int { return 0 } - -// PodCount is a fake method for testing. -func (c *Cache) PodCount() (int, error) { return 0, nil } - -// Dump is a fake method for testing. -func (c *Cache) Dump() *internalcache.Dump { - return &internalcache.Dump{} + if c.GetPodFunc != nil { + return c.GetPodFunc(pod), nil + } + return c.Cache.GetPod(pod) } diff --git a/pkg/scheduler/backend/cache/interface.go b/pkg/scheduler/backend/cache/interface.go index 24b7fd49066..74d0bd22797 100644 --- a/pkg/scheduler/backend/cache/interface.go +++ b/pkg/scheduler/backend/cache/interface.go @@ -114,6 +114,10 @@ type Cache interface { // Dump produces a dump of the current cache. Dump() *Dump + + // BindPod handles the pod binding by adding a bind API call to the dispatcher. + // This method should be used only if the SchedulerAsyncAPICalls feature gate is enabled. + BindPod(binding *v1.Binding) (<-chan error, error) } // Dump is a dump of the cache state. diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index 3f1bc5c4aef..1388aa44022 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -49,6 +49,7 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/backend/heap" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" "k8s.io/kubernetes/pkg/scheduler/metrics" @@ -128,6 +129,10 @@ type SchedulingQueue interface { // Run starts the goroutines managing the queue. Run(logger klog.Logger) + // PatchPodStatus handles the pod status update by sending an update API call through API dispatcher. + // This method should be used only if the SchedulerAsyncAPICalls feature gate is enabled. + PatchPodStatus(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) (<-chan error, error) + // The following functions are supposed to be used only for testing or debugging. GetPod(name, namespace string) (*framework.QueuedPodInfo, bool) PendingPods() ([]*v1.Pod, string) @@ -193,6 +198,10 @@ type PriorityQueue struct { // pluginMetricsSamplePercent is the percentage of plugin metrics to be sampled. pluginMetricsSamplePercent int + // apiDispatcher is used for the methods that are expected to send API calls. + // It's non-nil only if the SchedulerAsyncAPICalls feature gate is enabled. + apiDispatcher fwk.APIDispatcher + // isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled. isSchedulingQueueHintEnabled bool // isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled. @@ -224,6 +233,7 @@ type priorityQueueOptions struct { pluginMetricsSamplePercent int preEnqueuePluginMap map[string]map[string]framework.PreEnqueuePlugin queueingHintMap QueueingHintMapPerProfile + apiDispatcher fwk.APIDispatcher } // Option configures a PriorityQueue @@ -298,6 +308,13 @@ func WithPluginMetricsSamplePercent(percent int) Option { } } +// WithAPIDispatcher sets the API dispatcher. +func WithAPIDispatcher(apiDispatcher fwk.APIDispatcher) Option { + return func(o *priorityQueueOptions) { + o.apiDispatcher = apiDispatcher + } +} + var defaultPriorityQueueOptions = priorityQueueOptions{ clock: clock.RealClock{}, podInitialBackoffDuration: DefaultPodInitialBackoffDuration, @@ -349,6 +366,7 @@ func NewPriorityQueue( metricsRecorder: options.metricsRecorder, pluginMetricsSamplePercent: options.pluginMetricsSamplePercent, moveRequestCycle: -1, + apiDispatcher: options.apiDispatcher, isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled, isPopFromBackoffQEnabled: isPopFromBackoffQEnabled, } @@ -1307,6 +1325,20 @@ func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) { return result, fmt.Sprintf(pendingPodsSummary, activeQLen, backoffQLen, len(p.unschedulablePods.podInfoMap)) } +// PatchPodStatus handles the pod status update by sending an update API call through API dispatcher. +// This method should be used only if the SchedulerAsyncAPICalls feature gate is enabled. +func (p *PriorityQueue) PatchPodStatus(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) (<-chan error, error) { + // Don't store anything in the cache. This might be extended in the next releases. + onFinish := make(chan error, 1) + err := p.apiDispatcher.Add(apicalls.Implementations.PodStatusPatch(pod, condition, nominatingInfo), fwk.APICallOptions{ + OnFinish: onFinish, + }) + if fwk.IsUnexpectedError(err) { + return onFinish, err + } + return onFinish, nil +} + // Note: this function assumes the caller locks both p.lock.RLock and p.activeQ.getLock().RLock. func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedActiveQueueReader) *framework.PodInfo { pod := np.toPod() diff --git a/pkg/scheduler/eventhandlers.go b/pkg/scheduler/eventhandlers.go index 4b14c240e1c..2fe51b553ef 100644 --- a/pkg/scheduler/eventhandlers.go +++ b/pkg/scheduler/eventhandlers.go @@ -136,6 +136,20 @@ func (sched *Scheduler) addPodToSchedulingQueue(obj interface{}) { sched.SchedulingQueue.Add(logger, pod) } +func (sched *Scheduler) syncPodWithDispatcher(pod *v1.Pod) *v1.Pod { + enrichedObj, err := sched.APIDispatcher.SyncObject(pod) + if err != nil { + utilruntime.HandleError(fmt.Errorf("failed to sync pod %s/%s with API dispatcher: %w", pod.Namespace, pod.Name, err)) + return pod + } + enrichedPod, ok := enrichedObj.(*v1.Pod) + if !ok { + utilruntime.HandleError(fmt.Errorf("cannot convert enrichedObj of type %T to *v1.Pod", enrichedObj)) + return pod + } + return enrichedPod +} + func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) { start := time.Now() logger := sched.logger @@ -153,6 +167,12 @@ func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) { } } + if sched.APIDispatcher != nil { + // If the API dispatcher is available, sync the new pod with the details. + // However, at the moment the updated newPod is discarded and this logic will be handled in the future releases. + _ = sched.syncPodWithDispatcher(newPod) + } + isAssumed, err := sched.Cache.IsAssumedPod(newPod) if err != nil { utilruntime.HandleErrorWithLogger(logger, err, "Failed to check whether pod is assumed", "pod", klog.KObj(newPod)) @@ -274,6 +294,12 @@ func (sched *Scheduler) updatePodInCache(oldObj, newObj interface{}) { return } + if sched.APIDispatcher != nil { + // If the API dispatcher is available, sync the new pod with the details. + // However, at the moment the updated newPod is discarded and this logic will be handled in the future releases. + _ = sched.syncPodWithDispatcher(newPod) + } + logger.V(4).Info("Update event for scheduled pod", "pod", klog.KObj(oldPod)) if err := sched.Cache.UpdatePod(logger, oldPod, newPod); err != nil { utilruntime.HandleErrorWithLogger(logger, err, "Scheduler cache UpdatePod failed", "pod", klog.KObj(oldPod)) diff --git a/pkg/scheduler/eventhandlers_test.go b/pkg/scheduler/eventhandlers_test.go index dadf9bc91f9..d8f6a6707a8 100644 --- a/pkg/scheduler/eventhandlers_test.go +++ b/pkg/scheduler/eventhandlers_test.go @@ -32,26 +32,26 @@ import ( resourcealphaapi "k8s.io/api/resource/v1alpha3" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/version" utilfeature "k8s.io/apiserver/pkg/util/feature" - featuregatetesting "k8s.io/component-base/featuregate/testing" - resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker" - "k8s.io/klog/v2" - "k8s.io/klog/v2/ktesting" - - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/dynamic/dynamicinformer" dyfake "k8s.io/client-go/dynamic/fake" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" - + featuregatetesting "k8s.io/component-base/featuregate/testing" + resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker" + "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports" @@ -162,16 +162,22 @@ func TestEventHandlers_MoveToActiveOnNominatedNodeUpdate(t *testing.T) { client := fake.NewClientset(objs...) informerFactory := informers.NewSharedInformerFactory(client, 0) + // apiDispatcher is unused in the test, but intializing it anyway. + apiDispatcher := apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + recorder := metrics.NewMetricsAsyncRecorder(3, 20*time.Microsecond, ctx.Done()) queue := internalqueue.NewPriorityQueue( newDefaultQueueSort(), informerFactory, internalqueue.WithMetricsRecorder(*recorder), internalqueue.WithQueueingHintMapPerProfile(queueingHintMap), + internalqueue.WithAPIDispatcher(apiDispatcher), // disable backoff queue internalqueue.WithPodInitialBackoffDuration(0), internalqueue.WithPodMaxBackoffDuration(0)) - schedulerCache := internalcache.New(ctx, 30*time.Second) + schedulerCache := internalcache.New(ctx, 30*time.Second, nil) // Put test pods into unschedulable queue for _, pod := range unschedulablePods { @@ -186,7 +192,7 @@ func TestEventHandlers_MoveToActiveOnNominatedNodeUpdate(t *testing.T) { } } - s, _, err := initScheduler(ctx, schedulerCache, queue, client, informerFactory) + s, _, err := initScheduler(ctx, schedulerCache, queue, apiDispatcher, client, informerFactory) if err != nil { t.Fatalf("Failed to initialize test scheduler: %v", err) } @@ -242,7 +248,7 @@ func TestUpdatePodInCache(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() sched := &Scheduler{ - Cache: internalcache.New(ctx, ttl), + Cache: internalcache.New(ctx, ttl, nil), SchedulingQueue: internalqueue.NewTestQueue(ctx, nil), logger: logger, } diff --git a/pkg/scheduler/extender_test.go b/pkg/scheduler/extender_test.go index 8cd83ebba3b..78412ed8eff 100644 --- a/pkg/scheduler/extender_test.go +++ b/pkg/scheduler/extender_test.go @@ -333,7 +333,7 @@ func TestSchedulerWithExtenders(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := internalcache.New(ctx, time.Duration(0)) + cache := internalcache.New(ctx, time.Duration(0), nil) for _, name := range test.nodes { cache.AddNode(logger, createNode(name)) } diff --git a/pkg/scheduler/framework/api_calls/api_calls.go b/pkg/scheduler/framework/api_calls/api_calls.go new file mode 100644 index 00000000000..6a37e4ac73b --- /dev/null +++ b/pkg/scheduler/framework/api_calls/api_calls.go @@ -0,0 +1,45 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicalls + +import ( + fwk "k8s.io/kube-scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework" +) + +const ( + PodStatusPatch fwk.APICallType = "pod_status_patch" + PodBinding fwk.APICallType = "pod_binding" +) + +// Relevances is a built-in mapping types to relevances. +// Types of the same relevance should only be defined for different object types. +// Misconfiguration of this map can lead to unexpected system bahavior, +// so any change has to be well tested and done with care. +// This mapping can be replaced by the out-of-tree plugin in its init() function, if needed. +var Relevances = fwk.APICallRelevances{ + PodStatusPatch: 1, + PodBinding: 2, +} + +// Implementation is a built-in mapping types to calls' constructors. +// It's used to construct calls' objects in the scheduler framework and for easier replacement of those. +// This mapping can be replaced by the out-of-tree plugin in its init() function, if needed. +var Implementations = framework.APICallImplementations[*PodStatusPatchCall, *PodBindingCall]{ + PodStatusPatch: NewPodStatusPatchCall, + PodBinding: NewPodBindingCall, +} diff --git a/pkg/scheduler/framework/api_calls/pod_binding.go b/pkg/scheduler/framework/api_calls/pod_binding.go new file mode 100644 index 00000000000..6ca93e64efb --- /dev/null +++ b/pkg/scheduler/framework/api_calls/pod_binding.go @@ -0,0 +1,68 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicalls + +import ( + "context" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" +) + +// PodBindingCall is used to bind the pod using the binding details. +type PodBindingCall struct { + binding *v1.Binding +} + +func NewPodBindingCall(binding *v1.Binding) *PodBindingCall { + return &PodBindingCall{ + binding: binding, + } +} + +func (pbc *PodBindingCall) CallType() fwk.APICallType { + return PodBinding +} + +func (pbc *PodBindingCall) UID() types.UID { + return pbc.binding.UID +} + +func (pbc *PodBindingCall) Execute(ctx context.Context, client clientset.Interface) error { + logger := klog.FromContext(ctx) + logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(&pbc.binding.ObjectMeta), "node", pbc.binding.Target.Name) + + return client.CoreV1().Pods(pbc.binding.Namespace).Bind(ctx, pbc.binding, metav1.CreateOptions{}) +} + +func (pbc *PodBindingCall) Sync(obj metav1.Object) (metav1.Object, error) { + // Don't need to store or update an object. + return obj, nil +} + +func (pbc *PodBindingCall) Merge(oldCall fwk.APICall) error { + // Merge should just overwrite the previous call. + return nil +} + +func (pbc *PodBindingCall) IsNoOp() bool { + return false +} diff --git a/pkg/scheduler/framework/api_calls/pod_binding_test.go b/pkg/scheduler/framework/api_calls/pod_binding_test.go new file mode 100644 index 00000000000..a12c96b205e --- /dev/null +++ b/pkg/scheduler/framework/api_calls/pod_binding_test.go @@ -0,0 +1,66 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicalls + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" + "k8s.io/klog/v2/ktesting" +) + +func TestPodBindingCall_Execute(t *testing.T) { + _, ctx := ktesting.NewTestContext(t) + binding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod", + Namespace: "ns", + }, + Target: v1.ObjectReference{ + Name: "node", + }, + } + + client := fake.NewClientset() + bound := false + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + createAction := action.(clienttesting.CreateActionImpl) + if createAction.Subresource != "binding" { + return false, nil, nil + } + bound = true + + gotBinding := createAction.GetObject().(*v1.Binding) + if diff := cmp.Diff(binding, gotBinding); diff != "" { + t.Errorf("Execute() sent incorrect binding object (-want,+got):\n%s", diff) + } + return true, nil, nil + }) + + call := NewPodBindingCall(binding) + if err := call.Execute(ctx, client); err != nil { + t.Fatalf("Execute() returned unexpected error: %v", err) + } + if !bound { + t.Error("Expected binding API to be called") + } +} diff --git a/pkg/scheduler/framework/api_calls/pod_status_patch.go b/pkg/scheduler/framework/api_calls/pod_status_patch.go new file mode 100644 index 00000000000..e8f7c4d0711 --- /dev/null +++ b/pkg/scheduler/framework/api_calls/pod_status_patch.go @@ -0,0 +1,183 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicalls + +import ( + "context" + "fmt" + "sync" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + fwk "k8s.io/kube-scheduler/framework" + podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/util" +) + +// PodStatusPatchCall is used to patch the pod status. +type PodStatusPatchCall struct { + lock sync.Mutex + // executed is set at the beginning of the call's Execute + // and is used by Sync to know if the podStatus should be updated. + executed bool + + // podUID is an UID of the pod. + podUID types.UID + // podRef is a reference to the pod. + podRef klog.ObjectRef + // podStatus contains the actual status of the pod. + podStatus *v1.PodStatus + // newCondition is a condition to update. + newCondition *v1.PodCondition + // nominatingInfo is a nominating info to update. + nominatingInfo *framework.NominatingInfo +} + +func NewPodStatusPatchCall(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) *PodStatusPatchCall { + return &PodStatusPatchCall{ + podUID: pod.UID, + podRef: klog.KObj(pod), + podStatus: pod.Status.DeepCopy(), + newCondition: condition, + nominatingInfo: nominatingInfo, + } +} + +func (psuc *PodStatusPatchCall) CallType() fwk.APICallType { + return PodStatusPatch +} + +func (psuc *PodStatusPatchCall) UID() types.UID { + return psuc.podUID +} + +// syncStatus syncs the given status with condition and nominatingInfo. It returns true if anything was actually updated. +func syncStatus(status *v1.PodStatus, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) bool { + nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && status.NominatedNodeName != nominatingInfo.NominatedNodeName + if condition != nil { + if !podutil.UpdatePodCondition(status, condition) && !nnnNeedsUpdate { + return false + } + } else if !nnnNeedsUpdate { + return false + } + if nnnNeedsUpdate { + status.NominatedNodeName = nominatingInfo.NominatedNodeName + } + return true +} + +func (psuc *PodStatusPatchCall) Execute(ctx context.Context, client clientset.Interface) error { + psuc.lock.Lock() + // Executed flag is set not to race with podStatus write in Sync afterwards. + psuc.executed = true + condition := psuc.newCondition.DeepCopy() + podStatusCopy := psuc.podStatus.DeepCopy() + psuc.lock.Unlock() + + logger := klog.FromContext(ctx) + if condition != nil { + logger.V(3).Info("Updating pod condition", "pod", psuc.podRef, "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) + } + + // Sync status to have the condition and nominatingInfo applied on a podStatusCopy. + synced := syncStatus(podStatusCopy, condition, psuc.nominatingInfo) + if !synced { + logger.V(5).Info("Pod status patch call does not need to be executed because it has no effect", "pod", psuc.podRef) + return nil + } + + // It's safe to run PatchPodStatus even on outdated pod object. + err := util.PatchPodStatus(ctx, client, psuc.podRef.Name, psuc.podRef.Namespace, psuc.podStatus, podStatusCopy) + if err != nil { + logger.Error(err, "Failed to patch pod status", "pod", psuc.podRef) + return err + } + + return nil +} + +func (psuc *PodStatusPatchCall) Sync(obj metav1.Object) (metav1.Object, error) { + pod, ok := obj.(*v1.Pod) + if !ok { + return obj, fmt.Errorf("unexpected error: object of type %T is not of type *v1.Pod", obj) + } + + psuc.lock.Lock() + if !psuc.executed { + // Set podStatus only if the call execution haven't started yet, + // because otherwise it's irrelevant and might race. + psuc.podStatus = pod.Status.DeepCopy() + } + psuc.lock.Unlock() + + podCopy := pod.DeepCopy() + // Sync passed pod's status with the call's condition and nominatingInfo. + synced := syncStatus(&podCopy.Status, psuc.newCondition, psuc.nominatingInfo) + if !synced { + return pod, nil + } + return podCopy, nil +} + +func (psuc *PodStatusPatchCall) Merge(oldCall fwk.APICall) error { + oldPsuc, ok := oldCall.(*PodStatusPatchCall) + if !ok { + return fmt.Errorf("unexpected error: call of type %T is not of type *PodStatusPatchCall", oldCall) + } + if psuc.nominatingInfo.Mode() == framework.ModeNoop && oldPsuc.nominatingInfo.Mode() == framework.ModeOverride { + // Set a nominatingInfo from an old call if the new one is no-op. + psuc.nominatingInfo = oldPsuc.nominatingInfo + } + if psuc.newCondition == nil && oldPsuc.newCondition != nil { + // Set a condition from an old call if the new one is nil. + psuc.newCondition = oldPsuc.newCondition + } + return nil +} + +// conditionNeedsUpdate checks if the pod condition needs update. +func conditionNeedsUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool { + // Try to find this pod condition. + _, oldCondition := podutil.GetPodCondition(status, condition.Type) + if oldCondition == nil { + return true + } + + isEqual := condition.Status == oldCondition.Status && + condition.Reason == oldCondition.Reason && + condition.Message == oldCondition.Message && + condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) + + // Return true if one of the fields have changed. + return !isEqual +} + +func (psuc *PodStatusPatchCall) IsNoOp() bool { + nnnNeedsUpdate := psuc.nominatingInfo.Mode() == framework.ModeOverride && psuc.podStatus.NominatedNodeName != psuc.nominatingInfo.NominatedNodeName + if nnnNeedsUpdate { + return false + } + if psuc.newCondition == nil { + return true + } + return !conditionNeedsUpdate(psuc.podStatus, psuc.newCondition) +} diff --git a/pkg/scheduler/framework/api_calls/pod_status_patch_test.go b/pkg/scheduler/framework/api_calls/pod_status_patch_test.go new file mode 100644 index 00000000000..4b38c334bcf --- /dev/null +++ b/pkg/scheduler/framework/api_calls/pod_status_patch_test.go @@ -0,0 +1,250 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apicalls + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + clienttesting "k8s.io/client-go/testing" + "k8s.io/klog/v2/ktesting" + "k8s.io/kubernetes/pkg/scheduler/framework" +) + +func TestPodStatusPatchCall_IsNoOp(t *testing.T) { + podWithNode := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "uid", + }, + Status: v1.PodStatus{ + NominatedNodeName: "node-a", + Conditions: []v1.PodCondition{ + { + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + }, + }, + }, + } + + tests := []struct { + name string + pod *v1.Pod + condition *v1.PodCondition + nominatingInfo *framework.NominatingInfo + want bool + }{ + { + name: "No-op when condition and node name match", + pod: podWithNode, + condition: &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionFalse}, + nominatingInfo: &framework.NominatingInfo{NominatedNodeName: "node-a", NominatingMode: framework.ModeOverride}, + want: true, + }, + { + name: "Not no-op when condition is different", + pod: podWithNode, + condition: &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionTrue}, + nominatingInfo: &framework.NominatingInfo{NominatedNodeName: "node-a", NominatingMode: framework.ModeOverride}, + want: false, + }, + { + name: "Not no-op when nominated node name is different", + pod: podWithNode, + condition: &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionFalse}, + nominatingInfo: &framework.NominatingInfo{NominatedNodeName: "node-b", NominatingMode: framework.ModeOverride}, + want: false, + }, + { + name: "No-op when condition is nil and node name matches", + pod: podWithNode, + condition: nil, + nominatingInfo: &framework.NominatingInfo{NominatedNodeName: "node-a", NominatingMode: framework.ModeOverride}, + want: true, + }, + { + name: "Not no-op when condition is nil but node name differs", + pod: podWithNode, + condition: nil, + nominatingInfo: &framework.NominatingInfo{NominatedNodeName: "node-b", NominatingMode: framework.ModeOverride}, + want: false, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + call := NewPodStatusPatchCall(test.pod, test.condition, test.nominatingInfo) + if got := call.IsNoOp(); got != test.want { + t.Errorf("Expected IsNoOp() to return %v, but got %v", test.want, got) + } + }) + } +} + +func TestPodStatusPatchCall_Merge(t *testing.T) { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "uid", + }, + } + + t.Run("Merges nominating info and condition from the old call", func(t *testing.T) { + oldCall := NewPodStatusPatchCall(pod, &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionFalse}, + &framework.NominatingInfo{NominatedNodeName: "node-a", NominatingMode: framework.ModeOverride}, + ) + newCall := NewPodStatusPatchCall(pod, nil, &framework.NominatingInfo{NominatingMode: framework.ModeNoop}) + + if err := newCall.Merge(oldCall); err != nil { + t.Fatalf("Unexpected error returned by Merge(): %v", err) + } + if newCall.nominatingInfo.NominatedNodeName != "node-a" { + t.Errorf("Expected NominatedNodeName to be node-a, but got: %v", newCall.nominatingInfo.NominatedNodeName) + } + if newCall.newCondition == nil || newCall.newCondition.Type != v1.PodScheduled { + t.Errorf("Expected PodScheduled condition, but got: %v", newCall.newCondition) + } + }) + + t.Run("Doesn't overwrite nominating info and condition of a new call", func(t *testing.T) { + oldCall := NewPodStatusPatchCall(pod, nil, &framework.NominatingInfo{NominatingMode: framework.ModeNoop}) + newCall := NewPodStatusPatchCall(pod, &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionFalse}, + &framework.NominatingInfo{NominatedNodeName: "node-b", NominatingMode: framework.ModeOverride}) + + if err := newCall.Merge(oldCall); err != nil { + t.Fatalf("Unexpected error returned by Merge(): %v", err) + } + if newCall.nominatingInfo.NominatedNodeName != "node-b" { + t.Errorf("Expected NominatedNodeName to be node-b, but got: %v", newCall.nominatingInfo.NominatedNodeName) + } + if newCall.newCondition == nil || newCall.newCondition.Type != v1.PodScheduled { + t.Errorf("Expected PodScheduled condition, but got: %v", newCall.newCondition) + } + }) +} + +func TestPodStatusPatchCall_Sync(t *testing.T) { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "uid", + }, + Status: v1.PodStatus{ + NominatedNodeName: "node-a", + Conditions: []v1.PodCondition{ + { + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + }, + }, + }, + } + + t.Run("Syncs the status before execution and updates the pod", func(t *testing.T) { + call := NewPodStatusPatchCall(pod, nil, + &framework.NominatingInfo{NominatedNodeName: "node-c", NominatingMode: framework.ModeOverride}) + + updatedPod := pod.DeepCopy() + updatedPod.Status.NominatedNodeName = "node-b" + + syncedObj, err := call.Sync(updatedPod) + if err != nil { + t.Fatalf("Unexpected error returned by Sync(): %v", err) + } + if call.podStatus.NominatedNodeName != "node-b" { + t.Errorf("Expected podStatus NominatedNodeName to be node-b, but got: %v", call.podStatus.NominatedNodeName) + } + syncedPod := syncedObj.(*v1.Pod) + if syncedPod.Status.NominatedNodeName != "node-c" { + t.Errorf("Expected synced pod's NominatedNodeName to be node-c, but got: %v", syncedPod.Status.NominatedNodeName) + } + }) + + t.Run("Doesn't sync internal status during or after execution, but updates the pod", func(t *testing.T) { + call := NewPodStatusPatchCall(pod, nil, + &framework.NominatingInfo{NominatedNodeName: "node-c", NominatingMode: framework.ModeOverride}) + call.executed = true + + updatedPod := pod.DeepCopy() + updatedPod.Status.NominatedNodeName = "node-b" + + syncedObj, err := call.Sync(updatedPod) + if err != nil { + t.Fatalf("Unexpected error returned by Sync(): %v", err) + } + if call.podStatus.NominatedNodeName != "node-a" { + t.Errorf("Expected podStatus NominatedNodeName to be node-a, but got: %v", call.podStatus.NominatedNodeName) + } + syncedPod := syncedObj.(*v1.Pod) + if syncedPod.Status.NominatedNodeName != "node-c" { + t.Errorf("Expected synced pod's NominatedNodeName to be node-c, but got: %v", syncedPod.Status.NominatedNodeName) + } + }) +} + +func TestPodStatusPatchCall_Execute(t *testing.T) { + _, ctx := ktesting.NewTestContext(t) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "uid", + Name: "pod", + Namespace: "ns", + }, + Status: v1.PodStatus{ + NominatedNodeName: "node-a", + }, + } + + t.Run("Successful patch", func(t *testing.T) { + client := fake.NewClientset() + patched := false + client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + patched = true + return true, nil, nil + }) + + call := NewPodStatusPatchCall(pod, &v1.PodCondition{Type: v1.PodScheduled, Status: v1.ConditionFalse}, + &framework.NominatingInfo{NominatingMode: framework.ModeNoop}) + if err := call.Execute(ctx, client); err != nil { + t.Fatalf("Unexpected error returned by Execute(): %v", err) + } + if !patched { + t.Error("Expected patch API to be called") + } + if !call.executed { + t.Error("Expected 'executed' flag to be set during execution") + } + }) + + t.Run("Skip API call if patch is not needed", func(t *testing.T) { + client := fake.NewClientset() + patched := false + client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + patched = true + return true, nil, nil + }) + + noOpCall := NewPodStatusPatchCall(pod, nil, + &framework.NominatingInfo{NominatedNodeName: "node-a", NominatingMode: framework.ModeOverride}) + if err := noOpCall.Execute(ctx, client); err != nil { + t.Fatalf("Unexpected error returned by Execute(): %v", err) + } + if patched { + t.Error("Expected patch API not to be called if the call is no-op") + } + }) +} diff --git a/pkg/scheduler/framework/interface.go b/pkg/scheduler/framework/interface.go index 6e520307018..6a8694eb59c 100644 --- a/pkg/scheduler/framework/interface.go +++ b/pkg/scheduler/framework/interface.go @@ -579,6 +579,8 @@ type Framework interface { SetPodNominator(nominator PodNominator) // SetPodActivator sets the PodActivator SetPodActivator(activator PodActivator) + // SetAPICacher sets the APICacher + SetAPICacher(apiCacher APICacher) // Close calls Close method of each plugin. Close() error @@ -642,6 +644,49 @@ type Handle interface { // Parallelizer returns a parallelizer holding parallelism for scheduler. Parallelizer() parallelize.Parallelizer + + // APIDispatcher returns a fwk.APIDispatcher that can be used to dispatch API calls directly. + // This is non-nil if the SchedulerAsyncAPICalls feature gate is enabled. + APIDispatcher() fwk.APIDispatcher + + // APICacher returns an APICacher that coordinates API calls with the scheduler's internal cache. + // Use this to ensure the scheduler's view of the cluster remains consistent. + // This is non-nil if the SchedulerAsyncAPICalls feature gate is enabled. + APICacher() APICacher +} + +// APICacher defines methods that send API calls through the scheduler's cache +// before they are executed asynchronously by the APIDispatcher. +// This ensures the scheduler's internal state is updated optimistically, +// reflecting the intended outcome of the call. +// This methods should be used only if the SchedulerAsyncAPICalls feature gate is enabled. +type APICacher interface { + // PatchPodStatus sends a patch request for a Pod's status. + // The patch could be first applied to the cached Pod object and then the API call is executed asynchronously. + // It returns a channel that can be used to wait for the call's completion. + PatchPodStatus(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *NominatingInfo) (<-chan error, error) + + // BindPod sends a binding request. The binding could be first applied to the cached Pod object + // and then the API call is executed asynchronously. + // It returns a channel that can be used to wait for the call's completion. + BindPod(binding *v1.Binding) (<-chan error, error) + + // WaitOnFinish blocks until the result of an API call is sent to the given onFinish channel + // (returned by methods BindPod or PreemptPod). + // + // It returns the error received from the channel. + // It also returns nil if the call was skipped or overwritten, + // as these are considered successful lifecycle outcomes. + // Direct onFinish channel read can be used to access these results. + WaitOnFinish(ctx context.Context, onFinish <-chan error) error +} + +// APICallImplementations define constructors for each fwk.APICall that is used by the scheduler internally. +type APICallImplementations[T, K fwk.APICall] struct { + // PodStatusPatch is a constructor used to create fwk.APICall object for pod status patch. + PodStatusPatch func(pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *NominatingInfo) T + // PodBinding is a constructor used to create fwk.APICall object for pod binding. + PodBinding func(binding *v1.Binding) K } // PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase. diff --git a/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go b/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go index 24ab2a8faa5..eb9207fa4b3 100644 --- a/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go +++ b/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go @@ -51,11 +51,23 @@ func (b DefaultBinder) Name() string { // Bind binds pods to nodes using the k8s client. func (b DefaultBinder) Bind(ctx context.Context, state fwk.CycleState, p *v1.Pod, nodeName string) *fwk.Status { logger := klog.FromContext(ctx) - logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(p), "node", klog.KRef("", nodeName)) binding := &v1.Binding{ ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID}, Target: v1.ObjectReference{Kind: "Node", Name: nodeName}, } + if b.handle.APICacher() != nil { + // When API cacher is available, use it to bind the pod. + onFinish, err := b.handle.APICacher().BindPod(binding) + if err != nil { + return fwk.AsStatus(err) + } + err = b.handle.APICacher().WaitOnFinish(ctx, onFinish) + if err != nil { + return fwk.AsStatus(err) + } + return nil + } + logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(p), "node", klog.KRef("", nodeName)) err := b.handle.ClientSet().CoreV1().Pods(binding.Namespace).Bind(ctx, binding, metav1.CreateOptions{}) if err != nil { return fwk.AsStatus(err) diff --git a/pkg/scheduler/framework/plugins/defaultbinder/default_binder_test.go b/pkg/scheduler/framework/plugins/defaultbinder/default_binder_test.go index 72a48022987..71b72df7c39 100644 --- a/pkg/scheduler/framework/plugins/defaultbinder/default_binder_test.go +++ b/pkg/scheduler/framework/plugins/defaultbinder/default_binder_test.go @@ -19,7 +19,9 @@ package defaultbinder import ( "context" "errors" + "fmt" "testing" + "time" "github.com/google/go-cmp/cmp" @@ -29,10 +31,19 @@ import ( "k8s.io/client-go/kubernetes/fake" clienttesting "k8s.io/client-go/testing" "k8s.io/klog/v2/ktesting" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" + internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" + "k8s.io/kubernetes/pkg/scheduler/metrics" st "k8s.io/kubernetes/pkg/scheduler/testing" ) +func init() { + metrics.Register() +} + func TestDefaultBinder(t *testing.T) { testPod := st.MakePod().Name("foo").Namespace("ns").Obj() testNode := "foohost.kubernetes.mydomain.com" @@ -52,37 +63,51 @@ func TestDefaultBinder(t *testing.T) { injectErr: errors.New("binding error"), }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, tt := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", tt.name, asyncAPICallsEnabled), func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() - var gotBinding *v1.Binding - client := fake.NewClientset(testPod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() != "binding" { - return false, nil, nil + var gotBinding *v1.Binding + client := fake.NewClientset(testPod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "binding" { + return false, nil, nil + } + if tt.injectErr != nil { + return true, nil, tt.injectErr + } + gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + return true, gotBinding, nil + }) + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - if tt.injectErr != nil { - return true, nil, tt.injectErr + + fh, err := frameworkruntime.NewFramework(ctx, nil, nil, frameworkruntime.WithClientSet(client), frameworkruntime.WithAPIDispatcher(apiDispatcher)) + if err != nil { + t.Fatal(err) + } + if asyncAPICallsEnabled { + cache := internalcache.New(ctx, time.Duration(0), apiDispatcher) + fh.SetAPICacher(apicache.New(nil, cache)) + } + + binder := &DefaultBinder{handle: fh} + status := binder.Bind(ctx, nil, testPod, testNode) + if got := status.AsError(); (tt.injectErr != nil) != (got != nil) { + t.Errorf("got error %q, want %q", got, tt.injectErr) + } + if diff := cmp.Diff(tt.wantBinding, gotBinding); diff != "" { + t.Errorf("got different binding (-want, +got): %s", diff) } - gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - return true, gotBinding, nil }) - - fh, err := frameworkruntime.NewFramework(ctx, nil, nil, frameworkruntime.WithClientSet(client)) - if err != nil { - t.Fatal(err) - } - binder := &DefaultBinder{handle: fh} - status := binder.Bind(ctx, nil, testPod, testNode) - if got := status.AsError(); (tt.injectErr != nil) != (got != nil) { - t.Errorf("got error %q, want %q", got, tt.injectErr) - } - if diff := cmp.Diff(tt.wantBinding, gotBinding); diff != "" { - t.Errorf("got different binding (-want, +got): %s", diff) - } - }) + } } } diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go index f0abccfb457..ae7cdc40c33 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go @@ -52,9 +52,12 @@ import ( apipod "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/scheduler/apis/config" configv1 "k8s.io/kubernetes/pkg/scheduler/apis/config/v1" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" @@ -390,80 +393,99 @@ func TestPostFilter(t *testing.T) { }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // index the potential victim pods in the fake client so that the victims deletion logic does not fail - podItems := []v1.Pod{} - for _, pod := range tt.pods { - podItems = append(podItems, *pod) - } - cs := clientsetfake.NewClientset(&v1.PodList{Items: podItems}) - informerFactory := informers.NewSharedInformerFactory(cs, 0) - podInformer := informerFactory.Core().V1().Pods().Informer() - podInformer.GetStore().Add(tt.pod) - for i := range tt.pods { - podInformer.GetStore().Add(tt.pods[i]) - } - pdbInformer := informerFactory.Policy().V1().PodDisruptionBudgets().Informer() - for i := range tt.pdbs { - if err := pdbInformer.GetStore().Add(tt.pdbs[i]); err != nil { + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, tt := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", tt.name, asyncAPICallsEnabled), func(t *testing.T) { + // index the potential victim pods in the fake client so that the victims deletion logic does not fail + podItems := []v1.Pod{} + for _, pod := range tt.pods { + podItems = append(podItems, *pod) + } + cs := clientsetfake.NewClientset(&v1.PodList{Items: podItems}) + informerFactory := informers.NewSharedInformerFactory(cs, 0) + podInformer := informerFactory.Core().V1().Pods().Informer() + if err := podInformer.GetStore().Add(tt.pod); err != nil { t.Fatal(err) } - } - - // Register NodeResourceFit as the Filter & PreFilter plugin. - registeredPlugins := []tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterPluginAsExtensions(noderesources.Name, nodeResourcesFitFunc, "Filter", "PreFilter"), - tf.RegisterPluginAsExtensions("test-plugin", newTestPlugin, "PreFilter"), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - } - var extenders []framework.Extender - if tt.extender != nil { - extenders = append(extenders, tt.extender) - } - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - f, err := tf.NewFramework(ctx, registeredPlugins, "", - frameworkruntime.WithClientSet(cs), - frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), - frameworkruntime.WithExtenders(extenders), - frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.pods, tt.nodes)), - frameworkruntime.WithLogger(logger), - frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), - ) - if err != nil { - t.Fatal(err) - } - p, err := New(ctx, getDefaultDefaultPreemptionArgs(), f, feature.Features{}) - if err != nil { - t.Fatal(err) - } - - state := framework.NewCycleState() - // Ensure is populated. - if _, status, _ := f.RunPreFilterPlugins(ctx, state, tt.pod); !status.IsSuccess() { - t.Errorf("Unexpected PreFilter Status: %v", status) - } - - gotResult, gotStatus := p.PostFilter(ctx, state, tt.pod, tt.filteredNodesStatuses) - // As we cannot compare two errors directly due to miss the equal method for how to compare two errors, so just need to compare the reasons. - if gotStatus.Code() == fwk.Error { - if diff := cmp.Diff(tt.wantStatus.Reasons(), gotStatus.Reasons()); diff != "" { - t.Errorf("Unexpected status (-want, +got):\n%s", diff) + for i := range tt.pods { + if err := podInformer.GetStore().Add(tt.pods[i]); err != nil { + t.Fatal(err) + } } - } else { - if diff := cmp.Diff(tt.wantStatus, gotStatus); diff != "" { - t.Errorf("Unexpected status (-want, +got):\n%s", diff) + pdbInformer := informerFactory.Policy().V1().PodDisruptionBudgets().Informer() + for i := range tt.pdbs { + if err := pdbInformer.GetStore().Add(tt.pdbs[i]); err != nil { + t.Fatal(err) + } } - } - if diff := cmp.Diff(tt.wantResult, gotResult); diff != "" { - t.Errorf("Unexpected postFilterResult (-want, +got):\n%s", diff) - } - }) + + // Register NodeResourceFit as the Filter & PreFilter plugin. + registeredPlugins := []tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterPluginAsExtensions(noderesources.Name, nodeResourcesFitFunc, "Filter", "PreFilter"), + tf.RegisterPluginAsExtensions("test-plugin", newTestPlugin, "PreFilter"), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + } + var extenders []framework.Extender + if tt.extender != nil { + extenders = append(extenders, tt.extender) + } + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(cs, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } + + f, err := tf.NewFramework(ctx, registeredPlugins, "", + frameworkruntime.WithClientSet(cs), + frameworkruntime.WithAPIDispatcher(apiDispatcher), + frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), + frameworkruntime.WithExtenders(extenders), + frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.pods, tt.nodes)), + frameworkruntime.WithLogger(logger), + frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), + ) + if err != nil { + t.Fatal(err) + } + if asyncAPICallsEnabled { + cache := internalcache.New(ctx, 100*time.Millisecond, apiDispatcher) + f.SetAPICacher(apicache.New(nil, cache)) + } + + p, err := New(ctx, getDefaultDefaultPreemptionArgs(), f, feature.Features{}) + if err != nil { + t.Fatal(err) + } + + state := framework.NewCycleState() + // Ensure is populated. + if _, status, _ := f.RunPreFilterPlugins(ctx, state, tt.pod); !status.IsSuccess() { + t.Errorf("Unexpected PreFilter Status: %v", status) + } + + gotResult, gotStatus := p.PostFilter(ctx, state, tt.pod, tt.filteredNodesStatuses) + // As we cannot compare two errors directly due to miss the equal method for how to compare two errors, so just need to compare the reasons. + if gotStatus.Code() == fwk.Error { + if diff := cmp.Diff(tt.wantStatus.Reasons(), gotStatus.Reasons()); diff != "" { + t.Errorf("Unexpected status (-want, +got):\n%s", diff) + } + } else { + if diff := cmp.Diff(tt.wantStatus, gotStatus); diff != "" { + t.Errorf("Unexpected status (-want, +got):\n%s", diff) + } + } + if diff := cmp.Diff(tt.wantResult, gotResult); diff != "" { + t.Errorf("Unexpected postFilterResult (-want, +got):\n%s", diff) + } + }) + } } } @@ -2075,228 +2097,241 @@ func TestPreempt(t *testing.T) { labelKeys := []string{"hostname", "zone", "region"} for _, asyncPreemptionEnabled := range []bool{true, false} { - for _, test := range tests { - t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) { - client := clientsetfake.NewClientset() - informerFactory := informers.NewSharedInformerFactory(client, 0) - podInformer := informerFactory.Core().V1().Pods().Informer() - testPod := test.pod.DeepCopy() - testPods := make([]*v1.Pod, len(test.pods)) - for i := range test.pods { - testPods[i] = test.pods[i].DeepCopy() - } - - if err := podInformer.GetStore().Add(testPod); err != nil { - t.Fatalf("Failed to add test pod %s: %v", testPod.Name, err) - } - for i := range testPods { - if err := podInformer.GetStore().Add(testPods[i]); err != nil { - t.Fatalf("Failed to add test pod %s: %v", testPods[i], err) - } - } - - // Need to protect deletedPodNames and patchedPodNames to prevent DATA RACE panic. - var mu sync.RWMutex - deletedPodNames := sets.New[string]() - patchedPodNames := sets.New[string]() - patchedPods := []*v1.Pod{} - client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - patchAction := action.(clienttesting.PatchAction) - podName := patchAction.GetName() - namespace := patchAction.GetNamespace() - patch := patchAction.GetPatch() - pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName) - if err != nil { - t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err) - } - marshalledPod, err := json.Marshal(pod) - if err != nil { - t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err) - } - updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{}) - if err != nil { - t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err) - } - updatedPod := &v1.Pod{} - if err := json.Unmarshal(updated, updatedPod); err != nil { - t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err) - } - patchedPods = append(patchedPods, updatedPod) - mu.Lock() - defer mu.Unlock() - patchedPodNames.Insert(podName) - return true, nil, nil - }) - client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - mu.Lock() - defer mu.Unlock() - deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName()) - return true, nil, nil - }) - - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - waitingPods := frameworkruntime.NewWaitingPodsMap() - - cache := internalcache.New(ctx, time.Duration(0)) - for _, pod := range testPods { - if err := cache.AddPod(logger, pod.DeepCopy()); err != nil { - t.Fatalf("Failed to add pod %s: %v", pod.Name, err) - } - } - cachedNodeInfoMap := map[string]*framework.NodeInfo{} - nodes := make([]*v1.Node, len(test.nodeNames)) - for i, name := range test.nodeNames { - node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj() - // Split node name by '/' to form labels in a format of - // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} - node.ObjectMeta.Labels = make(map[string]string) - for i, label := range strings.Split(node.Name, "/") { - node.ObjectMeta.Labels[labelKeys[i]] = label - } - node.Name = node.ObjectMeta.Labels["hostname"] - t.Logf("node is added: %v. labels: %#v", node.Name, node.ObjectMeta.Labels) - cache.AddNode(logger, node) - nodes[i] = node - - // Set nodeInfo to extenders to mock extenders' cache for preemption. - cachedNodeInfo := framework.NewNodeInfo() - cachedNodeInfo.SetNode(node) - cachedNodeInfoMap[node.Name] = cachedNodeInfo - } - var extenders []framework.Extender - for _, extender := range test.extenders { - // Set nodeInfoMap as extenders cached node information. - extender.CachedNodeNameToInfo = cachedNodeInfoMap - extenders = append(extenders, extender) - } - schedFramework, err := tf.NewFramework( - ctx, - []tf.RegisterPluginFunc{ - test.registerPlugin, - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, - "", - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), - frameworkruntime.WithExtenders(extenders), - frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), - frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(testPods, nodes)), - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithWaitingPods(waitingPods), - frameworkruntime.WithLogger(logger), - frameworkruntime.WithPodActivator(&fakePodActivator{}), - ) - if err != nil { - t.Fatal(err) - } - - state := framework.NewCycleState() - // Some tests rely on PreFilter plugin to compute its CycleState. - if _, s, _ := schedFramework.RunPreFilterPlugins(ctx, state, testPod); !s.IsSuccess() { - t.Errorf("Unexpected preFilterStatus: %v", s) - } - // Call preempt and check the expected results. - features := feature.Features{ - EnableAsyncPreemption: asyncPreemptionEnabled, - } - pl, err := New(ctx, getDefaultDefaultPreemptionArgs(), schedFramework, features) - if err != nil { - t.Fatal(err) - } - - // so that these nodes are eligible for preemption, we set their status - // to Unschedulable. - - nodeToStatusMap := framework.NewDefaultNodeToStatus() - for _, n := range nodes { - nodeToStatusMap.Set(n.Name, fwk.NewStatus(fwk.Unschedulable)) - } - - res, status := pl.Evaluator.Preempt(ctx, state, testPod, nodeToStatusMap) - if !status.IsSuccess() && !status.IsRejected() { - t.Errorf("unexpected error in preemption: %v", status.AsError()) - } - if diff := cmp.Diff(test.want, res); diff != "" { - t.Errorf("Unexpected status (-want, +got):\n%s", diff) - } - - if asyncPreemptionEnabled { - // Wait for the pod to be deleted. - if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - mu.RLock() - defer mu.RUnlock() - return len(deletedPodNames) == len(test.expectedPods), nil - }); err != nil { - t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames)) - } - } else { - mu.RLock() - // If async preemption is disabled, the pod should be deleted immediately. - if len(deletedPodNames) != len(test.expectedPods) { - t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames)) - } - mu.RUnlock() - } - - mu.RLock() - if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" { - t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff) - } - - // Make sure that the DisruptionTarget condition has been added to the pod status - for _, patchedPod := range patchedPods { - expectedPodCondition := &v1.PodCondition{ - Type: v1.DisruptionTarget, - Status: v1.ConditionTrue, - Reason: v1.PodReasonPreemptionByScheduler, - Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName), + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(fmt.Sprintf("%s (Async preemption enabled: %v, Async API calls enabled: %v)", test.name, asyncPreemptionEnabled, asyncAPICallsEnabled), func(t *testing.T) { + client := clientsetfake.NewClientset() + informerFactory := informers.NewSharedInformerFactory(client, 0) + podInformer := informerFactory.Core().V1().Pods().Informer() + testPod := test.pod.DeepCopy() + testPods := make([]*v1.Pod, len(test.pods)) + for i := range test.pods { + testPods[i] = test.pods[i].DeepCopy() } - _, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget) - if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" { - t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff) + if err := podInformer.GetStore().Add(testPod); err != nil { + t.Fatalf("Failed to add test pod %s: %v", testPod.Name, err) } - } - - for victimName := range deletedPodNames { - found := false - for _, expPod := range test.expectedPods { - if expPod == victimName { - found = true - break + for i := range testPods { + if err := podInformer.GetStore().Add(testPods[i]); err != nil { + t.Fatalf("Failed to add test pod %s: %v", testPods[i], err) } } - if !found { - t.Errorf("pod %v is not expected to be a victim.", victimName) - } - } - if res != nil && res.NominatingInfo != nil { - testPod.Status.NominatedNodeName = res.NominatedNodeName - } - // Manually set the deleted Pods' deletionTimestamp to non-nil. - for _, pod := range testPods { - if deletedPodNames.Has(pod.Name) { - now := metav1.Now() - pod.DeletionTimestamp = &now - deletedPodNames.Delete(pod.Name) - } - } - mu.RUnlock() + // Need to protect deletedPodNames and patchedPodNames to prevent DATA RACE panic. + var mu sync.RWMutex + deletedPodNames := sets.New[string]() + patchedPodNames := sets.New[string]() + patchedPods := []*v1.Pod{} + client.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + patchAction := action.(clienttesting.PatchAction) + podName := patchAction.GetName() + namespace := patchAction.GetNamespace() + patch := patchAction.GetPatch() + pod, err := informerFactory.Core().V1().Pods().Lister().Pods(namespace).Get(podName) + if err != nil { + t.Fatalf("Failed to get the original pod %s/%s before patching: %v\n", namespace, podName, err) + } + marshalledPod, err := json.Marshal(pod) + if err != nil { + t.Fatalf("Failed to marshal the original pod %s/%s: %v", namespace, podName, err) + } + updated, err := strategicpatch.StrategicMergePatch(marshalledPod, patch, v1.Pod{}) + if err != nil { + t.Fatalf("Failed to apply strategic merge patch %q on pod %#v: %v", patch, marshalledPod, err) + } + updatedPod := &v1.Pod{} + if err := json.Unmarshal(updated, updatedPod); err != nil { + t.Fatalf("Failed to unmarshal updated pod %q: %v", updated, err) + } + patchedPods = append(patchedPods, updatedPod) + mu.Lock() + defer mu.Unlock() + patchedPodNames.Insert(podName) + return true, nil, nil + }) + client.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + mu.Lock() + defer mu.Unlock() + deletedPodNames.Insert(action.(clienttesting.DeleteAction).GetName()) + return true, nil, nil + }) - // Call preempt again and make sure it doesn't preempt any more pods. - res, status = pl.Evaluator.Preempt(ctx, state, testPod, framework.NewDefaultNodeToStatus()) - if !status.IsSuccess() && !status.IsRejected() { - t.Errorf("unexpected error in preemption: %v", status.AsError()) - } - if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 { - t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName) - } - }) + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + waitingPods := frameworkruntime.NewWaitingPodsMap() + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } + + cache := internalcache.New(ctx, time.Duration(0), apiDispatcher) + for _, pod := range testPods { + if err := cache.AddPod(logger, pod.DeepCopy()); err != nil { + t.Fatalf("Failed to add pod %s: %v", pod.Name, err) + } + } + cachedNodeInfoMap := map[string]*framework.NodeInfo{} + nodes := make([]*v1.Node, len(test.nodeNames)) + for i, name := range test.nodeNames { + node := st.MakeNode().Name(name).Capacity(veryLargeRes).Obj() + // Split node name by '/' to form labels in a format of + // {"hostname": node.Name[0], "zone": node.Name[1], "region": node.Name[2]} + node.Labels = make(map[string]string) + for i, label := range strings.Split(node.Name, "/") { + node.Labels[labelKeys[i]] = label + } + node.Name = node.Labels["hostname"] + t.Logf("node is added: %v. labels: %#v", node.Name, node.Labels) + cache.AddNode(logger, node) + nodes[i] = node + + // Set nodeInfo to extenders to mock extenders' cache for preemption. + cachedNodeInfo := framework.NewNodeInfo() + cachedNodeInfo.SetNode(node) + cachedNodeInfoMap[node.Name] = cachedNodeInfo + } + var extenders []framework.Extender + for _, extender := range test.extenders { + // Set nodeInfoMap as extenders cached node information. + extender.CachedNodeNameToInfo = cachedNodeInfoMap + extenders = append(extenders, extender) + } + schedFramework, err := tf.NewFramework( + ctx, + []tf.RegisterPluginFunc{ + test.registerPlugin, + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, + "", + frameworkruntime.WithClientSet(client), + frameworkruntime.WithAPIDispatcher(apiDispatcher), + frameworkruntime.WithEventRecorder(&events.FakeRecorder{}), + frameworkruntime.WithExtenders(extenders), + frameworkruntime.WithPodNominator(internalqueue.NewSchedulingQueue(nil, informerFactory)), + frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(testPods, nodes)), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithWaitingPods(waitingPods), + frameworkruntime.WithLogger(logger), + frameworkruntime.WithPodActivator(&fakePodActivator{}), + ) + if err != nil { + t.Fatal(err) + } + if asyncAPICallsEnabled { + schedFramework.SetAPICacher(apicache.New(nil, cache)) + } + + state := framework.NewCycleState() + // Some tests rely on PreFilter plugin to compute its CycleState. + if _, s, _ := schedFramework.RunPreFilterPlugins(ctx, state, testPod); !s.IsSuccess() { + t.Errorf("Unexpected preFilterStatus: %v", s) + } + // Call preempt and check the expected results. + features := feature.Features{ + EnableAsyncPreemption: asyncPreemptionEnabled, + } + pl, err := New(ctx, getDefaultDefaultPreemptionArgs(), schedFramework, features) + if err != nil { + t.Fatal(err) + } + + // so that these nodes are eligible for preemption, we set their status + // to Unschedulable. + + nodeToStatusMap := framework.NewDefaultNodeToStatus() + for _, n := range nodes { + nodeToStatusMap.Set(n.Name, fwk.NewStatus(fwk.Unschedulable)) + } + + res, status := pl.Evaluator.Preempt(ctx, state, testPod, nodeToStatusMap) + if !status.IsSuccess() && !status.IsRejected() { + t.Errorf("unexpected error in preemption: %v", status.AsError()) + } + if diff := cmp.Diff(test.want, res); diff != "" { + t.Errorf("Unexpected status (-want, +got):\n%s", diff) + } + + if asyncPreemptionEnabled { + // Wait for the pod to be deleted. + if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + mu.RLock() + defer mu.RUnlock() + return len(deletedPodNames) == len(test.expectedPods), nil + }); err != nil { + t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames)) + } + } else { + mu.RLock() + // If async preemption is disabled, the pod should be deleted immediately. + if len(deletedPodNames) != len(test.expectedPods) { + t.Errorf("expected %v pods to be deleted, got %v.", len(test.expectedPods), len(deletedPodNames)) + } + mu.RUnlock() + } + + mu.RLock() + if diff := cmp.Diff(sets.List(patchedPodNames), sets.List(deletedPodNames)); diff != "" { + t.Errorf("unexpected difference in the set of patched and deleted pods: %s", diff) + } + + // Make sure that the DisruptionTarget condition has been added to the pod status + for _, patchedPod := range patchedPods { + expectedPodCondition := &v1.PodCondition{ + Type: v1.DisruptionTarget, + Status: v1.ConditionTrue, + Reason: v1.PodReasonPreemptionByScheduler, + Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", patchedPod.Spec.SchedulerName), + } + + _, condition := apipod.GetPodCondition(&patchedPod.Status, v1.DisruptionTarget) + if diff := cmp.Diff(condition, expectedPodCondition, cmpopts.IgnoreFields(v1.PodCondition{}, "LastTransitionTime")); diff != "" { + t.Fatalf("unexpected difference in the pod %q DisruptionTarget condition: %s", patchedPod.Name, diff) + } + } + + for victimName := range deletedPodNames { + found := false + for _, expPod := range test.expectedPods { + if expPod == victimName { + found = true + break + } + } + if !found { + t.Errorf("pod %v is not expected to be a victim.", victimName) + } + } + if res != nil && res.NominatingInfo != nil { + testPod.Status.NominatedNodeName = res.NominatedNodeName + } + + // Manually set the deleted Pods' deletionTimestamp to non-nil. + for _, pod := range testPods { + if deletedPodNames.Has(pod.Name) { + now := metav1.Now() + pod.DeletionTimestamp = &now + deletedPodNames.Delete(pod.Name) + } + } + mu.RUnlock() + + // Call preempt again and make sure it doesn't preempt any more pods. + res, status = pl.Evaluator.Preempt(ctx, state, testPod, framework.NewDefaultNodeToStatus()) + if !status.IsSuccess() && !status.IsRejected() { + t.Errorf("unexpected error in preemption: %v", status.AsError()) + } + if res != nil && res.NominatingInfo != nil && len(deletedPodNames) > 0 { + t.Errorf("didn't expect any more preemption. Node %v is selected for preemption.", res.NominatedNodeName) + } + }) + } } } } diff --git a/pkg/scheduler/framework/preemption/preemption.go b/pkg/scheduler/framework/preemption/preemption.go index 2476c83e333..ee98910ef99 100644 --- a/pkg/scheduler/framework/preemption/preemption.go +++ b/pkg/scheduler/framework/preemption/preemption.go @@ -33,6 +33,7 @@ import ( utilerrors "k8s.io/apimachinery/pkg/util/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" + clientset "k8s.io/client-go/kubernetes" corelisters "k8s.io/client-go/listers/core/v1" policylisters "k8s.io/client-go/listers/policy/v1" corev1helpers "k8s.io/component-helpers/scheduling/corev1" @@ -182,7 +183,7 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsy newStatus := victim.Status.DeepCopy() updated := apipod.UpdatePodCondition(newStatus, condition) if updated { - if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil { + if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim.Name, victim.Namespace, &victim.Status, newStatus); err != nil { logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor)) return err } @@ -450,7 +451,7 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1. // nomination updates these pods and moves them to the active queue. It // lets scheduler find another place for them. nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name()) - if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil { + if err := clearNominatedNodeName(ctx, cs, ev.Handler.APICacher(), nominatedPods...); err != nil { utilruntime.HandleErrorWithContext(ctx, err, "Cannot clear 'NominatedNodeName' field") // We do not return as this error is not critical. } @@ -458,6 +459,31 @@ func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1. return nil } +// clearNominatedNodeName internally submit a patch request to API server +// to set each pods[*].Status.NominatedNodeName> to "". +func clearNominatedNodeName(ctx context.Context, cs clientset.Interface, apiCacher framework.APICacher, pods ...*v1.Pod) utilerrors.Aggregate { + var errs []error + for _, p := range pods { + if apiCacher != nil { + // When API cacher is available, use it to clear the NominatedNodeName. + _, err := apiCacher.PatchPodStatus(p, nil, &framework.NominatingInfo{NominatedNodeName: "", NominatingMode: framework.ModeOverride}) + if err != nil { + errs = append(errs, err) + } + } else { + if len(p.Status.NominatedNodeName) == 0 { + continue + } + podStatusCopy := p.Status.DeepCopy() + podStatusCopy.NominatedNodeName = "" + if err := util.PatchPodStatus(ctx, cs, p.Name, p.Namespace, &p.Status, podStatusCopy); err != nil { + errs = append(errs, err) + } + } + } + return utilerrors.NewAggregate(errs) +} + // prepareCandidateAsync triggers a goroutine for some preparation work: // - Evict the victim pods // - Reject the victim pods if they are in waitingPod map @@ -504,7 +530,7 @@ func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName // nomination updates these pods and moves them to the active queue. It // lets scheduler find another place for them. nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name()) - if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil { + if err := clearNominatedNodeName(ctx, ev.Handler.ClientSet(), ev.Handler.APICacher(), nominatedPods...); err != nil { utilruntime.HandleErrorWithContext(ctx, err, "Cannot clear 'NominatedNodeName' field from lower priority pods on the same target node", "node", c.Name()) result = metrics.GoroutineResultError // We do not return as this error is not critical. diff --git a/pkg/scheduler/framework/preemption/preemption_test.go b/pkg/scheduler/framework/preemption/preemption_test.go index 3c6bfc980a1..e8b0a80ecad 100644 --- a/pkg/scheduler/framework/preemption/preemption_test.go +++ b/pkg/scheduler/framework/preemption/preemption_test.go @@ -30,6 +30,7 @@ import ( v1 "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" @@ -43,9 +44,12 @@ import ( "k8s.io/klog/v2/ktesting" extenderv1 "k8s.io/kube-scheduler/extender/v1" fwk "k8s.io/kube-scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" @@ -442,6 +446,9 @@ func TestPrepareCandidate(t *testing.T) { SchedulerName(defaultSchedulerName).Priority(highPriority). Containers([]v1.Container{st.MakeContainer().Name("container1").Obj()}). Obj() + + errDeletePodFailed = errors.New("delete pod failed") + errPatchStatusFailed = errors.New("patch pod status failed") ) tests := []struct { @@ -549,7 +556,7 @@ func TestPrepareCandidate(t *testing.T) { testPods: []*v1.Pod{}, expectedDeletionError: true, nodeNames: []string{node1Name}, - expectedStatus: fwk.AsStatus(errors.New("delete pod failed")), + expectedStatus: fwk.AsStatus(errDeletePodFailed), expectedPreemptingMap: sets.New(types.UID("preemptor")), expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor}, }, @@ -586,7 +593,7 @@ func TestPrepareCandidate(t *testing.T) { testPods: []*v1.Pod{}, expectedPatchError: true, nodeNames: []string{node1Name}, - expectedStatus: fwk.AsStatus(errors.New("patch pod status failed")), + expectedStatus: fwk.AsStatus(errPatchStatusFailed), expectedPreemptingMap: sets.New(types.UID("preemptor")), expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor}, }, @@ -614,192 +621,203 @@ func TestPrepareCandidate(t *testing.T) { // which results in the second victim not being deleted. "", }, - expectedStatus: fwk.AsStatus(errors.New("patch pod status failed")), + expectedStatus: fwk.AsStatus(errPatchStatusFailed), expectedPreemptingMap: sets.New(types.UID("preemptor")), expectedActivatedPods: map[string]*v1.Pod{preemptor.Name: preemptor}, }, } for _, asyncPreemptionEnabled := range []bool{true, false} { - for _, tt := range tests { - t.Run(fmt.Sprintf("%v (Async preemption enabled: %v)", tt.name, asyncPreemptionEnabled), func(t *testing.T) { - metrics.Register() - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, tt := range tests { + t.Run(fmt.Sprintf("%v (Async preemption enabled: %v, Async API calls enabled: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled), func(t *testing.T) { + metrics.Register() + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() - nodes := make([]*v1.Node, len(tt.nodeNames)) - for i, nodeName := range tt.nodeNames { - nodes[i] = st.MakeNode().Name(nodeName).Capacity(veryLargeRes).Obj() - } - registeredPlugins := append([]tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New)}, - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - ) - var objs []runtime.Object - for _, pod := range tt.testPods { - objs = append(objs, pod) - } - - mu := &sync.RWMutex{} - deletedPods := sets.New[string]() - deletionFailure := false // whether any request to delete pod failed - patchFailure := false // whether any request to patch pod status failed - - cs := clientsetfake.NewClientset(objs...) - cs.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - mu.Lock() - defer mu.Unlock() - name := action.(clienttesting.DeleteAction).GetName() - if name == "fail-victim" { - deletionFailure = true - return true, nil, fmt.Errorf("delete pod failed") + nodes := make([]*v1.Node, len(tt.nodeNames)) + for i, nodeName := range tt.nodeNames { + nodes[i] = st.MakeNode().Name(nodeName).Capacity(veryLargeRes).Obj() + } + registeredPlugins := append([]tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New)}, + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + ) + var objs []runtime.Object + for _, pod := range tt.testPods { + objs = append(objs, pod) } - deletedPods.Insert(name) - return true, nil, nil - }) + mu := &sync.RWMutex{} + deletedPods := sets.New[string]() + deletionFailure := false // whether any request to delete pod failed + patchFailure := false // whether any request to patch pod status failed - cs.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - mu.Lock() - defer mu.Unlock() - if action.(clienttesting.PatchAction).GetName() == "fail-victim" { - patchFailure = true - return true, nil, fmt.Errorf("patch pod status failed") - } - return true, nil, nil - }) - - informerFactory := informers.NewSharedInformerFactory(cs, 0) - eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()}) - fakeActivator := &fakePodActivator{activatedPods: make(map[string]*v1.Pod), mu: mu} - - // Note: NominatedPodsForNode is called at the beginning of the goroutine in any case. - // fakePodNominator can delay the response of NominatedPodsForNode until the channel is closed, - // which allows us to test the preempting map before the goroutine does nothing yet. - requestStopper := make(chan struct{}) - nominator := &fakePodNominator{ - SchedulingQueue: internalqueue.NewSchedulingQueue(nil, informerFactory), - requestStopper: requestStopper, - } - fwk, err := tf.NewFramework( - ctx, - registeredPlugins, "", - frameworkruntime.WithClientSet(cs), - frameworkruntime.WithLogger(logger), - frameworkruntime.WithInformerFactory(informerFactory), - frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), - frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)), - frameworkruntime.WithPodNominator(nominator), - frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")), - frameworkruntime.WithPodActivator(fakeActivator), - ) - if err != nil { - t.Fatal(err) - } - informerFactory.Start(ctx.Done()) - informerFactory.WaitForCacheSync(ctx.Done()) - fakePreemptionScorePostFilterPlugin := &FakePreemptionScorePostFilterPlugin{} - pe := NewEvaluator("FakePreemptionScorePostFilter", fwk, fakePreemptionScorePostFilterPlugin, asyncPreemptionEnabled) - - if asyncPreemptionEnabled { - pe.prepareCandidateAsync(tt.candidate, tt.preemptor, "test-plugin") - pe.mu.Lock() - // The preempting map should be registered synchronously - // so we don't need wait.Poll. - if !tt.expectedPreemptingMap.Equal(pe.preempting) { - t.Errorf("expected preempting map %v, got %v", tt.expectedPreemptingMap, pe.preempting) - close(requestStopper) - pe.mu.Unlock() - return - } - pe.mu.Unlock() - // make the requests complete - close(requestStopper) - } else { - close(requestStopper) // no need to stop requests - status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin") - if tt.expectedStatus == nil { - if status != nil { - t.Errorf("expect nil status, but got %v", status) + cs := clientsetfake.NewClientset(objs...) + cs.PrependReactor("delete", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + mu.Lock() + defer mu.Unlock() + name := action.(clienttesting.DeleteAction).GetName() + if name == "fail-victim" { + deletionFailure = true + return true, nil, errDeletePodFailed } - } else { - if status == nil { - t.Errorf("expect status %v, but got nil", tt.expectedStatus) - } else if status.Code() != tt.expectedStatus.Code() { - t.Errorf("expect status code %v, but got %v", tt.expectedStatus.Code(), status.Code()) - } else if status.Message() != tt.expectedStatus.Message() { - t.Errorf("expect status message %v, but got %v", tt.expectedStatus.Message(), status.Message()) + + deletedPods.Insert(name) + return true, nil, nil + }) + + cs.PrependReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + mu.Lock() + defer mu.Unlock() + if action.(clienttesting.PatchAction).GetName() == "fail-victim" { + patchFailure = true + return true, nil, errPatchStatusFailed } + return true, nil, nil + }) + + informerFactory := informers.NewSharedInformerFactory(cs, 0) + eventBroadcaster := events.NewBroadcaster(&events.EventSinkImpl{Interface: cs.EventsV1()}) + fakeActivator := &fakePodActivator{activatedPods: make(map[string]*v1.Pod), mu: mu} + + // Note: NominatedPodsForNode is called at the beginning of the goroutine in any case. + // fakePodNominator can delay the response of NominatedPodsForNode until the channel is closed, + // which allows us to test the preempting map before the goroutine does nothing yet. + requestStopper := make(chan struct{}) + nominator := &fakePodNominator{ + SchedulingQueue: internalqueue.NewSchedulingQueue(nil, informerFactory), + requestStopper: requestStopper, } - } - - var lastErrMsg string - if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - mu.RLock() - defer mu.RUnlock() - - pe.mu.Lock() - defer pe.mu.Unlock() - if len(pe.preempting) != 0 { - // The preempting map should be empty after the goroutine in all test cases. - lastErrMsg = fmt.Sprintf("expected no preempting pods, got %v", pe.preempting) - return false, nil + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(cs, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - if tt.expectedDeletionError != deletionFailure { - lastErrMsg = fmt.Sprintf("expected deletion error %v, got %v", tt.expectedDeletionError, deletionFailure) - return false, nil + fwk, err := tf.NewFramework( + ctx, + registeredPlugins, "", + frameworkruntime.WithClientSet(cs), + frameworkruntime.WithAPIDispatcher(apiDispatcher), + frameworkruntime.WithLogger(logger), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), + frameworkruntime.WithSnapshotSharedLister(internalcache.NewSnapshot(tt.testPods, nodes)), + frameworkruntime.WithPodNominator(nominator), + frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, "test-scheduler")), + frameworkruntime.WithPodActivator(fakeActivator), + ) + if err != nil { + t.Fatal(err) } - if tt.expectedPatchError != patchFailure { - lastErrMsg = fmt.Sprintf("expected patch error %v, got %v", tt.expectedPatchError, patchFailure) - return false, nil + informerFactory.Start(ctx.Done()) + informerFactory.WaitForCacheSync(ctx.Done()) + fakePreemptionScorePostFilterPlugin := &FakePreemptionScorePostFilterPlugin{} + if asyncAPICallsEnabled { + cache := internalcache.New(ctx, 100*time.Millisecond, apiDispatcher) + fwk.SetAPICacher(apicache.New(nil, cache)) } + pe := NewEvaluator("FakePreemptionScorePostFilter", fwk, fakePreemptionScorePostFilterPlugin, asyncPreemptionEnabled) + if asyncPreemptionEnabled { - if diff := cmp.Diff(tt.expectedActivatedPods, fakeActivator.activatedPods); tt.expectedActivatedPods != nil && diff != "" { - lastErrMsg = fmt.Sprintf("Unexpected activated pods (-want,+got):\n%s", diff) - return false, nil + pe.prepareCandidateAsync(tt.candidate, tt.preemptor, "test-plugin") + pe.mu.Lock() + // The preempting map should be registered synchronously + // so we don't need wait.Poll. + if !tt.expectedPreemptingMap.Equal(pe.preempting) { + t.Errorf("expected preempting map %v, got %v", tt.expectedPreemptingMap, pe.preempting) + close(requestStopper) + pe.mu.Unlock() + return } - if tt.expectedActivatedPods == nil && len(fakeActivator.activatedPods) != 0 { - lastErrMsg = fmt.Sprintf("expected no activated pods, got %v", fakeActivator.activatedPods) - return false, nil + pe.mu.Unlock() + // make the requests complete + close(requestStopper) + } else { + close(requestStopper) // no need to stop requests + status := pe.prepareCandidate(ctx, tt.candidate, tt.preemptor, "test-plugin") + if tt.expectedStatus == nil { + if status != nil { + t.Errorf("expect nil status, but got %v", status) + } + } else { + if !cmp.Equal(status, tt.expectedStatus) { + t.Errorf("expect status %v, but got %v", tt.expectedStatus, status) + } } } - if deletedPods.Len() > 1 { - // For now, we only expect at most one pod to be deleted in all test cases. - // If we need to test multiple pods deletion, we need to update the test table definition. - return false, fmt.Errorf("expected at most one pod to be deleted, got %v", deletedPods.UnsortedList()) - } + var lastErrMsg string + if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + mu.RLock() + defer mu.RUnlock() - if len(tt.expectedDeletedPod) == 0 { - if deletedPods.Len() != 0 { - // When tt.expectedDeletedPod is empty, we expect no pod to be deleted. - return false, fmt.Errorf("expected no pod to be deleted, got %v", deletedPods.UnsortedList()) + pe.mu.Lock() + defer pe.mu.Unlock() + if len(pe.preempting) != 0 { + // The preempting map should be empty after the goroutine in all test cases. + lastErrMsg = fmt.Sprintf("expected no preempting pods, got %v", pe.preempting) + return false, nil } - // nothing further to check. + + if tt.expectedDeletionError != deletionFailure { + lastErrMsg = fmt.Sprintf("expected deletion error %v, got %v", tt.expectedDeletionError, deletionFailure) + return false, nil + } + if tt.expectedPatchError != patchFailure { + lastErrMsg = fmt.Sprintf("expected patch error %v, got %v", tt.expectedPatchError, patchFailure) + return false, nil + } + + if asyncPreemptionEnabled { + if diff := cmp.Diff(tt.expectedActivatedPods, fakeActivator.activatedPods); tt.expectedActivatedPods != nil && diff != "" { + lastErrMsg = fmt.Sprintf("Unexpected activated pods (-want,+got):\n%s", diff) + return false, nil + } + if tt.expectedActivatedPods == nil && len(fakeActivator.activatedPods) != 0 { + lastErrMsg = fmt.Sprintf("expected no activated pods, got %v", fakeActivator.activatedPods) + return false, nil + } + } + + if deletedPods.Len() > 1 { + // For now, we only expect at most one pod to be deleted in all test cases. + // If we need to test multiple pods deletion, we need to update the test table definition. + return false, fmt.Errorf("expected at most one pod to be deleted, got %v", deletedPods.UnsortedList()) + } + + if len(tt.expectedDeletedPod) == 0 { + if deletedPods.Len() != 0 { + // When tt.expectedDeletedPod is empty, we expect no pod to be deleted. + return false, fmt.Errorf("expected no pod to be deleted, got %v", deletedPods.UnsortedList()) + } + // nothing further to check. + return true, nil + } + + found := false + for _, podName := range tt.expectedDeletedPod { + if deletedPods.Has(podName) || + // If podName is empty, we expect no pod to be deleted. + (deletedPods.Len() == 0 && podName == "") { + found = true + } + } + if !found { + lastErrMsg = fmt.Sprintf("expected pod %v to be deleted, but %v is deleted", strings.Join(tt.expectedDeletedPod, " or "), deletedPods.UnsortedList()) + return false, nil + } + return true, nil + }); err != nil { + t.Fatal(lastErrMsg) } - - found := false - for _, podName := range tt.expectedDeletedPod { - if deletedPods.Has(podName) || - // If podName is empty, we expect no pod to be deleted. - (deletedPods.Len() == 0 && podName == "") { - found = true - } - } - if !found { - lastErrMsg = fmt.Sprintf("expected pod %v to be deleted, but %v is deleted", strings.Join(tt.expectedDeletedPod, " or "), deletedPods.UnsortedList()) - return false, nil - } - - return true, nil - }); err != nil { - t.Fatal(lastErrMsg) - } - }) + }) + } } } } @@ -1032,10 +1050,15 @@ func TestCallExtenders(t *testing.T) { objs = append(objs, preemptor) cs := clientsetfake.NewClientset(objs...) informerFactory := informers.NewSharedInformerFactory(cs, 0) + apiDispatcher := apidispatcher.New(cs, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + fwk, err := tf.NewFramework( ctx, registeredPlugins, "", frameworkruntime.WithClientSet(cs), + frameworkruntime.WithAPIDispatcher(apiDispatcher), frameworkruntime.WithLogger(logger), frameworkruntime.WithExtenders(tt.extenders), frameworkruntime.WithInformerFactory(informerFactory), @@ -1047,6 +1070,8 @@ func TestCallExtenders(t *testing.T) { } informerFactory.Start(ctx.Done()) informerFactory.WaitForCacheSync(ctx.Done()) + cache := internalcache.New(ctx, 100*time.Millisecond, apiDispatcher) + fwk.SetAPICacher(apicache.New(nil, cache)) fakePreemptionScorePostFilterPlugin := &FakePreemptionScorePostFilterPlugin{} pe := Evaluator{ @@ -1075,3 +1100,85 @@ func TestCallExtenders(t *testing.T) { }) } } + +func TestRemoveNominatedNodeName(t *testing.T) { + tests := []struct { + name string + currentNominatedNodeName string + newNominatedNodeName string + expectPatchRequest bool + expectedPatchData string + }{ + { + name: "Should make patch request to clear node name", + currentNominatedNodeName: "node1", + expectPatchRequest: true, + expectedPatchData: `{"status":{"nominatedNodeName":null}}`, + }, + { + name: "Should not make patch request if nominated node is already cleared", + currentNominatedNodeName: "", + expectPatchRequest: false, + }, + } + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + actualPatchRequests := 0 + var actualPatchData string + cs := &clientsetfake.Clientset{} + patchCalled := make(chan struct{}, 1) + cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + actualPatchRequests++ + patch := action.(clienttesting.PatchAction) + actualPatchData = string(patch.GetPatch()) + patchCalled <- struct{}{} + // For this test, we don't care about the result of the patched pod, just that we got the expected + // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. + return true, &v1.Pod{}, nil + }) + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "foo"}, + Status: v1.PodStatus{NominatedNodeName: test.currentNominatedNodeName}, + } + + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + var apiCacher framework.APICacher + if asyncAPICallsEnabled { + apiDispatcher := apidispatcher.New(cs, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + + informerFactory := informers.NewSharedInformerFactory(cs, 0) + queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithAPIDispatcher(apiDispatcher)) + apiCacher = apicache.New(queue, nil) + } + + if err := clearNominatedNodeName(ctx, cs, apiCacher, pod); err != nil { + t.Fatalf("Error calling removeNominatedNodeName: %v", err) + } + + if test.expectPatchRequest { + select { + case <-patchCalled: + case <-time.After(time.Second): + t.Fatalf("Timed out while waiting for patch to be called") + } + if actualPatchData != test.expectedPatchData { + t.Fatalf("Patch data mismatch: Actual was %v, but expected %v", actualPatchData, test.expectedPatchData) + } + } else { + select { + case <-patchCalled: + t.Fatalf("Expected patch not to be called, actual patch data: %v", actualPatchData) + case <-time.After(time.Second): + } + } + }) + } + } +} diff --git a/pkg/scheduler/framework/runtime/framework.go b/pkg/scheduler/framework/runtime/framework.go index 493868d8c79..56a6b8303eb 100644 --- a/pkg/scheduler/framework/runtime/framework.go +++ b/pkg/scheduler/framework/runtime/framework.go @@ -37,6 +37,7 @@ import ( "k8s.io/klog/v2" fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/apis/config" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" "k8s.io/kubernetes/pkg/scheduler/metrics" @@ -86,6 +87,8 @@ type frameworkImpl struct { extenders []framework.Extender framework.PodNominator framework.PodActivator + apiDispatcher *apidispatcher.APIDispatcher + apiCacher framework.APICacher parallelizer parallelize.Parallelizer } @@ -138,6 +141,7 @@ type frameworkOptions struct { captureProfile CaptureProfile parallelizer parallelize.Parallelizer waitingPods *waitingPodsMap + apiDispatcher *apidispatcher.APIDispatcher logger *klog.Logger } @@ -223,6 +227,13 @@ func WithParallelism(parallelism int) Option { } } +// WithAPIDispatcher sets API dispatcher for the scheduling frameworkImpl. +func WithAPIDispatcher(apiDispatcher *apidispatcher.APIDispatcher) Option { + return func(o *frameworkOptions) { + o.apiDispatcher = apiDispatcher + } +} + // CaptureProfile is a callback to capture a finalized profile. type CaptureProfile func(config.KubeSchedulerProfile) @@ -289,6 +300,7 @@ func NewFramework(ctx context.Context, r Registry, profile *config.KubeScheduler extenders: options.extenders, PodNominator: options.podNominator, PodActivator: options.podActivator, + apiDispatcher: options.apiDispatcher, parallelizer: options.parallelizer, logger: logger, } @@ -441,6 +453,10 @@ func (f *frameworkImpl) SetPodActivator(a framework.PodActivator) { f.PodActivator = a } +func (f *frameworkImpl) SetAPICacher(c framework.APICacher) { + f.apiCacher = c +} + // Close closes each plugin, when they implement io.Closer interface. func (f *frameworkImpl) Close() error { var errs []error @@ -1679,3 +1695,22 @@ func (f *frameworkImpl) PercentageOfNodesToScore() *int32 { func (f *frameworkImpl) Parallelizer() parallelize.Parallelizer { return f.parallelizer } + +// APIDispatcher returns an apiDispatcher that can be used to dispatch API calls. +// This requires SchedulerAsyncAPICalls feature gate to be enabled. +func (f *frameworkImpl) APIDispatcher() fwk.APIDispatcher { + if f.apiDispatcher == nil { + return nil + } + return f.apiDispatcher +} + +// APICacher returns an apiCacher that can be used to dispatch API calls through scheduler's cache +// instead of directly using APIDispatcher(). +// This requires SchedulerAsyncAPICalls feature gate to be enabled. +func (f *frameworkImpl) APICacher() framework.APICacher { + if f.apiCacher == nil { + return nil + } + return f.apiCacher +} diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 0689fcbb0ab..b094ba5f5f6 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -1090,7 +1090,7 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo msg := truncateMessage(errMsg) fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) - if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{ + if err := updatePod(ctx, sched.client, fwk.APICacher(), pod, &v1.PodCondition{ Type: v1.PodScheduled, ObservedGeneration: podutil.CalculatePodConditionObservedGeneration(&pod.Status, pod.Generation, v1.PodScheduled), Status: v1.ConditionFalse, @@ -1111,7 +1111,12 @@ func truncateMessage(message string) string { return message[:max-len(suffix)] + suffix } -func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { +func updatePod(ctx context.Context, client clientset.Interface, apiCacher framework.APICacher, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { + if apiCacher != nil { + // When API cacher is available, use it to patch the status. + _, err := apiCacher.PatchPodStatus(pod, condition, nominatingInfo) + return err + } logger := klog.FromContext(ctx) logger.V(3).Info("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) podStatusCopy := pod.Status.DeepCopy() @@ -1124,5 +1129,5 @@ func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, con if nnnNeedsUpdate { podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName } - return util.PatchPodStatus(ctx, client, pod, podStatusCopy) + return util.PatchPodStatus(ctx, client, pod.Name, pod.Namespace, &pod.Status, podStatusCopy) } diff --git a/pkg/scheduler/schedule_one_test.go b/pkg/scheduler/schedule_one_test.go index 2c94b0947ad..21a513621d5 100644 --- a/pkg/scheduler/schedule_one_test.go +++ b/pkg/scheduler/schedule_one_test.go @@ -43,6 +43,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" + clientset "k8s.io/client-go/kubernetes" clientsetfake "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/scheme" clienttesting "k8s.io/client-go/testing" @@ -56,10 +57,13 @@ import ( fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/features" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" fakecache "k8s.io/kubernetes/pkg/scheduler/backend/cache/fake" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality" @@ -821,140 +825,159 @@ func TestSchedulerScheduleOne(t *testing.T) { } for _, qHintEnabled := range []bool{true, false} { - for _, item := range table { - t.Run(fmt.Sprintf("[QueueingHint: %v] %s", qHintEnabled, item.name), func(t *testing.T) { - if !qHintEnabled { - featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) - } - logger, ctx := ktesting.NewTestContext(t) - var gotError error - var gotPod *v1.Pod - var gotForgetPod *v1.Pod - var gotAssumedPod *v1.Pod - var gotBinding *v1.Binding - cache := &fakecache.Cache{ - ForgetFunc: func(pod *v1.Pod) { - gotForgetPod = pod - }, - AssumeFunc: func(pod *v1.Pod) { - gotAssumedPod = pod - }, - IsAssumedPodFunc: func(pod *v1.Pod) bool { - if pod == nil || gotAssumedPod == nil { - return false + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, item := range table { + t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v)", item.name, qHintEnabled, asyncAPICallsEnabled), func(t *testing.T) { + if !qHintEnabled { + featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) + } + logger, ctx := ktesting.NewTestContext(t) + var gotError error + var gotPod *v1.Pod + var gotForgetPod *v1.Pod + var gotAssumedPod *v1.Pod + var gotBinding *v1.Binding + + client := clientsetfake.NewClientset(item.sendPod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "binding" { + return false, nil, nil } - return pod.UID == gotAssumedPod.UID - }, - } - client := clientsetfake.NewClientset(item.sendPod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() != "binding" { - return false, nil, nil + gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + return true, gotBinding, item.injectBindError + }) + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - return true, gotBinding, item.injectBindError + + internalCache := internalcache.New(ctx, 30*time.Second, apiDispatcher) + cache := &fakecache.Cache{ + Cache: internalCache, + ForgetFunc: func(pod *v1.Pod) { + gotForgetPod = pod + }, + AssumeFunc: func(pod *v1.Pod) { + gotAssumedPod = pod + }, + IsAssumedPodFunc: func(pod *v1.Pod) bool { + if pod == nil || gotAssumedPod == nil { + return false + } + return pod.UID == gotAssumedPod.UID + }, + } + informerFactory := informers.NewSharedInformerFactory(client, 0) + + schedFramework, err := tf.NewFramework(ctx, + append(item.registerPluginFuncs, + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + ), + testSchedulerName, + frameworkruntime.WithClientSet(client), + frameworkruntime.WithAPIDispatcher(apiDispatcher), + frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName)), + frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), + frameworkruntime.WithInformerFactory(informerFactory), + ) + if err != nil { + t.Fatal(err) + } + + ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) + queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar), internalqueue.WithAPIDispatcher(apiDispatcher)) + if asyncAPICallsEnabled { + schedFramework.SetAPICacher(apicache.New(queue, cache)) + } + + sched := &Scheduler{ + Cache: cache, + client: client, + NextPod: queue.Pop, + SchedulingQueue: queue, + Profiles: profile.Map{testSchedulerName: schedFramework}, + APIDispatcher: apiDispatcher, + } + queue.Add(logger, item.sendPod) + + sched.SchedulePod = func(ctx context.Context, fwk framework.Framework, state fwk.CycleState, pod *v1.Pod) (ScheduleResult, error) { + return item.mockScheduleResult, item.injectSchedulingError + } + sched.FailureHandler = func(ctx context.Context, fwk framework.Framework, p *framework.QueuedPodInfo, status *fwk.Status, ni *framework.NominatingInfo, start time.Time) { + gotPod = p.Pod + gotError = status.AsError() + + sched.handleSchedulingFailure(ctx, fwk, p, status, ni, start) + } + called := make(chan struct{}) + stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { + e, _ := obj.(*eventsv1.Event) + if e.Reason != item.eventReason { + t.Errorf("got event %v, want %v", e.Reason, item.eventReason) + } + close(called) + }) + if err != nil { + t.Fatal(err) + } + informerFactory.Start(ctx.Done()) + informerFactory.WaitForCacheSync(ctx.Done()) + sched.ScheduleOne(ctx) + <-called + if diff := cmp.Diff(item.expectAssumedPod, gotAssumedPod); diff != "" { + t.Errorf("Unexpected assumed pod (-want,+got):\n%s", diff) + } + if diff := cmp.Diff(item.expectErrorPod, gotPod); diff != "" { + t.Errorf("Unexpected error pod (-want,+got):\n%s", diff) + } + if diff := cmp.Diff(item.expectForgetPod, gotForgetPod); diff != "" { + t.Errorf("Unexpected forget pod (-want,+got):\n%s", diff) + } + if item.expectError == nil || gotError == nil { + if !errors.Is(gotError, item.expectError) { + t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError, gotError) + } + } else if item.expectError.Error() != gotError.Error() { + t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error()) + } + if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { + t.Errorf("Unexpected binding (-want,+got):\n%s", diff) + } + // We have to use wait here because the Pod goes to the binding cycle in some test cases + // and the inflight pods might not be empty immediately at this point in such case. + if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + return len(queue.InFlightPods()) == 0, nil + }); err != nil { + t.Errorf("in-flight pods should be always empty after SchedulingOne. It has %v Pods", len(queue.InFlightPods())) + } + podsInBackoffQ := queue.PodsInBackoffQ() + if item.expectPodInBackoffQ != nil { + if !podListContainsPod(podsInBackoffQ, item.expectPodInBackoffQ) { + t.Errorf("Expected to find pod in backoffQ, but it's not there.\nWant: %v,\ngot: %v", item.expectPodInBackoffQ, podsInBackoffQ) + } + } else { + if len(podsInBackoffQ) > 0 { + t.Errorf("Expected backoffQ to be empty, but it's not.\nGot: %v", podsInBackoffQ) + } + } + unschedulablePods := queue.UnschedulablePods() + if item.expectPodInUnschedulable != nil { + if !podListContainsPod(unschedulablePods, item.expectPodInUnschedulable) { + t.Errorf("Expected to find pod in unschedulable, but it's not there.\nWant: %v,\ngot: %v", item.expectPodInUnschedulable, unschedulablePods) + } + } else { + if len(unschedulablePods) > 0 { + t.Errorf("Expected unschedulable pods to be empty, but it's not.\nGot: %v", unschedulablePods) + } + } + stopFunc() }) - informerFactory := informers.NewSharedInformerFactory(client, 0) - - schedFramework, err := tf.NewFramework(ctx, - append(item.registerPluginFuncs, - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - ), - testSchedulerName, - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName)), - frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), - frameworkruntime.WithInformerFactory(informerFactory), - ) - if err != nil { - t.Fatal(err) - } - - ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) - queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar)) - sched := &Scheduler{ - Cache: cache, - client: client, - NextPod: queue.Pop, - SchedulingQueue: queue, - Profiles: profile.Map{testSchedulerName: schedFramework}, - } - queue.Add(logger, item.sendPod) - - sched.SchedulePod = func(ctx context.Context, fwk framework.Framework, state fwk.CycleState, pod *v1.Pod) (ScheduleResult, error) { - return item.mockScheduleResult, item.injectSchedulingError - } - sched.FailureHandler = func(ctx context.Context, fwk framework.Framework, p *framework.QueuedPodInfo, status *fwk.Status, ni *framework.NominatingInfo, start time.Time) { - gotPod = p.Pod - gotError = status.AsError() - - sched.handleSchedulingFailure(ctx, fwk, p, status, ni, start) - } - called := make(chan struct{}) - stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { - e, _ := obj.(*eventsv1.Event) - if e.Reason != item.eventReason { - t.Errorf("got event %v, want %v", e.Reason, item.eventReason) - } - close(called) - }) - if err != nil { - t.Fatal(err) - } - informerFactory.Start(ctx.Done()) - informerFactory.WaitForCacheSync(ctx.Done()) - sched.ScheduleOne(ctx) - <-called - if diff := cmp.Diff(item.expectAssumedPod, gotAssumedPod); diff != "" { - t.Errorf("Unexpected assumed pod (-want,+got):\n%s", diff) - } - if diff := cmp.Diff(item.expectErrorPod, gotPod); diff != "" { - t.Errorf("Unexpected error pod (-want,+got):\n%s", diff) - } - if diff := cmp.Diff(item.expectForgetPod, gotForgetPod); diff != "" { - t.Errorf("Unexpected forget pod (-want,+got):\n%s", diff) - } - if item.expectError == nil || gotError == nil { - if !errors.Is(gotError, item.expectError) { - t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError, gotError) - } - } else if item.expectError.Error() != gotError.Error() { - t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error()) - } - if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { - t.Errorf("Unexpected binding (-want,+got):\n%s", diff) - } - // We have to use wait here because the Pod goes to the binding cycle in some test cases - // and the inflight pods might not be empty immediately at this point in such case. - if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - return len(queue.InFlightPods()) == 0, nil - }); err != nil { - t.Errorf("in-flight pods should be always empty after SchedulingOne. It has %v Pods", len(queue.InFlightPods())) - } - podsInBackoffQ := queue.PodsInBackoffQ() - if item.expectPodInBackoffQ != nil { - if !podListContainsPod(podsInBackoffQ, item.expectPodInBackoffQ) { - t.Errorf("Expected to find pod in backoffQ, but it's not there.\nWant: %v,\ngot: %v", item.expectPodInBackoffQ, podsInBackoffQ) - } - } else { - if len(podsInBackoffQ) > 0 { - t.Errorf("Expected backoffQ to be empty, but it's not.\nGot: %v", podsInBackoffQ) - } - } - unschedulablePods := queue.UnschedulablePods() - if item.expectPodInUnschedulable != nil { - if !podListContainsPod(unschedulablePods, item.expectPodInUnschedulable) { - t.Errorf("Expected to find pod in unschedulable, but it's not there.\nWant: %v,\ngot: %v", item.expectPodInUnschedulable, unschedulablePods) - } - } else { - if len(unschedulablePods) > 0 { - t.Errorf("Expected unschedulable pods to be empty, but it's not.\nGot: %v", unschedulablePods) - } - } - stopFunc() - }) + } } } } @@ -1115,153 +1138,170 @@ func TestScheduleOneMarksPodAsProcessedBeforePreBind(t *testing.T) { } for _, qHintEnabled := range []bool{true, false} { - for _, item := range table { - t.Run(fmt.Sprintf("[QueueingHint: %v] %s", qHintEnabled, item.name), func(t *testing.T) { - if !qHintEnabled { - featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) - } - logger, ctx := ktesting.NewTestContext(t) - var gotError error - var gotPod *v1.Pod - var gotAssumedPod *v1.Pod - var gotBinding *v1.Binding - var gotCallsToFailureHandler int - var gotPodIsInFlightAtFailureHandler bool - var gotPodIsInFlightAtWaitOnPermit bool - var gotPodIsInFlightAtRunPreBindPlugins bool + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, item := range table { + t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v)", item.name, qHintEnabled, asyncAPICallsEnabled), func(t *testing.T) { + if !qHintEnabled { + featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) + } + logger, ctx := ktesting.NewTestContext(t) + var gotError error + var gotPod *v1.Pod + var gotAssumedPod *v1.Pod + var gotBinding *v1.Binding + var gotCallsToFailureHandler int + var gotPodIsInFlightAtFailureHandler bool + var gotPodIsInFlightAtWaitOnPermit bool + var gotPodIsInFlightAtRunPreBindPlugins bool - cache := &fakecache.Cache{ - ForgetFunc: func(pod *v1.Pod) { - }, - AssumeFunc: func(pod *v1.Pod) { - gotAssumedPod = pod - }, - IsAssumedPodFunc: func(pod *v1.Pod) bool { - if pod == nil || gotAssumedPod == nil { - return false + client := clientsetfake.NewClientset(item.sendPod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "binding" { + return false, nil, nil } - return pod.UID == gotAssumedPod.UID - }, - } - client := clientsetfake.NewClientset(item.sendPod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() != "binding" { - return false, nil, nil + gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + return true, gotBinding, item.injectBindError + }) + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - gotBinding = action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - return true, gotBinding, item.injectBindError + + internalCache := internalcache.New(ctx, 30*time.Second, apiDispatcher) + cache := &fakecache.Cache{ + Cache: internalCache, + ForgetFunc: func(pod *v1.Pod) { + }, + AssumeFunc: func(pod *v1.Pod) { + gotAssumedPod = pod + }, + IsAssumedPodFunc: func(pod *v1.Pod) bool { + if pod == nil || gotAssumedPod == nil { + return false + } + return pod.UID == gotAssumedPod.UID + }, + } + + informerFactory := informers.NewSharedInformerFactory(client, 0) + ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) + queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar), internalqueue.WithAPIDispatcher(apiDispatcher)) + + schedFramework, err := NewFakeFramework( + ctx, + queue, + append(item.registerPluginFuncs, + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + ), + testSchedulerName, + frameworkruntime.WithClientSet(client), + frameworkruntime.WithAPIDispatcher(apiDispatcher), + frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName)), + frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), + ) + if err != nil { + t.Fatal(err) + } + if asyncAPICallsEnabled { + schedFramework.SetAPICacher(apicache.New(queue, cache)) + } + + schedFramework.waitOnPermitFn = func(_ context.Context, pod *v1.Pod) *fwk.Status { + gotPodIsInFlightAtWaitOnPermit = podListContainsPod(schedFramework.queue.InFlightPods(), pod) + return item.mockWaitOnPermitResult + } + schedFramework.runPreBindPluginsFn = func(_ context.Context, _ fwk.CycleState, pod *v1.Pod, _ string) *fwk.Status { + gotPodIsInFlightAtRunPreBindPlugins = podListContainsPod(schedFramework.queue.InFlightPods(), pod) + return item.mockRunPreBindPluginsResult + } + + sched := &Scheduler{ + Cache: cache, + client: client, + NextPod: queue.Pop, + SchedulingQueue: queue, + Profiles: profile.Map{testSchedulerName: schedFramework}, + APIDispatcher: apiDispatcher, + } + queue.Add(logger, item.sendPod) + + sched.SchedulePod = func(ctx context.Context, fwk framework.Framework, state fwk.CycleState, pod *v1.Pod) (ScheduleResult, error) { + return item.mockScheduleResult, item.injectSchedulingError + } + sched.FailureHandler = func(_ context.Context, fwk framework.Framework, p *framework.QueuedPodInfo, status *fwk.Status, _ *framework.NominatingInfo, _ time.Time) { + gotCallsToFailureHandler++ + gotPodIsInFlightAtFailureHandler = podListContainsPod(queue.InFlightPods(), p.Pod) + + gotPod = p.Pod + gotError = status.AsError() + + msg := truncateMessage(gotError.Error()) + fwk.EventRecorder().Eventf(p.Pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) + + queue.Done(p.Pod.UID) + } + called := make(chan struct{}) + stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { + e, _ := obj.(*eventsv1.Event) + if e.Reason != item.eventReason { + t.Errorf("got event %v, want %v", e.Reason, item.eventReason) + } + close(called) + }) + if err != nil { + t.Fatal(err) + } + sched.ScheduleOne(ctx) + <-called + + if diff := cmp.Diff(item.expectAssumedPod, gotAssumedPod); diff != "" { + t.Errorf("Unexpected assumed pod (-want,+got):\n%s", diff) + } + if diff := cmp.Diff(item.expectErrorPod, gotPod); diff != "" { + t.Errorf("Unexpected error pod (-want,+got):\n%s", diff) + } + if item.expectError == nil || gotError == nil { + if !errors.Is(gotError, item.expectError) { + t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError, gotError) + } + } else if item.expectError.Error() != gotError.Error() { + t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error()) + } + if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { + t.Errorf("Unexpected binding (-want,+got):\n%s", diff) + } + if item.expectError != nil && gotCallsToFailureHandler != 1 { + t.Errorf("expected 1 call to FailureHandlerFn, got %v", gotCallsToFailureHandler) + } + if item.expectError == nil && gotCallsToFailureHandler != 0 { + t.Errorf("expected 0 calls to FailureHandlerFn, got %v", gotCallsToFailureHandler) + } + if (item.expectPodIsInFlightAtFailureHandler && qHintEnabled) != gotPodIsInFlightAtFailureHandler { + t.Errorf("unexpected pod being in flight in FailureHandlerFn, expected %v but got %v.", + item.expectPodIsInFlightAtFailureHandler, gotPodIsInFlightAtFailureHandler) + } + if (item.expectPodIsInFlightAtWaitOnPermit && qHintEnabled) != gotPodIsInFlightAtWaitOnPermit { + t.Errorf("unexpected pod being in flight at start of WaitOnPermit, expected %v but got %v", + item.expectPodIsInFlightAtWaitOnPermit, gotPodIsInFlightAtWaitOnPermit) + } + if gotPodIsInFlightAtRunPreBindPlugins { + t.Errorf("unexpected pod being in flight at start of RunPreBindPlugins") + } + // We have to use wait here + // because the Pod goes to the binding cycle in some test cases and the inflight pods might not be empty immediately at this point in such case. + if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + return len(queue.InFlightPods()) == 0, nil + }); err != nil { + t.Errorf("in-flight pods should be always empty after SchedulingOne. It has %v Pods", len(queue.InFlightPods())) + } + stopFunc() }) - - informerFactory := informers.NewSharedInformerFactory(client, 0) - ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) - queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar)) - - schedFramework, err := NewFakeFramework( - ctx, - queue, - append(item.registerPluginFuncs, - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - ), - testSchedulerName, - frameworkruntime.WithClientSet(client), - frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName)), - frameworkruntime.WithWaitingPods(frameworkruntime.NewWaitingPodsMap()), - ) - if err != nil { - t.Fatal(err) - } - - schedFramework.waitOnPermitFn = func(_ context.Context, pod *v1.Pod) *fwk.Status { - gotPodIsInFlightAtWaitOnPermit = podListContainsPod(schedFramework.queue.InFlightPods(), pod) - return item.mockWaitOnPermitResult - } - schedFramework.runPreBindPluginsFn = func(_ context.Context, _ fwk.CycleState, pod *v1.Pod, _ string) *fwk.Status { - gotPodIsInFlightAtRunPreBindPlugins = podListContainsPod(schedFramework.queue.InFlightPods(), pod) - return item.mockRunPreBindPluginsResult - } - - sched := &Scheduler{ - Cache: cache, - client: client, - NextPod: queue.Pop, - SchedulingQueue: queue, - Profiles: profile.Map{testSchedulerName: schedFramework}, - } - queue.Add(logger, item.sendPod) - - sched.SchedulePod = func(ctx context.Context, fwk framework.Framework, state fwk.CycleState, pod *v1.Pod) (ScheduleResult, error) { - return item.mockScheduleResult, item.injectSchedulingError - } - sched.FailureHandler = func(_ context.Context, fwk framework.Framework, p *framework.QueuedPodInfo, status *fwk.Status, _ *framework.NominatingInfo, _ time.Time) { - gotCallsToFailureHandler++ - gotPodIsInFlightAtFailureHandler = podListContainsPod(queue.InFlightPods(), p.Pod) - - gotPod = p.Pod - gotError = status.AsError() - - msg := truncateMessage(gotError.Error()) - fwk.EventRecorder().Eventf(p.Pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) - - queue.Done(p.Pod.UID) - } - called := make(chan struct{}) - stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { - e, _ := obj.(*eventsv1.Event) - if e.Reason != item.eventReason { - t.Errorf("got event %v, want %v", e.Reason, item.eventReason) - } - close(called) - }) - if err != nil { - t.Fatal(err) - } - sched.ScheduleOne(ctx) - <-called - - if diff := cmp.Diff(item.expectAssumedPod, gotAssumedPod); diff != "" { - t.Errorf("Unexpected assumed pod (-want,+got):\n%s", diff) - } - if diff := cmp.Diff(item.expectErrorPod, gotPod); diff != "" { - t.Errorf("Unexpected error pod (-want,+got):\n%s", diff) - } - if item.expectError == nil || gotError == nil { - if !errors.Is(gotError, item.expectError) { - t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError, gotError) - } - } else if item.expectError.Error() != gotError.Error() { - t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error()) - } - if diff := cmp.Diff(item.expectBind, gotBinding); diff != "" { - t.Errorf("Unexpected binding (-want,+got):\n%s", diff) - } - if item.expectError != nil && gotCallsToFailureHandler != 1 { - t.Errorf("expected 1 call to FailureHandlerFn, got %v", gotCallsToFailureHandler) - } - if item.expectError == nil && gotCallsToFailureHandler != 0 { - t.Errorf("expected 0 calls to FailureHandlerFn, got %v", gotCallsToFailureHandler) - } - if (item.expectPodIsInFlightAtFailureHandler && qHintEnabled) != gotPodIsInFlightAtFailureHandler { - t.Errorf("unexpected pod being in flight in FailureHandlerFn, expected %v but got %v.", - item.expectPodIsInFlightAtFailureHandler, gotPodIsInFlightAtFailureHandler) - } - if (item.expectPodIsInFlightAtWaitOnPermit && qHintEnabled) != gotPodIsInFlightAtWaitOnPermit { - t.Errorf("unexpected pod being in flight at start of WaitOnPermit, expected %v but got %v", - item.expectPodIsInFlightAtWaitOnPermit, gotPodIsInFlightAtWaitOnPermit) - } - if gotPodIsInFlightAtRunPreBindPlugins { - t.Errorf("unexpected pod being in flight at start of RunPreBindPlugins") - } - // We have to use wait here - // because the Pod goes to the binding cycle in some test cases and the inflight pods might not be empty immediately at this point in such case. - if err := wait.PollUntilContextTimeout(ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - return len(queue.InFlightPods()) == 0, nil - }); err != nil { - t.Errorf("in-flight pods should be always empty after SchedulingOne. It has %v Pods", len(queue.InFlightPods())) - } - stopFunc() - }) + } } } } @@ -1302,223 +1342,272 @@ func podListContainsPod(list []*v1.Pod, pod *v1.Pod) bool { } func TestSchedulerNoPhantomPodAfterExpire(t *testing.T) { - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(ctx, 100*time.Millisecond) - pod := podWithPort("pod.Name", "", 8080) - node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", UID: types.UID("node1")}} - scache.AddNode(logger, &node) + for _, asyncAPICallsEnabled := range []bool{true, false} { + t.Run(fmt.Sprintf("Async API calls enabled: %v", asyncAPICallsEnabled), func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + client := clientsetfake.NewClientset() + bindingChan := interruptOnBind(client) - fns := []tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - tf.RegisterPluginAsExtensions(nodeports.Name, frameworkruntime.FactoryAdapter(feature.Features{}, nodeports.New), "Filter", "PreFilter"), - } - scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, pod, &node, fns...) + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } - waitPodExpireChan := make(chan struct{}) - timeout := make(chan struct{}) - go func() { - for { + scache := internalcache.New(ctx, 100*time.Millisecond, apiDispatcher) + pod := podWithPort("pod.Name", "", 8080) + node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", UID: types.UID("node1")}} + scache.AddNode(logger, &node) + + fns := []tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + tf.RegisterPluginAsExtensions(nodeports.Name, frameworkruntime.FactoryAdapter(feature.Features{}, nodeports.New), "Filter", "PreFilter"), + } + scheduler, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, client, queuedPodStore, scache, apiDispatcher, pod, &node, bindingChan, fns...) + + waitPodExpireChan := make(chan struct{}) + timeout := make(chan struct{}) + go func() { + for { + select { + case <-timeout: + return + default: + } + pods, err := scache.PodCount() + if err != nil { + errChan <- fmt.Errorf("cache.List failed: %w", err) + return + } + if pods == 0 { + close(waitPodExpireChan) + return + } + time.Sleep(100 * time.Millisecond) + } + }() + // waiting for the assumed pod to expire select { - case <-timeout: - return - default: + case err := <-errChan: + t.Fatal(err) + case <-waitPodExpireChan: + case <-time.After(wait.ForeverTestTimeout): + close(timeout) + t.Fatalf("timeout timeout in waiting pod expire after %v", wait.ForeverTestTimeout) } - pods, err := scache.PodCount() - if err != nil { - errChan <- fmt.Errorf("cache.List failed: %v", err) - return - } - if pods == 0 { - close(waitPodExpireChan) - return - } - time.Sleep(100 * time.Millisecond) - } - }() - // waiting for the assumed pod to expire - select { - case err := <-errChan: - t.Fatal(err) - case <-waitPodExpireChan: - case <-time.After(wait.ForeverTestTimeout): - close(timeout) - t.Fatalf("timeout timeout in waiting pod expire after %v", wait.ForeverTestTimeout) - } - // We use conflicted pod ports to incur fit predicate failure if first pod not removed. - secondPod := podWithPort("bar", "", 8080) - queuedPodStore.Add(secondPod) - scheduler.ScheduleOne(ctx) - select { - case b := <-bindingChan: - expectBinding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, - Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, - } - if diff := cmp.Diff(expectBinding, b); diff != "" { - t.Errorf("Unexpected binding (-want,+got):\n%s", diff) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + // We use conflicted pod ports to incur fit predicate failure if first pod not removed. + secondPod := podWithPort("bar", "", 8080) + if err := queuedPodStore.Add(secondPod); err != nil { + t.Fatal(err) + } + scheduler.ScheduleOne(ctx) + select { + case b := <-bindingChan: + expectBinding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, + Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, + } + if diff := cmp.Diff(expectBinding, b); diff != "" { + t.Errorf("Unexpected binding (-want,+got):\n%s", diff) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + } + }) } } func TestSchedulerNoPhantomPodAfterDelete(t *testing.T) { - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(ctx, 10*time.Minute) - firstPod := podWithPort("pod.Name", "", 8080) - node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", UID: types.UID("node1")}} - scache.AddNode(logger, &node) - fns := []tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - tf.RegisterPluginAsExtensions(nodeports.Name, frameworkruntime.FactoryAdapter(feature.Features{}, nodeports.New), "Filter", "PreFilter"), - } - scheduler, bindingChan, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, queuedPodStore, scache, firstPod, &node, fns...) + for _, asyncAPICallsEnabled := range []bool{true, false} { + t.Run(fmt.Sprintf("Async API calls enabled: %v", asyncAPICallsEnabled), func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + client := clientsetfake.NewClientset() + bindingChan := interruptOnBind(client) - // We use conflicted pod ports to incur fit predicate failure. - secondPod := podWithPort("bar", "", 8080) - queuedPodStore.Add(secondPod) - // queuedPodStore: [bar:8080] - // cache: [(assumed)foo:8080] + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } - scheduler.ScheduleOne(ctx) - select { - case err := <-errChan: - expectErr := &framework.FitError{ - Pod: secondPod, - NumAllNodes: 1, - Diagnosis: framework.Diagnosis{ - NodeToStatus: framework.NewNodeToStatus(map[string]*fwk.Status{ - node.Name: fwk.NewStatus(fwk.Unschedulable, nodeports.ErrReason).WithPlugin(nodeports.Name), - }, fwk.NewStatus(fwk.UnschedulableAndUnresolvable)), - UnschedulablePlugins: sets.New(nodeports.Name), - }, - } - if err == nil { - t.Errorf("expected error %v, got nil", expectErr) - } else if diff := cmp.Diff(expectErr, err, schedulerCmpOpts...); diff != "" { - t.Errorf("unexpected error (-want,+got):\n%s", diff) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in fitting after %v", wait.ForeverTestTimeout) - } + scache := internalcache.New(ctx, 10*time.Minute, apiDispatcher) + firstPod := podWithPort("pod.Name", "", 8080) + node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", UID: types.UID("node1")}} + scache.AddNode(logger, &node) + fns := []tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + tf.RegisterPluginAsExtensions(nodeports.Name, frameworkruntime.FactoryAdapter(feature.Features{}, nodeports.New), "Filter", "PreFilter"), + } + scheduler, errChan := setupTestSchedulerWithOnePodOnNode(ctx, t, client, queuedPodStore, scache, apiDispatcher, firstPod, &node, bindingChan, fns...) - // We mimic the workflow of cache behavior when a pod is removed by user. - // Note: if the schedulernodeinfo timeout would be super short, the first pod would expire - // and would be removed itself (without any explicit actions on schedulernodeinfo). Even in that case, - // explicitly AddPod will as well correct the behavior. - firstPod.Spec.NodeName = node.Name - if err := scache.AddPod(logger, firstPod); err != nil { - t.Fatalf("err: %v", err) - } - if err := scache.RemovePod(logger, firstPod); err != nil { - t.Fatalf("err: %v", err) - } + // We use conflicted pod ports to incur fit predicate failure. + secondPod := podWithPort("bar", "", 8080) + if err := queuedPodStore.Add(secondPod); err != nil { + t.Fatal(err) + } + // queuedPodStore: [bar:8080] + // cache: [(assumed)foo:8080] - queuedPodStore.Add(secondPod) - scheduler.ScheduleOne(ctx) - select { - case b := <-bindingChan: - expectBinding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, - Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, - } - if diff := cmp.Diff(expectBinding, b); diff != "" { - t.Errorf("unexpected binding (-want,+got):\n%s", diff) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + scheduler.ScheduleOne(ctx) + select { + case err := <-errChan: + expectErr := &framework.FitError{ + Pod: secondPod, + NumAllNodes: 1, + Diagnosis: framework.Diagnosis{ + NodeToStatus: framework.NewNodeToStatus(map[string]*fwk.Status{ + node.Name: fwk.NewStatus(fwk.Unschedulable, nodeports.ErrReason).WithPlugin(nodeports.Name), + }, fwk.NewStatus(fwk.UnschedulableAndUnresolvable)), + UnschedulablePlugins: sets.New(nodeports.Name), + }, + } + if err == nil { + t.Errorf("expected error %v, got nil", expectErr) + } else if diff := cmp.Diff(expectErr, err, schedulerCmpOpts...); diff != "" { + t.Errorf("unexpected error (-want,+got):\n%s", diff) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in fitting after %v", wait.ForeverTestTimeout) + } + + // We mimic the workflow of cache behavior when a pod is removed by user. + // Note: if the schedulernodeinfo timeout would be super short, the first pod would expire + // and would be removed itself (without any explicit actions on schedulernodeinfo). Even in that case, + // explicitly AddPod will as well correct the behavior. + firstPod.Spec.NodeName = node.Name + if err := scache.AddPod(logger, firstPod); err != nil { + t.Fatalf("err: %v", err) + } + if err := scache.RemovePod(logger, firstPod); err != nil { + t.Fatalf("err: %v", err) + } + + if err := queuedPodStore.Add(secondPod); err != nil { + t.Fatal(err) + } + scheduler.ScheduleOne(ctx) + select { + case b := <-bindingChan: + expectBinding := &v1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: "bar", UID: types.UID("bar")}, + Target: v1.ObjectReference{Kind: "Node", Name: node.Name}, + } + if diff := cmp.Diff(expectBinding, b); diff != "" { + t.Errorf("unexpected binding (-want,+got):\n%s", diff) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout in binding after %v", wait.ForeverTestTimeout) + } + }) } } func TestSchedulerFailedSchedulingReasons(t *testing.T) { - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) - scache := internalcache.New(ctx, 10*time.Minute) + for _, asyncAPICallsEnabled := range []bool{true, false} { + t.Run(fmt.Sprintf("Async API calls enabled: %v", asyncAPICallsEnabled), func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) + client := clientsetfake.NewClientset() - // Design the baseline for the pods, and we will make nodes that don't fit it later. - var cpu = int64(4) - var mem = int64(500) - podWithTooBigResourceRequests := podWithResources("bar", "", v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), - }, v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), - }) + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } - // create several nodes which cannot schedule the above pod - var nodes []*v1.Node - var objects []runtime.Object - for i := 0; i < 100; i++ { - uid := fmt.Sprintf("node%v", i) - node := v1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: uid, UID: types.UID(uid)}, - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), - v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }, - Allocatable: v1.ResourceList{ - v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), - v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), - v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }}, - } - scache.AddNode(logger, &node) - nodes = append(nodes, &node) - objects = append(objects, &node) - } + scache := internalcache.New(ctx, 10*time.Minute, apiDispatcher) - // Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary. - failedNodeStatues := framework.NewDefaultNodeToStatus() - for _, node := range nodes { - failedNodeStatues.Set(node.Name, fwk.NewStatus( - fwk.UnschedulableAndUnresolvable, - fmt.Sprintf("Insufficient %v", v1.ResourceCPU), - fmt.Sprintf("Insufficient %v", v1.ResourceMemory), - ).WithPlugin(noderesources.Name)) - } - fns := []tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - tf.RegisterPluginAsExtensions(noderesources.Name, frameworkruntime.FactoryAdapter(feature.Features{}, noderesources.NewFit), "Filter", "PreFilter"), - } + // Design the baseline for the pods, and we will make nodes that don't fit it later. + var cpu = int64(4) + var mem = int64(500) + podWithTooBigResourceRequests := podWithResources("bar", "", v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }, v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }) - informerFactory := informers.NewSharedInformerFactory(clientsetfake.NewClientset(objects...), 0) - scheduler, _, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, informerFactory, nil, fns...) + // create several nodes which cannot schedule the above pod + var nodes []*v1.Node + var objects []runtime.Object + for i := 0; i < 100; i++ { + uid := fmt.Sprintf("node%v", i) + node := v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: uid, UID: types.UID(uid)}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + v1.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + v1.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }}, + } + scache.AddNode(logger, &node) + nodes = append(nodes, &node) + objects = append(objects, &node) + } - queuedPodStore.Add(podWithTooBigResourceRequests) - scheduler.ScheduleOne(ctx) - select { - case err := <-errChan: - expectErr := &framework.FitError{ - Pod: podWithTooBigResourceRequests, - NumAllNodes: len(nodes), - Diagnosis: framework.Diagnosis{ - NodeToStatus: failedNodeStatues, - UnschedulablePlugins: sets.New(noderesources.Name), - }, - } - if len(fmt.Sprint(expectErr)) > 150 { - t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) - } - if diff := cmp.Diff(expectErr, err, schedulerCmpOpts...); diff != "" { - t.Errorf("Unexpected error (-want,+got):\n%s", diff) - } - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("timeout after %v", wait.ForeverTestTimeout) + // Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary. + failedNodeStatues := framework.NewDefaultNodeToStatus() + for _, node := range nodes { + failedNodeStatues.Set(node.Name, fwk.NewStatus( + fwk.UnschedulableAndUnresolvable, + fmt.Sprintf("Insufficient %v", v1.ResourceCPU), + fmt.Sprintf("Insufficient %v", v1.ResourceMemory), + ).WithPlugin(noderesources.Name)) + } + fns := []tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + tf.RegisterPluginAsExtensions(noderesources.Name, frameworkruntime.FactoryAdapter(feature.Features{}, noderesources.NewFit), "Filter", "PreFilter"), + } + + informerFactory := informers.NewSharedInformerFactory(clientsetfake.NewClientset(objects...), 0) + scheduler, errChan := setupTestScheduler(ctx, t, client, queuedPodStore, scache, apiDispatcher, informerFactory, nil, fns...) + + if err := queuedPodStore.Add(podWithTooBigResourceRequests); err != nil { + t.Fatal(err) + } + scheduler.ScheduleOne(ctx) + select { + case err := <-errChan: + expectErr := &framework.FitError{ + Pod: podWithTooBigResourceRequests, + NumAllNodes: len(nodes), + Diagnosis: framework.Diagnosis{ + NodeToStatus: failedNodeStatues, + UnschedulablePlugins: sets.New(noderesources.Name), + }, + } + if len(fmt.Sprint(expectErr)) > 150 { + t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) + } + if diff := cmp.Diff(expectErr, err, schedulerCmpOpts...); diff != "" { + t.Errorf("Unexpected error (-want,+got):\n%s", diff) + } + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("timeout after %v", wait.ForeverTestTimeout) + } + }) } } @@ -1613,62 +1702,64 @@ func TestSchedulerWithVolumeBinding(t *testing.T) { }, } - for _, item := range table { - t.Run(item.name, func(t *testing.T) { - _, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - fakeVolumeBinder := volumebinding.NewFakeVolumeBinder(item.volumeBinderConfig) - s, bindingChan, errChan := setupTestSchedulerWithVolumeBinding(ctx, t, fakeVolumeBinder, eventBroadcaster) - eventChan := make(chan struct{}) - stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { - e, _ := obj.(*eventsv1.Event) - if e, a := item.eventReason, e.Reason; e != a { - t.Errorf("expected %v, got %v", e, a) + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, item := range table { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", item.name, asyncAPICallsEnabled), func(t *testing.T) { + _, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + fakeVolumeBinder := volumebinding.NewFakeVolumeBinder(item.volumeBinderConfig) + s, bindingChan, errChan := setupTestSchedulerWithVolumeBinding(ctx, t, fakeVolumeBinder, eventBroadcaster, asyncAPICallsEnabled) + eventChan := make(chan struct{}) + stopFunc, err := eventBroadcaster.StartEventWatcher(func(obj runtime.Object) { + e, _ := obj.(*eventsv1.Event) + if e, a := item.eventReason, e.Reason; e != a { + t.Errorf("expected %v, got %v", e, a) + } + close(eventChan) + }) + if err != nil { + t.Fatal(err) } - close(eventChan) - }) - if err != nil { - t.Fatal(err) - } - s.ScheduleOne(ctx) - // Wait for pod to succeed or fail scheduling - select { - case <-eventChan: - case <-time.After(wait.ForeverTestTimeout): - t.Fatalf("scheduling timeout after %v", wait.ForeverTestTimeout) - } - stopFunc() - // Wait for scheduling to return an error or succeed binding. - var ( - gotErr error - gotBind *v1.Binding - ) - select { - case gotErr = <-errChan: - case gotBind = <-bindingChan: - case <-time.After(chanTimeout): - t.Fatalf("did not receive pod binding or error after %v", chanTimeout) - } - if item.expectError != nil { - if gotErr == nil || item.expectError.Error() != gotErr.Error() { + s.ScheduleOne(ctx) + // Wait for pod to succeed or fail scheduling + select { + case <-eventChan: + case <-time.After(wait.ForeverTestTimeout): + t.Fatalf("scheduling timeout after %v", wait.ForeverTestTimeout) + } + stopFunc() + // Wait for scheduling to return an error or succeed binding. + var ( + gotErr error + gotBind *v1.Binding + ) + select { + case gotErr = <-errChan: + case gotBind = <-bindingChan: + case <-time.After(chanTimeout): + t.Fatalf("did not receive pod binding or error after %v", chanTimeout) + } + if item.expectError != nil { + if gotErr == nil || item.expectError.Error() != gotErr.Error() { + t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) + } + } else if gotErr != nil { t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) } - } else if gotErr != nil { - t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectError, gotErr) - } - if !cmp.Equal(item.expectPodBind, gotBind) { - t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectPodBind, gotBind) - } + if !cmp.Equal(item.expectPodBind, gotBind) { + t.Errorf("err \nWANT=%+v,\nGOT=%+v", item.expectPodBind, gotBind) + } - if item.expectAssumeCalled != fakeVolumeBinder.AssumeCalled { - t.Errorf("expectedAssumeCall %v", item.expectAssumeCalled) - } + if item.expectAssumeCalled != fakeVolumeBinder.AssumeCalled { + t.Errorf("expectedAssumeCall %v", item.expectAssumeCalled) + } - if item.expectBindCalled != fakeVolumeBinder.BindCalled { - t.Errorf("expectedBindCall %v", item.expectBindCalled) - } - }) + if item.expectBindCalled != fakeVolumeBinder.BindCalled { + t.Errorf("expectedBindCall %v", item.expectBindCalled) + } + }) + } } } @@ -1715,54 +1806,73 @@ func TestSchedulerBinding(t *testing.T) { }, } - for _, test := range table { - t.Run(test.name, func(t *testing.T) { - pod := st.MakePod().Name(test.podName).Obj() - defaultBound := false - state := framework.NewCycleState() - client := clientsetfake.NewClientset(pod) - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - if action.GetSubresource() == "binding" { - defaultBound = true + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range table { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", test.name, asyncAPICallsEnabled), func(t *testing.T) { + pod := st.MakePod().Name(test.podName).Obj() + defaultBound := false + state := framework.NewCycleState() + client := clientsetfake.NewClientset(pod) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + if action.GetSubresource() == "binding" { + defaultBound = true + } + return false, nil, nil + }) + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - return false, nil, nil + + fwk, err := tf.NewFramework(ctx, + []tf.RegisterPluginFunc{ + tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + }, "", frameworkruntime.WithClientSet(client), frameworkruntime.WithAPIDispatcher(apiDispatcher), frameworkruntime.WithEventRecorder(&events.FakeRecorder{})) + if err != nil { + t.Fatal(err) + } + cache := internalcache.New(ctx, 100*time.Millisecond, apiDispatcher) + if asyncAPICallsEnabled { + informerFactory := informers.NewSharedInformerFactory(client, 0) + ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) + queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar), internalqueue.WithAPIDispatcher(apiDispatcher)) + fwk.SetAPICacher(apicache.New(queue, cache)) + } + + sched := &Scheduler{ + Extenders: test.extenders, + Cache: cache, + nodeInfoSnapshot: nil, + percentageOfNodesToScore: 0, + APIDispatcher: apiDispatcher, + } + status := sched.bind(ctx, fwk, pod, "node", state) + if !status.IsSuccess() { + t.Error(status.AsError()) + } + + // Checking default binding. + if wantBound := test.wantBinderID == -1; defaultBound != wantBound { + t.Errorf("got bound with default binding: %v, want %v", defaultBound, wantBound) + } + + // Checking extenders binding. + for i, ext := range test.extenders { + wantBound := i == test.wantBinderID + if gotBound := ext.(*fakeExtender).gotBind; gotBound != wantBound { + t.Errorf("got bound with extender #%d: %v, want %v", i, gotBound, wantBound) + } + } + }) - _, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - fwk, err := tf.NewFramework(ctx, - []tf.RegisterPluginFunc{ - tf.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), - tf.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), - }, "", frameworkruntime.WithClientSet(client), frameworkruntime.WithEventRecorder(&events.FakeRecorder{})) - if err != nil { - t.Fatal(err) - } - sched := &Scheduler{ - Extenders: test.extenders, - Cache: internalcache.New(ctx, 100*time.Millisecond), - nodeInfoSnapshot: nil, - percentageOfNodesToScore: 0, - } - status := sched.bind(ctx, fwk, pod, "node", state) - if !status.IsSuccess() { - t.Error(status.AsError()) - } - - // Checking default binding. - if wantBound := test.wantBinderID == -1; defaultBound != wantBound { - t.Errorf("got bound with default binding: %v, want %v", defaultBound, wantBound) - } - - // Checking extenders binding. - for i, ext := range test.extenders { - wantBound := i == test.wantBinderID - if gotBound := ext.(*fakeExtender).gotBind; gotBound != wantBound { - t.Errorf("got bound with extender #%d: %v, want %v", i, gotBound, wantBound) - } - } - - }) + } } } @@ -1773,7 +1883,7 @@ func TestUpdatePod(t *testing.T) { newPodCondition *v1.PodCondition currentNominatedNodeName string newNominatingInfo *framework.NominatingInfo - expectedPatchRequests int + expectPatchRequest bool expectedPatchDataPattern string }{ { @@ -1787,7 +1897,7 @@ func TestUpdatePod(t *testing.T) { Reason: "newReason", Message: "newMessage", }, - expectedPatchRequests: 1, + expectPatchRequest: true, expectedPatchDataPattern: `{"status":{"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, }, { @@ -1810,7 +1920,7 @@ func TestUpdatePod(t *testing.T) { Reason: "newReason", Message: "newMessage", }, - expectedPatchRequests: 1, + expectPatchRequest: true, expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"someOtherType"},{"type":"newType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"newType"}]}}`, }, { @@ -1833,7 +1943,7 @@ func TestUpdatePod(t *testing.T) { Reason: "newReason", Message: "newMessage", }, - expectedPatchRequests: 1, + expectPatchRequest: true, expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"currentType"}]}}`, }, { @@ -1856,7 +1966,7 @@ func TestUpdatePod(t *testing.T) { Reason: "newReason", Message: "newMessage", }, - expectedPatchRequests: 1, + expectPatchRequest: true, expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","message":"newMessage","reason":"newReason","type":"currentType"}]}}`, }, { @@ -1880,7 +1990,7 @@ func TestUpdatePod(t *testing.T) { Message: "currentMessage", }, currentNominatedNodeName: "node1", - expectedPatchRequests: 0, + expectPatchRequest: false, }, { name: "Should make patch request if pod condition already exists and is identical but nominated node name is set and different", @@ -1903,46 +2013,71 @@ func TestUpdatePod(t *testing.T) { Message: "currentMessage", }, newNominatingInfo: &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: "node1"}, - expectedPatchRequests: 1, + expectPatchRequest: true, expectedPatchDataPattern: `{"status":{"nominatedNodeName":"node1"}}`, }, } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - actualPatchRequests := 0 - var actualPatchData string - cs := &clientsetfake.Clientset{} - cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - actualPatchRequests++ - patch := action.(clienttesting.PatchAction) - actualPatchData = string(patch.GetPatch()) - // For this test, we don't care about the result of the patched pod, just that we got the expected - // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. - return true, &v1.Pod{}, nil + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", test.name, asyncAPICallsEnabled), func(t *testing.T) { + actualPatchRequests := 0 + var actualPatchData string + cs := &clientsetfake.Clientset{} + patchCalled := make(chan struct{}, 1) + cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + actualPatchRequests++ + patch := action.(clienttesting.PatchAction) + actualPatchData = string(patch.GetPatch()) + patchCalled <- struct{}{} + // For this test, we don't care about the result of the patched pod, just that we got the expected + // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. + return true, &v1.Pod{}, nil + }) + + pod := st.MakePod().Name("foo").NominatedNodeName(test.currentNominatedNodeName).Conditions(test.currentPodConditions).Obj() + + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + var apiCacher framework.APICacher + if asyncAPICallsEnabled { + apiDispatcher := apidispatcher.New(cs, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + + informerFactory := informers.NewSharedInformerFactory(cs, 0) + ar := metrics.NewMetricsAsyncRecorder(10, 1*time.Second, ctx.Done()) + queue := internalqueue.NewSchedulingQueue(nil, informerFactory, internalqueue.WithMetricsRecorder(*ar), internalqueue.WithAPIDispatcher(apiDispatcher)) + apiCacher = apicache.New(queue, nil) + } + + if err := updatePod(ctx, cs, apiCacher, pod, test.newPodCondition, test.newNominatingInfo); err != nil { + t.Fatalf("Error calling update: %v", err) + } + + if test.expectPatchRequest { + select { + case <-patchCalled: + case <-time.After(time.Second): + t.Fatalf("Timed out while waiting for patch to be called") + } + regex, err := regexp.Compile(test.expectedPatchDataPattern) + if err != nil { + t.Fatalf("Error compiling regexp for %v: %v", test.expectedPatchDataPattern, err) + } + if !regex.MatchString(actualPatchData) { + t.Fatalf("Patch data mismatch: Actual was %v, but expected to match regexp %v", actualPatchData, test.expectedPatchDataPattern) + } + } else { + select { + case <-patchCalled: + t.Fatalf("Expected patch not to be called, actual patch data: %v", actualPatchData) + case <-time.After(time.Second): + } + } }) - - pod := st.MakePod().Name("foo").NominatedNodeName(test.currentNominatedNodeName).Conditions(test.currentPodConditions).Obj() - - _, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - if err := updatePod(ctx, cs, pod, test.newPodCondition, test.newNominatingInfo); err != nil { - t.Fatalf("Error calling update: %v", err) - } - - if actualPatchRequests != test.expectedPatchRequests { - t.Fatalf("Actual patch requests (%d) does not equal expected patch requests (%d), actual patch data: %v", actualPatchRequests, test.expectedPatchRequests, actualPatchData) - } - - regex, err := regexp.Compile(test.expectedPatchDataPattern) - if err != nil { - t.Fatalf("Error compiling regexp for %v: %v", test.expectedPatchDataPattern, err) - } - - if test.expectedPatchRequests > 0 && !regex.MatchString(actualPatchData) { - t.Fatalf("Patch data mismatch: Actual was %v, but expected to match regexp %v", actualPatchData, test.expectedPatchDataPattern) - } - }) + } } } @@ -3047,7 +3182,7 @@ func TestSchedulerSchedulePod(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := internalcache.New(ctx, time.Duration(0)) + cache := internalcache.New(ctx, time.Duration(0), nil) for _, pod := range test.pods { cache.AddPod(logger, pod) } @@ -3807,7 +3942,7 @@ func Test_prioritizeNodes(t *testing.T) { _, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - cache := internalcache.New(ctx, time.Duration(0)) + cache := internalcache.New(ctx, time.Duration(0), nil) for _, node := range test.nodes { cache.AddNode(klog.FromContext(ctx), node) } @@ -3972,11 +4107,10 @@ func TestFairEvaluationForNodes(t *testing.T) { func TestPreferNominatedNodeFilterCallCounts(t *testing.T) { tests := []struct { - name string - pod *v1.Pod - nodeReturnCodeMap map[string]fwk.Code - expectedCount int32 - expectedPatchRequests int + name string + pod *v1.Pod + nodeReturnCodeMap map[string]fwk.Code + expectedCount int32 }{ { name: "pod has the nominated node set, filter is called only once", @@ -4006,7 +4140,7 @@ func TestPreferNominatedNodeFilterCallCounts(t *testing.T) { nodes := makeNodeList([]string{"node1", "node2", "node3"}) client := clientsetfake.NewClientset(test.pod) informerFactory := informers.NewSharedInformerFactory(client, 0) - cache := internalcache.New(ctx, time.Duration(0)) + cache := internalcache.New(ctx, time.Duration(0), nil) for _, n := range nodes { cache.AddNode(logger, n) } @@ -4088,7 +4222,7 @@ func makeNodeList(nodeNames []string) []*v1.Node { // makeScheduler makes a simple Scheduler for testing. func makeScheduler(ctx context.Context, nodes []*v1.Node) *Scheduler { logger := klog.FromContext(ctx) - cache := internalcache.New(ctx, time.Duration(0)) + cache := internalcache.New(ctx, time.Duration(0), nil) for _, n := range nodes { cache.AddNode(logger, n) } @@ -4123,13 +4257,28 @@ func makeNode(node string, milliCPU, memory int64, images ...v1.ContainerImage) } } +func interruptOnBind(client *clientsetfake.Clientset) chan *v1.Binding { + bindingChan := make(chan *v1.Binding, 1) + client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { + var b *v1.Binding + if action.GetSubresource() == "binding" { + b := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) + bindingChan <- b + } + return true, b, nil + }) + return bindingChan +} + // queuedPodStore: pods queued before processing. // cache: scheduler cache that might contain assumed pods. -func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queuedPodStore *clientcache.FIFO, scache internalcache.Cache, - pod *v1.Pod, node *v1.Node, fns ...tf.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { - scheduler, bindingChan, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, nil, nil, fns...) +func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, client clientset.Interface, queuedPodStore *clientcache.FIFO, scache internalcache.Cache, apiDispatcher *apidispatcher.APIDispatcher, + pod *v1.Pod, node *v1.Node, bindingChan chan *v1.Binding, fns ...tf.RegisterPluginFunc) (*Scheduler, chan error) { + scheduler, errChan := setupTestScheduler(ctx, t, client, queuedPodStore, scache, apiDispatcher, nil, nil, fns...) - queuedPodStore.Add(pod) + if err := queuedPodStore.Add(pod); err != nil { + t.Fatal(err) + } // queuedPodStore: [foo:8080] // cache: [] @@ -4149,22 +4298,13 @@ func setupTestSchedulerWithOnePodOnNode(ctx context.Context, t *testing.T, queue case <-time.After(wait.ForeverTestTimeout): t.Fatalf("timeout after %v", wait.ForeverTestTimeout) } - return scheduler, bindingChan, errChan + return scheduler, errChan } // queuedPodStore: pods queued before processing. // scache: scheduler cache that might contain assumed pods. -func setupTestScheduler(ctx context.Context, t *testing.T, queuedPodStore *clientcache.FIFO, cache internalcache.Cache, informerFactory informers.SharedInformerFactory, broadcaster events.EventBroadcaster, fns ...tf.RegisterPluginFunc) (*Scheduler, chan *v1.Binding, chan error) { - bindingChan := make(chan *v1.Binding, 1) - client := clientsetfake.NewClientset() - client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - var b *v1.Binding - if action.GetSubresource() == "binding" { - b := action.(clienttesting.CreateAction).GetObject().(*v1.Binding) - bindingChan <- b - } - return true, b, nil - }) +func setupTestScheduler(ctx context.Context, t *testing.T, client clientset.Interface, queuedPodStore *clientcache.FIFO, cache internalcache.Cache, apiDispatcher *apidispatcher.APIDispatcher, + informerFactory informers.SharedInformerFactory, broadcaster events.EventBroadcaster, fns ...tf.RegisterPluginFunc) (*Scheduler, chan error) { var recorder events.EventRecorder if broadcaster != nil { @@ -4176,7 +4316,7 @@ func setupTestScheduler(ctx context.Context, t *testing.T, queuedPodStore *clien if informerFactory == nil { informerFactory = informers.NewSharedInformerFactory(clientsetfake.NewClientset(), 0) } - schedulingQueue := internalqueue.NewTestQueueWithInformerFactory(ctx, nil, informerFactory) + schedulingQueue := internalqueue.NewTestQueueWithInformerFactory(ctx, nil, informerFactory, internalqueue.WithAPIDispatcher(apiDispatcher)) waitingPods := frameworkruntime.NewWaitingPodsMap() snapshot := internalcache.NewEmptySnapshot() @@ -4185,12 +4325,16 @@ func setupTestScheduler(ctx context.Context, t *testing.T, queuedPodStore *clien fns, testSchedulerName, frameworkruntime.WithClientSet(client), + frameworkruntime.WithAPIDispatcher(apiDispatcher), frameworkruntime.WithEventRecorder(recorder), frameworkruntime.WithInformerFactory(informerFactory), frameworkruntime.WithPodNominator(schedulingQueue), frameworkruntime.WithWaitingPods(waitingPods), frameworkruntime.WithSnapshotSharedLister(snapshot), ) + if apiDispatcher != nil { + schedFramework.SetAPICacher(apicache.New(schedulingQueue, cache)) + } errChan := make(chan error, 1) sched := &Scheduler{ @@ -4202,6 +4346,7 @@ func setupTestScheduler(ctx context.Context, t *testing.T, queuedPodStore *clien return &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(t, clientcache.Pop(queuedPodStore).(*v1.Pod))}, nil }, SchedulingQueue: schedulingQueue, + APIDispatcher: apiDispatcher, Profiles: profile.Map{testSchedulerName: schedFramework}, } @@ -4213,10 +4358,10 @@ func setupTestScheduler(ctx context.Context, t *testing.T, queuedPodStore *clien msg := truncateMessage(err.Error()) schedFramework.EventRecorder().Eventf(p.Pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) } - return sched, bindingChan, errChan + return sched, errChan } -func setupTestSchedulerWithVolumeBinding(ctx context.Context, t *testing.T, volumeBinder volumebinding.SchedulerVolumeBinder, broadcaster events.EventBroadcaster) (*Scheduler, chan *v1.Binding, chan error) { +func setupTestSchedulerWithVolumeBinding(ctx context.Context, t *testing.T, volumeBinder volumebinding.SchedulerVolumeBinder, broadcaster events.EventBroadcaster, asyncAPICallsEnabled bool) (*Scheduler, chan *v1.Binding, chan error) { logger := klog.FromContext(ctx) testNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", UID: types.UID("node1")}} queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) @@ -4224,11 +4369,23 @@ func setupTestSchedulerWithVolumeBinding(ctx context.Context, t *testing.T, volu pod.Namespace = "foo-ns" pod.Spec.Volumes = append(pod.Spec.Volumes, v1.Volume{Name: "testVol", VolumeSource: v1.VolumeSource{PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ClaimName: "testPVC"}}}) - queuedPodStore.Add(pod) - scache := internalcache.New(ctx, 10*time.Minute) - scache.AddNode(logger, &testNode) + if err := queuedPodStore.Add(pod); err != nil { + t.Fatal(err) + } + testPVC := v1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{Name: "testPVC", Namespace: pod.Namespace, UID: types.UID("testPVC")}} client := clientsetfake.NewClientset(&testNode, &testPVC) + bindingChan := interruptOnBind(client) + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + t.Cleanup(apiDispatcher.Close) + } + + scache := internalcache.New(ctx, 10*time.Minute, apiDispatcher) + scache.AddNode(logger, &testNode) informerFactory := informers.NewSharedInformerFactory(client, 0) pvcInformer := informerFactory.Core().V1().PersistentVolumeClaims() pvcInformer.Informer().GetStore().Add(&testPVC) @@ -4240,7 +4397,7 @@ func setupTestSchedulerWithVolumeBinding(ctx context.Context, t *testing.T, volu return &volumebinding.VolumeBinding{Binder: volumeBinder, PVCLister: pvcInformer.Lister()}, nil }, "PreFilter", "Filter", "Reserve", "PreBind"), } - s, bindingChan, errChan := setupTestScheduler(ctx, t, queuedPodStore, scache, informerFactory, broadcaster, fns...) + s, errChan := setupTestScheduler(ctx, t, client, queuedPodStore, scache, apiDispatcher, informerFactory, broadcaster, fns...) return s, bindingChan, errChan } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 555406abc1f..bd7f22f706c 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -40,10 +40,13 @@ import ( "k8s.io/kubernetes/pkg/features" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" cachedebugger "k8s.io/kubernetes/pkg/scheduler/backend/cache/debugger" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" frameworkplugins "k8s.io/kubernetes/pkg/scheduler/framework/plugins" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources" @@ -93,6 +96,12 @@ type Scheduler struct { // SchedulingQueue holds pods to be scheduled SchedulingQueue internalqueue.SchedulingQueue + // If possible, indirect operation on APIDispatcher, e.g. through SchedulingQueue, is preferred. + // Is nil iff SchedulerAsyncAPICalls feature gate is disabled. + // Adding a call to APIDispatcher should not be done directly by in-tree usages. + // framework.APICache should be used instead. + APIDispatcher *apidispatcher.APIDispatcher + // Profiles are the scheduling profiles. Profiles profile.Map @@ -331,6 +340,10 @@ func New(ctx context.Context, } draManager = dynamicresources.NewDRAManager(ctx, resourceClaimCache, resourceSliceTracker, informerFactory) } + var apiDispatcher *apidispatcher.APIDispatcher + if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerAsyncAPICalls) { + apiDispatcher = apidispatcher.New(client, int(options.parallelism), apicalls.Relevances) + } profiles, err := profile.NewMap(ctx, options.profiles, registry, recorderFactory, frameworkruntime.WithComponentConfigVersion(options.componentConfigVersion), @@ -344,6 +357,7 @@ func New(ctx context.Context, frameworkruntime.WithExtenders(extenders), frameworkruntime.WithMetricsRecorder(metricsRecorder), frameworkruntime.WithWaitingPods(waitingPods), + frameworkruntime.WithAPIDispatcher(apiDispatcher), ) if err != nil { return nil, fmt.Errorf("initializing profiles: %v", err) @@ -385,15 +399,22 @@ func New(ctx context.Context, internalqueue.WithQueueingHintMapPerProfile(queueingHintsPerProfile), internalqueue.WithPluginMetricsSamplePercent(pluginMetricsSamplePercent), internalqueue.WithMetricsRecorder(*metricsRecorder), + internalqueue.WithAPIDispatcher(apiDispatcher), ) + schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod, apiDispatcher) + + var apiCache framework.APICacher + if apiDispatcher != nil { + apiCache = apicache.New(podQueue, schedulerCache) + } + for _, fwk := range profiles { fwk.SetPodNominator(podQueue) fwk.SetPodActivator(podQueue) + fwk.SetAPICacher(apiCache) } - schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod) - // Setup cache debugger. debugger := cachedebugger.New(nodeLister, podLister, schedulerCache, podQueue) debugger.ListenForSignal(ctx) @@ -408,6 +429,7 @@ func New(ctx context.Context, SchedulingQueue: podQueue, Profiles: profiles, logger: logger, + APIDispatcher: apiDispatcher, } sched.NextPod = podQueue.Pop sched.applyDefaultHandlers() @@ -499,6 +521,10 @@ func (sched *Scheduler) Run(ctx context.Context) { logger := klog.FromContext(ctx) sched.SchedulingQueue.Run(logger) + if sched.APIDispatcher != nil { + go sched.APIDispatcher.Run(logger) + } + // We need to start scheduleOne loop in a dedicated goroutine, // because scheduleOne function hangs on getting the next item // from the SchedulingQueue. @@ -508,6 +534,9 @@ func (sched *Scheduler) Run(ctx context.Context) { go wait.UntilWithContext(ctx, sched.ScheduleOne, 0) <-ctx.Done() + if sched.APIDispatcher != nil { + sched.APIDispatcher.Close() + } sched.SchedulingQueue.Close() // If the plugins satisfy the io.Closer interface, they are closed. diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index dd22b9a130f..27c6524c03e 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -53,9 +53,12 @@ import ( "k8s.io/kubernetes/pkg/features" schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/apis/config/testing/defaults" + "k8s.io/kubernetes/pkg/scheduler/backend/api_cache" + "k8s.io/kubernetes/pkg/scheduler/backend/api_dispatcher" internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache" internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue" "k8s.io/kubernetes/pkg/scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework/api_calls" "k8s.io/kubernetes/pkg/scheduler/framework/plugins" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort" @@ -282,97 +285,125 @@ func TestFailureHandler(t *testing.T) { }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, tt := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", tt.name, asyncAPICallsEnabled), func(t *testing.T) { + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + client := fake.NewClientset(&v1.PodList{Items: []v1.Pod{*testPod}}) + informerFactory := informers.NewSharedInformerFactory(client, 0) + podInformer := informerFactory.Core().V1().Pods() + // Need to add/update/delete testPod to the store. + if err := podInformer.Informer().GetStore().Add(testPod); err != nil { + t.Fatal(err) + } + + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() + } + + recorder := metrics.NewMetricsAsyncRecorder(3, 20*time.Microsecond, ctx.Done()) + queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())), internalqueue.WithMetricsRecorder(*recorder), internalqueue.WithAPIDispatcher(apiDispatcher)) + schedulerCache := internalcache.New(ctx, 30*time.Second, apiDispatcher) + + queue.Add(logger, testPod) + + if _, err := queue.Pop(logger); err != nil { + t.Fatalf("Pop failed: %v", err) + } + + if tt.podUpdatedDuringScheduling { + if err := podInformer.Informer().GetStore().Update(testPodUpdated); err != nil { + t.Fatal(err) + } + queue.Update(logger, testPod, testPodUpdated) + } + if tt.podDeletedDuringScheduling { + if err := podInformer.Informer().GetStore().Delete(testPod); err != nil { + t.Fatal(err) + } + queue.Delete(testPod) + } + + s, schedFramework, err := initScheduler(ctx, schedulerCache, queue, apiDispatcher, client, informerFactory) + if err != nil { + t.Fatal(err) + } + + testPodInfo := &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(t, testPod)} + s.FailureHandler(ctx, schedFramework, testPodInfo, fwk.NewStatus(fwk.Unschedulable), nil, time.Now()) + + var got *v1.Pod + if tt.podUpdatedDuringScheduling { + pInfo, ok := queue.GetPod(testPod.Name, testPod.Namespace) + if !ok { + t.Fatalf("Failed to get pod %s/%s from queue", testPod.Namespace, testPod.Name) + } + got = pInfo.Pod + } else { + got = getPodFromPriorityQueue(queue, testPod) + } + + if diff := cmp.Diff(tt.expect, got); diff != "" { + t.Errorf("Unexpected pod (-want, +got): %s", diff) + } + }) + } + } +} + +func TestFailureHandler_PodAlreadyBound(t *testing.T) { + for _, asyncAPICallsEnabled := range []bool{true, false} { + t.Run(fmt.Sprintf("Async API calls enabled: %v", asyncAPICallsEnabled), func(t *testing.T) { logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - client := fake.NewClientset(&v1.PodList{Items: []v1.Pod{*testPod}}) + nodeFoo := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} + testPod := st.MakePod().Name("test-pod").Namespace(v1.NamespaceDefault).Node("foo").Obj() + + client := fake.NewClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: []v1.Node{nodeFoo}}) informerFactory := informers.NewSharedInformerFactory(client, 0) podInformer := informerFactory.Core().V1().Pods() - // Need to add/update/delete testPod to the store. - podInformer.Informer().GetStore().Add(testPod) - - recorder := metrics.NewMetricsAsyncRecorder(3, 20*time.Microsecond, ctx.Done()) - queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())), internalqueue.WithMetricsRecorder(*recorder)) - schedulerCache := internalcache.New(ctx, 30*time.Second) - - queue.Add(logger, testPod) - - if _, err := queue.Pop(logger); err != nil { - t.Fatalf("Pop failed: %v", err) + // Need to add testPod to the store. + if err := podInformer.Informer().GetStore().Add(testPod); err != nil { + t.Fatal(err) } - if tt.podUpdatedDuringScheduling { - podInformer.Informer().GetStore().Update(testPodUpdated) - queue.Update(logger, testPod, testPodUpdated) - } - if tt.podDeletedDuringScheduling { - podInformer.Informer().GetStore().Delete(testPod) - queue.Delete(testPod) + var apiDispatcher *apidispatcher.APIDispatcher + if asyncAPICallsEnabled { + apiDispatcher = apidispatcher.New(client, 16, apicalls.Relevances) + apiDispatcher.Run(logger) + defer apiDispatcher.Close() } - s, schedFramework, err := initScheduler(ctx, schedulerCache, queue, client, informerFactory) + queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now())), internalqueue.WithAPIDispatcher(apiDispatcher)) + schedulerCache := internalcache.New(ctx, 30*time.Second, apiDispatcher) + + // Add node to schedulerCache no matter it's deleted in API server or not. + schedulerCache.AddNode(logger, &nodeFoo) + + s, schedFramework, err := initScheduler(ctx, schedulerCache, queue, apiDispatcher, client, informerFactory) if err != nil { t.Fatal(err) } testPodInfo := &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(t, testPod)} - s.FailureHandler(ctx, schedFramework, testPodInfo, fwk.NewStatus(fwk.Unschedulable), nil, time.Now()) + s.FailureHandler(ctx, schedFramework, testPodInfo, fwk.NewStatus(fwk.Unschedulable).WithError(fmt.Errorf("binding rejected: timeout")), nil, time.Now()) - var got *v1.Pod - if tt.podUpdatedDuringScheduling { - pInfo, ok := queue.GetPod(testPod.Name, testPod.Namespace) - if !ok { - t.Fatalf("Failed to get pod %s/%s from queue", testPod.Namespace, testPod.Name) - } - got = pInfo.Pod - } else { - got = getPodFromPriorityQueue(queue, testPod) - } - - if diff := cmp.Diff(tt.expect, got); diff != "" { - t.Errorf("Unexpected pod (-want, +got): %s", diff) + pod := getPodFromPriorityQueue(queue, testPod) + if pod != nil { + t.Fatalf("Unexpected pod: %v should not be in PriorityQueue when the NodeName of pod is not empty", pod.Name) } }) } } -func TestFailureHandler_PodAlreadyBound(t *testing.T) { - logger, ctx := ktesting.NewTestContext(t) - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - nodeFoo := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "foo"}} - testPod := st.MakePod().Name("test-pod").Namespace(v1.NamespaceDefault).Node("foo").Obj() - - client := fake.NewClientset(&v1.PodList{Items: []v1.Pod{*testPod}}, &v1.NodeList{Items: []v1.Node{nodeFoo}}) - informerFactory := informers.NewSharedInformerFactory(client, 0) - podInformer := informerFactory.Core().V1().Pods() - // Need to add testPod to the store. - podInformer.Informer().GetStore().Add(testPod) - - queue := internalqueue.NewPriorityQueue(nil, informerFactory, internalqueue.WithClock(testingclock.NewFakeClock(time.Now()))) - schedulerCache := internalcache.New(ctx, 30*time.Second) - - // Add node to schedulerCache no matter it's deleted in API server or not. - schedulerCache.AddNode(logger, &nodeFoo) - - s, schedFramework, err := initScheduler(ctx, schedulerCache, queue, client, informerFactory) - if err != nil { - t.Fatal(err) - } - - testPodInfo := &framework.QueuedPodInfo{PodInfo: mustNewPodInfo(t, testPod)} - s.FailureHandler(ctx, schedFramework, testPodInfo, fwk.NewStatus(fwk.Unschedulable).WithError(fmt.Errorf("binding rejected: timeout")), nil, time.Now()) - - pod := getPodFromPriorityQueue(queue, testPod) - if pod != nil { - t.Fatalf("Unexpected pod: %v should not be in PriorityQueue when the NodeName of pod is not empty", pod.Name) - } -} - // TestWithPercentageOfNodesToScore tests scheduler's PercentageOfNodesToScore is set correctly. func TestWithPercentageOfNodesToScore(t *testing.T) { tests := []struct { @@ -445,7 +476,7 @@ func getPodFromPriorityQueue(queue *internalqueue.PriorityQueue, pod *v1.Pod) *v return nil } -func initScheduler(ctx context.Context, cache internalcache.Cache, queue internalqueue.SchedulingQueue, +func initScheduler(ctx context.Context, cache internalcache.Cache, queue internalqueue.SchedulingQueue, apiDispatcher *apidispatcher.APIDispatcher, client kubernetes.Interface, informerFactory informers.SharedInformerFactory) (*Scheduler, framework.Framework, error) { logger := klog.FromContext(ctx) registerPluginFuncs := []tf.RegisterPluginFunc{ @@ -458,6 +489,7 @@ func initScheduler(ctx context.Context, cache internalcache.Cache, queue interna registerPluginFuncs, testSchedulerName, frameworkruntime.WithClientSet(client), + frameworkruntime.WithAPIDispatcher(apiDispatcher), frameworkruntime.WithInformerFactory(informerFactory), frameworkruntime.WithEventRecorder(eventBroadcaster.NewRecorder(scheme.Scheme, testSchedulerName)), frameworkruntime.WithWaitingPods(waitingPods), @@ -465,12 +497,16 @@ func initScheduler(ctx context.Context, cache internalcache.Cache, queue interna if err != nil { return nil, nil, err } + if apiDispatcher != nil { + fwk.SetAPICacher(apicache.New(queue, cache)) + } s := &Scheduler{ Cache: cache, client: client, StopEverything: ctx.Done(), SchedulingQueue: queue, + APIDispatcher: apiDispatcher, Profiles: profile.Map{testSchedulerName: fwk}, logger: logger, } diff --git a/pkg/scheduler/util/utils.go b/pkg/scheduler/util/utils.go index f04d5e1dc9a..acaaaabb4d7 100644 --- a/pkg/scheduler/util/utils.go +++ b/pkg/scheduler/util/utils.go @@ -26,7 +26,6 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/net" "k8s.io/apimachinery/pkg/util/strategicpatch" "k8s.io/client-go/kubernetes" @@ -101,12 +100,12 @@ func Retriable(err error) bool { // PatchPodStatus calculates the delta bytes change from to , // and then submit a request to API server to patch the pod changes. -func PatchPodStatus(ctx context.Context, cs kubernetes.Interface, old *v1.Pod, newStatus *v1.PodStatus) error { +func PatchPodStatus(ctx context.Context, cs kubernetes.Interface, name string, namespace string, oldStatus *v1.PodStatus, newStatus *v1.PodStatus) error { if newStatus == nil { return nil } - oldData, err := json.Marshal(v1.Pod{Status: old.Status}) + oldData, err := json.Marshal(v1.Pod{Status: *oldStatus}) if err != nil { return err } @@ -117,7 +116,7 @@ func PatchPodStatus(ctx context.Context, cs kubernetes.Interface, old *v1.Pod, n } patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Pod{}) if err != nil { - return fmt.Errorf("failed to create merge patch for pod %q/%q: %v", old.Namespace, old.Name, err) + return fmt.Errorf("failed to create merge patch for pod %q/%q: %w", namespace, name, err) } if "{}" == string(patchBytes) { @@ -125,7 +124,7 @@ func PatchPodStatus(ctx context.Context, cs kubernetes.Interface, old *v1.Pod, n } patchFn := func() error { - _, err := cs.CoreV1().Pods(old.Namespace).Patch(ctx, old.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") + _, err := cs.CoreV1().Pods(namespace).Patch(ctx, name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") return err } @@ -137,23 +136,6 @@ func DeletePod(ctx context.Context, cs kubernetes.Interface, pod *v1.Pod) error return cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) } -// ClearNominatedNodeName internally submit a patch request to API server -// to set each pods[*].Status.NominatedNodeName> to "". -func ClearNominatedNodeName(ctx context.Context, cs kubernetes.Interface, pods ...*v1.Pod) utilerrors.Aggregate { - var errs []error - for _, p := range pods { - if len(p.Status.NominatedNodeName) == 0 { - continue - } - podStatusCopy := p.Status.DeepCopy() - podStatusCopy.NominatedNodeName = "" - if err := PatchPodStatus(ctx, cs, p, podStatusCopy); err != nil { - errs = append(errs, err) - } - } - return utilerrors.NewAggregate(errs) -} - // IsScalarResourceName validates the resource for Extended, Hugepages, Native and AttachableVolume resources func IsScalarResourceName(name v1.ResourceName) bool { return v1helper.IsExtendedResourceName(name) || v1helper.IsHugePageResourceName(name) || diff --git a/pkg/scheduler/util/utils_test.go b/pkg/scheduler/util/utils_test.go index 8af69a99568..59c0f3d1a19 100644 --- a/pkg/scheduler/util/utils_test.go +++ b/pkg/scheduler/util/utils_test.go @@ -161,63 +161,6 @@ func TestMoreImportantPod(t *testing.T) { } } -func TestRemoveNominatedNodeName(t *testing.T) { - tests := []struct { - name string - currentNominatedNodeName string - newNominatedNodeName string - expectedPatchRequests int - expectedPatchData string - }{ - { - name: "Should make patch request to clear node name", - currentNominatedNodeName: "node1", - expectedPatchRequests: 1, - expectedPatchData: `{"status":{"nominatedNodeName":null}}`, - }, - { - name: "Should not make patch request if nominated node is already cleared", - currentNominatedNodeName: "", - expectedPatchRequests: 0, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - _, ctx := ktesting.NewTestContext(t) - actualPatchRequests := 0 - var actualPatchData string - cs := &clientsetfake.Clientset{} - cs.AddReactor("patch", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) { - actualPatchRequests++ - patch := action.(clienttesting.PatchAction) - actualPatchData = string(patch.GetPatch()) - // For this test, we don't care about the result of the patched pod, just that we got the expected - // patch request, so just returning &v1.Pod{} here is OK because scheduler doesn't use the response. - return true, &v1.Pod{}, nil - }) - - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "foo"}, - Status: v1.PodStatus{NominatedNodeName: test.currentNominatedNodeName}, - } - - ctx, cancel := context.WithCancel(ctx) - defer cancel() - if err := ClearNominatedNodeName(ctx, cs, pod); err != nil { - t.Fatalf("Error calling removeNominatedNodeName: %v", err) - } - - if actualPatchRequests != test.expectedPatchRequests { - t.Fatalf("Actual patch requests (%d) dos not equal expected patch requests (%d)", actualPatchRequests, test.expectedPatchRequests) - } - - if test.expectedPatchRequests > 0 && actualPatchData != test.expectedPatchData { - t.Fatalf("Patch data mismatch: Actual was %v, but expected %v", actualPatchData, test.expectedPatchData) - } - }) - } -} - func TestPatchPodStatus(t *testing.T) { tests := []struct { name string @@ -366,7 +309,7 @@ func TestPatchPodStatus(t *testing.T) { _, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() - err = PatchPodStatus(ctx, client, &tc.pod, &tc.statusToUpdate) + err = PatchPodStatus(ctx, client, tc.pod.Name, tc.pod.Namespace, &tc.pod.Status, &tc.statusToUpdate) if err != nil && tc.validateErr == nil { // shouldn't be error t.Fatal(err) diff --git a/test/integration/scheduler/bind/bind_test.go b/test/integration/scheduler/bind/bind_test.go index d857485775a..fa872a1ebab 100644 --- a/test/integration/scheduler/bind/bind_test.go +++ b/test/integration/scheduler/bind/bind_test.go @@ -17,54 +17,70 @@ limitations under the License. package bind import ( + "fmt" "testing" corev1 "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" fwk "k8s.io/kube-scheduler/framework" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/framework" st "k8s.io/kubernetes/pkg/scheduler/testing" testutil "k8s.io/kubernetes/test/integration/util" + "k8s.io/kubernetes/test/utils/ktesting" ) // TestDefaultBinder tests the binding process in the scheduler. func TestDefaultBinder(t *testing.T) { - testCtx := testutil.InitTestSchedulerWithOptions(t, testutil.InitTestAPIServer(t, "", nil), 0) - testutil.SyncSchedulerInformerFactory(testCtx) + for _, asyncAPICallsEnabled := range []bool{true, false} { + t.Run(fmt.Sprintf("Async API calls enabled: %v", asyncAPICallsEnabled), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) - // Add a node. - node, err := testutil.CreateNode(testCtx.ClientSet, st.MakeNode().Name("testnode").Obj()) - if err != nil { - t.Fatal(err) - } + testCtx := testutil.InitTestSchedulerWithOptions(t, testutil.InitTestAPIServer(t, "", nil), 0) + testutil.SyncSchedulerInformerFactory(testCtx) + if testCtx.Scheduler.APIDispatcher != nil { + logger, _ := ktesting.NewTestContext(t) + testCtx.Scheduler.APIDispatcher.Run(logger) + defer testCtx.Scheduler.APIDispatcher.Close() + } - tests := map[string]struct { - anotherUID bool - wantStatusCode fwk.Code - }{ - "same UID": { - wantStatusCode: fwk.Success, - }, - "different UID": { - anotherUID: true, - wantStatusCode: fwk.Error, - }, - } - for name, tc := range tests { - t.Run(name, func(t *testing.T) { - pod, err := testutil.CreatePausePodWithResource(testCtx.ClientSet, "fixed-name", testCtx.NS.Name, nil) + // Add a node. + node, err := testutil.CreateNode(testCtx.ClientSet, st.MakeNode().Name("testnode").Obj()) if err != nil { - t.Fatalf("Failed to create pod: %v", err) - } - defer testutil.CleanupPods(testCtx.Ctx, testCtx.ClientSet, t, []*corev1.Pod{pod}) - - podCopy := pod.DeepCopy() - if tc.anotherUID { - podCopy.UID = "another" + t.Fatal(err) } - status := testCtx.Scheduler.Profiles["default-scheduler"].RunBindPlugins(testCtx.Ctx, framework.NewCycleState(), podCopy, node.Name) - if code := status.Code(); code != tc.wantStatusCode { - t.Errorf("Bind returned code %s, want %s", code, tc.wantStatusCode) + tests := map[string]struct { + anotherUID bool + wantStatusCode fwk.Code + }{ + "same UID": { + wantStatusCode: fwk.Success, + }, + "different UID": { + anotherUID: true, + wantStatusCode: fwk.Error, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + pod, err := testutil.CreatePausePodWithResource(testCtx.ClientSet, "fixed-name", testCtx.NS.Name, nil) + if err != nil { + t.Fatalf("Failed to create pod: %v", err) + } + defer testutil.CleanupPods(testCtx.Ctx, testCtx.ClientSet, t, []*corev1.Pod{pod}) + + podCopy := pod.DeepCopy() + if tc.anotherUID { + podCopy.UID = "another" + } + + status := testCtx.Scheduler.Profiles["default-scheduler"].RunBindPlugins(testCtx.Ctx, framework.NewCycleState(), podCopy, node.Name) + if code := status.Code(); code != tc.wantStatusCode { + t.Errorf("Bind returned code %s, want %s", code, tc.wantStatusCode) + } + }) } }) } diff --git a/test/integration/scheduler/eventhandler/eventhandler_test.go b/test/integration/scheduler/eventhandler/eventhandler_test.go index f2de5024314..5d4b0f035e0 100644 --- a/test/integration/scheduler/eventhandler/eventhandler_test.go +++ b/test/integration/scheduler/eventhandler/eventhandler_test.go @@ -212,80 +212,89 @@ func TestUpdateNominatedNodeName(t *testing.T) { for _, tt := range tests { for _, qHintEnabled := range []bool{false, true} { - t.Run(fmt.Sprintf("%s, with queuehint(%v)", tt.name, qHintEnabled), func(t *testing.T) { - if !qHintEnabled { - featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) + for _, asyncAPICallsEnabled := range []bool{false, true} { + if !qHintEnabled && asyncAPICallsEnabled { + // This can't happen. + continue } - // Set the SchedulerPopFromBackoffQ feature to false, because when it's enabled, we can't be sure the pod won't be popped from the backoffQ. - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerPopFromBackoffQ, false) + t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v)", tt.name, qHintEnabled, asyncAPICallsEnabled), func(t *testing.T) { + if !qHintEnabled { + featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, version.MustParse("1.33")) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerQueueingHints, false) + } else { + // Handle SchedulerAsyncAPICalls feature only in 1.34+. + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) + } + // Set the SchedulerPopFromBackoffQ feature to false, because when it's enabled, we can't be sure the pod won't be popped from the backoffQ. + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerPopFromBackoffQ, false) - testCtx, teardown := schedulerutils.InitTestSchedulerForFrameworkTest(t, testContext, 0, true, - scheduler.WithClock(fakeClock), - // UpdateFunc needs to be called when the nominated pod is still in the backoff queue, thus small, but non 0 value. - scheduler.WithPodInitialBackoffSeconds(int64(testBackoff.Seconds())), - scheduler.WithPodMaxBackoffSeconds(int64(testBackoff.Seconds())), - ) - defer teardown() + testCtx, teardown := schedulerutils.InitTestSchedulerForFrameworkTest(t, testContext, 0, true, + scheduler.WithClock(fakeClock), + // UpdateFunc needs to be called when the nominated pod is still in the backoff queue, thus small, but non 0 value. + scheduler.WithPodInitialBackoffSeconds(int64(testBackoff.Seconds())), + scheduler.WithPodMaxBackoffSeconds(int64(testBackoff.Seconds())), + ) + defer teardown() - _, err := testutils.CreateNode(testCtx.ClientSet, testNode) - if err != nil { - t.Fatalf("Creating node error: %v", err) - } + _, err := testutils.CreateNode(testCtx.ClientSet, testNode) + if err != nil { + t.Fatalf("Creating node error: %v", err) + } - // Ensure node is present in scheduler cache. - if err := testutils.WaitForNodesInCache(testCtx.Ctx, testCtx.Scheduler, 1); err != nil { - t.Fatalf("Waiting for node in cache error: %v", err) - } + // Ensure node is present in scheduler cache. + if err := testutils.WaitForNodesInCache(testCtx.Ctx, testCtx.Scheduler, 1); err != nil { + t.Fatalf("Waiting for node in cache error: %v", err) + } - // Create initial low-priority pod and wait until it's scheduled. - pod, err := testutils.CreatePausePod(testCtx.ClientSet, podLow) - if err != nil { - t.Fatalf("Creating pod error: %v", err) - } - if err := testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Fatalf("Pod %v was not scheduled: %v", pod.Name, err) - } + // Create initial low-priority pod and wait until it's scheduled. + pod, err := testutils.CreatePausePod(testCtx.ClientSet, podLow) + if err != nil { + t.Fatalf("Creating pod error: %v", err) + } + if err := testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Fatalf("Pod %v was not scheduled: %v", pod.Name, err) + } - // Create mid-priority pod and wait until it becomes nominated (preempt low-priority pod) and remain uschedulable. - pod, err = testutils.CreatePausePod(testCtx.ClientSet, podMidNominated) - if err != nil { - t.Fatalf("Creating pod error: %v", err) - } - if err := testutils.WaitForNominatedNodeName(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Errorf("NominatedNodeName field was not set for pod %v: %v", pod.Name, err) - } - if err := testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Errorf("Pod %v haven't become unschedulabe: %v", pod.Name, err) - } + // Create mid-priority pod and wait until it becomes nominated (preempt low-priority pod) and remain uschedulable. + pod, err = testutils.CreatePausePod(testCtx.ClientSet, podMidNominated) + if err != nil { + t.Fatalf("Creating pod error: %v", err) + } + if err := testutils.WaitForNominatedNodeName(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Errorf("NominatedNodeName field was not set for pod %v: %v", pod.Name, err) + } + if err := testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Errorf("Pod %v haven't become unschedulabe: %v", pod.Name, err) + } - // Remove the initial low-priority pod, which will move the nominated unschedulable pod back to the backoff queue. - if err := testutils.DeletePod(testCtx.ClientSet, podLow.Name, podLow.Namespace); err != nil { - t.Fatalf("Deleting pod error: %v", err) - } + // Remove the initial low-priority pod, which will move the nominated unschedulable pod back to the backoff queue. + if err := testutils.DeletePod(testCtx.ClientSet, podLow.Name, podLow.Namespace); err != nil { + t.Fatalf("Deleting pod error: %v", err) + } - // Create another low-priority pods which cannot be scheduled because the mid-priority pod is nominated on the node and the node doesn't have enough resource to have two pods both. - pod, err = testutils.CreatePausePod(testCtx.ClientSet, podLow) - if err != nil { - t.Fatalf("Creating pod error: %v", err) - } - if err := testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Fatalf("Pod %v was not scheduled: %v", pod.Name, err) - } + // Create another low-priority pods which cannot be scheduled because the mid-priority pod is nominated on the node and the node doesn't have enough resource to have two pods both. + pod, err = testutils.CreatePausePod(testCtx.ClientSet, podLow) + if err != nil { + t.Fatalf("Creating pod error: %v", err) + } + if err := testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Fatalf("Pod %v was not scheduled: %v", pod.Name, err) + } - // Update causing the nominated pod to be removed or to get its nominated node name removed, which should trigger scheduling of the low priority pod. - // Note that the update has to happen since the nominated pod is still in the backoffQ to actually test updates of nominated, but not bound yet pods. - tt.updateFunc(testCtx) + // Update causing the nominated pod to be removed or to get its nominated node name removed, which should trigger scheduling of the low priority pod. + // Note that the update has to happen since the nominated pod is still in the backoffQ to actually test updates of nominated, but not bound yet pods. + tt.updateFunc(testCtx) - // Advance time by the 2 * maxPodBackoffSeconds to move low priority pod out of the backoff queue. - fakeClock.Step(2 * testBackoff) + // Advance time by the 2 * maxPodBackoffSeconds to move low priority pod out of the backoff queue. + fakeClock.Step(2 * testBackoff) - // Expect the low-priority pod is notified about unnominated mid-pririty pod and gets scheduled, as it should fit this time. - if err := testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, podLow); err != nil { - t.Fatalf("Pod %v was not scheduled: %v", podLow.Name, err) - } - testutils.CleanupPods(testCtx.Ctx, testCtx.ClientSet, t, cleanupPods) - }) + // Expect the low-priority pod is notified about unnominated mid-pririty pod and gets scheduled, as it should fit this time. + if err := testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, podLow); err != nil { + t.Fatalf("Pod %v was not scheduled: %v", podLow.Name, err) + } + testutils.CleanupPods(testCtx.Ctx, testCtx.ClientSet, t, cleanupPods) + }) + } } } } diff --git a/test/integration/scheduler/preemption/preemption_test.go b/test/integration/scheduler/preemption/preemption_test.go index 218e04f1803..7cd6bd41220 100644 --- a/test/integration/scheduler/preemption/preemption_test.go +++ b/test/integration/scheduler/preemption/preemption_test.go @@ -171,16 +171,6 @@ func TestPreemption(t *testing.T) { }}, }) - testCtx := testutils.InitTestSchedulerWithOptions(t, - testutils.InitTestAPIServer(t, "preemption", nil), - 0, - scheduler.WithProfiles(cfg.Profiles...), - scheduler.WithFrameworkOutOfTreeRegistry(registry)) - testutils.SyncSchedulerInformerFactory(testCtx) - go testCtx.Scheduler.Run(testCtx.Ctx) - - cs := testCtx.ClientSet - defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)}, @@ -201,9 +191,8 @@ func TestPreemption(t *testing.T) { initTokens: maxTokens, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "victim-pod", - Namespace: testCtx.NS.Name, - Priority: &lowPriority, + Name: "victim-pod", + Priority: &lowPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -211,9 +200,8 @@ func TestPreemption(t *testing.T) { }), }, pod: initPausePod(&testutils.PausePodConfig{ - Name: "preemptor-pod", - Namespace: testCtx.NS.Name, - Priority: &highPriority, + Name: "preemptor-pod", + Priority: &highPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -226,9 +214,8 @@ func TestPreemption(t *testing.T) { initTokens: 1, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "victim-pod", - Namespace: testCtx.NS.Name, - Priority: &lowPriority, + Name: "victim-pod", + Priority: &lowPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -236,9 +223,8 @@ func TestPreemption(t *testing.T) { }), }, pod: initPausePod(&testutils.PausePodConfig{ - Name: "preemptor-pod", - Namespace: testCtx.NS.Name, - Priority: &highPriority, + Name: "preemptor-pod", + Priority: &highPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -256,9 +242,8 @@ func TestPreemption(t *testing.T) { enablePreFilter: true, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "victim-pod", - Namespace: testCtx.NS.Name, - Priority: &lowPriority, + Name: "victim-pod", + Priority: &lowPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -266,9 +251,8 @@ func TestPreemption(t *testing.T) { }), }, pod: initPausePod(&testutils.PausePodConfig{ - Name: "preemptor-pod", - Namespace: testCtx.NS.Name, - Priority: &highPriority, + Name: "preemptor-pod", + Priority: &highPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -283,9 +267,8 @@ func TestPreemption(t *testing.T) { unresolvable: true, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "victim-pod", - Namespace: testCtx.NS.Name, - Priority: &lowPriority, + Name: "victim-pod", + Priority: &lowPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -293,9 +276,8 @@ func TestPreemption(t *testing.T) { }), }, pod: initPausePod(&testutils.PausePodConfig{ - Name: "preemptor-pod", - Namespace: testCtx.NS.Name, - Priority: &highPriority, + Name: "preemptor-pod", + Priority: &highPriority, Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, @@ -308,13 +290,13 @@ func TestPreemption(t *testing.T) { initTokens: maxTokens, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "pod-0", Namespace: testCtx.NS.Name, + Name: "pod-0", Priority: &mediumPriority, Labels: map[string]string{"pod": "p0"}, Resources: defaultPodRes, }), initPausePod(&testutils.PausePodConfig{ - Name: "pod-1", Namespace: testCtx.NS.Name, + Name: "pod-1", Priority: &lowPriority, Labels: map[string]string{"pod": "p1"}, Resources: defaultPodRes, @@ -341,7 +323,6 @@ func TestPreemption(t *testing.T) { // A higher priority pod with anti-affinity. pod: initPausePod(&testutils.PausePodConfig{ Name: "preemptor-pod", - Namespace: testCtx.NS.Name, Priority: &highPriority, Labels: map[string]string{"pod": "preemptor"}, Resources: defaultPodRes, @@ -372,13 +353,13 @@ func TestPreemption(t *testing.T) { initTokens: maxTokens, existingPods: []*v1.Pod{ initPausePod(&testutils.PausePodConfig{ - Name: "pod-0", Namespace: testCtx.NS.Name, + Name: "pod-0", Priority: &mediumPriority, Labels: map[string]string{"pod": "p0"}, Resources: defaultPodRes, }), initPausePod(&testutils.PausePodConfig{ - Name: "pod-1", Namespace: testCtx.NS.Name, + Name: "pod-1", Priority: &highPriority, Labels: map[string]string{"pod": "p1"}, Resources: defaultPodRes, @@ -405,7 +386,6 @@ func TestPreemption(t *testing.T) { // A higher priority pod with anti-affinity. pod: initPausePod(&testutils.PausePodConfig{ Name: "preemptor-pod", - Namespace: testCtx.NS.Name, Priority: &highPriority, Labels: map[string]string{"pod": "preemptor"}, Resources: defaultPodRes, @@ -439,61 +419,77 @@ func TestPreemption(t *testing.T) { v1.ResourceMemory: "500", } nodeObject := st.MakeNode().Name("node1").Capacity(nodeRes).Label("node", "node1").Obj() - if _, err := createNode(testCtx.ClientSet, nodeObject); err != nil { - t.Fatalf("Error creating node: %v", err) - } for _, asyncPreemptionEnabled := range []bool{true, false} { - for _, test := range tests { - t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", test.name, asyncPreemptionEnabled), func(t *testing.T) { - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled) + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(fmt.Sprintf("%s (Async preemption enabled: %v, Async API calls enabled: %v)", test.name, asyncPreemptionEnabled, asyncAPICallsEnabled), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) - filter.Tokens = test.initTokens - filter.EnablePreFilter = test.enablePreFilter - filter.Unresolvable = test.unresolvable - pods := make([]*v1.Pod, len(test.existingPods)) - // Create and run existingPods. - for i, p := range test.existingPods { - pods[i], err = runPausePod(cs, p) - if err != nil { - t.Fatalf("Error running pause pod: %v", err) + testCtx := testutils.InitTestSchedulerWithOptions(t, + testutils.InitTestAPIServer(t, "preemption", nil), + 0, + scheduler.WithProfiles(cfg.Profiles...), + scheduler.WithFrameworkOutOfTreeRegistry(registry)) + testutils.SyncSchedulerInformerFactory(testCtx) + go testCtx.Scheduler.Run(testCtx.Ctx) + + if _, err := createNode(testCtx.ClientSet, nodeObject); err != nil { + t.Fatalf("Error creating node: %v", err) } - } - // Create the "pod". - preemptor, err := createPausePod(cs, test.pod) - if err != nil { - t.Errorf("Error while creating high priority pod: %v", err) - } - // Wait for preemption of pods and make sure the other ones are not preempted. - for i, p := range pods { - if _, found := test.preemptedPodIndexes[i]; found { - if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false, - podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { - t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name) - } - pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{}) + + cs := testCtx.ClientSet + + filter.Tokens = test.initTokens + filter.EnablePreFilter = test.enablePreFilter + filter.Unresolvable = test.unresolvable + pods := make([]*v1.Pod, len(test.existingPods)) + // Create and run existingPods. + for i, p := range test.existingPods { + p.Namespace = testCtx.NS.Name + pods[i], err = runPausePod(cs, p) if err != nil { - t.Errorf("Error %v when getting the updated status for pod %v/%v ", err, p.Namespace, p.Name) + t.Fatalf("Error running pause pod: %v", err) } - _, cond := podutil.GetPodCondition(&pod.Status, v1.DisruptionTarget) - if cond == nil { - t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget) + } + // Create the "pod". + test.pod.Namespace = testCtx.NS.Name + preemptor, err := createPausePod(cs, test.pod) + if err != nil { + t.Errorf("Error while creating high priority pod: %v", err) + } + // Wait for preemption of pods and make sure the other ones are not preempted. + for i, p := range pods { + if _, found := test.preemptedPodIndexes[i]; found { + if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false, + podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { + t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name) + } + pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Error %v when getting the updated status for pod %v/%v ", err, p.Namespace, p.Name) + } + _, cond := podutil.GetPodCondition(&pod.Status, v1.DisruptionTarget) + if cond == nil { + t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget) + } + } else if p.DeletionTimestamp != nil { + t.Errorf("Didn't expect pod %v to get preempted.", p.Name) } - } else if p.DeletionTimestamp != nil { - t.Errorf("Didn't expect pod %v to get preempted.", p.Name) } - } - // Also check that the preemptor pod gets the NominatedNodeName field set. - if len(test.preemptedPodIndexes) > 0 { - if err := testutils.WaitForNominatedNodeName(testCtx.Ctx, cs, preemptor); err != nil { - t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err) + // Also check that the preemptor pod gets the NominatedNodeName field set. + if len(test.preemptedPodIndexes) > 0 { + if err := testutils.WaitForNominatedNodeName(testCtx.Ctx, cs, preemptor); err != nil { + t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err) + } } - } - // Cleanup - pods = append(pods, preemptor) - testutils.CleanupPods(testCtx.Ctx, cs, t, pods) - }) + // Cleanup + pods = append(pods, preemptor) + testutils.CleanupPods(testCtx.Ctx, cs, t, pods) + }) + } } } } @@ -809,177 +805,185 @@ func TestAsyncPreemption(t *testing.T) { // All test cases have the same node. node := st.MakeNode().Name("node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj() - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - // We need to use a custom preemption plugin to test async preemption behavior - delayedPreemptionPluginName := "delay-preemption" - // keyed by the pod name - preemptionDoneChannels := make(map[string]chan struct{}) - defer func() { - for _, ch := range preemptionDoneChannels { - close(ch) - } - }() - registry := make(frameworkruntime.Registry) - var preemptionPlugin *defaultpreemption.DefaultPreemption - err := registry.Register(delayedPreemptionPluginName, func(c context.Context, r runtime.Object, fh framework.Handle) (framework.Plugin, error) { - p, err := frameworkruntime.FactoryAdapter(plfeature.Features{EnableAsyncPreemption: true}, defaultpreemption.New)(c, &config.DefaultPreemptionArgs{ - // Set default values to pass the validation at the initialization, not related to the test. - MinCandidateNodesPercentage: 10, - MinCandidateNodesAbsolute: 100, - }, fh) - if err != nil { - return nil, fmt.Errorf("error creating default preemption plugin: %w", err) - } + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", test.name, asyncAPICallsEnabled), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) - var ok bool - preemptionPlugin, ok = p.(*defaultpreemption.DefaultPreemption) - if !ok { - return nil, fmt.Errorf("unexpected plugin type %T", p) - } - - preemptPodFn := preemptionPlugin.Evaluator.PreemptPod - preemptionPlugin.Evaluator.PreemptPod = func(ctx context.Context, c preemption.Candidate, preemptor, victim *v1.Pod, pluginName string) error { - // block the preemption goroutine to complete until the test case allows it to proceed. - if ch, ok := preemptionDoneChannels[preemptor.Name]; ok { - <-ch + // We need to use a custom preemption plugin to test async preemption behavior + delayedPreemptionPluginName := "delay-preemption" + // keyed by the pod name + preemptionDoneChannels := make(map[string]chan struct{}) + defer func() { + for _, ch := range preemptionDoneChannels { + close(ch) + } + }() + registry := make(frameworkruntime.Registry) + var preemptionPlugin *defaultpreemption.DefaultPreemption + err := registry.Register(delayedPreemptionPluginName, func(c context.Context, r runtime.Object, fh framework.Handle) (framework.Plugin, error) { + p, err := frameworkruntime.FactoryAdapter(plfeature.Features{EnableAsyncPreemption: true}, defaultpreemption.New)(c, &config.DefaultPreemptionArgs{ + // Set default values to pass the validation at the initialization, not related to the test. + MinCandidateNodesPercentage: 10, + MinCandidateNodesAbsolute: 100, + }, fh) + if err != nil { + return nil, fmt.Errorf("error creating default preemption plugin: %w", err) } - return preemptPodFn(ctx, c, preemptor, victim, pluginName) - } - return preemptionPlugin, nil - }) - if err != nil { - t.Fatalf("Error registering a filter: %v", err) - } - cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ - Profiles: []configv1.KubeSchedulerProfile{{ - SchedulerName: ptr.To(v1.DefaultSchedulerName), - Plugins: &configv1.Plugins{ - MultiPoint: configv1.PluginSet{ - Enabled: []configv1.Plugin{ - {Name: delayedPreemptionPluginName}, - }, - Disabled: []configv1.Plugin{ - {Name: names.DefaultPreemption}, + var ok bool + preemptionPlugin, ok = p.(*defaultpreemption.DefaultPreemption) + if !ok { + return nil, fmt.Errorf("unexpected plugin type %T", p) + } + + preemptPodFn := preemptionPlugin.Evaluator.PreemptPod + preemptionPlugin.Evaluator.PreemptPod = func(ctx context.Context, c preemption.Candidate, preemptor, victim *v1.Pod, pluginName string) error { + // block the preemption goroutine to complete until the test case allows it to proceed. + if ch, ok := preemptionDoneChannels[preemptor.Name]; ok { + <-ch + } + return preemptPodFn(ctx, c, preemptor, victim, pluginName) + } + + return preemptionPlugin, nil + }) + if err != nil { + t.Fatalf("Error registering a filter: %v", err) + } + cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ + Profiles: []configv1.KubeSchedulerProfile{{ + SchedulerName: ptr.To(v1.DefaultSchedulerName), + Plugins: &configv1.Plugins{ + MultiPoint: configv1.PluginSet{ + Enabled: []configv1.Plugin{ + {Name: delayedPreemptionPluginName}, + }, + Disabled: []configv1.Plugin{ + {Name: names.DefaultPreemption}, + }, }, }, - }, - }}, + }}, + }) + + // It initializes the scheduler, but doesn't start. + // We manually trigger the scheduling cycle. + testCtx := testutils.InitTestSchedulerWithOptions(t, + testutils.InitTestAPIServer(t, "preemption", nil), + 0, + scheduler.WithProfiles(cfg.Profiles...), + scheduler.WithFrameworkOutOfTreeRegistry(registry), + // disable backoff + scheduler.WithPodMaxBackoffSeconds(0), + scheduler.WithPodInitialBackoffSeconds(0), + ) + testutils.SyncSchedulerInformerFactory(testCtx) + cs := testCtx.ClientSet + + if preemptionPlugin == nil { + t.Fatalf("the preemption plugin should be initialized") + } + + logger, _ := ktesting.NewTestContext(t) + if testCtx.Scheduler.APIDispatcher != nil { + testCtx.Scheduler.APIDispatcher.Run(logger) + defer testCtx.Scheduler.APIDispatcher.Close() + } + testCtx.Scheduler.SchedulingQueue.Run(logger) + defer testCtx.Scheduler.SchedulingQueue.Close() + + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, true) + + createdPods := []*v1.Pod{} + defer testutils.CleanupPods(testCtx.Ctx, cs, t, createdPods) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if _, err := cs.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}); err != nil { + t.Fatalf("Failed to create an initial Node %q: %v", node.Name, err) + } + defer func() { + if err := cs.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); err != nil { + t.Fatalf("Failed to delete the Node %q: %v", node.Name, err) + } + }() + + for _, scenario := range test.scenarios { + t.Logf("Running scenario: %s", scenario.name) + switch { + case scenario.createPod != nil: + if scenario.createPod.count == nil { + scenario.createPod.count = ptr.To(1) + } + + for i := 0; i < *scenario.createPod.count; i++ { + pod, err := cs.CoreV1().Pods(testCtx.NS.Name).Create(ctx, scenario.createPod.pod, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create a Pod %q: %v", pod.Name, err) + } + createdPods = append(createdPods, pod) + } + case scenario.schedulePod != nil: + lastFailure := "" + if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + if len(testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()) == 0 { + lastFailure = fmt.Sprintf("Expected the pod %s to be scheduled, but no pod arrives at the activeQ", scenario.schedulePod.podName) + return false, nil + } + + if testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name != scenario.schedulePod.podName { + // need to wait more because maybe the queue will get another Pod that higher priority than the current top pod. + lastFailure = fmt.Sprintf("The pod %s is expected to be scheduled, but the top Pod is %s", scenario.schedulePod.podName, testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name) + return false, nil + } + + return true, nil + }); err != nil { + t.Fatal(lastFailure) + } + + preemptionDoneChannels[scenario.schedulePod.podName] = make(chan struct{}) + testCtx.Scheduler.ScheduleOne(testCtx.Ctx) + if scenario.schedulePod.expectSuccess { + if err := wait.PollUntilContextTimeout(testCtx.Ctx, 200*time.Millisecond, wait.ForeverTestTimeout, false, testutils.PodScheduled(cs, testCtx.NS.Name, scenario.schedulePod.podName)); err != nil { + t.Fatalf("Expected the pod %s to be scheduled", scenario.schedulePod.podName) + } + } else { + if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.schedulePod.podName) { + t.Fatalf("Expected the pod %s to be in the queue after the scheduling attempt", scenario.schedulePod.podName) + } + } + case scenario.completePreemption != "": + if _, ok := preemptionDoneChannels[scenario.completePreemption]; !ok { + t.Fatalf("The preemptor Pod %q is not running preemption", scenario.completePreemption) + } + + close(preemptionDoneChannels[scenario.completePreemption]) + delete(preemptionDoneChannels, scenario.completePreemption) + case scenario.podGatedInQueue != "": + // make sure the Pod is in the queue in the first place. + if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) { + t.Fatalf("Expected the pod %s to be in the queue", scenario.podGatedInQueue) + } + + // Make sure this Pod is gated by the preemption at PreEnqueue extension point + // by activating the Pod and see if it's still in the unsched pod pool. + testCtx.Scheduler.SchedulingQueue.Activate(logger, map[string]*v1.Pod{scenario.podGatedInQueue: st.MakePod().Namespace(testCtx.NS.Name).Name(scenario.podGatedInQueue).Obj()}) + if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) { + t.Fatalf("Expected the pod %s to be in the queue even after the activation", scenario.podGatedInQueue) + } + case scenario.podRunningPreemption != nil: + if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + return preemptionPlugin.Evaluator.IsPodRunningPreemption(createdPods[*scenario.podRunningPreemption].GetUID()), nil + }); err != nil { + t.Fatalf("Expected the pod %s to be running preemption", createdPods[*scenario.podRunningPreemption].Name) + } + } + } }) - - // It initializes the scheduler, but doesn't start. - // We manually trigger the scheduling cycle. - testCtx := testutils.InitTestSchedulerWithOptions(t, - testutils.InitTestAPIServer(t, "preemption", nil), - 0, - scheduler.WithProfiles(cfg.Profiles...), - scheduler.WithFrameworkOutOfTreeRegistry(registry), - // disable backoff - scheduler.WithPodMaxBackoffSeconds(0), - scheduler.WithPodInitialBackoffSeconds(0), - ) - testutils.SyncSchedulerInformerFactory(testCtx) - cs := testCtx.ClientSet - - if preemptionPlugin == nil { - t.Fatalf("the preemption plugin should be initialized") - } - - logger, _ := ktesting.NewTestContext(t) - testCtx.Scheduler.SchedulingQueue.Run(logger) - defer testCtx.Scheduler.SchedulingQueue.Close() - - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, true) - - createdPods := []*v1.Pod{} - defer testutils.CleanupPods(testCtx.Ctx, cs, t, createdPods) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - if _, err := cs.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}); err != nil { - t.Fatalf("Failed to create an initial Node %q: %v", node.Name, err) - } - defer func() { - if err := cs.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); err != nil { - t.Fatalf("Failed to delete the Node %q: %v", node.Name, err) - } - }() - - for _, scenario := range test.scenarios { - t.Logf("Running scenario: %s", scenario.name) - switch { - case scenario.createPod != nil: - if scenario.createPod.count == nil { - scenario.createPod.count = ptr.To(1) - } - - for i := 0; i < *scenario.createPod.count; i++ { - pod, err := cs.CoreV1().Pods(testCtx.NS.Name).Create(ctx, scenario.createPod.pod, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("Failed to create a Pod %q: %v", pod.Name, err) - } - createdPods = append(createdPods, pod) - } - case scenario.schedulePod != nil: - lastFailure := "" - if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - if len(testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()) == 0 { - lastFailure = fmt.Sprintf("Expected the pod %s to be scheduled, but no pod arrives at the activeQ", scenario.schedulePod.podName) - return false, nil - } - - if testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name != scenario.schedulePod.podName { - // need to wait more because maybe the queue will get another Pod that higher priority than the current top pod. - lastFailure = fmt.Sprintf("The pod %s is expected to be scheduled, but the top Pod is %s", scenario.schedulePod.podName, testCtx.Scheduler.SchedulingQueue.PodsInActiveQ()[0].Name) - return false, nil - } - - return true, nil - }); err != nil { - t.Fatal(lastFailure) - } - - preemptionDoneChannels[scenario.schedulePod.podName] = make(chan struct{}) - testCtx.Scheduler.ScheduleOne(testCtx.Ctx) - if scenario.schedulePod.expectSuccess { - if err := wait.PollUntilContextTimeout(testCtx.Ctx, 200*time.Millisecond, wait.ForeverTestTimeout, false, testutils.PodScheduled(cs, testCtx.NS.Name, scenario.schedulePod.podName)); err != nil { - t.Fatalf("Expected the pod %s to be scheduled", scenario.schedulePod.podName) - } - } else { - if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.schedulePod.podName) { - t.Fatalf("Expected the pod %s to be in the queue after the scheduling attempt", scenario.schedulePod.podName) - } - } - case scenario.completePreemption != "": - if _, ok := preemptionDoneChannels[scenario.completePreemption]; !ok { - t.Fatalf("The preemptor Pod %q is not running preemption", scenario.completePreemption) - } - - close(preemptionDoneChannels[scenario.completePreemption]) - delete(preemptionDoneChannels, scenario.completePreemption) - case scenario.podGatedInQueue != "": - // make sure the Pod is in the queue in the first place. - if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) { - t.Fatalf("Expected the pod %s to be in the queue", scenario.podGatedInQueue) - } - - // Make sure this Pod is gated by the preemption at PreEnqueue extension point - // by activating the Pod and see if it's still in the unsched pod pool. - testCtx.Scheduler.SchedulingQueue.Activate(logger, map[string]*v1.Pod{scenario.podGatedInQueue: st.MakePod().Namespace(testCtx.NS.Name).Name(scenario.podGatedInQueue).Obj()}) - if !podInUnschedulablePodPool(t, testCtx.Scheduler.SchedulingQueue, scenario.podGatedInQueue) { - t.Fatalf("Expected the pod %s to be in the queue even after the activation", scenario.podGatedInQueue) - } - case scenario.podRunningPreemption != nil: - if err := wait.PollUntilContextTimeout(testCtx.Ctx, time.Millisecond*200, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - return preemptionPlugin.Evaluator.IsPodRunningPreemption(createdPods[*scenario.podRunningPreemption].GetUID()), nil - }); err != nil { - t.Fatalf("Expected the pod %s to be running preemption", createdPods[*scenario.podRunningPreemption].Name) - } - } - } - }) + } } } @@ -1680,83 +1684,86 @@ func TestNominatedNodeCleanUp(t *testing.T) { } for _, asyncPreemptionEnabled := range []bool{true, false} { - for _, tt := range tests { - t.Run(fmt.Sprintf("%s (Async preemption enabled: %v)", tt.name, asyncPreemptionEnabled), func(t *testing.T) { - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled) + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, tt := range tests { + t.Run(fmt.Sprintf("%s (Async preemption enabled: %v, Async API calls enabled: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) - cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ - Profiles: []configv1.KubeSchedulerProfile{{ - SchedulerName: ptr.To(v1.DefaultSchedulerName), - Plugins: tt.customPlugins, - }}, - }) - testCtx := initTest( - t, - "preemption", - scheduler.WithProfiles(cfg.Profiles...), - scheduler.WithFrameworkOutOfTreeRegistry(tt.outOfTreeRegistry), - ) + cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ + Profiles: []configv1.KubeSchedulerProfile{{ + SchedulerName: ptr.To(v1.DefaultSchedulerName), + Plugins: tt.customPlugins, + }}, + }) + testCtx := initTest( + t, + "preemption", + scheduler.WithProfiles(cfg.Profiles...), + scheduler.WithFrameworkOutOfTreeRegistry(tt.outOfTreeRegistry), + ) - cs, ns := testCtx.ClientSet, testCtx.NS.Name - for _, node := range tt.initNodes { - if _, err := createNode(cs, node); err != nil { - t.Fatalf("Error creating initial node %v: %v", node.Name, err) - } - } - - // Create a node with the specified capacity. - nodeName := "fake-node" - if _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(tt.nodeCapacity).Obj()); err != nil { - t.Fatalf("Error creating node %v: %v", nodeName, err) - } - - // Create pods and run post check if necessary. - for i, pods := range tt.podsToCreate { - for _, p := range pods { - p.Namespace = ns - if _, err := createPausePod(cs, p); err != nil { - t.Fatalf("Error creating pod %v: %v", p.Name, err) + cs, ns := testCtx.ClientSet, testCtx.NS.Name + for _, node := range tt.initNodes { + if _, err := createNode(cs, node); err != nil { + t.Fatalf("Error creating initial node %v: %v", node.Name, err) } } - // If necessary, run the post check function. - if len(tt.postChecks) > i && tt.postChecks[i] != nil { + + // Create a node with the specified capacity. + nodeName := "fake-node" + if _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(tt.nodeCapacity).Obj()); err != nil { + t.Fatalf("Error creating node %v: %v", nodeName, err) + } + + // Create pods and run post check if necessary. + for i, pods := range tt.podsToCreate { for _, p := range pods { - if err := tt.postChecks[i](testCtx.Ctx, cs, p); err != nil { - t.Fatalf("Pod %v didn't pass the postChecks[%v]: %v", p.Name, i, err) + p.Namespace = ns + if _, err := createPausePod(cs, p); err != nil { + t.Fatalf("Error creating pod %v: %v", p.Name, err) + } + } + // If necessary, run the post check function. + if len(tt.postChecks) > i && tt.postChecks[i] != nil { + for _, p := range pods { + if err := tt.postChecks[i](testCtx.Ctx, cs, p); err != nil { + t.Fatalf("Pod %v didn't pass the postChecks[%v]: %v", p.Name, i, err) + } } } } - } - // Delete the fake node if necessary. - if tt.deleteFakeNode { - if err := cs.CoreV1().Nodes().Delete(testCtx.Ctx, nodeName, *metav1.NewDeleteOptions(0)); err != nil { - t.Fatalf("Node %v cannot be deleted: %v", nodeName, err) + // Delete the fake node if necessary. + if tt.deleteFakeNode { + if err := cs.CoreV1().Nodes().Delete(testCtx.Ctx, nodeName, *metav1.NewDeleteOptions(0)); err != nil { + t.Fatalf("Node %v cannot be deleted: %v", nodeName, err) + } } - } - // Force deleting the terminating pods if necessary. - // This is required if we demand to delete terminating Pods physically. - for _, podName := range tt.podNamesToDelete { - if err := deletePod(cs, podName, ns); err != nil { - t.Fatalf("Pod %v cannot be deleted: %v", podName, err) + // Force deleting the terminating pods if necessary. + // This is required if we demand to delete terminating Pods physically. + for _, podName := range tt.podNamesToDelete { + if err := deletePod(cs, podName, ns); err != nil { + t.Fatalf("Pod %v cannot be deleted: %v", podName, err) + } } - } - // Verify if .status.nominatedNodeName is cleared. - if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{}) - if err != nil { - t.Errorf("Error getting the medium pod: %v", err) + // Verify if .status.nominatedNodeName is cleared. + if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{}) + if err != nil { + t.Errorf("Error getting the medium pod: %v", err) + } + if len(pod.Status.NominatedNodeName) == 0 { + return true, nil + } + return false, err + }); err != nil { + t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err) } - if len(pod.Status.NominatedNodeName) == 0 { - return true, nil - } - return false, err - }); err != nil { - t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err) - } - }) + }) + } } } } diff --git a/test/integration/scheduler/queueing/queue_test.go b/test/integration/scheduler/queueing/queue_test.go index 0cee65db888..4c47fd7260b 100644 --- a/test/integration/scheduler/queueing/queue_test.go +++ b/test/integration/scheduler/queueing/queue_test.go @@ -49,6 +49,7 @@ import ( testfwk "k8s.io/kubernetes/test/integration/framework" testutils "k8s.io/kubernetes/test/integration/util" imageutils "k8s.io/kubernetes/test/utils/image" + "k8s.io/kubernetes/test/utils/ktesting" "k8s.io/utils/ptr" ) @@ -112,6 +113,12 @@ func TestSchedulingGates(t *testing.T) { ) testutils.SyncSchedulerInformerFactory(testCtx) + if testCtx.Scheduler.APIDispatcher != nil { + logger, _ := ktesting.NewTestContext(t) + testCtx.Scheduler.APIDispatcher.Run(logger) + defer testCtx.Scheduler.APIDispatcher.Close() + } + cs, ns, ctx := testCtx.ClientSet, testCtx.NS.Name, testCtx.Ctx // Create node, so we can schedule pods. diff --git a/test/integration/scheduler/rescheduling_test.go b/test/integration/scheduler/rescheduling_test.go index 765c2ddba21..10d89fdef24 100644 --- a/test/integration/scheduler/rescheduling_test.go +++ b/test/integration/scheduler/rescheduling_test.go @@ -18,13 +18,17 @@ package scheduler import ( "context" + "fmt" "testing" "time" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" "k8s.io/klog/v2" fwk "k8s.io/kube-scheduler/framework" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler" "k8s.io/kubernetes/pkg/scheduler/framework" st "k8s.io/kubernetes/pkg/scheduler/testing" @@ -36,6 +40,11 @@ var _ framework.EnqueueExtensions = &PermitPlugin{} var _ framework.ReservePlugin = &ReservePlugin{} var _ framework.EnqueueExtensions = &ReservePlugin{} +type ResettablePlugin interface { + framework.Plugin + Reset() +} + type ReservePlugin struct { name string statusCode fwk.Code @@ -78,6 +87,11 @@ func (rp *ReservePlugin) EventsToRegister(_ context.Context) ([]fwk.ClusterEvent }, nil } +func (rp *ReservePlugin) Reset() { + rp.numReserveCalled = 0 + rp.numUnreserveCalled = 0 +} + type PermitPlugin struct { name string statusCode fwk.Code @@ -115,12 +129,16 @@ func (pp *PermitPlugin) EventsToRegister(_ context.Context) ([]fwk.ClusterEventW }, nil } +func (pp *PermitPlugin) Reset() { + pp.numPermitCalled = 0 +} + func TestReScheduling(t *testing.T) { testContext := testutils.InitTestAPIServer(t, "permit-plugin", nil) tests := []struct { - name string - plugins []framework.Plugin - action func() error + name string + plugin ResettablePlugin + action func() error // The first time for pod scheduling, we make pod scheduled error or unschedulable on purpose. // This is controlled by wantFirstSchedulingError. By default, pod is unschedulable. wantFirstSchedulingError bool @@ -130,10 +148,8 @@ func TestReScheduling(t *testing.T) { wantError bool }{ { - name: "Rescheduling pod rejected by Permit Plugin", - plugins: []framework.Plugin{ - &PermitPlugin{name: "permit", statusCode: fwk.Unschedulable}, - }, + name: "Rescheduling pod rejected by Permit Plugin", + plugin: &PermitPlugin{name: "permit", statusCode: fwk.Unschedulable}, action: func() error { _, err := testutils.CreateNode(testContext.ClientSet, st.MakeNode().Name("fake-node").Obj()) return err @@ -141,10 +157,8 @@ func TestReScheduling(t *testing.T) { wantScheduled: true, }, { - name: "Rescheduling pod rejected by Permit Plugin with unrelated event", - plugins: []framework.Plugin{ - &PermitPlugin{name: "permit", statusCode: fwk.Unschedulable}, - }, + name: "Rescheduling pod rejected by Permit Plugin with unrelated event", + plugin: &PermitPlugin{name: "permit", statusCode: fwk.Unschedulable}, action: func() error { _, err := testutils.CreatePausePod(testContext.ClientSet, testutils.InitPausePod(&testutils.PausePodConfig{Name: "test-pod-2", Namespace: testContext.NS.Name})) @@ -153,10 +167,8 @@ func TestReScheduling(t *testing.T) { wantScheduled: false, }, { - name: "Rescheduling pod failed by Permit Plugin", - plugins: []framework.Plugin{ - &PermitPlugin{name: "permit", statusCode: fwk.Error}, - }, + name: "Rescheduling pod failed by Permit Plugin", + plugin: &PermitPlugin{name: "permit", statusCode: fwk.Error}, action: func() error { _, err := testutils.CreateNode(testContext.ClientSet, st.MakeNode().Name("fake-node").Obj()) return err @@ -165,10 +177,8 @@ func TestReScheduling(t *testing.T) { wantError: true, }, { - name: "Rescheduling pod rejected by Reserve Plugin", - plugins: []framework.Plugin{ - &ReservePlugin{name: "reserve", statusCode: fwk.Unschedulable}, - }, + name: "Rescheduling pod rejected by Reserve Plugin", + plugin: &ReservePlugin{name: "reserve", statusCode: fwk.Unschedulable}, action: func() error { _, err := testutils.CreateNode(testContext.ClientSet, st.MakeNode().Name("fake-node").Obj()) return err @@ -176,10 +186,8 @@ func TestReScheduling(t *testing.T) { wantScheduled: true, }, { - name: "Rescheduling pod rejected by Reserve Plugin with unrelated event", - plugins: []framework.Plugin{ - &ReservePlugin{name: "reserve", statusCode: fwk.Unschedulable}, - }, + name: "Rescheduling pod rejected by Reserve Plugin with unrelated event", + plugin: &ReservePlugin{name: "reserve", statusCode: fwk.Unschedulable}, action: func() error { _, err := testutils.CreatePausePod(testContext.ClientSet, testutils.InitPausePod(&testutils.PausePodConfig{Name: "test-pod-2", Namespace: testContext.NS.Name})) @@ -188,10 +196,8 @@ func TestReScheduling(t *testing.T) { wantScheduled: false, }, { - name: "Rescheduling pod failed by Reserve Plugin", - plugins: []framework.Plugin{ - &ReservePlugin{name: "reserve", statusCode: fwk.Error}, - }, + name: "Rescheduling pod failed by Reserve Plugin", + plugin: &ReservePlugin{name: "reserve", statusCode: fwk.Error}, action: func() error { _, err := testutils.CreateNode(testContext.ClientSet, st.MakeNode().Name("fake-node").Obj()) return err @@ -201,54 +207,59 @@ func TestReScheduling(t *testing.T) { }, } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - // Create a plugin registry for testing. Register only a permit plugin. - registry, prof := InitRegistryAndConfig(t, nil, test.plugins...) + for _, asyncAPICallsEnabled := range []bool{true, false} { + for _, test := range tests { + t.Run(fmt.Sprintf("%s (Async API calls enabled: %v)", test.name, asyncAPICallsEnabled), func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled) - testCtx, teardown := InitTestSchedulerForFrameworkTest(t, testContext, 2, true, - scheduler.WithProfiles(prof), - scheduler.WithFrameworkOutOfTreeRegistry(registry)) - defer teardown() + // Create a plugin registry for testing. Register only a permit plugin. + registry, prof := InitRegistryAndConfig(t, nil, test.plugin) + t.Cleanup(test.plugin.Reset) - pod, err := testutils.CreatePausePod(testCtx.ClientSet, - testutils.InitPausePod(&testutils.PausePodConfig{Name: "test-pod", Namespace: testCtx.NS.Name})) - if err != nil { - t.Errorf("Error while creating a test pod: %v", err) - } + testCtx, teardown := InitTestSchedulerForFrameworkTest(t, testContext, 2, true, + scheduler.WithProfiles(prof), + scheduler.WithFrameworkOutOfTreeRegistry(registry)) + defer teardown() - // The first time for scheduling, pod is error or unschedulable, controlled by wantFirstSchedulingError - if test.wantFirstSchedulingError { - if err = wait.PollUntilContextTimeout(testCtx.Ctx, 10*time.Millisecond, 30*time.Second, false, - testutils.PodSchedulingError(testCtx.ClientSet, pod.Namespace, pod.Name)); err != nil { - t.Errorf("Expected a scheduling error, but got: %v", err) + pod, err := testutils.CreatePausePod(testCtx.ClientSet, + testutils.InitPausePod(&testutils.PausePodConfig{Name: "test-pod", Namespace: testCtx.NS.Name})) + if err != nil { + t.Errorf("Error while creating a test pod: %v", err) } - } else { - if err = testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Errorf("Didn't expect the pod to be scheduled. error: %v", err) - } - } - if test.action() != nil { - if err = test.action(); err != nil { - t.Errorf("Perform action() error: %v", err) + // The first time for scheduling, pod is error or unschedulable, controlled by wantFirstSchedulingError + if test.wantFirstSchedulingError { + if err = wait.PollUntilContextTimeout(testCtx.Ctx, 10*time.Millisecond, 30*time.Second, false, + testutils.PodSchedulingError(testCtx.ClientSet, pod.Namespace, pod.Name)); err != nil { + t.Errorf("Expected a scheduling error, but got: %v", err) + } + } else { + if err = testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Errorf("Didn't expect the pod to be scheduled. error: %v", err) + } } - } - if test.wantScheduled { - if err = testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Errorf("Didn't expect the pod to be unschedulable. error: %v", err) + if test.action != nil { + if err = test.action(); err != nil { + t.Errorf("Perform action() error: %v", err) + } } - } else if test.wantError { - if err = wait.PollUntilContextTimeout(testCtx.Ctx, 10*time.Millisecond, 30*time.Second, false, - testutils.PodSchedulingError(testCtx.ClientSet, pod.Namespace, pod.Name)); err != nil { - t.Errorf("Expected a scheduling error, but got: %v", err) + + if test.wantScheduled { + if err = testutils.WaitForPodToSchedule(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Errorf("Didn't expect the pod to be unschedulable. error: %v", err) + } + } else if test.wantError { + if err = wait.PollUntilContextTimeout(testCtx.Ctx, 10*time.Millisecond, 30*time.Second, false, + testutils.PodSchedulingError(testCtx.ClientSet, pod.Namespace, pod.Name)); err != nil { + t.Errorf("Expected a scheduling error, but got: %v", err) + } + } else { + if err = testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { + t.Errorf("Didn't expect the pod to be scheduled. error: %v", err) + } } - } else { - if err = testutils.WaitForPodUnschedulable(testCtx.Ctx, testCtx.ClientSet, pod); err != nil { - t.Errorf("Didn't expect the pod to be scheduled. error: %v", err) - } - } - }) + }) + } } } diff --git a/test/integration/scheduler_perf/misc/performance-config.yaml b/test/integration/scheduler_perf/misc/performance-config.yaml index c988e3f235a..e1815725011 100644 --- a/test/integration/scheduler_perf/misc/performance-config.yaml +++ b/test/integration/scheduler_perf/misc/performance-config.yaml @@ -39,6 +39,16 @@ - name: 5Nodes_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [integration-test, short] + params: + initNodes: 5 + initPods: 5 + measurePods: 10 + - name: 5Nodes_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [integration-test, short] params: initNodes: 5 @@ -77,6 +87,16 @@ - name: 5000Nodes_50000Pods_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [performance] + params: + initNodes: 5000 + initPods: 5000 + measurePods: 50000 + - name: 5000Nodes_50000Pods_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [performance] params: initNodes: 5000 @@ -158,6 +178,16 @@ - name: 5Nodes_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [integration-test, short] + params: + initNodes: 5 + initPods: 20 + measurePods: 5 + - name: 5Nodes_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [integration-test, short] params: initNodes: 5 @@ -174,6 +204,16 @@ - name: 1000Nodes_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [performance, short] + params: + initNodes: 1000 + initPods: 4000 + measurePods: 1000 + - name: 1000Nodes_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [performance, short] params: initNodes: 1000 @@ -222,6 +262,16 @@ - name: 5Nodes_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [integration-test, short] + params: + initNodes: 5 + initPods: 20 + measurePods: 5 + - name: 5Nodes_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [integration-test, short] params: initNodes: 5 @@ -248,6 +298,7 @@ labels: [performance] featureGates: SchedulerAsyncPreemption: true + SchedulerAsyncAPICalls: false params: initNodes: 5000 initPods: 20000 @@ -255,6 +306,17 @@ - name: 5000Nodes_QueueingHintsEnabled featureGates: SchedulerQueueingHints: true + SchedulerAsyncAPICalls: false + labels: [performance] + threshold: 160 + params: + initNodes: 5000 + initPods: 20000 + measurePods: 5000 + - name: 5000Nodes_AsyncAPICallsEnabled + featureGates: + SchedulerQueueingHints: true + SchedulerAsyncAPICalls: true labels: [performance] threshold: 160 params: