mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-03 19:58:17 +00:00 
			
		
		
		
	Merge pull request #35932 from jayunit100/sched_events_spam_reduce
Automatic merge from submit-queue Reduce spam in Events from scheduler by counter aggregation of failure Fixes #35842 Part of overall #35555
This commit is contained in:
		@@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
 | 
			
		||||
func (f *FitError) Error() string {
 | 
			
		||||
	var buf bytes.Buffer
 | 
			
		||||
	buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
 | 
			
		||||
	for node, predicates := range f.FailedPredicates {
 | 
			
		||||
		reasons := make([]string, 0)
 | 
			
		||||
	reasons := make(map[string]int)
 | 
			
		||||
	for _, predicates := range f.FailedPredicates {
 | 
			
		||||
		for _, pred := range predicates {
 | 
			
		||||
			reasons = append(reasons, pred.GetReason())
 | 
			
		||||
			reasons[pred.GetReason()] += 1
 | 
			
		||||
		}
 | 
			
		||||
		reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
 | 
			
		||||
		buf.WriteString(reasonMsg)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	sortReasonsHistogram := func() []string {
 | 
			
		||||
		reasonStrings := []string{}
 | 
			
		||||
		for k, v := range reasons {
 | 
			
		||||
			reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
 | 
			
		||||
		}
 | 
			
		||||
		sort.Strings(reasonStrings)
 | 
			
		||||
		return reasonStrings
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
 | 
			
		||||
	buf.WriteString(reasonMsg)
 | 
			
		||||
	return buf.String()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -18,6 +18,7 @@ package scheduler
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"errors"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"reflect"
 | 
			
		||||
	"testing"
 | 
			
		||||
	"time"
 | 
			
		||||
@@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
 | 
			
		||||
	defer close(stop)
 | 
			
		||||
	queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
 | 
			
		||||
	scache := schedulercache.New(10*time.Minute, stop)
 | 
			
		||||
	node := api.Node{
 | 
			
		||||
		ObjectMeta: api.ObjectMeta{Name: "machine1"},
 | 
			
		||||
		Status: api.NodeStatus{
 | 
			
		||||
			Capacity: api.ResourceList{
 | 
			
		||||
				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)),
 | 
			
		||||
				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
 | 
			
		||||
				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
 | 
			
		||||
			},
 | 
			
		||||
			Allocatable: api.ResourceList{
 | 
			
		||||
				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)),
 | 
			
		||||
				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
 | 
			
		||||
				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
 | 
			
		||||
			}},
 | 
			
		||||
 | 
			
		||||
	// Design the baseline for the pods, and we will make nodes that dont fit it later.
 | 
			
		||||
	var cpu = int64(4)
 | 
			
		||||
	var mem = int64(500)
 | 
			
		||||
	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
 | 
			
		||||
		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)),
 | 
			
		||||
		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
 | 
			
		||||
	}, api.ResourceList{
 | 
			
		||||
		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)),
 | 
			
		||||
		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	// create several nodes which cannot schedule the above pod
 | 
			
		||||
	nodes := []*api.Node{}
 | 
			
		||||
	for i := 0; i < 100; i++ {
 | 
			
		||||
		node := api.Node{
 | 
			
		||||
			ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
 | 
			
		||||
			Status: api.NodeStatus{
 | 
			
		||||
				Capacity: api.ResourceList{
 | 
			
		||||
					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
 | 
			
		||||
					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
 | 
			
		||||
					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
 | 
			
		||||
				},
 | 
			
		||||
				Allocatable: api.ResourceList{
 | 
			
		||||
					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
 | 
			
		||||
					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
 | 
			
		||||
					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
 | 
			
		||||
				}},
 | 
			
		||||
		}
 | 
			
		||||
		scache.AddNode(&node)
 | 
			
		||||
		nodes = append(nodes, &node)
 | 
			
		||||
	}
 | 
			
		||||
	scache.AddNode(&node)
 | 
			
		||||
	nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
 | 
			
		||||
	nodeLister := algorithm.FakeNodeLister(nodes)
 | 
			
		||||
	predicateMap := map[string]algorithm.FitPredicate{
 | 
			
		||||
		"PodFitsResources": predicates.PodFitsResources,
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Create expected failure reasons for all the nodes.  Hopefully they will get rolled up into a non-spammy summary.
 | 
			
		||||
	failedPredicatesMap := FailedPredicateMap{}
 | 
			
		||||
	for _, node := range nodes {
 | 
			
		||||
		failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
 | 
			
		||||
			predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
 | 
			
		||||
			predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)
 | 
			
		||||
 | 
			
		||||
	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
 | 
			
		||||
		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)),
 | 
			
		||||
		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
 | 
			
		||||
	}, api.ResourceList{
 | 
			
		||||
		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)),
 | 
			
		||||
		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
 | 
			
		||||
	})
 | 
			
		||||
	queuedPodStore.Add(podWithTooBigResourceRequests)
 | 
			
		||||
	scheduler.scheduleOne()
 | 
			
		||||
 | 
			
		||||
	select {
 | 
			
		||||
	case err := <-errChan:
 | 
			
		||||
		expectErr := &FitError{
 | 
			
		||||
			Pod: podWithTooBigResourceRequests,
 | 
			
		||||
			FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
 | 
			
		||||
				predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
 | 
			
		||||
				predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
 | 
			
		||||
			}},
 | 
			
		||||
			Pod:              podWithTooBigResourceRequests,
 | 
			
		||||
			FailedPredicates: failedPredicatesMap,
 | 
			
		||||
		}
 | 
			
		||||
		if len(fmt.Sprint(expectErr)) > 150 {
 | 
			
		||||
			t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
 | 
			
		||||
		}
 | 
			
		||||
		if !reflect.DeepEqual(expectErr, err) {
 | 
			
		||||
			t.Errorf("err want=%+v, get=%+v", expectErr, err)
 | 
			
		||||
			t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
 | 
			
		||||
		}
 | 
			
		||||
	case <-time.After(wait.ForeverTestTimeout):
 | 
			
		||||
		t.Fatalf("timeout after %v", wait.ForeverTestTimeout)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user