mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 12:18:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			296 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			296 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
Copyright 2014 Google Inc. All rights reserved.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package controller
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"sort"
 | 
						|
	"sync"
 | 
						|
	"time"
 | 
						|
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/validation"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
 | 
						|
	"github.com/golang/glog"
 | 
						|
)
 | 
						|
 | 
						|
// ReplicationManager is responsible for synchronizing ReplicationController objects stored
 | 
						|
// in the system with actual running pods.
 | 
						|
type ReplicationManager struct {
 | 
						|
	kubeClient client.Interface
 | 
						|
	podControl PodControlInterface
 | 
						|
	syncTime   <-chan time.Time
 | 
						|
 | 
						|
	// To allow injection of syncReplicationController for testing.
 | 
						|
	syncHandler func(controller api.ReplicationController) error
 | 
						|
}
 | 
						|
 | 
						|
// PodControlInterface is an interface that knows how to add or delete pods
 | 
						|
// created as an interface to allow testing.
 | 
						|
type PodControlInterface interface {
 | 
						|
	// createReplica creates new replicated pods according to the spec.
 | 
						|
	createReplica(namespace string, controller api.ReplicationController)
 | 
						|
	// deletePod deletes the pod identified by podID.
 | 
						|
	deletePod(namespace string, podID string) error
 | 
						|
}
 | 
						|
 | 
						|
// RealPodControl is the default implementation of PodControllerInterface.
 | 
						|
type RealPodControl struct {
 | 
						|
	kubeClient client.Interface
 | 
						|
	recorder   record.EventRecorder
 | 
						|
}
 | 
						|
 | 
						|
// Time period of main replication controller sync loop
 | 
						|
const DefaultSyncPeriod = 5 * time.Second
 | 
						|
 | 
						|
func (r RealPodControl) createReplica(namespace string, controller api.ReplicationController) {
 | 
						|
	desiredLabels := make(labels.Set)
 | 
						|
	for k, v := range controller.Spec.Template.Labels {
 | 
						|
		desiredLabels[k] = v
 | 
						|
	}
 | 
						|
	desiredAnnotations := make(labels.Set)
 | 
						|
	for k, v := range controller.Spec.Template.Annotations {
 | 
						|
		desiredAnnotations[k] = v
 | 
						|
	}
 | 
						|
 | 
						|
	// use the dash (if the name isn't too long) to make the pod name a bit prettier
 | 
						|
	prefix := fmt.Sprintf("%s-", controller.Name)
 | 
						|
	if ok, _ := validation.ValidatePodName(prefix, true); !ok {
 | 
						|
		prefix = controller.Name
 | 
						|
	}
 | 
						|
 | 
						|
	pod := &api.Pod{
 | 
						|
		ObjectMeta: api.ObjectMeta{
 | 
						|
			Labels:       desiredLabels,
 | 
						|
			Annotations:  desiredAnnotations,
 | 
						|
			GenerateName: prefix,
 | 
						|
		},
 | 
						|
	}
 | 
						|
	if err := api.Scheme.Convert(&controller.Spec.Template.Spec, &pod.Spec); err != nil {
 | 
						|
		util.HandleError(fmt.Errorf("unable to convert pod template: %v", err))
 | 
						|
		return
 | 
						|
	}
 | 
						|
	if labels.Set(pod.Labels).AsSelector().Empty() {
 | 
						|
		util.HandleError(fmt.Errorf("unable to create pod replica, no labels"))
 | 
						|
		return
 | 
						|
	}
 | 
						|
	if _, err := r.kubeClient.Pods(namespace).Create(pod); err != nil {
 | 
						|
		r.recorder.Eventf(&controller, "failedCreate", "Error creating: %v", err)
 | 
						|
		util.HandleError(fmt.Errorf("unable to create pod replica: %v", err))
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (r RealPodControl) deletePod(namespace, podID string) error {
 | 
						|
	return r.kubeClient.Pods(namespace).Delete(podID)
 | 
						|
}
 | 
						|
 | 
						|
// NewReplicationManager creates a new ReplicationManager.
 | 
						|
func NewReplicationManager(kubeClient client.Interface) *ReplicationManager {
 | 
						|
	eventBroadcaster := record.NewBroadcaster()
 | 
						|
	eventBroadcaster.StartRecordingToSink(kubeClient.Events(""))
 | 
						|
 | 
						|
	rm := &ReplicationManager{
 | 
						|
		kubeClient: kubeClient,
 | 
						|
		podControl: RealPodControl{
 | 
						|
			kubeClient: kubeClient,
 | 
						|
			recorder:   eventBroadcaster.NewRecorder(api.EventSource{Component: "replication-controller"}),
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	rm.syncHandler = rm.syncReplicationController
 | 
						|
	return rm
 | 
						|
}
 | 
						|
 | 
						|
// Run begins watching and syncing.
 | 
						|
func (rm *ReplicationManager) Run(period time.Duration) {
 | 
						|
	rm.syncTime = time.Tick(period)
 | 
						|
	resourceVersion := ""
 | 
						|
	go util.Forever(func() { rm.watchControllers(&resourceVersion) }, period)
 | 
						|
}
 | 
						|
 | 
						|
// resourceVersion is a pointer to the resource version to use/update.
 | 
						|
func (rm *ReplicationManager) watchControllers(resourceVersion *string) {
 | 
						|
	watching, err := rm.kubeClient.ReplicationControllers(api.NamespaceAll).Watch(
 | 
						|
		labels.Everything(),
 | 
						|
		fields.Everything(),
 | 
						|
		*resourceVersion,
 | 
						|
	)
 | 
						|
	if err != nil {
 | 
						|
		util.HandleError(fmt.Errorf("unable to watch: %v", err))
 | 
						|
		time.Sleep(5 * time.Second)
 | 
						|
		return
 | 
						|
	}
 | 
						|
 | 
						|
	for {
 | 
						|
		select {
 | 
						|
		case <-rm.syncTime:
 | 
						|
			rm.synchronize()
 | 
						|
		case event, open := <-watching.ResultChan():
 | 
						|
			if !open {
 | 
						|
				// watchChannel has been closed, or something else went
 | 
						|
				// wrong with our watch call. Let the util.Forever()
 | 
						|
				// that called us call us again.
 | 
						|
				return
 | 
						|
			}
 | 
						|
			if event.Type == watch.Error {
 | 
						|
				util.HandleError(fmt.Errorf("error from watch during sync: %v", errors.FromObject(event.Object)))
 | 
						|
				// Clear the resource version, this may cause us to skip some elements on the watch,
 | 
						|
				// but we'll catch them on the synchronize() call, so it works out.
 | 
						|
				*resourceVersion = ""
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			glog.V(4).Infof("Got watch: %#v", event)
 | 
						|
			rc, ok := event.Object.(*api.ReplicationController)
 | 
						|
			if !ok {
 | 
						|
				if status, ok := event.Object.(*api.Status); ok {
 | 
						|
					if status.Status == api.StatusFailure {
 | 
						|
						glog.Errorf("Failed to watch: %v", status)
 | 
						|
						// Clear resource version here, as above, this won't hurt consistency, but we
 | 
						|
						// should consider introspecting more carefully here. (or make the apiserver smarter)
 | 
						|
						// "why not both?"
 | 
						|
						*resourceVersion = ""
 | 
						|
						continue
 | 
						|
					}
 | 
						|
				}
 | 
						|
				util.HandleError(fmt.Errorf("unexpected object: %#v", event.Object))
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			// If we get disconnected, start where we left off.
 | 
						|
			*resourceVersion = rc.ResourceVersion
 | 
						|
			// Sync even if this is a deletion event, to ensure that we leave
 | 
						|
			// it in the desired state.
 | 
						|
			glog.V(4).Infof("About to sync from watch: %q", rc.Name)
 | 
						|
			if err := rm.syncHandler(*rc); err != nil {
 | 
						|
				util.HandleError(fmt.Errorf("unexpected sync error: %v", err))
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// filterActivePods returns pods that have not terminated.
 | 
						|
func filterActivePods(pods []api.Pod) []*api.Pod {
 | 
						|
	var result []*api.Pod
 | 
						|
	for _, value := range pods {
 | 
						|
		if api.PodSucceeded != value.Status.Phase &&
 | 
						|
			api.PodFailed != value.Status.Phase {
 | 
						|
			result = append(result, &value)
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return result
 | 
						|
}
 | 
						|
 | 
						|
type activePods []*api.Pod
 | 
						|
 | 
						|
func (s activePods) Len() int      { return len(s) }
 | 
						|
func (s activePods) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
 | 
						|
 | 
						|
func (s activePods) Less(i, j int) bool {
 | 
						|
	// Unassigned < assigned
 | 
						|
	if s[i].Spec.Host == "" && s[j].Spec.Host != "" {
 | 
						|
		return true
 | 
						|
	}
 | 
						|
	// PodPending < PodUnknown < PodRunning
 | 
						|
	m := map[api.PodPhase]int{api.PodPending: 0, api.PodUnknown: 1, api.PodRunning: 2}
 | 
						|
	if m[s[i].Status.Phase] != m[s[j].Status.Phase] {
 | 
						|
		return m[s[i].Status.Phase] < m[s[j].Status.Phase]
 | 
						|
	}
 | 
						|
	// Not ready < ready
 | 
						|
	if !api.IsPodReady(s[i]) && api.IsPodReady(s[j]) {
 | 
						|
		return true
 | 
						|
	}
 | 
						|
	return false
 | 
						|
}
 | 
						|
 | 
						|
func (rm *ReplicationManager) syncReplicationController(controller api.ReplicationController) error {
 | 
						|
	s := labels.Set(controller.Spec.Selector).AsSelector()
 | 
						|
	podList, err := rm.kubeClient.Pods(controller.Namespace).List(s, fields.Everything())
 | 
						|
	if err != nil {
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	filteredList := filterActivePods(podList.Items)
 | 
						|
	numActivePods := len(filteredList)
 | 
						|
	diff := numActivePods - controller.Spec.Replicas
 | 
						|
	if diff < 0 {
 | 
						|
		diff *= -1
 | 
						|
		wait := sync.WaitGroup{}
 | 
						|
		wait.Add(diff)
 | 
						|
		glog.V(2).Infof("Too few %q replicas, creating %d", controller.Name, diff)
 | 
						|
		for i := 0; i < diff; i++ {
 | 
						|
			go func() {
 | 
						|
				defer wait.Done()
 | 
						|
				rm.podControl.createReplica(controller.Namespace, controller)
 | 
						|
			}()
 | 
						|
		}
 | 
						|
		wait.Wait()
 | 
						|
	} else if diff > 0 {
 | 
						|
		glog.V(2).Infof("Too many %q replicas, deleting %d", controller.Name, diff)
 | 
						|
		// Sort the pods in the order such that not-ready < ready, unscheduled
 | 
						|
		// < scheduled, and pending < running. This ensures that we delete pods
 | 
						|
		// in the earlier stages whenever possible.
 | 
						|
		sort.Sort(activePods(filteredList))
 | 
						|
 | 
						|
		wait := sync.WaitGroup{}
 | 
						|
		wait.Add(diff)
 | 
						|
		for i := 0; i < diff; i++ {
 | 
						|
			go func(ix int) {
 | 
						|
				defer wait.Done()
 | 
						|
				rm.podControl.deletePod(controller.Namespace, filteredList[ix].Name)
 | 
						|
			}(i)
 | 
						|
		}
 | 
						|
		wait.Wait()
 | 
						|
	}
 | 
						|
	if controller.Status.Replicas != numActivePods {
 | 
						|
		controller.Status.Replicas = numActivePods
 | 
						|
		_, err = rm.kubeClient.ReplicationControllers(controller.Namespace).Update(&controller)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func (rm *ReplicationManager) synchronize() {
 | 
						|
	// TODO: remove this method completely and rely on the watch.
 | 
						|
	// Add resource version tracking to watch to make this work.
 | 
						|
	var controllers []api.ReplicationController
 | 
						|
	list, err := rm.kubeClient.ReplicationControllers(api.NamespaceAll).List(labels.Everything())
 | 
						|
	if err != nil {
 | 
						|
		util.HandleError(fmt.Errorf("synchronization error: %v", err))
 | 
						|
		return
 | 
						|
	}
 | 
						|
	controllers = list.Items
 | 
						|
	wg := sync.WaitGroup{}
 | 
						|
	wg.Add(len(controllers))
 | 
						|
	for ix := range controllers {
 | 
						|
		go func(ix int) {
 | 
						|
			defer wg.Done()
 | 
						|
			glog.V(4).Infof("periodic sync of %v", controllers[ix].Name)
 | 
						|
			err := rm.syncHandler(controllers[ix])
 | 
						|
			if err != nil {
 | 
						|
				util.HandleError(fmt.Errorf("error synchronizing: %v", err))
 | 
						|
			}
 | 
						|
		}(ix)
 | 
						|
	}
 | 
						|
	wg.Wait()
 | 
						|
}
 |