mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-03 19:58:17 +00:00 
			
		
		
		
	Before NodeName in the pod spec was used. Hence, pods with a fixed, pre-set NodeName were never scheduled by the k8sm-scheduler, leading e.g. to a failing e2e intra-pod test. Fixes mesosphere/kubernetes-mesos#388
		
			
				
	
	
		
			383 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			383 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
Copyright 2015 The Kubernetes Authors All rights reserved.
 | 
						|
 | 
						|
Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
you may not use this file except in compliance with the License.
 | 
						|
You may obtain a copy of the License at
 | 
						|
 | 
						|
    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
Unless required by applicable law or agreed to in writing, software
 | 
						|
distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
See the License for the specific language governing permissions and
 | 
						|
limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package podtask
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"strings"
 | 
						|
	"time"
 | 
						|
 | 
						|
	"code.google.com/p/go-uuid/uuid"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
 | 
						|
	annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
 | 
						|
	mresource "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/resource"
 | 
						|
	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 | 
						|
	"github.com/gogo/protobuf/proto"
 | 
						|
 | 
						|
	log "github.com/golang/glog"
 | 
						|
	mesos "github.com/mesos/mesos-go/mesosproto"
 | 
						|
	mutil "github.com/mesos/mesos-go/mesosutil"
 | 
						|
)
 | 
						|
 | 
						|
type StateType int
 | 
						|
 | 
						|
const (
 | 
						|
	StatePending StateType = iota
 | 
						|
	StateRunning
 | 
						|
	StateFinished
 | 
						|
	StateUnknown
 | 
						|
)
 | 
						|
 | 
						|
type FlagType string
 | 
						|
 | 
						|
const (
 | 
						|
	Launched = FlagType("launched")
 | 
						|
	Bound    = FlagType("bound")
 | 
						|
	Deleted  = FlagType("deleted")
 | 
						|
)
 | 
						|
 | 
						|
// A struct that describes a pod task.
 | 
						|
type T struct {
 | 
						|
	ID          string
 | 
						|
	Pod         api.Pod
 | 
						|
	Spec        Spec
 | 
						|
	Offer       offers.Perishable // thread-safe
 | 
						|
	State       StateType
 | 
						|
	Flags       map[FlagType]struct{}
 | 
						|
	CreateTime  time.Time
 | 
						|
	UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
 | 
						|
 | 
						|
	podStatus  api.PodStatus
 | 
						|
	executor   *mesos.ExecutorInfo // readonly
 | 
						|
	podKey     string
 | 
						|
	launchTime time.Time
 | 
						|
	bindTime   time.Time
 | 
						|
	mapper     HostPortMappingType
 | 
						|
}
 | 
						|
 | 
						|
type Spec struct {
 | 
						|
	SlaveID string
 | 
						|
	CPU     mresource.CPUShares
 | 
						|
	Memory  mresource.MegaBytes
 | 
						|
	PortMap []HostPortMapping
 | 
						|
	Ports   []uint64
 | 
						|
	Data    []byte
 | 
						|
}
 | 
						|
 | 
						|
// mostly-clone this pod task. the clone will actually share the some fields:
 | 
						|
//   - executor    // OK because it's read only
 | 
						|
//   - Offer       // OK because it's guarantees safe concurrent access
 | 
						|
func (t *T) Clone() *T {
 | 
						|
	if t == nil {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
 | 
						|
	// shallow-copy
 | 
						|
	clone := *t
 | 
						|
 | 
						|
	// deep copy
 | 
						|
	(&t.Spec).copyTo(&clone.Spec)
 | 
						|
	clone.Flags = map[FlagType]struct{}{}
 | 
						|
	for k := range t.Flags {
 | 
						|
		clone.Flags[k] = struct{}{}
 | 
						|
	}
 | 
						|
	return &clone
 | 
						|
}
 | 
						|
 | 
						|
func (old *Spec) copyTo(new *Spec) {
 | 
						|
	if len(old.PortMap) > 0 {
 | 
						|
		new.PortMap = append(([]HostPortMapping)(nil), old.PortMap...)
 | 
						|
	}
 | 
						|
	if len(old.Ports) > 0 {
 | 
						|
		new.Ports = append(([]uint64)(nil), old.Ports...)
 | 
						|
	}
 | 
						|
	if len(old.Data) > 0 {
 | 
						|
		new.Data = append(([]byte)(nil), old.Data...)
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) HasAcceptedOffer() bool {
 | 
						|
	return t.Spec.SlaveID != ""
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) GetOfferId() string {
 | 
						|
	if t.Offer == nil {
 | 
						|
		return ""
 | 
						|
	}
 | 
						|
	return t.Offer.Details().Id.GetValue()
 | 
						|
}
 | 
						|
 | 
						|
func generateTaskName(pod *api.Pod) string {
 | 
						|
	ns := pod.Namespace
 | 
						|
	if ns == "" {
 | 
						|
		ns = api.NamespaceDefault
 | 
						|
	}
 | 
						|
	return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) BuildTaskInfo() *mesos.TaskInfo {
 | 
						|
	info := &mesos.TaskInfo{
 | 
						|
		Name:     proto.String(generateTaskName(&t.Pod)),
 | 
						|
		TaskId:   mutil.NewTaskID(t.ID),
 | 
						|
		SlaveId:  mutil.NewSlaveID(t.Spec.SlaveID),
 | 
						|
		Executor: t.executor,
 | 
						|
		Data:     t.Spec.Data,
 | 
						|
		Resources: []*mesos.Resource{
 | 
						|
			mutil.NewScalarResource("cpus", float64(t.Spec.CPU)),
 | 
						|
			mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
 | 
						|
		},
 | 
						|
	}
 | 
						|
	if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
 | 
						|
		info.Resources = append(info.Resources, portsResource)
 | 
						|
	}
 | 
						|
	return info
 | 
						|
}
 | 
						|
 | 
						|
// Fill the Spec in the T, should be called during k8s scheduling, before binding.
 | 
						|
func (t *T) FillFromDetails(details *mesos.Offer) error {
 | 
						|
	if details == nil {
 | 
						|
		//programming error
 | 
						|
		panic("offer details are nil")
 | 
						|
	}
 | 
						|
 | 
						|
	// compute used resources
 | 
						|
	cpu := mresource.PodCPULimit(&t.Pod)
 | 
						|
	mem := mresource.PodMemLimit(&t.Pod)
 | 
						|
	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, cpu, mem)
 | 
						|
 | 
						|
	t.Spec = Spec{
 | 
						|
		SlaveID: details.GetSlaveId().GetValue(),
 | 
						|
		CPU:     cpu,
 | 
						|
		Memory:  mem,
 | 
						|
	}
 | 
						|
 | 
						|
	// fill in port mapping
 | 
						|
	if mapping, err := t.mapper.Generate(t, details); err != nil {
 | 
						|
		t.Reset()
 | 
						|
		return err
 | 
						|
	} else {
 | 
						|
		ports := []uint64{}
 | 
						|
		for _, entry := range mapping {
 | 
						|
			ports = append(ports, entry.OfferPort)
 | 
						|
		}
 | 
						|
		t.Spec.PortMap = mapping
 | 
						|
		t.Spec.Ports = ports
 | 
						|
	}
 | 
						|
 | 
						|
	// hostname needs of the executor needs to match that of the offer, otherwise
 | 
						|
	// the kubelet node status checker/updater is very unhappy
 | 
						|
	const HOSTNAME_OVERRIDE_FLAG = "--hostname-override="
 | 
						|
	hostname := details.GetHostname() // required field, non-empty
 | 
						|
	hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname
 | 
						|
 | 
						|
	argv := t.executor.Command.Arguments
 | 
						|
	overwrite := false
 | 
						|
	for i, arg := range argv {
 | 
						|
		if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) {
 | 
						|
			overwrite = true
 | 
						|
			argv[i] = hostnameOverride
 | 
						|
			break
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if !overwrite {
 | 
						|
		t.executor.Command.Arguments = append(argv, hostnameOverride)
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// Clear offer-related details from the task, should be called if/when an offer
 | 
						|
// has already been assigned to a task but for some reason is no longer valid.
 | 
						|
func (t *T) Reset() {
 | 
						|
	log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name)
 | 
						|
	t.Offer = nil
 | 
						|
	t.Spec = Spec{}
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) AcceptOffer(offer *mesos.Offer) bool {
 | 
						|
	if offer == nil {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// if the user has specified a target host, make sure this offer is for that host
 | 
						|
	if t.Pod.Spec.NodeName != "" && offer.GetHostname() != t.Pod.Spec.NodeName {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// check ports
 | 
						|
	if _, err := t.mapper.Generate(t, offer); err != nil {
 | 
						|
		log.V(3).Info(err)
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	// find offered cpu and mem
 | 
						|
	var (
 | 
						|
		offeredCpus mresource.CPUShares
 | 
						|
		offeredMem  mresource.MegaBytes
 | 
						|
	)
 | 
						|
	for _, resource := range offer.Resources {
 | 
						|
		if resource.GetName() == "cpus" {
 | 
						|
			offeredCpus = mresource.CPUShares(*resource.GetScalar().Value)
 | 
						|
		}
 | 
						|
 | 
						|
		if resource.GetName() == "mem" {
 | 
						|
			offeredMem = mresource.MegaBytes(*resource.GetScalar().Value)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// calculate cpu and mem sum over all containers of the pod
 | 
						|
	// TODO (@sttts): also support pod.spec.resources.limit.request
 | 
						|
	// TODO (@sttts): take into account the executor resources
 | 
						|
	cpu := mresource.PodCPULimit(&t.Pod)
 | 
						|
	mem := mresource.PodMemLimit(&t.Pod)
 | 
						|
	log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 | 
						|
	if (cpu > offeredCpus) || (mem > offeredMem) {
 | 
						|
		log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	return true
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) Set(f FlagType) {
 | 
						|
	t.Flags[f] = struct{}{}
 | 
						|
	if Launched == f {
 | 
						|
		t.launchTime = time.Now()
 | 
						|
		queueWaitTime := t.launchTime.Sub(t.CreateTime)
 | 
						|
		metrics.QueueWaitTime.Observe(metrics.InMicroseconds(queueWaitTime))
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) Has(f FlagType) (exists bool) {
 | 
						|
	_, exists = t.Flags[f]
 | 
						|
	return
 | 
						|
}
 | 
						|
 | 
						|
func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
 | 
						|
	if executor == nil {
 | 
						|
		return nil, fmt.Errorf("illegal argument: executor was nil")
 | 
						|
	}
 | 
						|
	key, err := MakePodKey(ctx, pod.Name)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	if id == "" {
 | 
						|
		id = "pod." + uuid.NewUUID().String()
 | 
						|
	}
 | 
						|
	task := &T{
 | 
						|
		ID:       id,
 | 
						|
		Pod:      pod,
 | 
						|
		State:    StatePending,
 | 
						|
		podKey:   key,
 | 
						|
		mapper:   MappingTypeForPod(&pod),
 | 
						|
		Flags:    make(map[FlagType]struct{}),
 | 
						|
		executor: proto.Clone(executor).(*mesos.ExecutorInfo),
 | 
						|
	}
 | 
						|
	task.CreateTime = time.Now()
 | 
						|
	return task, nil
 | 
						|
}
 | 
						|
 | 
						|
func (t *T) SaveRecoveryInfo(dict map[string]string) {
 | 
						|
	dict[annotation.TaskIdKey] = t.ID
 | 
						|
	dict[annotation.SlaveIdKey] = t.Spec.SlaveID
 | 
						|
	dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
 | 
						|
	dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
 | 
						|
}
 | 
						|
 | 
						|
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
 | 
						|
// support reconstruction. if we expect to be able to reconstruct state but encounter errors
 | 
						|
// in the process then those errors are returned. if the pod is in a seemingly valid state but
 | 
						|
// otherwise does not support task reconstruction return false. if we're able to reconstruct
 | 
						|
// state then return a reconstructed task and true.
 | 
						|
//
 | 
						|
// at this time task reconstruction is only supported for pods that have been annotated with
 | 
						|
// binding metadata, which implies that they've previously been associated with a task and
 | 
						|
// that mesos knows about it.
 | 
						|
//
 | 
						|
// assumes that the pod data comes from the k8s registry and reflects the desired state.
 | 
						|
//
 | 
						|
func RecoverFrom(pod api.Pod) (*T, bool, error) {
 | 
						|
	// we only expect annotations if pod has been bound, which implies that it has already
 | 
						|
	// been scheduled and launched
 | 
						|
	if pod.Spec.NodeName == "" && len(pod.Annotations) == 0 {
 | 
						|
		log.V(1).Infof("skipping recovery for unbound pod %v/%v", pod.Namespace, pod.Name)
 | 
						|
		return nil, false, nil
 | 
						|
	}
 | 
						|
 | 
						|
	// only process pods that are not in a terminal state
 | 
						|
	switch pod.Status.Phase {
 | 
						|
	case api.PodPending, api.PodRunning, api.PodUnknown: // continue
 | 
						|
	default:
 | 
						|
		log.V(1).Infof("skipping recovery for terminal pod %v/%v", pod.Namespace, pod.Name)
 | 
						|
		return nil, false, nil
 | 
						|
	}
 | 
						|
 | 
						|
	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
 | 
						|
	key, err := MakePodKey(ctx, pod.Name)
 | 
						|
	if err != nil {
 | 
						|
		return nil, false, err
 | 
						|
	}
 | 
						|
 | 
						|
	//TODO(jdef) recover ports (and other resource requirements?) from the pod spec as well
 | 
						|
 | 
						|
	now := time.Now()
 | 
						|
	t := &T{
 | 
						|
		Pod:        pod,
 | 
						|
		CreateTime: now,
 | 
						|
		podKey:     key,
 | 
						|
		State:      StatePending, // possibly running? mesos will tell us during reconciliation
 | 
						|
		Flags:      make(map[FlagType]struct{}),
 | 
						|
		mapper:     MappingTypeForPod(&pod),
 | 
						|
		launchTime: now,
 | 
						|
		bindTime:   now,
 | 
						|
	}
 | 
						|
	var (
 | 
						|
		offerId  string
 | 
						|
		hostname string
 | 
						|
	)
 | 
						|
	for _, k := range []string{
 | 
						|
		annotation.BindingHostKey,
 | 
						|
		annotation.TaskIdKey,
 | 
						|
		annotation.SlaveIdKey,
 | 
						|
		annotation.OfferIdKey,
 | 
						|
		annotation.ExecutorIdKey,
 | 
						|
	} {
 | 
						|
		v, found := pod.Annotations[k]
 | 
						|
		if !found {
 | 
						|
			return nil, false, fmt.Errorf("incomplete metadata: missing value for pod annotation: %v", k)
 | 
						|
		}
 | 
						|
		switch k {
 | 
						|
		case annotation.BindingHostKey:
 | 
						|
			hostname = v
 | 
						|
		case annotation.SlaveIdKey:
 | 
						|
			t.Spec.SlaveID = v
 | 
						|
		case annotation.OfferIdKey:
 | 
						|
			offerId = v
 | 
						|
		case annotation.TaskIdKey:
 | 
						|
			t.ID = v
 | 
						|
		case annotation.ExecutorIdKey:
 | 
						|
			// this is nowhere near sufficient to re-launch a task, but we really just
 | 
						|
			// want this for tracking
 | 
						|
			t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
 | 
						|
		}
 | 
						|
	}
 | 
						|
	t.Offer = offers.Expired(offerId, hostname, 0)
 | 
						|
	t.Flags[Launched] = struct{}{}
 | 
						|
	t.Flags[Bound] = struct{}{}
 | 
						|
	return t, true, nil
 | 
						|
}
 |