mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	improve gpu integration
Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
		@@ -206,7 +206,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
 | 
			
		||||
	fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
 | 
			
		||||
	fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
 | 
			
		||||
	fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
 | 
			
		||||
	fs.BoolVar(&s.EnableExperimentalNvidiaGPU, "experimental-enable-nvidia-gpu", s.EnableExperimentalNvidiaGPU, "Enable experimental Nvidia GPU support.")
 | 
			
		||||
	fs.BoolVar(&s.ExperimentalEnableNvidiaGPU, "experimental-enable-nvidia-gpu", s.ExperimentalEnableNvidiaGPU, "Enable experimental Nvidia GPU support.")
 | 
			
		||||
	// TODO(#40229): Remove the docker-exec-handler flag.
 | 
			
		||||
	fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
 | 
			
		||||
	fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")
 | 
			
		||||
 
 | 
			
		||||
@@ -363,7 +363,7 @@ type KubeletConfiguration struct {
 | 
			
		||||
	// maxPods is the number of pods that can run on this Kubelet.
 | 
			
		||||
	MaxPods int32
 | 
			
		||||
	// Enable experimental Nvidia GPU
 | 
			
		||||
	EnableExperimentalNvidiaGPU bool
 | 
			
		||||
	ExperimentalEnableNvidiaGPU bool
 | 
			
		||||
	// dockerExecHandlerName is the handler to use when executing a command
 | 
			
		||||
	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 | 
			
		||||
	// 'native'.
 | 
			
		||||
 
 | 
			
		||||
@@ -408,7 +408,7 @@ type KubeletConfiguration struct {
 | 
			
		||||
	// maxPods is the number of pods that can run on this Kubelet.
 | 
			
		||||
	MaxPods int32 `json:"maxPods"`
 | 
			
		||||
	// Enable Nvidia GPU support on this node.
 | 
			
		||||
	EnableExperimentalNvidiaGPU bool `json:"enableExperimentalNvidiaGPU"`
 | 
			
		||||
	ExperimentalEnableNvidiaGPU bool `json:"experimentalEnableNvidiaGPU"`
 | 
			
		||||
	// dockerExecHandlerName is the handler to use when executing a command
 | 
			
		||||
	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 | 
			
		||||
	// 'native'.
 | 
			
		||||
 
 | 
			
		||||
@@ -353,7 +353,7 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
 | 
			
		||||
	out.HairpinMode = in.HairpinMode
 | 
			
		||||
	out.BabysitDaemons = in.BabysitDaemons
 | 
			
		||||
	out.MaxPods = in.MaxPods
 | 
			
		||||
	out.NvidiaGPUs = in.NvidiaGPUs
 | 
			
		||||
	out.ExperimentalEnableNvidiaGPU = in.ExperimentalEnableNvidiaGPU
 | 
			
		||||
	out.DockerExecHandlerName = in.DockerExecHandlerName
 | 
			
		||||
	out.PodCIDR = in.PodCIDR
 | 
			
		||||
	out.ResolverConfig = in.ResolverConfig
 | 
			
		||||
@@ -531,7 +531,7 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
 | 
			
		||||
	out.HairpinMode = in.HairpinMode
 | 
			
		||||
	out.BabysitDaemons = in.BabysitDaemons
 | 
			
		||||
	out.MaxPods = in.MaxPods
 | 
			
		||||
	out.NvidiaGPUs = in.NvidiaGPUs
 | 
			
		||||
	out.ExperimentalEnableNvidiaGPU = in.ExperimentalEnableNvidiaGPU
 | 
			
		||||
	out.DockerExecHandlerName = in.DockerExecHandlerName
 | 
			
		||||
	out.PodCIDR = in.PodCIDR
 | 
			
		||||
	out.ResolverConfig = in.ResolverConfig
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										41
									
								
								pkg/kubelet/gpu/gpu_manager_stub.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								pkg/kubelet/gpu/gpu_manager_stub.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,41 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package gpu
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
 | 
			
		||||
	"k8s.io/kubernetes/pkg/api/v1"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type gpuManagerStub struct{}
 | 
			
		||||
 | 
			
		||||
func (gms *gpuManagerStub) Start() error {
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
 | 
			
		||||
	return nil, fmt.Errorf("GPUs are not supported")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewGPUManagerStub() GPUManager {
 | 
			
		||||
	return &gpuManagerStub{}
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										59
									
								
								pkg/kubelet/gpu/nvidia/helpers.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								pkg/kubelet/gpu/nvidia/helpers.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,59 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package nvidia
 | 
			
		||||
 | 
			
		||||
import "k8s.io/apimachinery/pkg/util/sets"
 | 
			
		||||
 | 
			
		||||
// podGPUs represents a list of pod to GPU mappings.
 | 
			
		||||
type podGPUs struct {
 | 
			
		||||
	podGPUMapping map[string]sets.String
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func newPodGpus() *podGPUs {
 | 
			
		||||
	return &podGPUs{
 | 
			
		||||
		podGPUMapping: map[string]sets.String{},
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
func (pgpu *podGPUs) pods() sets.String {
 | 
			
		||||
	ret := sets.NewString()
 | 
			
		||||
	for k := range pgpu.podGPUMapping {
 | 
			
		||||
		ret.Insert(k)
 | 
			
		||||
	}
 | 
			
		||||
	return ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (pgpu *podGPUs) insert(podUID string, device string) {
 | 
			
		||||
	if _, exists := pgpu.podGPUMapping[podUID]; !exists {
 | 
			
		||||
		pgpu.podGPUMapping[podUID] = sets.NewString(device)
 | 
			
		||||
	} else {
 | 
			
		||||
		pgpu.podGPUMapping[podUID].Insert(device)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (pgpu *podGPUs) delete(pods []string) {
 | 
			
		||||
	for _, uid := range pods {
 | 
			
		||||
		delete(pgpu.podGPUMapping, uid)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (pgpu *podGPUs) devices() sets.String {
 | 
			
		||||
	ret := sets.NewString()
 | 
			
		||||
	for _, devices := range pgpu.podGPUMapping {
 | 
			
		||||
		ret.Union(devices)
 | 
			
		||||
	}
 | 
			
		||||
	return ret
 | 
			
		||||
}
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2016 The Kubernetes Authors.
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
@@ -18,12 +18,19 @@ package nvidia
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"io/ioutil"
 | 
			
		||||
	"os"
 | 
			
		||||
	"path/filepath"
 | 
			
		||||
	"path"
 | 
			
		||||
	"regexp"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	"github.com/golang/glog"
 | 
			
		||||
 | 
			
		||||
	"k8s.io/apimachinery/pkg/api/resource"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/util/sets"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/api/v1"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/gpu"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// TODO: If use NVML in the future, the implementation could be more complex,
 | 
			
		||||
@@ -34,53 +41,40 @@ const (
 | 
			
		||||
	// If the driver installed correctly, the 2 devices must be there.
 | 
			
		||||
	NvidiaCtlDevice  string = "/dev/nvidiactl"
 | 
			
		||||
	NvidiaUVMDevice  string = "/dev/nvidia-uvm"
 | 
			
		||||
	devDirectory            = "/dev"
 | 
			
		||||
	nvidiaDeviceRE          = `^nvidia[0-9]*$`
 | 
			
		||||
	nvidiaFullpathRE        = `^/dev/nvidia[0-9]*$`
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// Manage GPU devices.
 | 
			
		||||
type NvidiaGPUManager struct {
 | 
			
		||||
	gpuPaths []string
 | 
			
		||||
	gpuMutex sync.Mutex
 | 
			
		||||
type activePodsLister interface {
 | 
			
		||||
	// Returns a list of active pods on the node.
 | 
			
		||||
	GetRunningPods() ([]*v1.Pod, error)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// nvidiaGPUManager manages nvidia gpu devices.
 | 
			
		||||
type nvidiaGPUManager struct {
 | 
			
		||||
	sync.Mutex
 | 
			
		||||
	// All gpus available on the Node
 | 
			
		||||
	allGPUs   sets.String
 | 
			
		||||
	allocated *podGPUs
 | 
			
		||||
	// The interface which could get GPU mapping from all the containers.
 | 
			
		||||
	// TODO: Should make this independent of Docker in the future.
 | 
			
		||||
	dockerClient     dockertools.DockerInterface
 | 
			
		||||
	activePodsLister activePodsLister
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get all the paths of NVIDIA GPU card from /dev/
 | 
			
		||||
// TODO: Without NVML support we only can check whether there has GPU devices, but
 | 
			
		||||
// could not give a health check or get more information like GPU cores, memory, or
 | 
			
		||||
// family name. Need to support NVML in the future. But we do not need NVML until
 | 
			
		||||
// we want more features, features like schedule containers according to GPU family
 | 
			
		||||
// name.
 | 
			
		||||
func (ngm *NvidiaGPUManager) discovery() (err error) {
 | 
			
		||||
	if ngm.gpuPaths == nil {
 | 
			
		||||
		err = filepath.Walk("/dev", func(path string, f os.FileInfo, err error) error {
 | 
			
		||||
			reg := regexp.MustCompile(`^nvidia[0-9]*$`)
 | 
			
		||||
			gpupath := reg.FindAllString(f.Name(), -1)
 | 
			
		||||
			if gpupath != nil && gpupath[0] != "" {
 | 
			
		||||
				ngm.gpuPaths = append(ngm.gpuPaths, "/dev/"+gpupath[0])
 | 
			
		||||
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
 | 
			
		||||
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
 | 
			
		||||
func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockertools.DockerInterface) gpu.GPUManager {
 | 
			
		||||
	return &nvidiaGPUManager{
 | 
			
		||||
		allGPUs:          sets.NewString(),
 | 
			
		||||
		dockerClient:     dockerClient,
 | 
			
		||||
		activePodsLister: activePodsLister,
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
			return nil
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func Valid(path string) bool {
 | 
			
		||||
	reg := regexp.MustCompile(`^/dev/nvidia[0-9]*$`)
 | 
			
		||||
	check := reg.FindAllString(path, -1)
 | 
			
		||||
 | 
			
		||||
	return check != nil && check[0] != ""
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Initialize the GPU devices, so far only needed to discover the GPU paths.
 | 
			
		||||
func (ngm *NvidiaGPUManager) Init(dc dockertools.DockerInterface) error {
 | 
			
		||||
func (ngm *nvidiaGPUManager) Start() error {
 | 
			
		||||
	if _, err := os.Stat(NvidiaCtlDevice); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
@@ -88,43 +82,162 @@ func (ngm *NvidiaGPUManager) Init(dc dockertools.DockerInterface) error {
 | 
			
		||||
	if _, err := os.Stat(NvidiaUVMDevice); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	ngm.Lock()
 | 
			
		||||
	defer ngm.Unlock()
 | 
			
		||||
 | 
			
		||||
	ngm.gpuMutex.Lock()
 | 
			
		||||
	defer ngm.gpuMutex.Unlock()
 | 
			
		||||
 | 
			
		||||
	err := ngm.discovery()
 | 
			
		||||
 | 
			
		||||
	ngm.dockerClient = dc
 | 
			
		||||
 | 
			
		||||
	if err := ngm.discoverGPUs(); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (ngm *NvidiaGPUManager) Shutdown() {
 | 
			
		||||
	ngm.gpuMutex.Lock()
 | 
			
		||||
	defer ngm.gpuMutex.Unlock()
 | 
			
		||||
 | 
			
		||||
	ngm.gpuPaths = nil
 | 
			
		||||
	}
 | 
			
		||||
	// Its possible that the runtime isn't available now.
 | 
			
		||||
	allocatedGPUs, err := ngm.gpusInUse()
 | 
			
		||||
	if err == nil {
 | 
			
		||||
		ngm.allocated = allocatedGPUs
 | 
			
		||||
	}
 | 
			
		||||
	// We ignore errors with identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Get how many GPU cards we have.
 | 
			
		||||
func (ngm *NvidiaGPUManager) Capacity() int {
 | 
			
		||||
	ngm.gpuMutex.Lock()
 | 
			
		||||
	defer ngm.gpuMutex.Unlock()
 | 
			
		||||
 | 
			
		||||
	return len(ngm.gpuPaths)
 | 
			
		||||
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
 | 
			
		||||
	gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
 | 
			
		||||
	return v1.ResourceList{
 | 
			
		||||
		v1.ResourceNvidiaGPU: *gpus,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Check whether the GPU device could be assigned to a container.
 | 
			
		||||
func (ngm *NvidiaGPUManager) isAvailable(path string) bool {
 | 
			
		||||
	containers, err := dockertools.GetKubeletDockerContainers(ngm.dockerClient, false)
 | 
			
		||||
 | 
			
		||||
// AllocateGPUs returns `num` GPUs if available, error otherwise.
 | 
			
		||||
// Allocation is made thread safe using the following logic.
 | 
			
		||||
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
 | 
			
		||||
// It is expected that the list of active pods will not return any false positives.
 | 
			
		||||
// As part of initialization or allocation, the list of GPUs in use will be computed once.
 | 
			
		||||
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
 | 
			
		||||
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
 | 
			
		||||
// GPUs are allocated based on the internal list of allocatedGPUs.
 | 
			
		||||
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
 | 
			
		||||
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
 | 
			
		||||
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
 | 
			
		||||
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
 | 
			
		||||
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
 | 
			
		||||
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
 | 
			
		||||
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
 | 
			
		||||
	gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
 | 
			
		||||
	if gpusNeeded == 0 {
 | 
			
		||||
		return []string{}, nil
 | 
			
		||||
	}
 | 
			
		||||
	ngm.Lock()
 | 
			
		||||
	defer ngm.Unlock()
 | 
			
		||||
	if ngm.allocated == nil {
 | 
			
		||||
		// Initialization is not complete. Try now. Failures can no longer be tolerated.
 | 
			
		||||
		allocated, err := ngm.gpusInUse()
 | 
			
		||||
		if err != nil {
 | 
			
		||||
		return true
 | 
			
		||||
			return nil, fmt.Errorf("failed to allocate GPUs because of issues identifying GPUs in use: %v", err)
 | 
			
		||||
		}
 | 
			
		||||
		ngm.allocated = allocated
 | 
			
		||||
	} else {
 | 
			
		||||
		// update internal list of GPUs in use prior to allocating new GPUs.
 | 
			
		||||
		if err := ngm.updateAllocatedGPUs(); err != nil {
 | 
			
		||||
			return nil, fmt.Errorf("failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	// Get GPU devices in use.
 | 
			
		||||
	devicesInUse := ngm.allocated.devices()
 | 
			
		||||
	// Get a list of available GPUs.
 | 
			
		||||
	available := ngm.allGPUs.Difference(devicesInUse)
 | 
			
		||||
	if int64(available.Len()) < gpusNeeded {
 | 
			
		||||
		return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
 | 
			
		||||
	}
 | 
			
		||||
	var ret []string
 | 
			
		||||
	for _, device := range available.List() {
 | 
			
		||||
		if gpusNeeded > 0 {
 | 
			
		||||
			ret = append(ret, device)
 | 
			
		||||
			// Update internal allocated GPU cache.
 | 
			
		||||
			ngm.allocated.insert(string(pod.UID), device)
 | 
			
		||||
		}
 | 
			
		||||
		gpusNeeded--
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i := range containers {
 | 
			
		||||
		containerJSON, err := ngm.dockerClient.InspectContainer(containers[i].ID)
 | 
			
		||||
	return ret, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
 | 
			
		||||
	activePods, err := ngm.activePodsLister.GetRunningPods()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return fmt.Errorf("failed to list active pods: %v", err)
 | 
			
		||||
	}
 | 
			
		||||
	activePodUids := sets.NewString()
 | 
			
		||||
	for _, pod := range activePods {
 | 
			
		||||
		activePodUids.Insert(string(pod.UID))
 | 
			
		||||
	}
 | 
			
		||||
	allocatedPodUids := ngm.allocated.pods()
 | 
			
		||||
	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
 | 
			
		||||
	ngm.allocated.delete(podsToBeRemoved.List())
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
 | 
			
		||||
// TODO: Without NVML support we only can check whether there has GPU devices, but
 | 
			
		||||
// could not give a health check or get more information like GPU cores, memory, or
 | 
			
		||||
// family name. Need to support NVML in the future. But we do not need NVML until
 | 
			
		||||
// we want more features, features like schedule containers according to GPU family
 | 
			
		||||
// name.
 | 
			
		||||
func (ngm *nvidiaGPUManager) discoverGPUs() error {
 | 
			
		||||
	reg := regexp.MustCompile(nvidiaDeviceRE)
 | 
			
		||||
	files, err := ioutil.ReadDir(devDirectory)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	for _, f := range files {
 | 
			
		||||
		if f.IsDir() {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		if reg.MatchString(f.Name()) {
 | 
			
		||||
			glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
 | 
			
		||||
			ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
 | 
			
		||||
func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
 | 
			
		||||
	pods, err := ngm.activePodsLister.GetRunningPods()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	type podContainers struct {
 | 
			
		||||
		uid          string
 | 
			
		||||
		containerIDs sets.String
 | 
			
		||||
	}
 | 
			
		||||
	// List of containers to inspect.
 | 
			
		||||
	podContainersToInspect := []podContainers{}
 | 
			
		||||
	for _, pod := range pods {
 | 
			
		||||
		containers := sets.NewString()
 | 
			
		||||
		for _, container := range pod.Spec.Containers {
 | 
			
		||||
			// GPUs are expected to be specified only in limits.
 | 
			
		||||
			if !container.Resources.Limits.NvidiaGPU().IsZero() {
 | 
			
		||||
				containers.Insert(container.Name)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		// If no GPUs were requested skip this pod.
 | 
			
		||||
		if containers.Len() == 0 {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		containerIDs := sets.NewString()
 | 
			
		||||
		for _, container := range pod.Status.ContainerStatuses {
 | 
			
		||||
			if containers.Has(container.Name) {
 | 
			
		||||
				containerIDs.Insert(container.ContainerID)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		// add the pod and its containers that need to be inspected.
 | 
			
		||||
		podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs})
 | 
			
		||||
	}
 | 
			
		||||
	ret := newPodGpus()
 | 
			
		||||
	for _, podContainer := range podContainersToInspect {
 | 
			
		||||
		for _, containerId := range podContainer.containerIDs.List() {
 | 
			
		||||
			containerJSON, err := ngm.dockerClient.InspectContainer(containerId)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				glog.V(3).Infof("failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid)
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
@@ -134,48 +247,16 @@ func (ngm *NvidiaGPUManager) isAvailable(path string) bool {
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			for _, device := range devices {
 | 
			
		||||
			if Valid(device.PathOnHost) && path == device.PathOnHost {
 | 
			
		||||
				return false
 | 
			
		||||
				if isValidPath(device.PathOnHost) {
 | 
			
		||||
					glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
 | 
			
		||||
					ret.insert(podContainer.uid, device.PathOnHost)
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
	return true
 | 
			
		||||
	}
 | 
			
		||||
	return ret, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Return the GPU paths as needed, otherwise, return error.
 | 
			
		||||
func (ngm *NvidiaGPUManager) AllocateGPUs(num int) (paths []string, err error) {
 | 
			
		||||
	if num <= 0 {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ngm.gpuMutex.Lock()
 | 
			
		||||
	defer ngm.gpuMutex.Unlock()
 | 
			
		||||
 | 
			
		||||
	for _, path := range ngm.gpuPaths {
 | 
			
		||||
		if ngm.isAvailable(path) {
 | 
			
		||||
			paths = append(paths, path)
 | 
			
		||||
			if len(paths) == num {
 | 
			
		||||
				return
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	err = fmt.Errorf("Not enough GPUs!")
 | 
			
		||||
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Return the count of GPUs which are free.
 | 
			
		||||
func (ngm *NvidiaGPUManager) AvailableGPUs() (num int) {
 | 
			
		||||
	ngm.gpuMutex.Lock()
 | 
			
		||||
	defer ngm.gpuMutex.Unlock()
 | 
			
		||||
 | 
			
		||||
	for _, path := range ngm.gpuPaths {
 | 
			
		||||
		if ngm.isAvailable(path) {
 | 
			
		||||
			num++
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return
 | 
			
		||||
func isValidPath(path string) bool {
 | 
			
		||||
	return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										32
									
								
								pkg/kubelet/gpu/types.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								pkg/kubelet/gpu/types.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package gpu
 | 
			
		||||
 | 
			
		||||
import "k8s.io/kubernetes/pkg/api/v1"
 | 
			
		||||
 | 
			
		||||
// GPUManager manages GPUs on a local node.
 | 
			
		||||
// Implementations are expected to be thread safe.
 | 
			
		||||
type GPUManager interface {
 | 
			
		||||
	// Start logically initializes GPUManager
 | 
			
		||||
	Start() error
 | 
			
		||||
	// Capacity returns the total number of GPUs on the node.
 | 
			
		||||
	Capacity() v1.ResourceList
 | 
			
		||||
	// AllocateGPU attempts to allocate GPUs for input container.
 | 
			
		||||
	// Returns paths to allocated GPUs and nil on success.
 | 
			
		||||
	// Returns an error on failure.
 | 
			
		||||
	AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
 | 
			
		||||
}
 | 
			
		||||
@@ -67,6 +67,7 @@ import (
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/events"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/eviction"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/gpu"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/images"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
 | 
			
		||||
@@ -450,7 +451,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 | 
			
		||||
		writer:            kubeDeps.Writer,
 | 
			
		||||
		nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
 | 
			
		||||
		maxPods:           int(kubeCfg.MaxPods),
 | 
			
		||||
		enableNvidiaGPU:   kubeCfg.EnableNvidiaGPU,
 | 
			
		||||
		podsPerCore:       int(kubeCfg.PodsPerCore),
 | 
			
		||||
		syncLoopMonitor:   atomic.Value{},
 | 
			
		||||
		resolverConfig:    kubeCfg.ResolverConfig,
 | 
			
		||||
@@ -787,7 +787,11 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 | 
			
		||||
 | 
			
		||||
	klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime)
 | 
			
		||||
	klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
 | 
			
		||||
 | 
			
		||||
	if kubeCfg.ExperimentalEnableNvidiaGPU {
 | 
			
		||||
		klet.gpuManager = nvidia.NewNvidiaGPUManager(klet, klet.dockerClient)
 | 
			
		||||
	} else {
 | 
			
		||||
		klet.gpuManager = gpu.NewGPUManagerStub()
 | 
			
		||||
	}
 | 
			
		||||
	// Finally, put the most recent version of the config on the Kubelet, so
 | 
			
		||||
	// people can see how it was configured.
 | 
			
		||||
	klet.kubeletConfiguration = *kubeCfg
 | 
			
		||||
@@ -982,9 +986,6 @@ type Kubelet struct {
 | 
			
		||||
	// Maximum Number of Pods which can be run by this Kubelet
 | 
			
		||||
	maxPods int
 | 
			
		||||
 | 
			
		||||
	// Enable experimental Nvidia GPU
 | 
			
		||||
	enableExperimentalNvidiaGPU bool
 | 
			
		||||
 | 
			
		||||
	// Monitor Kubelet's sync loop
 | 
			
		||||
	syncLoopMonitor atomic.Value
 | 
			
		||||
 | 
			
		||||
@@ -1091,8 +1092,8 @@ type Kubelet struct {
 | 
			
		||||
	// experimental behavior is desired.
 | 
			
		||||
	experimentalHostUserNamespaceDefaulting bool
 | 
			
		||||
 | 
			
		||||
	// NVIDIA GPU Manager
 | 
			
		||||
	nvidiaGPUManager nvidia.NvidiaGPUManager
 | 
			
		||||
	// GPU Manager
 | 
			
		||||
	gpuManager gpu.GPUManager
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setupDataDirs creates:
 | 
			
		||||
@@ -1186,11 +1187,8 @@ func (kl *Kubelet) initializeModules() error {
 | 
			
		||||
		return fmt.Errorf("Failed to start OOM watcher %v", err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Step 7: Init Nvidia Manager. Do not need to return err until we use NVML instead.
 | 
			
		||||
	// Only works when user give true to EnableExperimentalNvidiaGPU
 | 
			
		||||
	if kl.enableExperimentalNvidiaGPU {
 | 
			
		||||
		kl.nvidiaGPUManager.Init(kl.dockerClient)
 | 
			
		||||
	}
 | 
			
		||||
	// Step 7: Initialize GPUs
 | 
			
		||||
	kl.gpuManager.Start()
 | 
			
		||||
 | 
			
		||||
	// Step 8: Start resource analyzer
 | 
			
		||||
	kl.resourceAnalyzer.Start()
 | 
			
		||||
 
 | 
			
		||||
@@ -482,9 +482,12 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 | 
			
		||||
		node.Status.Capacity = v1.ResourceList{}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	nvidiaGPUCapacity := 0
 | 
			
		||||
	if kl.enableExperimentalNvidiaGPU {
 | 
			
		||||
		nvidiaGPUCapacity = kl.nvidiaGPUManager.Capacity()
 | 
			
		||||
	// populate GPU capacity.
 | 
			
		||||
	gpuCapacity := kl.gpuManager.Capacity()
 | 
			
		||||
	if gpuCapacity != nil {
 | 
			
		||||
		for k, v := range gpuCapacity {
 | 
			
		||||
			node.Status.Capacity[k] = v
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
 | 
			
		||||
@@ -496,8 +499,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 | 
			
		||||
		node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
 | 
			
		||||
		node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
 | 
			
		||||
		node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
 | 
			
		||||
		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(nvidiaGPUCapacity), resource.DecimalSI)
 | 
			
		||||
 | 
			
		||||
		glog.Errorf("Error getting machine info: %v", err)
 | 
			
		||||
	} else {
 | 
			
		||||
		node.Status.NodeInfo.MachineID = info.MachineID
 | 
			
		||||
@@ -514,8 +515,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 | 
			
		||||
			node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
 | 
			
		||||
				int64(kl.maxPods), resource.DecimalSI)
 | 
			
		||||
		}
 | 
			
		||||
		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
 | 
			
		||||
			int64(nvidiaGPUCapacity), resource.DecimalSI)
 | 
			
		||||
		if node.Status.NodeInfo.BootID != "" &&
 | 
			
		||||
			node.Status.NodeInfo.BootID != info.BootID {
 | 
			
		||||
			// TODO: This requires a transaction, either both node status is updated
 | 
			
		||||
 
 | 
			
		||||
@@ -211,13 +211,11 @@ func TestUpdateNewNodeStatus(t *testing.T) {
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Allocatable: v1.ResourceList{
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Addresses: []v1.NodeAddress{
 | 
			
		||||
				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
 | 
			
		||||
@@ -485,13 +483,11 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Allocatable: v1.ResourceList{
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Addresses: []v1.NodeAddress{
 | 
			
		||||
				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
 | 
			
		||||
@@ -793,13 +789,11 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) {
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Allocatable: v1.ResourceList{
 | 
			
		||||
				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
 | 
			
		||||
				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
 | 
			
		||||
			},
 | 
			
		||||
			Addresses: []v1.NodeAddress{
 | 
			
		||||
				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
 | 
			
		||||
 
 | 
			
		||||
@@ -87,28 +87,33 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
 | 
			
		||||
 | 
			
		||||
// makeDevices determines the devices for the given container.
 | 
			
		||||
// Experimental.
 | 
			
		||||
func (kl *Kubelet) makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
 | 
			
		||||
	if !kl.enableExperimentalNvidiaGPU {
 | 
			
		||||
		return nil
 | 
			
		||||
func (kl *Kubelet) makeDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
 | 
			
		||||
	if container.Resources.Limits.NvidiaGPU().IsZero() {
 | 
			
		||||
		return nil, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
 | 
			
		||||
 | 
			
		||||
	if nvidiaGPULimit.Value() != 0 {
 | 
			
		||||
		if nvidiaGPUPaths, err := kl.nvidiaGPUManager.AllocateGPUs(int(nvidiaGPULimit.Value())); err == nil {
 | 
			
		||||
			devices := []kubecontainer.DeviceInfo{{PathOnHost: nvidia.NvidiaCtlDevice, PathInContainer: nvidia.NvidiaCtlDevice, Permissions: "mrw"},
 | 
			
		||||
				{PathOnHost: nvidia.NvidiaUVMDevice, PathInContainer: nvidia.NvidiaUVMDevice, Permissions: "mrw"}}
 | 
			
		||||
	nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	devices := []kubecontainer.DeviceInfo{
 | 
			
		||||
		{
 | 
			
		||||
			PathOnHost:      nvidia.NvidiaCtlDevice,
 | 
			
		||||
			PathInContainer: nvidia.NvidiaCtlDevice,
 | 
			
		||||
			Permissions:     "mrw",
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			PathOnHost:      nvidia.NvidiaUVMDevice,
 | 
			
		||||
			PathInContainer: nvidia.NvidiaUVMDevice,
 | 
			
		||||
			Permissions:     "mrw",
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i, path := range nvidiaGPUPaths {
 | 
			
		||||
		devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: "/dev/nvidia" + strconv.Itoa(i), Permissions: "mrw"})
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
			return devices
 | 
			
		||||
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return nil
 | 
			
		||||
	return devices, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// makeMounts determines the mount points for the given container.
 | 
			
		||||
@@ -296,7 +301,10 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
 | 
			
		||||
 | 
			
		||||
	opts.PortMappings = kubecontainer.MakePortMappings(container)
 | 
			
		||||
	// TODO(random-liu): Move following convert functions into pkg/kubelet/container
 | 
			
		||||
	opts.Devices = kl.makeDevices(container)
 | 
			
		||||
	opts.Devices, err = kl.makeDevices(pod, container)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
 
 | 
			
		||||
@@ -27,7 +27,6 @@ import (
 | 
			
		||||
	"github.com/stretchr/testify/assert"
 | 
			
		||||
	"github.com/stretchr/testify/require"
 | 
			
		||||
	apierrors "k8s.io/apimachinery/pkg/api/errors"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/api/resource"
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/labels"
 | 
			
		||||
	"k8s.io/apimachinery/pkg/runtime"
 | 
			
		||||
@@ -1711,39 +1710,6 @@ func TestGetHostPortConflicts(t *testing.T) {
 | 
			
		||||
	assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestMakeDevices(t *testing.T) {
 | 
			
		||||
	testCases := []struct {
 | 
			
		||||
		container *v1.Container
 | 
			
		||||
		devices   []kubecontainer.DeviceInfo
 | 
			
		||||
		test      string
 | 
			
		||||
	}{
 | 
			
		||||
		{
 | 
			
		||||
			test:      "no device",
 | 
			
		||||
			container: &v1.Container{},
 | 
			
		||||
			devices:   nil,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			test: "gpu",
 | 
			
		||||
			container: &v1.Container{
 | 
			
		||||
				Resources: v1.ResourceRequirements{
 | 
			
		||||
					Limits: map[v1.ResourceName]resource.Quantity{
 | 
			
		||||
						v1.ResourceNvidiaGPU: resource.MustParse("1000"),
 | 
			
		||||
					},
 | 
			
		||||
				},
 | 
			
		||||
			},
 | 
			
		||||
			devices: []kubecontainer.DeviceInfo{
 | 
			
		||||
				{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
 | 
			
		||||
				{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
 | 
			
		||||
				{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for _, test := range testCases {
 | 
			
		||||
		assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestHasHostMountPVC(t *testing.T) {
 | 
			
		||||
	tests := map[string]struct {
 | 
			
		||||
		pvError       error
 | 
			
		||||
 
 | 
			
		||||
@@ -49,6 +49,7 @@ import (
 | 
			
		||||
	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 | 
			
		||||
	containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/eviction"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/gpu"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/images"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/network"
 | 
			
		||||
@@ -272,7 +273,7 @@ func newTestKubeletWithImageList(
 | 
			
		||||
 | 
			
		||||
	kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
 | 
			
		||||
	kubelet.AddPodSyncHandler(activeDeadlineHandler)
 | 
			
		||||
 | 
			
		||||
	kubelet.gpuManager = gpu.NewGPUManagerStub()
 | 
			
		||||
	return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -150,7 +150,6 @@ func GetHollowKubeletConfig(
 | 
			
		||||
	c.MaxContainerCount = 100
 | 
			
		||||
	c.MaxOpenFiles = 1024
 | 
			
		||||
	c.MaxPerPodContainerCount = 2
 | 
			
		||||
	c.EnableExperimentalNvidiaGPU = false
 | 
			
		||||
	c.RegisterNode = true
 | 
			
		||||
	c.RegisterSchedulable = true
 | 
			
		||||
	c.RegistryBurst = 10
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user