mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Install Nvidia Daemonset in test harness for GCE
Signed-off-by: Davanum Srinivas <davanum@gmail.com>
This commit is contained in:
		@@ -18,17 +18,26 @@ package node
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
	"context"
 | 
						"context"
 | 
				
			||||||
 | 
						"os"
 | 
				
			||||||
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						appsv1 "k8s.io/api/apps/v1"
 | 
				
			||||||
	v1 "k8s.io/api/core/v1"
 | 
						v1 "k8s.io/api/core/v1"
 | 
				
			||||||
	"k8s.io/apimachinery/pkg/api/resource"
 | 
						"k8s.io/apimachinery/pkg/api/resource"
 | 
				
			||||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
						metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
				
			||||||
	"k8s.io/apimachinery/pkg/util/uuid"
 | 
						"k8s.io/apimachinery/pkg/util/uuid"
 | 
				
			||||||
	clientset "k8s.io/client-go/kubernetes"
 | 
						clientset "k8s.io/client-go/kubernetes"
 | 
				
			||||||
 | 
						extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
 | 
				
			||||||
	"k8s.io/kubernetes/test/e2e/feature"
 | 
						"k8s.io/kubernetes/test/e2e/feature"
 | 
				
			||||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
						"k8s.io/kubernetes/test/e2e/framework"
 | 
				
			||||||
 | 
						e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
 | 
				
			||||||
	e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
 | 
						e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
 | 
				
			||||||
 | 
						e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
 | 
				
			||||||
	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
						e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 | 
				
			||||||
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
						e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 | 
				
			||||||
 | 
						e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
 | 
				
			||||||
	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
						e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 | 
				
			||||||
 | 
						e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
 | 
				
			||||||
	admissionapi "k8s.io/pod-security-admission/api"
 | 
						admissionapi "k8s.io/pod-security-admission/api"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"github.com/onsi/ginkgo/v2"
 | 
						"github.com/onsi/ginkgo/v2"
 | 
				
			||||||
@@ -42,12 +51,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
 | 
				
			|||||||
	var podClient *e2epod.PodClient
 | 
						var podClient *e2epod.PodClient
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ginkgo.BeforeEach(func() {
 | 
						ginkgo.BeforeEach(func() {
 | 
				
			||||||
		e2eskipper.SkipUnlessProviderIs("aws")
 | 
							e2eskipper.SkipUnlessProviderIs("aws", "gce")
 | 
				
			||||||
		podClient = e2epod.NewPodClient(f)
 | 
							podClient = e2epod.NewPodClient(f)
 | 
				
			||||||
	})
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	f.It("should run nvidia-smi cli", func(ctx context.Context) {
 | 
						f.It("should run nvidia-smi cli", func(ctx context.Context) {
 | 
				
			||||||
		checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
 | 
							SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
 | 
				
			||||||
		pod := testNvidiaCLIPod()
 | 
							pod := testNvidiaCLIPod()
 | 
				
			||||||
		pod.Spec.Containers[0].Command = []string{"nvidia-smi"}
 | 
							pod.Spec.Containers[0].Command = []string{"nvidia-smi"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -65,7 +74,7 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, "Sanity test for Nvidia Device", fu
 | 
				
			|||||||
	})
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
 | 
						f.It("should run gpu based matrix multiplication", func(ctx context.Context) {
 | 
				
			||||||
		checkEnvironmentAndSkipIfNeeded(ctx, f.ClientSet)
 | 
							SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
 | 
				
			||||||
		pod := testMatrixMultiplicationPod()
 | 
							pod := testMatrixMultiplicationPod()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ginkgo.By("Creating a pod that runs matrix multiplication")
 | 
							ginkgo.By("Creating a pod that runs matrix multiplication")
 | 
				
			||||||
@@ -180,7 +189,18 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
 | 
				
			|||||||
	return &pod
 | 
						return &pod
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.Interface) {
 | 
					func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) {
 | 
				
			||||||
 | 
						if framework.ProviderIs("gce") {
 | 
				
			||||||
 | 
							rsgather := SetupNVIDIAGPUNode(ctx, f)
 | 
				
			||||||
 | 
							defer func() {
 | 
				
			||||||
 | 
								framework.Logf("Stopping ResourceUsageGather")
 | 
				
			||||||
 | 
								constraints := make(map[string]e2edebug.ResourceConstraint)
 | 
				
			||||||
 | 
								// For now, just gets summary. Can pass valid constraints in the future.
 | 
				
			||||||
 | 
								summary, err := rsgather.StopAndSummarize([]int{50, 90, 100}, constraints)
 | 
				
			||||||
 | 
								f.TestSummaries = append(f.TestSummaries, summary)
 | 
				
			||||||
 | 
								framework.ExpectNoError(err, "getting resource usage summary")
 | 
				
			||||||
 | 
							}()
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
 | 
						nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
 | 
				
			||||||
	framework.ExpectNoError(err)
 | 
						framework.ExpectNoError(err)
 | 
				
			||||||
	capacity := 0
 | 
						capacity := 0
 | 
				
			||||||
@@ -204,3 +224,83 @@ func checkEnvironmentAndSkipIfNeeded(ctx context.Context, clientSet clientset.In
 | 
				
			|||||||
		e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items))
 | 
							e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items))
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func areGPUsAvailableOnAllSchedulableNodes(ctx context.Context, clientSet clientset.Interface) bool {
 | 
				
			||||||
 | 
						framework.Logf("Getting list of Nodes from API server")
 | 
				
			||||||
 | 
						nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, "getting node list")
 | 
				
			||||||
 | 
						for _, node := range nodeList.Items {
 | 
				
			||||||
 | 
							if node.Spec.Unschedulable {
 | 
				
			||||||
 | 
								continue
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							framework.Logf("gpuResourceName %s", e2egpu.NVIDIAGPUResourceName)
 | 
				
			||||||
 | 
							if val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]; !ok || val.Value() == 0 {
 | 
				
			||||||
 | 
								framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
 | 
				
			||||||
 | 
								return false
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						framework.Logf("Nvidia GPUs exist on all schedulable nodes")
 | 
				
			||||||
 | 
						return true
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func logOSImages(ctx context.Context, f *framework.Framework) {
 | 
				
			||||||
 | 
						nodeList, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, "getting node list")
 | 
				
			||||||
 | 
						for _, node := range nodeList.Items {
 | 
				
			||||||
 | 
							framework.Logf("Nodename: %v, OS Image: %v", node.Name, node.Status.NodeInfo.OSImage)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const (
 | 
				
			||||||
 | 
						// Nvidia driver installation can take upwards of 5 minutes.
 | 
				
			||||||
 | 
						driverInstallTimeout = 10 * time.Minute
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
 | 
				
			||||||
 | 
					func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) *e2edebug.ContainerResourceGatherer {
 | 
				
			||||||
 | 
						logOSImages(ctx, f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						var err error
 | 
				
			||||||
 | 
						var ds *appsv1.DaemonSet
 | 
				
			||||||
 | 
						dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
 | 
				
			||||||
 | 
						if dsYamlURLFromEnv != "" {
 | 
				
			||||||
 | 
							// Using DaemonSet from remote URL
 | 
				
			||||||
 | 
							framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
 | 
				
			||||||
 | 
							ds, err = e2emanifest.DaemonSetFromURL(ctx, dsYamlURLFromEnv)
 | 
				
			||||||
 | 
							framework.ExpectNoError(err, "failed get remote")
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							// Using default local DaemonSet
 | 
				
			||||||
 | 
							framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
 | 
				
			||||||
 | 
							data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
 | 
				
			||||||
 | 
							framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
 | 
				
			||||||
 | 
							ds, err = e2emanifest.DaemonSetFromData(data)
 | 
				
			||||||
 | 
							framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						ds.Namespace = f.Namespace.Name
 | 
				
			||||||
 | 
						_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{})
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
 | 
				
			||||||
 | 
						framework.Logf("Successfully created daemonset to install Nvidia drivers.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						pods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet"))
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, "failed to get pods controlled by the nvidia-driver-installer daemonset")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						devicepluginPods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet"))
 | 
				
			||||||
 | 
						if err == nil {
 | 
				
			||||||
 | 
							framework.Logf("Adding deviceplugin addon pod.")
 | 
				
			||||||
 | 
							pods.Items = append(pods.Items, devicepluginPods.Items...)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
 | 
				
			||||||
 | 
						rsgather, err := e2edebug.NewResourceUsageGatherer(ctx, f.ClientSet,
 | 
				
			||||||
 | 
							e2edebug.ResourceGathererOptions{InKubemark: false, Nodes: e2edebug.AllNodes, ResourceDataGatheringPeriod: 2 * time.Second, ProbeDuration: 2 * time.Second, PrintVerboseLogs: true}, pods)
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")
 | 
				
			||||||
 | 
						go rsgather.StartGatheringData(ctx)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// Wait for Nvidia GPUs to be available on nodes
 | 
				
			||||||
 | 
						framework.Logf("Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
 | 
				
			||||||
 | 
						gomega.Eventually(ctx, func(ctx context.Context) bool {
 | 
				
			||||||
 | 
							return areGPUsAvailableOnAllSchedulableNodes(ctx, f.ClientSet)
 | 
				
			||||||
 | 
						}, driverInstallTimeout, time.Second).Should(gomega.BeTrueBecause("expected GPU resources to be available within the timout"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return rsgather
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user