Merge pull request #119012 from pohly/dra-batch-node-prepare

kubelet: support batched prepare/unprepare in v1alpha3 DRA plugin API
This commit is contained in:
Kubernetes Prow Robot
2023-07-12 10:57:37 -07:00
committed by GitHub
11 changed files with 2604 additions and 171 deletions

View File

@@ -28,6 +28,7 @@ import (
clientset "k8s.io/client-go/kubernetes"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/klog/v2"
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
@@ -62,10 +63,12 @@ func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string) (
}
// PrepareResources attempts to prepare all of the required resource
// plugin resources for the input container, issue an NodePrepareResource rpc request
// plugin resources for the input container, issue NodePrepareResources rpc requests
// for each new resource requirement, process their responses and update the cached
// containerResources on success.
func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
batches := make(map[string][]*drapb.Claim)
claimInfos := make(map[types.UID]*ClaimInfo)
for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i]
klog.V(3).InfoS("Processing resource", "podClaim", podClaim.Name, "pod", pod.Name)
@@ -139,7 +142,7 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
sets.New(string(pod.UID)),
)
// Walk through each resourceHandle
// Loop through all plugins and prepare for calling NodePrepareResources.
for _, resourceHandle := range resourceHandles {
// If no DriverName is provided in the resourceHandle, we
// use the DriverName from the status
@@ -147,48 +150,71 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
if pluginName == "" {
pluginName = resourceClaim.Status.DriverName
}
// Call NodePrepareResource RPC for each resourceHandle
client, err := dra.NewDRAPluginClient(pluginName)
if err != nil {
return fmt.Errorf("failed to get DRA Plugin client for plugin name %s, err=%+v", pluginName, err)
claim := &drapb.Claim{
Namespace: resourceClaim.Namespace,
Uid: string(resourceClaim.UID),
Name: resourceClaim.Name,
ResourceHandle: resourceHandle.Data,
}
response, err := client.NodePrepareResource(
context.Background(),
resourceClaim.Namespace,
resourceClaim.UID,
resourceClaim.Name,
resourceHandle.Data)
if err != nil {
return fmt.Errorf("NodePrepareResource failed, claim UID: %s, claim name: %s, resource handle: %s, err: %+v",
resourceClaim.UID, resourceClaim.Name, resourceHandle.Data, err)
}
klog.V(3).InfoS("NodePrepareResource succeeded", "pluginName", pluginName, "response", response)
batches[pluginName] = append(batches[pluginName], claim)
}
claimInfos[resourceClaim.UID] = claimInfo
}
// Add the CDI Devices returned by NodePrepareResource to
// Call NodePrepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for pluginName, claims := range batches {
// Call NodePrepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(pluginName)
if err != nil {
return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err)
}
response, err := client.NodePrepareResources(context.Background(), &drapb.NodePrepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodePrepareResources failed: %v", err)
}
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID)
}
if result.Error != "" {
return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
}
claimInfo := claimInfos[types.UID(claimUID)]
// Add the CDI Devices returned by NodePrepareResources to
// the claimInfo object.
err = claimInfo.addCDIDevices(pluginName, response.CdiDevices)
err = claimInfo.addCDIDevices(pluginName, result.CDIDevices)
if err != nil {
return fmt.Errorf("failed to add CDIDevices to claimInfo %+v: %+v", claimInfo, err)
}
// TODO: We (re)add the claimInfo object to the cache and
// sync it to the checkpoint *after* the
// NodePrepareResource call has completed. This will cause
// NodePrepareResources call has completed. This will cause
// issues if the kubelet gets restarted between
// NodePrepareResource and syncToCheckpoint. It will result
// in not calling NodeUnprepareResource for this claim
// NodePrepareResources and syncToCheckpoint. It will result
// in not calling NodeUnprepareResources for this claim
// because no claimInfo will be synced back to the cache
// for it after the restart. We need to resolve this issue
// before moving to beta.
m.cache.add(claimInfo)
}
// Checkpoint to reduce redundant calls to
// NodePrepareResource() after a kubelet restart.
err = m.cache.syncToCheckpoint()
if err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
}
// Checkpoint to reduce redundant calls to
// NodePrepareResources after a kubelet restart.
err = m.cache.syncToCheckpoint()
if err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodePrepareResources left out %d claims", unfinished)
}
}
// Checkpoint to capture all of the previous addPodReference() calls.
@@ -199,6 +225,15 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
return nil
}
func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim {
for _, claim := range claims {
if claim.Uid == claimUID {
return claim
}
}
return nil
}
func claimIsUsedByPod(podClaim *v1.PodResourceClaim, pod *v1.Pod) bool {
if claimIsUsedByContainers(podClaim, pod.Spec.InitContainers) {
return true
@@ -274,7 +309,8 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
// already been successfully unprepared.
func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error {
// Call NodeUnprepareResource RPC for every resource claim referenced by the pod
batches := make(map[string][]*drapb.Claim)
claimInfos := make(map[types.UID]*ClaimInfo)
for i := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
if err != nil {
@@ -324,8 +360,7 @@ func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error {
resourceHandles = make([]resourcev1alpha2.ResourceHandle, 1)
}
// Loop through all plugins and call NodeUnprepareResource only for the
// last pod that references the claim
// Loop through all plugins and prepare for calling NodeUnprepareResources.
for _, resourceHandle := range resourceHandles {
// If no DriverName is provided in the resourceHandle, we
// use the DriverName from the status
@@ -334,38 +369,62 @@ func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error {
pluginName = claimInfo.DriverName
}
// Call NodeUnprepareResource RPC for each resourceHandle
client, err := dra.NewDRAPluginClient(pluginName)
if err != nil {
return fmt.Errorf("failed to get DRA Plugin client for plugin name %s, err=%+v", pluginName, err)
claim := &drapb.Claim{
Namespace: resourceClaim.Namespace,
Uid: string(resourceClaim.UID),
Name: resourceClaim.Name,
ResourceHandle: resourceHandle.Data,
}
response, err := client.NodeUnprepareResource(
context.Background(),
claimInfo.Namespace,
claimInfo.ClaimUID,
claimInfo.ClaimName,
resourceHandle.Data)
if err != nil {
return fmt.Errorf(
"NodeUnprepareResource failed, pod: %s, claim UID: %s, claim name: %s, resource handle: %s, err: %+v",
pod.Name, claimInfo.ClaimUID, claimInfo.ClaimName, resourceHandle.Data, err)
}
klog.V(3).InfoS("NodeUnprepareResource succeeded", "response", response)
batches[pluginName] = append(batches[pluginName], claim)
}
claimInfos[resourceClaim.UID] = claimInfo
}
// Call NodeUnprepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for pluginName, claims := range batches {
// Call NodeUnprepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(pluginName)
if err != nil {
return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err)
}
response, err := client.NodeUnprepareResources(context.Background(), &drapb.NodeUnprepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodeUnprepareResources failed: %v", err)
}
// Delete last pod UID only if all NodeUnprepareResource calls succeed.
// This ensures that the status manager doesn't enter termination status
// for the pod. This logic is implemented in
// m.PodMightNeedToUnprepareResources and claimInfo.hasPodReference.
claimInfo.deletePodReference(pod.UID)
m.cache.delete(claimInfo.ClaimName, pod.Namespace)
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID)
}
if result.Error != "" {
return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, err)
}
// Checkpoint to reduce redundant calls to NodeUnPrepareResource() after a kubelet restart.
// Delete last pod UID only if unprepare succeeds.
// This ensures that the status manager doesn't enter termination status
// for the pod. This logic is implemented in
// m.PodMightNeedToUnprepareResources and claimInfo.hasPodReference.
claimInfo := claimInfos[types.UID(claimUID)]
claimInfo.deletePodReference(pod.UID)
m.cache.delete(claimInfo.ClaimName, pod.Namespace)
}
// Checkpoint to reduce redundant calls to NodeUnprepareResources after a kubelet restart.
err = m.cache.syncToCheckpoint()
if err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished)
}
}
// Checkpoint to capture all of the previous deletePodReference() calls.
err := m.cache.syncToCheckpoint()
if err != nil {

View File

@@ -25,68 +25,53 @@ import (
"time"
"google.golang.org/grpc"
grpccodes "google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/apimachinery/pkg/types"
grpcstatus "google.golang.org/grpc/status"
"k8s.io/klog/v2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha2"
drapbv1alpha2 "k8s.io/kubelet/pkg/apis/dra/v1alpha2"
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
)
const PluginClientTimeout = 10 * time.Second
type Client interface {
NodePrepareResource(
ctx context.Context,
namespace string,
claimUID types.UID,
claimName string,
resourceHandle string,
) (*drapbv1.NodePrepareResourceResponse, error)
NodeUnprepareResource(
ctx context.Context,
namespace string,
claimUID types.UID,
claimName string,
resourceHandle string,
) (*drapbv1.NodeUnprepareResourceResponse, error)
}
// Strongly typed address.
type draAddr string
// draPluginClient encapsulates all dra plugin methods.
type draPluginClient struct {
pluginName string
addr draAddr
nodeV1ClientCreator nodeV1ClientCreator
pluginName string
addr draAddr
nodeClientCreator nodeClientCreator
}
var _ Client = &draPluginClient{}
var _ drapb.NodeClient = &draPluginClient{}
type nodeV1ClientCreator func(addr draAddr) (
nodeClient drapbv1.NodeClient,
type nodeClientCreator func(addr draAddr) (
nodeClient drapb.NodeClient,
nodeClientOld drapbv1alpha2.NodeClient,
closer io.Closer,
err error,
)
// newV1NodeClient creates a new NodeClient with the internally used gRPC
// newNodeClient creates a new NodeClient with the internally used gRPC
// connection set up. It also returns a closer which must be called to close
// the gRPC connection when the NodeClient is not used anymore.
// This is the default implementation for the nodeV1ClientCreator, used in
// This is the default implementation for the nodeClientCreator, used in
// newDRAPluginClient.
func newV1NodeClient(addr draAddr) (nodeClient drapbv1.NodeClient, closer io.Closer, err error) {
func newNodeClient(addr draAddr) (nodeClient drapb.NodeClient, nodeClientOld drapbv1alpha2.NodeClient, closer io.Closer, err error) {
var conn *grpc.ClientConn
conn, err = newGrpcConn(addr)
if err != nil {
return nil, nil, err
return nil, nil, nil, err
}
return drapbv1.NewNodeClient(conn), conn, nil
return drapb.NewNodeClient(conn), drapbv1alpha2.NewNodeClient(conn), conn, nil
}
func NewDRAPluginClient(pluginName string) (Client, error) {
func NewDRAPluginClient(pluginName string) (drapb.NodeClient, error) {
if pluginName == "" {
return nil, fmt.Errorf("plugin name is empty")
}
@@ -97,84 +82,114 @@ func NewDRAPluginClient(pluginName string) (Client, error) {
}
return &draPluginClient{
pluginName: pluginName,
addr: draAddr(existingPlugin.endpoint),
nodeV1ClientCreator: newV1NodeClient,
pluginName: pluginName,
addr: draAddr(existingPlugin.endpoint),
nodeClientCreator: newNodeClient,
}, nil
}
func (r *draPluginClient) NodePrepareResource(
func (r *draPluginClient) NodePrepareResources(
ctx context.Context,
namespace string,
claimUID types.UID,
claimName string,
resourceHandle string,
) (*drapbv1.NodePrepareResourceResponse, error) {
klog.V(4).InfoS(
log("calling NodePrepareResource rpc"),
"namespace", namespace,
"claimUID", claimUID,
"claimName", claimName,
"resourceHandle", resourceHandle)
req *drapb.NodePrepareResourcesRequest,
opts ...grpc.CallOption,
) (resp *drapb.NodePrepareResourcesResponse, err error) {
logger := klog.FromContext(ctx)
logger.V(4).Info(log("calling NodePrepareResources rpc"), "request", req)
defer logger.V(4).Info(log("done calling NodePrepareResources rpc"), "response", resp, "err", err)
if r.nodeV1ClientCreator == nil {
return nil, errors.New("failed to call NodePrepareResource. nodeV1ClientCreator is nil")
if r.nodeClientCreator == nil {
return nil, errors.New("failed to call NodePrepareResources. nodeClientCreator is nil")
}
nodeClient, closer, err := r.nodeV1ClientCreator(r.addr)
nodeClient, nodeClientOld, closer, err := r.nodeClientCreator(r.addr)
if err != nil {
return nil, err
}
defer closer.Close()
req := &drapbv1.NodePrepareResourceRequest{
Namespace: namespace,
ClaimUid: string(claimUID),
ClaimName: claimName,
ResourceHandle: resourceHandle,
}
ctx, cancel := context.WithTimeout(ctx, PluginClientTimeout)
defer cancel()
return nodeClient.NodePrepareResource(ctx, req)
resp, err = nodeClient.NodePrepareResources(ctx, req)
if err != nil {
status, _ := grpcstatus.FromError(err)
if status.Code() == grpccodes.Unimplemented {
// Fall back to the older gRPC API.
resp = &drapb.NodePrepareResourcesResponse{
Claims: make(map[string]*drapb.NodePrepareResourceResponse),
}
err = nil
for _, claim := range req.Claims {
respOld, errOld := nodeClientOld.NodePrepareResource(ctx,
&drapbv1alpha2.NodePrepareResourceRequest{
Namespace: claim.Namespace,
ClaimUid: claim.Uid,
ClaimName: claim.Name,
ResourceHandle: claim.ResourceHandle,
})
result := &drapb.NodePrepareResourceResponse{}
if errOld != nil {
result.Error = errOld.Error()
} else {
result.CDIDevices = respOld.CdiDevices
}
resp.Claims[claim.Uid] = result
}
}
}
return
}
func (r *draPluginClient) NodeUnprepareResource(
func (r *draPluginClient) NodeUnprepareResources(
ctx context.Context,
namespace string,
claimUID types.UID,
claimName string,
resourceHandle string,
) (*drapbv1.NodeUnprepareResourceResponse, error) {
klog.V(4).InfoS(
log("calling NodeUnprepareResource rpc"),
"namespace", namespace,
"claimUID", claimUID,
"claimname", claimName,
"resourceHandle", resourceHandle)
req *drapb.NodeUnprepareResourcesRequest,
opts ...grpc.CallOption,
) (resp *drapb.NodeUnprepareResourcesResponse, err error) {
logger := klog.FromContext(ctx)
logger.V(4).Info(log("calling NodeUnprepareResource rpc"), "request", req)
defer logger.V(4).Info(log("done calling NodeUnprepareResources rpc"), "response", resp, "err", err)
if r.nodeV1ClientCreator == nil {
return nil, errors.New("nodeV1ClientCreate is nil")
if r.nodeClientCreator == nil {
return nil, errors.New("failed to call NodeUnprepareResources. nodeClientCreator is nil")
}
nodeClient, closer, err := r.nodeV1ClientCreator(r.addr)
nodeClient, nodeClientOld, closer, err := r.nodeClientCreator(r.addr)
if err != nil {
return nil, err
}
defer closer.Close()
req := &drapbv1.NodeUnprepareResourceRequest{
Namespace: namespace,
ClaimUid: string(claimUID),
ClaimName: claimName,
ResourceHandle: resourceHandle,
}
ctx, cancel := context.WithTimeout(ctx, PluginClientTimeout)
defer cancel()
return nodeClient.NodeUnprepareResource(ctx, req)
resp, err = nodeClient.NodeUnprepareResources(ctx, req)
if err != nil {
status, _ := grpcstatus.FromError(err)
if status.Code() == grpccodes.Unimplemented {
// Fall back to the older gRPC API.
resp = &drapb.NodeUnprepareResourcesResponse{
Claims: make(map[string]*drapb.NodeUnprepareResourceResponse),
}
err = nil
for _, claim := range req.Claims {
_, errOld := nodeClientOld.NodeUnprepareResource(ctx,
&drapbv1alpha2.NodeUnprepareResourceRequest{
Namespace: claim.Namespace,
ClaimUid: claim.Uid,
ClaimName: claim.Name,
ResourceHandle: claim.ResourceHandle,
})
result := &drapb.NodeUnprepareResourceResponse{}
if errOld != nil {
result.Error = errOld.Error()
}
resp.Claims[claim.Uid] = result
}
}
}
return
}
func newGrpcConn(addr draAddr) (*grpc.ClientConn, error) {