Merge pull request #132632 from sdowell/gc-rv-race

fix: add RV check on GC delete calls
This commit is contained in:
Kubernetes Prow Robot
2025-07-24 17:58:47 -07:00
committed by GitHub
5 changed files with 172 additions and 12 deletions

View File

@@ -620,7 +620,7 @@ func (gc *GarbageCollector) attemptToDeleteItem(ctx context.Context, item *node)
// FinalizerDeletingDependents from the item, resulting in the final
// deletion of the item.
policy := metav1.DeletePropagationForeground
return gc.deleteObject(item.identity, &policy)
return gc.deleteObject(item.identity, latest.ResourceVersion, latest.OwnerReferences, &policy)
default:
// item doesn't have any solid owner, so it needs to be garbage
// collected. Also, none of item's owners is waiting for the deletion of
@@ -641,7 +641,7 @@ func (gc *GarbageCollector) attemptToDeleteItem(ctx context.Context, item *node)
"item", item.identity,
"propagationPolicy", policy,
)
return gc.deleteObject(item.identity, &policy)
return gc.deleteObject(item.identity, latest.ResourceVersion, latest.OwnerReferences, &policy)
}
}

View File

@@ -41,6 +41,7 @@ import (
"k8s.io/utils/ptr"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/meta/testrestmapper"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -2313,6 +2314,123 @@ func TestConflictingData(t *testing.T) {
}),
},
},
{
// https://github.com/kubernetes/kubernetes/issues/114603
name: "resourceVersion conflict between Get/Delete while processing object deletion",
steps: []step{
// setup, 0,1
createObjectInClient("", "v1", "pods", "ns1", makeMetadataObj(pod1ns1)), // good parent
createObjectInClient("", "v1", "pods", "ns1", withRV(makeMetadataObj(pod2ns1, pod1ns1), "42")), // good child
// events, 2,3
processEvent(makeAddEvent(pod1ns1)),
processEvent(withRV(makeAddEvent(pod2ns1, pod1ns1), "42")),
// delete parent, 4,5,6
deleteObjectFromClient("", "v1", "pods", "ns1", pod1ns1.Name),
processEvent(makeDeleteEvent(pod1ns1)),
assertState(state{
absentOwnerCache: []objectReference{pod1ns1},
graphNodes: []*node{
makeNode(pod2ns1, withOwners(pod1ns1))},
pendingAttemptToDelete: []*node{
makeNode(pod2ns1, withOwners(pod1ns1)),
},
}),
// add reactor to enforce RV precondition, 7
prependReactor("delete", "pods", (func() func(ctx stepContext) clientgotesting.ReactionFunc {
call := 0
return func(ctx stepContext) clientgotesting.ReactionFunc {
return func(action clientgotesting.Action) (handled bool, ret runtime.Object, err error) {
call++
deleteAction, ok := action.(clientgotesting.DeleteAction)
if !ok {
ctx.t.Error("missing DeleteAction")
return false, nil, nil
}
preconditionRV := ""
if preconditions := deleteAction.GetDeleteOptions().Preconditions; preconditions != nil {
if rv := preconditions.ResourceVersion; rv != nil {
preconditionRV = *rv
}
}
objectRV := ""
var metadataObj *metav1.PartialObjectMetadata
if obj, err := ctx.metadataClient.Tracker().Get(deleteAction.GetResource(), deleteAction.GetNamespace(), deleteAction.GetName(), metav1.GetOptions{}); err == nil {
metadataObj = obj.(*metav1.PartialObjectMetadata)
objectRV = metadataObj.ResourceVersion
}
switch call {
case 1:
if preconditionRV == "42" && objectRV == "42" {
// simulate a concurrent change that modifies the owner references
ctx.t.Log("changing rv 42 --> 43 concurrently")
metadataObj.OwnerReferences = []metav1.OwnerReference{role1v1.OwnerReference}
metadataObj.ResourceVersion = "43"
if err := ctx.metadataClient.Tracker().Update(deleteAction.GetResource(), metadataObj, deleteAction.GetNamespace()); err != nil {
ctx.t.Errorf("unexpected error updating tracker: %v", err)
}
return true, nil, errors.NewConflict(deleteAction.GetResource().GroupResource(), deleteAction.GetName(), fmt.Errorf("expected 42, got 43"))
} else {
ctx.t.Errorf("expected delete with rv=42 precondition on call 1, got %q, %q", preconditionRV, objectRV)
}
case 2:
if preconditionRV == "43" && objectRV == "43" {
// simulate a concurrent change that *does not* modify the owner references
ctx.t.Log("changing rv 43 --> 44 concurrently")
metadataObj.ResourceVersion = "44"
if err := ctx.metadataClient.Tracker().Update(deleteAction.GetResource(), metadataObj, deleteAction.GetNamespace()); err != nil {
ctx.t.Errorf("unexpected error updating tracker: %v", err)
}
return true, nil, errors.NewConflict(deleteAction.GetResource().GroupResource(), deleteAction.GetName(), fmt.Errorf("expected 43, got 44"))
} else {
ctx.t.Errorf("expected delete with rv=43 precondition on call 2, got %q, %q", preconditionRV, objectRV)
}
case 3:
if preconditionRV != "" {
ctx.t.Errorf("expected delete with no rv precondition on call 3, got %q", preconditionRV)
}
default:
ctx.t.Errorf("expected delete call %d", call)
}
return false, nil, nil
}
}
})()),
// 8,9
processAttemptToDelete(1),
assertState(state{
clientActions: []string{
"get /v1, Resource=pods ns=ns1 name=podname2", // first get sees rv=42, pod1ns1 owner
"delete /v1, Resource=pods ns=ns1 name=podname2", // first delete with rv=42 precondition gets confict, triggers a live get
"get /v1, Resource=pods ns=ns1 name=podname2", // get has new ownerReferences, exits attemptToDelete
},
absentOwnerCache: []objectReference{pod1ns1},
graphNodes: []*node{
makeNode(pod2ns1, withOwners(pod1ns1))},
pendingAttemptToDelete: []*node{
makeNode(pod2ns1, withOwners(pod1ns1))},
}),
// reattempt delete, 10,11
processAttemptToDelete(1),
assertState(state{
clientActions: []string{
"get /v1, Resource=pods ns=ns1 name=podname2", // first get sees rv=43, role1v1 owner
"get rbac.authorization.k8s.io/v1, Resource=roles ns=ns1 name=role1", // verify missing owner
"delete /v1, Resource=pods ns=ns1 name=podname2", // first delete RV precondition triggers a live Get
"get /v1, Resource=pods ns=ns1 name=podname2", // the object has same ownerReferences, causing unconditional Delete
"delete /v1, Resource=pods ns=ns1 name=podname2", // unconditional Delete
},
absentOwnerCache: []objectReference{pod1ns1, role1v1},
graphNodes: []*node{
makeNode(pod2ns1, withOwners(pod1ns1))},
}),
},
},
}
alwaysStarted := make(chan struct{})
@@ -2369,6 +2487,7 @@ func TestConflictingData(t *testing.T) {
uidToNode: make(map[types.UID]*node),
},
attemptToDelete: attemptToDelete,
attemptToOrphan: attemptToOrphan,
absentOwnerCache: absentOwnerCache,
},
}
@@ -2473,6 +2592,16 @@ func makeObj(identity objectReference, owners ...objectReference) *metaonly.Meta
return obj
}
func withRV[T any](obj T, rv string) T {
switch t := any(obj).(type) {
case *metav1.PartialObjectMetadata:
t.ResourceVersion = rv
case *event:
withRV(t.obj, rv)
}
return obj
}
func makeMetadataObj(identity objectReference, owners ...objectReference) *metav1.PartialObjectMetadata {
obj := &metav1.PartialObjectMetadata{
TypeMeta: metav1.TypeMeta{APIVersion: identity.APIVersion, Kind: identity.Kind},
@@ -2523,6 +2652,18 @@ func processPendingGraphChanges(count int) step {
}
}
type reactionFuncFactory func(ctx stepContext) clientgotesting.ReactionFunc
func prependReactor(verb, resource string, reaction reactionFuncFactory) step {
return step{
name: "prependReactor",
check: func(ctx stepContext) {
ctx.t.Helper()
ctx.metadataClient.PrependReactor(verb, resource, reaction(ctx))
},
}
}
func processAttemptToDelete(count int) step {
return step{
name: "processAttemptToDelete",

View File

@@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"reflect"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
@@ -49,15 +50,33 @@ func (gc *GarbageCollector) apiResource(apiVersion, kind string) (schema.GroupVe
return mapping.Resource, mapping.Scope == meta.RESTScopeNamespace, nil
}
func (gc *GarbageCollector) deleteObject(item objectReference, policy *metav1.DeletionPropagation) error {
func (gc *GarbageCollector) deleteObject(item objectReference, resourceVersion string, ownersAtResourceVersion []metav1.OwnerReference, policy *metav1.DeletionPropagation) error {
resource, namespaced, err := gc.apiResource(item.APIVersion, item.Kind)
if err != nil {
return err
}
uid := item.UID
preconditions := metav1.Preconditions{UID: &uid}
if len(resourceVersion) > 0 {
preconditions.ResourceVersion = &resourceVersion
}
deleteOptions := metav1.DeleteOptions{Preconditions: &preconditions, PropagationPolicy: policy}
return gc.metadataClient.Resource(resource).Namespace(resourceDefaultNamespace(namespaced, item.Namespace)).Delete(context.TODO(), item.Name, deleteOptions)
resourceClient := gc.metadataClient.Resource(resource).Namespace(resourceDefaultNamespace(namespaced, item.Namespace))
err = resourceClient.Delete(context.TODO(), item.Name, deleteOptions)
if errors.IsConflict(err) && len(resourceVersion) > 0 {
// check if the ownerReferences changed
liveObject, liveErr := resourceClient.Get(context.TODO(), item.Name, metav1.GetOptions{})
if errors.IsNotFound(liveErr) {
// object we wanted to delete is gone, success!
return nil
}
if liveErr == nil && liveObject.UID == item.UID && liveObject.ResourceVersion != resourceVersion && reflect.DeepEqual(liveObject.OwnerReferences, ownersAtResourceVersion) {
// object changed, causing a conflict error, but ownerReferences did not change.
// retry delete without resourceVersion precondition so rapid updates unrelated to ownerReferences don't starve GC
return gc.deleteObject(item, "", nil, policy)
}
}
return err
}
func (gc *GarbageCollector) getObject(item objectReference) (*metav1.PartialObjectMetadata, error) {

View File

@@ -279,19 +279,19 @@ func (c *dynamicResourceClient) Delete(ctx context.Context, name string, opts me
switch {
case len(c.namespace) == 0 && len(subresources) == 0:
_, err = c.client.Fake.
Invokes(testing.NewRootDeleteAction(c.resource, name), &metav1.Status{Status: "dynamic delete fail"})
Invokes(testing.NewRootDeleteActionWithOptions(c.resource, name, opts), &metav1.Status{Status: "dynamic delete fail"})
case len(c.namespace) == 0 && len(subresources) > 0:
_, err = c.client.Fake.
Invokes(testing.NewRootDeleteSubresourceAction(c.resource, strings.Join(subresources, "/"), name), &metav1.Status{Status: "dynamic delete fail"})
Invokes(testing.NewRootDeleteSubresourceActionWithOptions(c.resource, strings.Join(subresources, "/"), name, opts), &metav1.Status{Status: "dynamic delete fail"})
case len(c.namespace) > 0 && len(subresources) == 0:
_, err = c.client.Fake.
Invokes(testing.NewDeleteAction(c.resource, c.namespace, name), &metav1.Status{Status: "dynamic delete fail"})
Invokes(testing.NewDeleteActionWithOptions(c.resource, c.namespace, name, opts), &metav1.Status{Status: "dynamic delete fail"})
case len(c.namespace) > 0 && len(subresources) > 0:
_, err = c.client.Fake.
Invokes(testing.NewDeleteSubresourceAction(c.resource, strings.Join(subresources, "/"), c.namespace, name), &metav1.Status{Status: "dynamic delete fail"})
Invokes(testing.NewDeleteSubresourceActionWithOptions(c.resource, strings.Join(subresources, "/"), c.namespace, name, opts), &metav1.Status{Status: "dynamic delete fail"})
}
return err

View File

@@ -235,19 +235,19 @@ func (c *metadataResourceClient) Delete(ctx context.Context, name string, opts m
switch {
case len(c.namespace) == 0 && len(subresources) == 0:
_, err = c.client.Fake.
Invokes(testing.NewRootDeleteAction(c.resource, name), &metav1.Status{Status: "metadata delete fail"})
Invokes(testing.NewRootDeleteActionWithOptions(c.resource, name, opts), &metav1.Status{Status: "metadata delete fail"})
case len(c.namespace) == 0 && len(subresources) > 0:
_, err = c.client.Fake.
Invokes(testing.NewRootDeleteSubresourceAction(c.resource, strings.Join(subresources, "/"), name), &metav1.Status{Status: "metadata delete fail"})
Invokes(testing.NewRootDeleteSubresourceActionWithOptions(c.resource, strings.Join(subresources, "/"), name, opts), &metav1.Status{Status: "metadata delete fail"})
case len(c.namespace) > 0 && len(subresources) == 0:
_, err = c.client.Fake.
Invokes(testing.NewDeleteAction(c.resource, c.namespace, name), &metav1.Status{Status: "metadata delete fail"})
Invokes(testing.NewDeleteActionWithOptions(c.resource, c.namespace, name, opts), &metav1.Status{Status: "metadata delete fail"})
case len(c.namespace) > 0 && len(subresources) > 0:
_, err = c.client.Fake.
Invokes(testing.NewDeleteSubresourceAction(c.resource, strings.Join(subresources, "/"), c.namespace, name), &metav1.Status{Status: "metadata delete fail"})
Invokes(testing.NewDeleteSubresourceActionWithOptions(c.resource, strings.Join(subresources, "/"), c.namespace, name, opts), &metav1.Status{Status: "metadata delete fail"})
}
return err