mirror of
https://github.com/optim-enterprises-bv/kubernetes.git
synced 2025-11-02 03:08:15 +00:00
svm: set UID and RV on SSA patch to cause conflict on logical create
When a resource gets deleted during migration, the SVM SSA patch calls are interpreted as a logical create request. Since the object from storage is nil, the merged result is just a type meta object, which lacks a name in the body. This fails when the API server checks that the name from the request URL and the body are the same. Note that a create request is something that SVM controller should never do. Once the UID is set on the patch, the API server will fail the request at a slightly earlier point with an "uid mismatch" conflict error, which the SVM controller can handle gracefully. Setting UID by itself is not sufficient. When a resource gets deleted and recreated, if RV is not set but UID is set, we would get an immutable field validation error for attempting to update the UID. To address this, we set the resource version on the SSA patch as well. This will cause that update request to also fail with a conflict error. Added the create verb on all resources for SVM controller RBAC as otherwise the API server will reject the request before it fails with a conflict error. The change addresses a host of other issues with the SVM controller: 1. Include failure message in SVM resource 2. Do not block forever on unsynced GC monitor 3. Do not immediately fail on GC monitor being missing, allow for a grace period since discovery may be out of sync 4. Set higher QPS and burst to handle large migrations Test changes: 1. Clean up CRD webhook convertor logs 2. Allow SVM tests to be run multiple times to make finding flakes easier 3. Create and delete CRs during CRD test to force out any flakes 4. Add a stress test with multiple parallel migrations 5. Enable RBAC on KAS 6. Run KCM directly to exercise wiring and RBAC 7. Better logs during CRD migration 8. Scan audit logs to confirm SVM controller never creates Signed-off-by: Monis Khan <mok@microsoft.com>
This commit is contained in:
@@ -199,7 +199,7 @@ func (rv *ResourceVersionController) sync(ctx context.Context, key string) error
|
||||
StorageVersionMigrations().
|
||||
UpdateStatus(
|
||||
ctx,
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason),
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason, "resource does not exist in discovery"),
|
||||
metav1.UpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
|
||||
@@ -204,27 +204,35 @@ func (svmc *SVMController) sync(ctx context.Context, key string) error {
|
||||
}
|
||||
gvr := getGVRFromResource(toBeProcessedSVM)
|
||||
|
||||
resourceMonitor, err := svmc.dependencyGraphBuilder.GetMonitor(ctx, gvr)
|
||||
// prevent unsynced monitor from blocking forever
|
||||
// use a short timeout so that we can fail quickly and possibly handle other migrations while this monitor gets ready.
|
||||
monCtx, monCtxCancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer monCtxCancel()
|
||||
resourceMonitor, errMonitor := svmc.dependencyGraphBuilder.GetMonitor(monCtx, gvr)
|
||||
if resourceMonitor != nil {
|
||||
if err != nil {
|
||||
if errMonitor != nil {
|
||||
// non nil monitor indicates that error is due to resource not being synced
|
||||
return fmt.Errorf("dependency graph is not synced, requeuing to attempt again")
|
||||
return fmt.Errorf("dependency graph is not synced, requeuing to attempt again: %w", errMonitor)
|
||||
}
|
||||
} else {
|
||||
logger.V(4).Error(errMonitor, "resource does not exist in GC", "gvr", gvr.String())
|
||||
|
||||
// our GC cache could be missing a recently created custom resource, so give it some time to catch up
|
||||
// we resync discovery every 30 seconds so twice that should be sufficient
|
||||
if toBeProcessedSVM.CreationTimestamp.Add(time.Minute).After(time.Now()) {
|
||||
return fmt.Errorf("resource does not exist in GC, requeuing to attempt again: %w", errMonitor)
|
||||
}
|
||||
|
||||
// we can't migrate a resource that doesn't exist in the GC
|
||||
_, err = svmc.kubeClient.StoragemigrationV1alpha1().
|
||||
_, errStatus := svmc.kubeClient.StoragemigrationV1alpha1().
|
||||
StorageVersionMigrations().
|
||||
UpdateStatus(
|
||||
ctx,
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason),
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason, "resource not found"),
|
||||
metav1.UpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logger.V(4).Error(fmt.Errorf("error migrating the resource"), "resource does not exist in GC", "gvr", gvr.String())
|
||||
|
||||
return nil
|
||||
return errStatus
|
||||
}
|
||||
|
||||
gcListResourceVersion, err := convertResourceVersionToInt(resourceMonitor.Controller.LastSyncResourceVersion())
|
||||
@@ -244,7 +252,7 @@ func (svmc *SVMController) sync(ctx context.Context, key string) error {
|
||||
StorageVersionMigrations().
|
||||
UpdateStatus(
|
||||
ctx,
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationRunning, migrationRunningStatusReason),
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationRunning, migrationRunningStatusReason, ""),
|
||||
metav1.UpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
@@ -255,60 +263,72 @@ func (svmc *SVMController) sync(ctx context.Context, key string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
typeMeta := metav1.TypeMeta{}
|
||||
typeMeta.APIVersion, typeMeta.Kind = gvk.ToAPIVersionAndKind()
|
||||
data, err := json.Marshal(typeMeta)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// ToDo: implement a mechanism to resume migration from the last migrated resource in case of a failure
|
||||
// process storage migration
|
||||
for _, gvrKey := range resourceMonitor.Store.ListKeys() {
|
||||
namespace, name, err := cache.SplitMetaNamespaceKey(gvrKey)
|
||||
for _, obj := range resourceMonitor.Store.List() {
|
||||
accessor, err := meta.Accessor(obj)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = svmc.dynamicClient.Resource(gvr).
|
||||
Namespace(namespace).
|
||||
typeMeta := typeMetaUIDRV{}
|
||||
typeMeta.APIVersion, typeMeta.Kind = gvk.ToAPIVersionAndKind()
|
||||
// set UID so that when a resource gets deleted, we get an "uid mismatch"
|
||||
// conflict error instead of trying to create it.
|
||||
typeMeta.UID = accessor.GetUID()
|
||||
// set RV so that when a resources gets updated or deleted+recreated, we get an "object has been modified"
|
||||
// conflict error. we do not actually need to do anything special for the updated case because if RV
|
||||
// was not set, it would just result in no-op request. but for the deleted+recreated case, if RV is
|
||||
// not set but UID is set, we would get an immutable field validation error. hence we must set both.
|
||||
typeMeta.ResourceVersion = accessor.GetResourceVersion()
|
||||
data, err := json.Marshal(typeMeta)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, errPatch := svmc.dynamicClient.Resource(gvr).
|
||||
Namespace(accessor.GetNamespace()).
|
||||
Patch(ctx,
|
||||
name,
|
||||
accessor.GetName(),
|
||||
types.ApplyPatchType,
|
||||
data,
|
||||
metav1.PatchOptions{
|
||||
FieldManager: svmc.controllerName,
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
// in case of NotFound or Conflict, we can stop processing migration for that resource
|
||||
if apierrors.IsNotFound(err) || apierrors.IsConflict(err) {
|
||||
continue
|
||||
}
|
||||
|
||||
_, err = svmc.kubeClient.StoragemigrationV1alpha1().
|
||||
// in case of conflict, we can stop processing migration for that resource because it has either been
|
||||
// - updated, meaning that migration has already been performed
|
||||
// - deleted, meaning that migration is not needed
|
||||
// - deleted and recreated, meaning that migration has already been performed
|
||||
if apierrors.IsConflict(errPatch) {
|
||||
logger.V(6).Info("Resource ignored due to conflict", "namespace", accessor.GetNamespace(), "name", accessor.GetName(), "gvr", gvr.String(), "err", errPatch)
|
||||
continue
|
||||
}
|
||||
|
||||
if errPatch != nil {
|
||||
logger.V(4).Error(errPatch, "Failed to migrate the resource", "namespace", accessor.GetNamespace(), "name", accessor.GetName(), "gvr", gvr.String(), "reason", apierrors.ReasonForError(errPatch))
|
||||
|
||||
_, errStatus := svmc.kubeClient.StoragemigrationV1alpha1().
|
||||
StorageVersionMigrations().
|
||||
UpdateStatus(
|
||||
ctx,
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason),
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationFailed, migrationFailedStatusReason, "migration encountered unhandled error"),
|
||||
metav1.UpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
logger.V(4).Error(err, "Failed to migrate the resource", "name", gvrKey, "gvr", gvr.String(), "reason", apierrors.ReasonForError(err))
|
||||
|
||||
return nil
|
||||
return errStatus
|
||||
// Todo: add retry for scenarios where API server returns rate limiting error
|
||||
}
|
||||
logger.V(4).Info("Successfully migrated the resource", "name", gvrKey, "gvr", gvr.String())
|
||||
logger.V(4).Info("Successfully migrated the resource", "namespace", accessor.GetNamespace(), "name", accessor.GetName(), "gvr", gvr.String())
|
||||
}
|
||||
|
||||
_, err = svmc.kubeClient.StoragemigrationV1alpha1().
|
||||
StorageVersionMigrations().
|
||||
UpdateStatus(
|
||||
ctx,
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationSucceeded, migrationSuccessStatusReason),
|
||||
setStatusConditions(toBeProcessedSVM, svmv1alpha1.MigrationSucceeded, migrationSuccessStatusReason, ""),
|
||||
metav1.UpdateOptions{},
|
||||
)
|
||||
if err != nil {
|
||||
@@ -318,3 +338,13 @@ func (svmc *SVMController) sync(ctx context.Context, key string) error {
|
||||
logger.V(4).Info("Finished syncing svm resource", "key", key, "gvr", gvr.String(), "elapsed", time.Since(startTime))
|
||||
return nil
|
||||
}
|
||||
|
||||
type typeMetaUIDRV struct {
|
||||
metav1.TypeMeta `json:",inline"`
|
||||
objectMetaUIDandRV `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type objectMetaUIDandRV struct {
|
||||
UID types.UID `json:"uid,omitempty"`
|
||||
ResourceVersion string `json:"resourceVersion,omitempty"`
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ func indexOfCondition(svm *svmv1alpha1.StorageVersionMigration, conditionType sv
|
||||
func setStatusConditions(
|
||||
toBeUpdatedSVM *svmv1alpha1.StorageVersionMigration,
|
||||
conditionType svmv1alpha1.MigrationConditionType,
|
||||
reason string,
|
||||
reason, message string,
|
||||
) *svmv1alpha1.StorageVersionMigration {
|
||||
if !IsConditionTrue(toBeUpdatedSVM, conditionType) {
|
||||
if conditionType == svmv1alpha1.MigrationSucceeded || conditionType == svmv1alpha1.MigrationFailed {
|
||||
@@ -77,6 +77,7 @@ func setStatusConditions(
|
||||
Status: corev1.ConditionTrue,
|
||||
LastUpdateTime: metav1.Now(),
|
||||
Reason: reason,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user