From 4ebe560c74f94d1e125cfbca2d36591f52c112e1 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 18 Aug 2025 13:44:26 +0200 Subject: [PATCH] DRA allocator: fix data race around `claimsToAllocate` The `claimsToAllocate` field stores the parameter of the `Allocate` call and therefore has to be in the per-Allocate `allocate` struct. Without support for extended resources, all calls get the same slice, which explains why this bug did not fail more severely and only showed up in a data race warning during integration testing. With support for extended resources, the result is potentially broken because each call gets different slices. --- .../experimental/allocator_experimental.go | 14 +++++++------- .../internal/incubating/allocator_incubating.go | 4 ++-- .../structured/internal/stable/allocator_stable.go | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go index 268850ec85a..84218307c6b 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go @@ -86,12 +86,11 @@ var SupportedFeatures = internal.Features{ } type Allocator struct { - features Features - claimsToAllocate []*resourceapi.ResourceClaim - allocatedState AllocatedState - classLister DeviceClassLister - slices []*resourceapi.ResourceSlice - celCache *cel.Cache + features Features + allocatedState AllocatedState + classLister DeviceClassLister + slices []*resourceapi.ResourceSlice + celCache *cel.Cache // availableCounters contains the available counters for individual // ResourceSlices. It acts as a cache that is updated the first time // the available counters are needed for each ResourceSlice. The information @@ -139,6 +138,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou ctx: ctx, // all methods share the same a and thus ctx logger: klog.FromContext(ctx), node: node, + claimsToAllocate: claims, deviceMatchesRequest: make(map[matchKey]bool), constraints: make([][]constraint, len(claims)), consumedCounters: make(map[string]counterSets), @@ -146,7 +146,6 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou result: make([]internalAllocationResult, len(claims)), allocatingCapacity: NewConsumedCapacityCollection(), } - alloc.claimsToAllocate = claims alloc.logger.V(5).Info("Starting allocation", "numClaims", len(alloc.claimsToAllocate)) defer alloc.logger.V(5).Info("Done with allocation", "success", len(finalResult) == len(alloc.claimsToAllocate), "err", finalErr) @@ -577,6 +576,7 @@ type allocator struct { ctx context.Context logger klog.Logger node *v1.Node + claimsToAllocate []*resourceapi.ResourceClaim pools []*Pool deviceMatchesRequest map[matchKey]bool constraints [][]constraint // one list of constraints per claim diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go index ad1e4d74ad1..d65235e35b0 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go @@ -58,7 +58,6 @@ var SupportedFeatures = internal.Features{ type Allocator struct { features Features - claimsToAllocate []*resourceapi.ResourceClaim allocatedDevices sets.Set[DeviceID] classLister DeviceClassLister slices []*resourceapi.ResourceSlice @@ -110,13 +109,13 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou ctx: ctx, // all methods share the same a and thus ctx logger: klog.FromContext(ctx), node: node, + claimsToAllocate: claims, deviceMatchesRequest: make(map[matchKey]bool), constraints: make([][]constraint, len(claims)), consumedCounters: make(map[string]counterSets), requestData: make(map[requestIndices]requestData), result: make([]internalAllocationResult, len(claims)), } - alloc.claimsToAllocate = claims alloc.logger.V(5).Info("Starting allocation", "numClaims", len(alloc.claimsToAllocate)) defer alloc.logger.V(5).Info("Done with allocation", "success", len(finalResult) == len(alloc.claimsToAllocate), "err", finalErr) @@ -471,6 +470,7 @@ type allocator struct { ctx context.Context logger klog.Logger node *v1.Node + claimsToAllocate []*resourceapi.ResourceClaim pools []*Pool deviceMatchesRequest map[matchKey]bool constraints [][]constraint // one list of constraints per claim diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/stable/allocator_stable.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/stable/allocator_stable.go index a24d4b25b8e..d5b118e931e 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/stable/allocator_stable.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/stable/allocator_stable.go @@ -56,7 +56,6 @@ var SupportedFeatures = internal.Features{} type Allocator struct { features Features - claimsToAllocate []*resourceapi.ResourceClaim allocatedDevices sets.Set[DeviceID] classLister DeviceClassLister slices []*resourceapi.ResourceSlice @@ -101,13 +100,13 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou ctx: ctx, // all methods share the same a and thus ctx logger: klog.FromContext(ctx), node: node, + claimsToAllocate: claims, deviceMatchesRequest: make(map[matchKey]bool), constraints: make([][]constraint, len(claims)), consumedCounters: make(map[string]counterSets), requestData: make(map[requestIndices]requestData), result: make([]internalAllocationResult, len(claims)), } - alloc.claimsToAllocate = claims alloc.logger.V(5).Info("Starting allocation", "numClaims", len(alloc.claimsToAllocate)) defer alloc.logger.V(5).Info("Done with allocation", "success", len(finalResult) == len(alloc.claimsToAllocate), "err", finalErr) @@ -453,6 +452,7 @@ type allocator struct { ctx context.Context logger klog.Logger node *v1.Node + claimsToAllocate []*resourceapi.ResourceClaim pools []*Pool deviceMatchesRequest map[matchKey]bool constraints [][]constraint // one list of constraints per claim