diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index 5ba1fba24ef..eb8a92c16ef 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -33,10 +33,20 @@ import ( "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/azure" "github.com/ghodss/yaml" + "github.com/golang/glog" + "k8s.io/apimachinery/pkg/util/wait" ) -// CloudProviderName is the value used for the --cloud-provider flag -const CloudProviderName = "azure" +const ( + // CloudProviderName is the value used for the --cloud-provider flag + CloudProviderName = "azure" + rateLimitQPSDefault = 1 + rateLimitBucketDefault = 5 + backoffRetriesDefault = 6 + backoffExponentDefault = 1.5 + backoffDurationDefault = 5 // in seconds + backoffJitterDefault = 1.0 +) // Config holds the configuration parsed from the --cloud-config flag // All fields are required unless otherwise specified @@ -70,6 +80,22 @@ type Config struct { AADClientID string `json:"aadClientId" yaml:"aadClientId"` // The ClientSecret for an AAD application with RBAC access to talk to Azure RM APIs AADClientSecret string `json:"aadClientSecret" yaml:"aadClientSecret"` + // Enable exponential backoff to manage resource request retries + CloudProviderBackoff bool `json:"cloudProviderBackoff" yaml:"cloudProviderBackoff"` + // Backoff retry limit + CloudProviderBackoffRetries int `json:"cloudProviderBackoffRetries" yaml:"cloudProviderBackoffRetries"` + // Backoff exponent + CloudProviderBackoffExponent float64 `json:"cloudProviderBackoffExponent" yaml:"cloudProviderBackoffExponent"` + // Backoff duration + CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration" yaml:"cloudProviderBackoffDuration"` + // Backoff jitter + CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter" yaml:"cloudProviderBackoffJitter"` + // Enable rate limiting + CloudProviderRateLimit bool `json:"cloudProviderRateLimit" yaml:"cloudProviderRateLimit"` + // Rate limit QPS + CloudProviderRateLimitQPS int `json:"cloudProviderRateLimitQPS" yaml:"cloudProviderRateLimitQPS"` + // Rate limit Bucket Size + CloudProviderRateLimitBucket int `json:"cloudProviderRateLimitBucket" yaml:"cloudProviderRateLimitBucket"` } // Cloud holds the config and clients @@ -86,6 +112,7 @@ type Cloud struct { VirtualMachinesClient compute.VirtualMachinesClient StorageAccountClient storage.AccountsClient operationPollRateLimiter flowcontrol.RateLimiter + resourceRequestBackoff wait.Backoff } func init() { @@ -179,8 +206,53 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { az.StorageAccountClient = storage.NewAccountsClientWithBaseURI(az.Environment.ResourceManagerEndpoint, az.SubscriptionID) az.StorageAccountClient.Authorizer = servicePrincipalToken - // 1 qps, up to 5 burst when in flowcontrol; i.e., aggressive backoff enforcement - az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter(1, 5) + // Conditionally configure rate limits + if az.CloudProviderRateLimit { + // Assign rate limit defaults if no configuration was passed in + if az.CloudProviderRateLimitQPS == 0 { + az.CloudProviderRateLimitQPS = rateLimitQPSDefault + } + if az.CloudProviderRateLimitBucket == 0 { + az.CloudProviderRateLimitBucket = rateLimitBucketDefault + } + az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter( + float32(az.CloudProviderRateLimitQPS), + az.CloudProviderRateLimitBucket) + glog.V(2).Infof("Azure cloudprovider using rate limits: QPS=%d, bucket=%d", + az.CloudProviderRateLimitQPS, + az.CloudProviderRateLimitBucket) + } else { + // if rate limits are configured off, az.operationPollRateLimiter.Accept() is a no-op + az.operationPollRateLimiter = flowcontrol.NewFakeAlwaysRateLimiter() + } + + // Conditionally configure resource request backoff + if az.CloudProviderBackoff { + // Assign backoff defaults if no configuration was passed in + if az.CloudProviderBackoffRetries == 0 { + az.CloudProviderBackoffRetries = backoffRetriesDefault + } + if az.CloudProviderBackoffExponent == 0 { + az.CloudProviderBackoffExponent = backoffExponentDefault + } + if az.CloudProviderBackoffDuration == 0 { + az.CloudProviderBackoffDuration = backoffDurationDefault + } + if az.CloudProviderBackoffJitter == 0 { + az.CloudProviderBackoffJitter = backoffJitterDefault + } + az.resourceRequestBackoff = wait.Backoff{ + Steps: az.CloudProviderBackoffRetries, + Factor: az.CloudProviderBackoffExponent, + Duration: time.Duration(az.CloudProviderBackoffDuration) * time.Second, + Jitter: az.CloudProviderBackoffJitter, + } + glog.V(2).Infof("Azure cloudprovider using retry backoff: retries=%d, exponent=%f, duration=%d, jitter=%f", + az.CloudProviderBackoffRetries, + az.CloudProviderBackoffExponent, + az.CloudProviderBackoffDuration, + az.CloudProviderBackoffJitter) + } return &az, nil } diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 64f6ce9b3ec..4ad3d7fbe10 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -17,35 +17,36 @@ limitations under the License. package azure import ( - "time" - "k8s.io/apimachinery/pkg/util/wait" "github.com/Azure/azure-sdk-for-go/arm/compute" "github.com/Azure/azure-sdk-for-go/arm/network" "github.com/Azure/go-autorest/autorest" "github.com/golang/glog" + "k8s.io/apimachinery/pkg/types" ) -const ( - operationPollInterval = 3 * time.Second - operationPollTimeoutDuration = time.Hour - backoffRetries = 12 - backoffExponent = 2 - backoffDuration = 1 * time.Second - backoffJitter = 1.0 -) - -var azAPIBackoff = wait.Backoff{ - Steps: backoffRetries, - Factor: backoffExponent, - Duration: backoffDuration, - Jitter: backoffJitter, +// GetVirtualMachineWithRetry invokes az.getVirtualMachine with exponential backoff retry +func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) { + var machine compute.VirtualMachine + var exists bool + err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() + var retryErr error + machine, exists, retryErr = az.getVirtualMachine(name) + if retryErr != nil { + glog.Errorf("backoff: failure, will retry,err=%v", retryErr) + return false, nil + } + glog.V(2).Infof("backoff: success") + return true, nil + }) + return machine, exists, err } // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) return processRetryResponse(resp, err) @@ -54,7 +55,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) return processRetryResponse(resp, err) @@ -63,7 +64,7 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) return processRetryResponse(resp, err) @@ -72,7 +73,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) return processRetryResponse(resp, err) @@ -81,7 +82,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) return processRetryResponse(resp, err) @@ -90,7 +91,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry func (az *Cloud) DeleteLBWithRetry(lbName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) return processRetryResponse(resp, err) @@ -99,7 +100,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error { // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) return processRetryResponse(resp, err) @@ -108,7 +109,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) return processRetryResponse(resp, err) @@ -117,7 +118,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry func (az *Cloud) DeleteRouteWithRetry(routeName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) return processRetryResponse(resp, err) @@ -126,7 +127,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error { // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) return processRetryResponse(resp, err) diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 45b109e1478..c4c40884208 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -24,6 +24,7 @@ import ( "k8s.io/kubernetes/pkg/cloudprovider" "github.com/Azure/azure-sdk-for-go/arm/compute" + "github.com/golang/glog" "k8s.io/apimachinery/pkg/types" ) @@ -31,6 +32,7 @@ import ( func (az *Cloud) NodeAddresses(name types.NodeName) ([]v1.NodeAddress, error) { ip, err := az.getIPForMachine(name) if err != nil { + glog.Errorf("error: az.NodeAddresses, az.getIPForMachine(%s), err=%v", name, err) return nil, err } @@ -55,9 +57,22 @@ func (az *Cloud) ExternalID(name types.NodeName) (string, error) { // InstanceID returns the cloud provider ID of the specified instance. // Note that if the instance does not exist or is no longer running, we must return ("", cloudprovider.InstanceNotFound) func (az *Cloud) InstanceID(name types.NodeName) (string, error) { - machine, exists, err := az.getVirtualMachine(name) + var machine compute.VirtualMachine + var exists bool + var err error + az.operationPollRateLimiter.Accept() + machine, exists, err = az.getVirtualMachine(name) if err != nil { - return "", err + if az.CloudProviderBackoff { + glog.V(2).Infof("InstanceID(%s) backing off", name) + machine, exists, err = az.GetVirtualMachineWithRetry(name) + if err != nil { + glog.V(2).Infof("InstanceID(%s) abort backoff", name) + return "", err + } + } else { + return "", err + } } else if !exists { return "", cloudprovider.InstanceNotFound } @@ -78,6 +93,7 @@ func (az *Cloud) InstanceTypeByProviderID(providerID string) (string, error) { func (az *Cloud) InstanceType(name types.NodeName) (string, error) { machine, exists, err := az.getVirtualMachine(name) if err != nil { + glog.Errorf("error: az.InstanceType(%s), az.getVirtualMachine(%s) err=%v", name, name, err) return "", err } else if !exists { return "", cloudprovider.InstanceNotFound @@ -102,6 +118,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) result, err := az.VirtualMachinesClient.List(az.ResourceGroup) if err != nil { + glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.List(%s), err=%v", az.ResourceGroup, err) return nil, err } @@ -112,6 +129,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) result, err = az.VirtualMachinesClient.ListAllNextResults(result) if err != nil { + glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%s), err=%v", result, err) return nil, err } diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 302a0d8c6cd..7e9f151e004 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -150,7 +150,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name) retryErr := az.CreateOrUpdateSGWithRetry(sg) if retryErr != nil { @@ -228,7 +228,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod if !existsLb || lbNeedsUpdate { glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { @@ -327,7 +327,7 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg) if retryErr != nil { @@ -364,7 +364,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if len(*lb.FrontendIPConfigurations) > 0 { glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { @@ -379,7 +379,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) retryErr := az.DeleteLBWithRetry(lbName) if retryErr != nil { @@ -433,7 +433,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name) resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - creating", serviceName, *pip.Name) retryErr := az.CreateOrUpdatePIPWithRetry(pip) if retryErr != nil { @@ -457,7 +457,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error { glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName) resp, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) - if shouldRetryAPIRequest(resp, deleteErr) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, deleteErr) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName) retryErr := az.DeletePublicIPWithRetry(pipName) if retryErr != nil { @@ -849,6 +849,7 @@ func findSecurityRule(rules []network.SecurityRule, rule network.SecurityRule) b // participating in the specified LoadBalancer Backend Pool. func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, backendPoolID string) error { vmName := mapNodeNameToVMName(nodeName) + az.operationPollRateLimiter.Accept() machine, err := az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "") if err != nil { return err @@ -906,8 +907,8 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b glog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) - if shouldRetryAPIRequest(resp, err) { - glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating", serviceName, nicName) + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating, err=%v", serviceName, nicName, err) retryErr := az.CreateOrUpdateInterfaceWithRetry(nic) if retryErr != nil { err = retryErr diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index 5c5fa2c132e..8332c9d124f 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -78,7 +78,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating routetable. routeTableName=%q", az.RouteTableName) resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating routetable. routeTableName=%q", az.RouteTableName) retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) if retryErr != nil { @@ -113,7 +113,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.CreateOrUpdateRouteWithRetry(route) if retryErr != nil { @@ -136,7 +136,7 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete backing off: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.DeleteRouteWithRetry(routeName) if retryErr != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index b35e312b353..bb09cc1c5e5 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -66,7 +66,7 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { @@ -146,7 +146,7 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go index 98e9eead48c..4176230a720 100644 --- a/pkg/cloudprovider/providers/azure/azure_test.go +++ b/pkg/cloudprovider/providers/azure/azure_test.go @@ -591,7 +591,15 @@ func TestNewCloudFromJSON(t *testing.T) { "securityGroupName": "--security-group-name--", "vnetName": "--vnet-name--", "routeTableName": "--route-table-name--", - "primaryAvailabilitySetName": "--primary-availability-set-name--" + "primaryAvailabilitySetName": "--primary-availability-set-name--", + "cloudProviderBackoff": true, + "cloudProviderBackoffRetries": 6, + "cloudProviderBackoffExponent": 1.5, + "cloudProviderBackoffDuration": 5, + "cloudProviderBackoffJitter": 1.0, + "cloudProviderRatelimit": true, + "cloudProviderRateLimitQPS": 1, + "cloudProviderRateLimitBucket": 5 }` validateConfig(t, config) } @@ -610,6 +618,14 @@ securityGroupName: --security-group-name-- vnetName: --vnet-name-- routeTableName: --route-table-name-- primaryAvailabilitySetName: --primary-availability-set-name-- +cloudProviderBackoff: true +cloudProviderBackoffRetries: 6 +cloudProviderBackoffExponent: 1.5 +cloudProviderBackoffDuration: 5 +cloudProviderBackoffJitter: 1.0 +cloudProviderRatelimit: true +cloudProviderRateLimitQPS: 1 +cloudProviderRateLimitBucket: 5 ` validateConfig(t, config) } @@ -659,6 +675,30 @@ func validateConfig(t *testing.T, config string) { if azureCloud.PrimaryAvailabilitySetName != "--primary-availability-set-name--" { t.Errorf("got incorrect value for PrimaryAvailabilitySetName") } + if azureCloud.CloudProviderBackoff != true { + t.Errorf("got incorrect value for CloudProviderBackoff") + } + if azureCloud.CloudProviderBackoffRetries != 6 { + t.Errorf("got incorrect value for CloudProviderBackoffRetries") + } + if azureCloud.CloudProviderBackoffExponent != 1.5 { + t.Errorf("got incorrect value for CloudProviderBackoffExponent") + } + if azureCloud.CloudProviderBackoffDuration != 5 { + t.Errorf("got incorrect value for CloudProviderBackoffDuration") + } + if azureCloud.CloudProviderBackoffJitter != 1.0 { + t.Errorf("got incorrect value for CloudProviderBackoffJitter") + } + if azureCloud.CloudProviderRateLimit != true { + t.Errorf("got incorrect value for CloudProviderRateLimit") + } + if azureCloud.CloudProviderRateLimitQPS != 1 { + t.Errorf("got incorrect value for CloudProviderRateLimitQPS") + } + if azureCloud.CloudProviderRateLimitBucket != 5 { + t.Errorf("got incorrect value for CloudProviderRateLimitBucket") + } } func TestDecodeInstanceInfo(t *testing.T) { diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go index 094f3aaf903..bebea9a64ab 100644 --- a/pkg/cloudprovider/providers/azure/azure_util.go +++ b/pkg/cloudprovider/providers/azure/azure_util.go @@ -25,6 +25,7 @@ import ( "github.com/Azure/azure-sdk-for-go/arm/compute" "github.com/Azure/azure-sdk-for-go/arm/network" + "github.com/golang/glog" "k8s.io/apimachinery/pkg/types" ) @@ -242,26 +243,31 @@ func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) { return "", cloudprovider.InstanceNotFound } if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), az.getVirtualMachine(%s), err=%v", nodeName, nodeName, err) return "", err } nicID, err := getPrimaryInterfaceID(machine) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryInterfaceID(%s), err=%v", nodeName, machine, err) return "", err } nicName, err := getLastSegment(nicID) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getLastSegment(%s), err=%v", nodeName, nicID, err) return "", err } nic, err := az.InterfacesClient.Get(az.ResourceGroup, nicName, "") if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), az.InterfacesClient.Get(%s, %s, %s), err=%v", nodeName, az.ResourceGroup, nicName, "", err) return "", err } ipConfig, err := getPrimaryIPConfig(nic) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryIPConfig(%s), err=%v", nodeName, nic, err) return "", err } diff --git a/pkg/cloudprovider/providers/azure/azure_wrap.go b/pkg/cloudprovider/providers/azure/azure_wrap.go index 185b7347211..2eb5f6d9e3c 100644 --- a/pkg/cloudprovider/providers/azure/azure_wrap.go +++ b/pkg/cloudprovider/providers/azure/azure_wrap.go @@ -43,6 +43,7 @@ func (az *Cloud) getVirtualMachine(nodeName types.NodeName) (vm compute.VirtualM var realErr error vmName := string(nodeName) + az.operationPollRateLimiter.Accept() vm, err = az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "") exists, realErr = checkResourceExistsFromError(err)