Don't allow setting dead server last contact threshold to less than 1 minute (#22040)

* Don't allow setting dead server last contact threshold to less than 1 minute

* add changelog

* document the minimum dead server last contact threshold
This commit is contained in:
Josh Black
2023-07-25 13:46:52 -07:00
committed by GitHub
parent 64b50ad7a1
commit d407078766
4 changed files with 18 additions and 2 deletions

3
changelog/22040.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
storage/raft: Cap the minimum dead_server_last_contact_threshold to 1m.
```

View File

@@ -194,6 +194,14 @@ func TestRaft_Autopilot_Configuration(t *testing.T) {
writeConfigFunc(writableConfig, true)
configCheckFunc(config)
// Check dead server last contact threshold minimum
writableConfig = map[string]interface{}{
"cleanup_dead_servers": true,
"dead_server_last_contact_threshold": "5s",
}
writeConfigFunc(writableConfig, true)
configCheckFunc(config)
// Ensure that the configuration stays across reboots
leaderCore := cluster.Cores[0]
testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
@@ -450,7 +458,7 @@ func TestRaft_Autopilot_DeadServerCleanup(t *testing.T) {
// Ensure Autopilot has the aggressive settings
config.CleanupDeadServers = true
config.ServerStabilizationTime = 5 * time.Second
config.DeadServerLastContactThreshold = 10 * time.Second
config.DeadServerLastContactThreshold = 1 * time.Minute
config.MaxTrailingLogs = 10
config.LastContactThreshold = 10 * time.Second
config.MinQuorum = 3

View File

@@ -533,6 +533,10 @@ func (b *SystemBackend) handleStorageRaftAutopilotConfigUpdate() framework.Opera
return logical.ErrorResponse(fmt.Sprintf("min_quorum must be set when cleanup_dead_servers is set and it should at least be 3; cleanup_dead_servers: %#v, min_quorum: %#v", effectiveConf.CleanupDeadServers, effectiveConf.MinQuorum)), logical.ErrInvalidRequest
}
if effectiveConf.CleanupDeadServers && effectiveConf.DeadServerLastContactThreshold.Seconds() < 60 {
return logical.ErrorResponse(fmt.Sprintf("dead_server_last_contact_threshold should not be set to less than 1m; received: %v", deadServerLastContactThreshold)), logical.ErrInvalidRequest
}
// Persist only the user supplied fields
if persist {
entry, err := logical.StorageEntryJSON(raftAutopilotConfigurationStoragePath, config)

View File

@@ -210,7 +210,8 @@ This endpoint is used to modify the configuration of the autopilot subsystem of
- `dead_server_last_contact_threshold` `(string: "24h")` - Limit on the amount of time
a server can go without leader contact before being considered failed. This
takes effect only when `cleanup_dead_servers` is `true`.
takes effect only when `cleanup_dead_servers` is `true`. This can not be set to a value
smaller than 1m.
- `max_trailing_logs` `(int: 1000)` - Amount of entries in the Raft Log that a server
can be behind before being considered unhealthy.