test: fix and improve reboot/reset tests

These tests rely on node uptime checks. These checks are quite flaky.

Following fixes were applied:

* code was refactored as common method shared between reset/reboot tests
(reboot all nodes does checks in a different way, so it wasn't updated)

* each request to read uptime times out in 5 seconds, so that checks
don't wait forever when node is down (or connection is aborted)

* to account for node availability vs. lower uptime in the beginning of
test, add extra elapsed time to the check condition

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
Andrey Smirnov
2020-06-29 23:14:39 +03:00
committed by talos-bot
parent 51112a1d86
commit 6fb55229a2
3 changed files with 80 additions and 81 deletions

View File

@@ -8,14 +8,11 @@ package api
import (
"context"
"fmt"
"sort"
"testing"
"time"
"github.com/talos-systems/talos/internal/integration/base"
"github.com/talos-systems/talos/pkg/client"
"github.com/talos-systems/talos/pkg/retry"
)
type ResetSuite struct {
@@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() {
suite.T().Log("Resetting node", node)
func(node string) {
// timeout for single node Reset
ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
defer ctxCancel()
nodeCtx := client.WithNodes(ctx, node)
// read uptime before Reset
uptimeBefore, err := suite.ReadUptime(nodeCtx)
suite.Require().NoError(err)
// uptime should go down after Reset, as it reboots the node
suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
// force reboot after reset, as this is the only mode we can test
suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true))
var uptimeAfter float64
suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
uptimeAfter, err = suite.ReadUptime(nodeCtx)
if err != nil {
// API might be unresponsive during reboot
return retry.ExpectedError(err)
}
if uptimeAfter >= uptimeBefore {
// uptime should go down after Reset, as it reboots the node
return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
}
return nil
}))
// TODO: there is no good way to assert that node was reset and disk contents were really wiped
// NB: using `ctx` here to have client talking to init node by default
suite.AssertClusterHealthy(ctx)
}(node)
return suite.Client.Reset(nodeCtx, true, true)
}, 10*time.Minute)
// TODO: there is no good way to assert that node was reset and disk contents were really wiped
}
}