mirror of
				https://github.com/lingble/talos.git
				synced 2025-11-03 22:17:58 +00:00 
			
		
		
		
	test: fix and improve reboot/reset tests
These tests rely on node uptime checks. These checks are quite flaky. Following fixes were applied: * code was refactored as common method shared between reset/reboot tests (reboot all nodes does checks in a different way, so it wasn't updated) * each request to read uptime times out in 5 seconds, so that checks don't wait forever when node is down (or connection is aborted) * to account for node availability vs. lower uptime in the beginning of test, add extra elapsed time to the check condition Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
		
				
					committed by
					
						
						talos-bot
					
				
			
			
				
	
			
			
			
						parent
						
							51112a1d86
						
					
				
				
					commit
					6fb55229a2
				
			@@ -57,43 +57,9 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
 | 
				
			|||||||
	for _, node := range nodes {
 | 
						for _, node := range nodes {
 | 
				
			||||||
		suite.T().Log("rebooting node", node)
 | 
							suite.T().Log("rebooting node", node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		func(node string) {
 | 
							suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
 | 
				
			||||||
			// timeout for single node reboot
 | 
								return suite.Client.Reboot(nodeCtx)
 | 
				
			||||||
			ctx, ctxCancel := context.WithTimeout(suite.ctx, 10*time.Minute)
 | 
							}, 10*time.Minute)
 | 
				
			||||||
			defer ctxCancel()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			nodeCtx := client.WithNodes(ctx, node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			// read uptime before reboot
 | 
					 | 
				
			||||||
			uptimeBefore, err := suite.ReadUptime(nodeCtx)
 | 
					 | 
				
			||||||
			suite.Require().NoError(err)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			suite.Assert().NoError(suite.Client.Reboot(nodeCtx))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			var uptimeAfter float64
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
 | 
					 | 
				
			||||||
				uptimeAfter, err = suite.ReadUptime(nodeCtx)
 | 
					 | 
				
			||||||
				if err != nil {
 | 
					 | 
				
			||||||
					// API might be unresponsive during reboot
 | 
					 | 
				
			||||||
					return retry.ExpectedError(err)
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
				if uptimeAfter >= uptimeBefore {
 | 
					 | 
				
			||||||
					// uptime should go down after reboot
 | 
					 | 
				
			||||||
					return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
				return nil
 | 
					 | 
				
			||||||
			}))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			if suite.Cluster != nil {
 | 
					 | 
				
			||||||
				// without cluster state we can't do deep checks, but basic reboot test still works
 | 
					 | 
				
			||||||
				// NB: using `ctx` here to have client talking to init node by default
 | 
					 | 
				
			||||||
				suite.AssertClusterHealthy(ctx)
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}(node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -103,6 +69,9 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 | 
				
			|||||||
		suite.T().Skip("cluster doesn't support reboots")
 | 
							suite.T().Skip("cluster doesn't support reboots")
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// offset to account for uptime measuremenet inaccuracy
 | 
				
			||||||
 | 
						const offset = 2 * time.Second
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	nodes := suite.DiscoverNodes()
 | 
						nodes := suite.DiscoverNodes()
 | 
				
			||||||
	suite.Require().NotEmpty(nodes)
 | 
						suite.Require().NotEmpty(nodes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -131,6 +100,8 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 | 
				
			|||||||
		suite.Require().NoError(<-errCh)
 | 
							suite.Require().NoError(<-errCh)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rebootTimestamp := time.Now()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	allNodesCtx := client.WithNodes(suite.ctx, nodes...)
 | 
						allNodesCtx := client.WithNodes(suite.ctx, nodes...)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
 | 
						suite.Require().NoError(suite.Client.Reboot(allNodesCtx))
 | 
				
			||||||
@@ -143,20 +114,27 @@ func (suite *RebootSuite) TestRebootAllNodes() {
 | 
				
			|||||||
					return fmt.Errorf("uptime record not found for %q", node)
 | 
										return fmt.Errorf("uptime record not found for %q", node)
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
				uptimeBefore := uptimeBeforeInterface.(float64) //nolint: errcheck
 | 
									uptimeBefore := uptimeBeforeInterface.(time.Duration) //nolint: errcheck
 | 
				
			||||||
 | 
					
 | 
				
			||||||
				nodeCtx := client.WithNodes(suite.ctx, node)
 | 
									nodeCtx := client.WithNodes(suite.ctx, node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
				return retry.Constant(10 * time.Minute).Retry(func() error {
 | 
									return retry.Constant(10 * time.Minute).Retry(func() error {
 | 
				
			||||||
					uptimeAfter, err := suite.ReadUptime(nodeCtx)
 | 
										requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
 | 
				
			||||||
 | 
										defer requestCtxCancel()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
										elapsed := time.Since(rebootTimestamp) - offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
										uptimeAfter, err := suite.ReadUptime(requestCtx)
 | 
				
			||||||
					if err != nil {
 | 
										if err != nil {
 | 
				
			||||||
						// API might be unresponsive during reboot
 | 
											// API might be unresponsive during reboot
 | 
				
			||||||
						return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
 | 
											return retry.ExpectedError(fmt.Errorf("error reading uptime for node %q: %w", node, err))
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
					if uptimeAfter >= uptimeBefore {
 | 
										// uptime of the node before it actually reboots still goes up linearly
 | 
				
			||||||
 | 
										// so we can safely add elapsed time here
 | 
				
			||||||
 | 
										if uptimeAfter >= uptimeBefore+elapsed {
 | 
				
			||||||
						// uptime should go down after reboot
 | 
											// uptime should go down after reboot
 | 
				
			||||||
						return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %f, after %f", node, uptimeBefore, uptimeAfter))
 | 
											return retry.ExpectedError(fmt.Errorf("uptime didn't go down for node %q: before %s + %s, after %s", node, uptimeBefore, elapsed, uptimeAfter))
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
					return nil
 | 
										return nil
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,14 +8,11 @@ package api
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
	"context"
 | 
						"context"
 | 
				
			||||||
	"fmt"
 | 
					 | 
				
			||||||
	"sort"
 | 
						"sort"
 | 
				
			||||||
	"testing"
 | 
						"testing"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"github.com/talos-systems/talos/internal/integration/base"
 | 
						"github.com/talos-systems/talos/internal/integration/base"
 | 
				
			||||||
	"github.com/talos-systems/talos/pkg/client"
 | 
					 | 
				
			||||||
	"github.com/talos-systems/talos/pkg/retry"
 | 
					 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
type ResetSuite struct {
 | 
					type ResetSuite struct {
 | 
				
			||||||
@@ -70,43 +67,13 @@ func (suite *ResetSuite) TestResetNodeByNode() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
		suite.T().Log("Resetting node", node)
 | 
							suite.T().Log("Resetting node", node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		func(node string) {
 | 
					 | 
				
			||||||
			// timeout for single node Reset
 | 
					 | 
				
			||||||
			ctx, ctxCancel := context.WithTimeout(suite.ctx, 5*time.Minute)
 | 
					 | 
				
			||||||
			defer ctxCancel()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			nodeCtx := client.WithNodes(ctx, node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			// read uptime before Reset
 | 
					 | 
				
			||||||
			uptimeBefore, err := suite.ReadUptime(nodeCtx)
 | 
					 | 
				
			||||||
			suite.Require().NoError(err)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			// force reboot after reset, as this is the only mode we can test
 | 
					 | 
				
			||||||
			suite.Assert().NoError(suite.Client.Reset(nodeCtx, true, true))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			var uptimeAfter float64
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			suite.Require().NoError(retry.Constant(10 * time.Minute).Retry(func() error {
 | 
					 | 
				
			||||||
				uptimeAfter, err = suite.ReadUptime(nodeCtx)
 | 
					 | 
				
			||||||
				if err != nil {
 | 
					 | 
				
			||||||
					// API might be unresponsive during reboot
 | 
					 | 
				
			||||||
					return retry.ExpectedError(err)
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
				if uptimeAfter >= uptimeBefore {
 | 
					 | 
				
			||||||
		// uptime should go down after Reset, as it reboots the node
 | 
							// uptime should go down after Reset, as it reboots the node
 | 
				
			||||||
					return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %f, after %f", uptimeBefore, uptimeAfter))
 | 
							suite.AssertRebooted(suite.ctx, node, func(nodeCtx context.Context) error {
 | 
				
			||||||
				}
 | 
								// force reboot after reset, as this is the only mode we can test
 | 
				
			||||||
 | 
								return suite.Client.Reset(nodeCtx, true, true)
 | 
				
			||||||
				return nil
 | 
							}, 10*time.Minute)
 | 
				
			||||||
			}))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
		// TODO: there is no good way to assert that node was reset and disk contents were really wiped
 | 
							// TODO: there is no good way to assert that node was reset and disk contents were really wiped
 | 
				
			||||||
 | 
					 | 
				
			||||||
			// NB: using `ctx` here to have client talking to init node by default
 | 
					 | 
				
			||||||
			suite.AssertClusterHealthy(ctx)
 | 
					 | 
				
			||||||
		}(node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -21,6 +21,7 @@ import (
 | 
				
			|||||||
	"github.com/talos-systems/talos/internal/pkg/provision/access"
 | 
						"github.com/talos-systems/talos/internal/pkg/provision/access"
 | 
				
			||||||
	"github.com/talos-systems/talos/pkg/client"
 | 
						"github.com/talos-systems/talos/pkg/client"
 | 
				
			||||||
	"github.com/talos-systems/talos/pkg/client/config"
 | 
						"github.com/talos-systems/talos/pkg/client/config"
 | 
				
			||||||
 | 
						"github.com/talos-systems/talos/pkg/retry"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// APISuite is a base suite for API tests
 | 
					// APISuite is a base suite for API tests
 | 
				
			||||||
@@ -115,7 +116,7 @@ func (apiSuite *APISuite) AssertClusterHealthy(ctx context.Context) {
 | 
				
			|||||||
// ReadUptime reads node uptime.
 | 
					// ReadUptime reads node uptime.
 | 
				
			||||||
//
 | 
					//
 | 
				
			||||||
// Context provided might have specific node attached for API call.
 | 
					// Context provided might have specific node attached for API call.
 | 
				
			||||||
func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
 | 
					func (apiSuite *APISuite) ReadUptime(ctx context.Context) (time.Duration, error) {
 | 
				
			||||||
	// set up a short timeout around uptime read calls to work around
 | 
						// set up a short timeout around uptime read calls to work around
 | 
				
			||||||
	// cases when rebooted node doesn't answer for a long time on requests
 | 
						// cases when rebooted node doesn't answer for a long time on requests
 | 
				
			||||||
	reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
 | 
						reqCtx, reqCtxCancel := context.WithTimeout(ctx, 10*time.Second)
 | 
				
			||||||
@@ -150,7 +151,60 @@ func (apiSuite *APISuite) ReadUptime(ctx context.Context) (float64, error) {
 | 
				
			|||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return uptime, reader.Close()
 | 
						return time.Duration(uptime * float64(time.Second)), reader.Close()
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// AssertRebooted verifies that node got rebooted as result of running some API call.
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// Verification happens via reading uptime of the node.
 | 
				
			||||||
 | 
					func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
 | 
				
			||||||
 | 
						// offset to account for uptime measuremenet inaccuracy
 | 
				
			||||||
 | 
						const offset = 2 * time.Second
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// timeout for single node Reset
 | 
				
			||||||
 | 
						ctx, ctxCancel := context.WithTimeout(ctx, timeout)
 | 
				
			||||||
 | 
						defer ctxCancel()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						nodeCtx := client.WithNodes(ctx, node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// read uptime before Reset
 | 
				
			||||||
 | 
						uptimeBefore, err := apiSuite.ReadUptime(nodeCtx)
 | 
				
			||||||
 | 
						apiSuite.Require().NoError(err)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						apiSuite.Assert().NoError(rebootFunc(nodeCtx))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// capture current time when API returns
 | 
				
			||||||
 | 
						rebootTimestamp := time.Now()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						var uptimeAfter time.Duration
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						apiSuite.Require().NoError(retry.Constant(timeout).Retry(func() error {
 | 
				
			||||||
 | 
							requestCtx, requestCtxCancel := context.WithTimeout(nodeCtx, 5*time.Second)
 | 
				
			||||||
 | 
							defer requestCtxCancel()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							elapsed := time.Since(rebootTimestamp) - offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							uptimeAfter, err = apiSuite.ReadUptime(requestCtx)
 | 
				
			||||||
 | 
							if err != nil {
 | 
				
			||||||
 | 
								// API might be unresponsive during reboot
 | 
				
			||||||
 | 
								return retry.ExpectedError(err)
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							// uptime of the node before it actually reboots still goes up linearly
 | 
				
			||||||
 | 
							// so we can safely add elapsed time here
 | 
				
			||||||
 | 
							if uptimeAfter >= uptimeBefore+elapsed {
 | 
				
			||||||
 | 
								// uptime should go down after reboot
 | 
				
			||||||
 | 
								return retry.ExpectedError(fmt.Errorf("uptime didn't go down: before %s + %s, after %s", uptimeBefore, elapsed, uptimeAfter))
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							return nil
 | 
				
			||||||
 | 
						}))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if apiSuite.Cluster != nil {
 | 
				
			||||||
 | 
							// without cluster state we can't do deep checks, but basic reboot test still works
 | 
				
			||||||
 | 
							// NB: using `ctx` here to have client talking to init node by default
 | 
				
			||||||
 | 
							apiSuite.AssertClusterHealthy(ctx)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// TearDownSuite closes Talos API client
 | 
					// TearDownSuite closes Talos API client
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user