Skip to content

Commit 59dac19

Browse files
committed
Add a test for server stabilization (#11128)
1 parent d77a09d commit 59dac19

File tree

5 files changed

+195
-43
lines changed

5 files changed

+195
-43
lines changed

physical/raft/raft.go

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ type RaftBackend struct {
145145
// VAULT_RAFT_AUTOPILOT_DISABLE during startup and can't be updated once the
146146
// node is up and running.
147147
disableAutopilot bool
148+
149+
autopilotReconcileInterval time.Duration
148150
}
149151

150152
// LeaderJoinInfo contains information required by a node to join itself as a
@@ -385,7 +387,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
385387
if err != nil {
386388
return nil, fmt.Errorf("snapshot_delay does not parse as a duration: %w", err)
387389
}
388-
snap = newSnapshotStoreDelay(snap, delay)
390+
snap = newSnapshotStoreDelay(snap, delay, logger)
389391
}
390392

391393
maxEntrySize := defaultMaxEntrySize
@@ -398,28 +400,40 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
398400
maxEntrySize = uint64(i)
399401
}
400402

403+
var reconcileInterval time.Duration
404+
if interval := conf["autopilot_reconcile_interval"]; interval != "" {
405+
interval, err := time.ParseDuration(interval)
406+
if err != nil {
407+
return nil, fmt.Errorf("autopilot_reconcile_interval does not parse as a duration: %w", err)
408+
}
409+
reconcileInterval = interval
410+
}
411+
401412
return &RaftBackend{
402-
logger: logger,
403-
fsm: fsm,
404-
raftInitCh: make(chan struct{}),
405-
conf: conf,
406-
logStore: log,
407-
stableStore: stable,
408-
snapStore: snap,
409-
dataDir: path,
410-
localID: localID,
411-
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
412-
maxEntrySize: maxEntrySize,
413-
followerHeartbeatTicker: time.NewTicker(time.Second),
413+
logger: logger,
414+
fsm: fsm,
415+
raftInitCh: make(chan struct{}),
416+
conf: conf,
417+
logStore: log,
418+
stableStore: stable,
419+
snapStore: snap,
420+
dataDir: path,
421+
localID: localID,
422+
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
423+
maxEntrySize: maxEntrySize,
424+
followerHeartbeatTicker: time.NewTicker(time.Second),
425+
autopilotReconcileInterval: reconcileInterval,
414426
}, nil
415427
}
416428

417429
type snapshotStoreDelay struct {
430+
logger log.Logger
418431
wrapped raft.SnapshotStore
419432
delay time.Duration
420433
}
421434

422435
func (s snapshotStoreDelay) Create(version raft.SnapshotVersion, index, term uint64, configuration raft.Configuration, configurationIndex uint64, trans raft.Transport) (raft.SnapshotSink, error) {
436+
s.logger.Trace("delaying before creating snapshot", "delay", s.delay)
423437
time.Sleep(s.delay)
424438
return s.wrapped.Create(version, index, term, configuration, configurationIndex, trans)
425439
}
@@ -434,8 +448,9 @@ func (s snapshotStoreDelay) Open(id string) (*raft.SnapshotMeta, io.ReadCloser,
434448

435449
var _ raft.SnapshotStore = &snapshotStoreDelay{}
436450

437-
func newSnapshotStoreDelay(snap raft.SnapshotStore, delay time.Duration) *snapshotStoreDelay {
451+
func newSnapshotStoreDelay(snap raft.SnapshotStore, delay time.Duration, logger log.Logger) *snapshotStoreDelay {
438452
return &snapshotStoreDelay{
453+
logger: logger,
439454
wrapped: snap,
440455
delay: delay,
441456
}
@@ -666,6 +681,7 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
666681

667682
// Setup the raft config
668683
raftConfig := raft.DefaultConfig()
684+
raftConfig.SnapshotInterval = 5 * time.Second
669685
if err := b.applyConfigSettings(raftConfig); err != nil {
670686
return err
671687
}
@@ -774,6 +790,7 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
774790
}
775791
}
776792

793+
b.logger.Info("creating Raft", "config", fmt.Sprintf("%#v", raftConfig))
777794
raftObj, err := raft.NewRaft(raftConfig, b.fsm.chunker, b.logStore, b.stableStore, b.snapStore, b.raftTransport)
778795
b.fsm.SetNoopRestore(false)
779796
if err != nil {

physical/raft/raft_autopilot.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -670,13 +670,20 @@ func (b *RaftBackend) SetupAutopilot(ctx context.Context, storageConfig *Autopil
670670
b.autopilotConfig.Merge(storageConfig)
671671

672672
// Create the autopilot instance
673-
b.autopilot = autopilot.New(b.raft, newDelegate(b), autopilot.WithLogger(b.logger), autopilot.WithPromoter(b.autopilotPromoter()))
673+
options := []autopilot.Option{
674+
autopilot.WithLogger(b.logger),
675+
autopilot.WithPromoter(b.autopilotPromoter()),
676+
}
677+
if b.autopilotReconcileInterval != 0 {
678+
options = append(options, autopilot.WithReconcileInterval(b.autopilotReconcileInterval))
679+
}
680+
b.autopilot = autopilot.New(b.raft, newDelegate(b), options...)
674681
b.followerStates = followerStates
675682
b.followerHeartbeatTicker = time.NewTicker(1 * time.Second)
676683

677684
b.l.Unlock()
678685

679-
b.logger.Info("starting autopilot", "config", b.autopilotConfig)
686+
b.logger.Info("starting autopilot", "config", b.autopilotConfig, "reconcile_interval", b.autopilotReconcileInterval)
680687
b.autopilot.Start(ctx)
681688

682689
go b.startFollowerHeartbeatTracker()

vault/external_tests/raft/raft_autopilot_test.go

Lines changed: 138 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,23 @@ package rafttests
22

33
import (
44
"context"
5+
"fmt"
56
"math"
67
"testing"
78
"time"
89

9-
"github.com/hashicorp/vault/api"
10-
"github.com/kr/pretty"
11-
10+
"github.com/hashicorp/go-hclog"
1211
autopilot "github.com/hashicorp/raft-autopilot"
13-
14-
"github.com/stretchr/testify/require"
15-
12+
"github.com/hashicorp/vault/api"
1613
"github.com/hashicorp/vault/helper/namespace"
1714
"github.com/hashicorp/vault/helper/testhelpers"
15+
"github.com/hashicorp/vault/helper/testhelpers/teststorage"
1816
"github.com/hashicorp/vault/physical/raft"
17+
"github.com/hashicorp/vault/sdk/helper/strutil"
1918
"github.com/hashicorp/vault/vault"
19+
"github.com/kr/pretty"
20+
testingintf "github.com/mitchellh/go-testing-interface"
21+
"github.com/stretchr/testify/require"
2022
)
2123

2224
func TestRaft_Autopilot_Disable(t *testing.T) {
@@ -208,3 +210,133 @@ func TestRaft_Autopilot_Configuration(t *testing.T) {
208210
vault.TestWaitActive(t, leaderCore.Core)
209211
configCheckFunc(config)
210212
}
213+
214+
// TestRaft_Autopilot_Stabilization_Delay verifies that if a node takes a long
215+
// time to become ready, it doesn't get promoted to voter until then.
216+
func TestRaft_Autopilot_Stabilization_Delay(t *testing.T) {
217+
conf, opts := teststorage.ClusterSetup(nil, nil, teststorage.RaftBackendSetup)
218+
conf.DisableAutopilot = false
219+
opts.InmemClusterLayers = true
220+
opts.KeepStandbysSealed = true
221+
opts.SetupFunc = nil
222+
timeToHealthyCore2 := 5 * time.Second
223+
opts.PhysicalFactory = func(t testingintf.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
224+
config := map[string]interface{}{
225+
"snapshot_threshold": "50",
226+
"trailing_logs": "100",
227+
"autopilot_reconcile_interval": "1s",
228+
}
229+
if coreIdx == 2 {
230+
config["snapshot_delay"] = timeToHealthyCore2.String()
231+
}
232+
return teststorage.MakeRaftBackend(t, coreIdx, logger, config)
233+
}
234+
235+
cluster := vault.NewTestCluster(t, conf, opts)
236+
cluster.Start()
237+
defer cluster.Cleanup()
238+
testhelpers.WaitForActiveNode(t, cluster)
239+
240+
// Check that autopilot execution state is running
241+
client := cluster.Cores[0].Client
242+
state, err := client.Sys().RaftAutopilotState()
243+
require.NotNil(t, state)
244+
require.NoError(t, err)
245+
require.Equal(t, true, state.Healthy)
246+
require.Len(t, state.Servers, 1)
247+
require.Equal(t, "core-0", state.Servers["core-0"].ID)
248+
require.Equal(t, "alive", state.Servers["core-0"].NodeStatus)
249+
require.Equal(t, "leader", state.Servers["core-0"].Status)
250+
251+
_, err = client.Logical().Write("sys/storage/raft/autopilot/configuration", map[string]interface{}{
252+
"server_stabilization_time": "3s",
253+
})
254+
require.NoError(t, err)
255+
256+
config, err := client.Sys().RaftAutopilotConfiguration()
257+
require.NoError(t, err)
258+
259+
// Wait for 110% of the stabilization time to add nodes
260+
stabilizationKickOffWaitDuration := time.Duration(math.Ceil(1.1 * float64(config.ServerStabilizationTime)))
261+
time.Sleep(stabilizationKickOffWaitDuration)
262+
263+
cli := cluster.Cores[0].Client
264+
// Write more keys than snapshot_threshold
265+
for i := 0; i < 250; i++ {
266+
_, err := cli.Logical().Write(fmt.Sprintf("secret/%d", i), map[string]interface{}{
267+
"test": "data",
268+
})
269+
if err != nil {
270+
t.Fatal(err)
271+
}
272+
}
273+
274+
joinFunc := func(core *vault.TestClusterCore) {
275+
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), []*raft.LeaderJoinInfo{
276+
{
277+
LeaderAPIAddr: client.Address(),
278+
TLSConfig: cluster.Cores[0].TLSConfig,
279+
Retry: true,
280+
},
281+
}, false)
282+
require.NoError(t, err)
283+
time.Sleep(1 * time.Second)
284+
cluster.UnsealCore(t, core)
285+
}
286+
287+
checkState := func(nodeID string, numServers int, allHealthy bool, healthy bool, suffrage string) {
288+
state, err = client.Sys().RaftAutopilotState()
289+
require.NoError(t, err)
290+
require.Equal(t, allHealthy, state.Healthy)
291+
require.Len(t, state.Servers, numServers)
292+
require.Equal(t, healthy, state.Servers[nodeID].Healthy)
293+
require.Equal(t, "alive", state.Servers[nodeID].NodeStatus)
294+
require.Equal(t, suffrage, state.Servers[nodeID].Status)
295+
}
296+
297+
joinFunc(cluster.Cores[1])
298+
checkState("core-1", 2, false, false, "non-voter")
299+
300+
core2shouldBeHealthyAt := time.Now().Add(timeToHealthyCore2)
301+
joinFunc(cluster.Cores[2])
302+
checkState("core-2", 3, false, false, "non-voter")
303+
304+
stabilizationWaitDuration := time.Duration(1.25 * float64(config.ServerStabilizationTime))
305+
deadline := time.Now().Add(stabilizationWaitDuration)
306+
var core1healthy, core2healthy bool
307+
for time.Now().Before(deadline) {
308+
state, err := client.Sys().RaftAutopilotState()
309+
require.NoError(t, err)
310+
core1healthy = state.Servers["core-1"].Healthy
311+
core2healthy = state.Servers["core-2"].Healthy
312+
time.Sleep(1 * time.Second)
313+
}
314+
if !core1healthy || core2healthy {
315+
t.Fatalf("expected health: core1=true and core2=false, got: core=%v, core2=%v", core1healthy, core2healthy)
316+
}
317+
318+
time.Sleep(2 * time.Second) // wait for reconciliation
319+
state, err = client.Sys().RaftAutopilotState()
320+
require.NoError(t, err)
321+
require.Equal(t, []string{"core-0", "core-1"}, state.Voters)
322+
323+
for time.Now().Before(core2shouldBeHealthyAt) {
324+
state, err := client.Sys().RaftAutopilotState()
325+
require.NoError(t, err)
326+
core2healthy = state.Servers["core-2"].Healthy
327+
time.Sleep(1 * time.Second)
328+
t.Log(core2healthy)
329+
}
330+
331+
deadline = time.Now().Add(10 * time.Second)
332+
for time.Now().Before(deadline) {
333+
state, err = client.Sys().RaftAutopilotState()
334+
if err != nil {
335+
t.Fatal(err)
336+
}
337+
if strutil.EquivalentSlices(state.Voters, []string{"core-0", "core-1", "core-2"}) {
338+
break
339+
}
340+
}
341+
require.Equal(t, state.Voters, []string{"core-0", "core-1", "core-2"})
342+
}

vault/external_tests/raft/raft_test.go

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,24 @@ import (
55
"context"
66
"crypto/md5"
77
"fmt"
8+
"github.com/hashicorp/vault/api"
9+
"github.com/hashicorp/vault/sdk/logical"
810
"io/ioutil"
911
"net/http"
1012
"strings"
11-
"sync"
1213
"sync/atomic"
1314
"testing"
1415
"time"
1516

1617
"github.com/hashicorp/go-cleanhttp"
17-
"github.com/hashicorp/go-hclog"
1818
uuid "github.com/hashicorp/go-uuid"
19-
"github.com/hashicorp/vault/api"
2019
credUserpass "github.com/hashicorp/vault/builtin/credential/userpass"
2120
"github.com/hashicorp/vault/helper/namespace"
2221
"github.com/hashicorp/vault/helper/testhelpers"
2322
"github.com/hashicorp/vault/helper/testhelpers/teststorage"
2423
vaulthttp "github.com/hashicorp/vault/http"
2524
"github.com/hashicorp/vault/physical/raft"
26-
"github.com/hashicorp/vault/sdk/helper/logging"
27-
28-
"github.com/hashicorp/vault/sdk/logical"
2925
"github.com/hashicorp/vault/vault"
30-
vaultcluster "github.com/hashicorp/vault/vault/cluster"
3126
"github.com/stretchr/testify/require"
3227
"golang.org/x/net/http2"
3328
)
@@ -55,20 +50,7 @@ func raftCluster(t testing.TB, ropts *RaftClusterOpts) *vault.TestCluster {
5550
var opts = vault.TestClusterOptions{
5651
HandlerFunc: vaulthttp.Handler,
5752
}
58-
opts.Logger = logging.NewVaultLogger(hclog.Trace).Named(t.Name())
59-
60-
if ropts.InmemCluster {
61-
inmemCluster, err := vaultcluster.NewInmemLayerCluster("inmem-cluster", 3, hclog.New(&hclog.LoggerOptions{
62-
Mutex: &sync.Mutex{},
63-
Level: hclog.Trace,
64-
Name: "inmem-cluster",
65-
}))
66-
if err != nil {
67-
t.Fatal(err)
68-
}
69-
opts.ClusterLayers = inmemCluster
70-
}
71-
53+
opts.InmemClusterLayers = ropts.InmemCluster
7254
opts.PhysicalFactoryConfig = ropts.PhysicalFactoryConfig
7355
conf.DisablePerformanceStandby = ropts.DisablePerfStandby
7456

vault/testing.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,9 @@ type TestClusterOptions struct {
10811081

10821082
// ClusterLayers are used to override the default cluster connection layer
10831083
ClusterLayers cluster.NetworkLayerSet
1084+
// InmemClusterLayers is a shorthand way of asking for ClusterLayers to be
1085+
// built using the inmem implementation.
1086+
InmemClusterLayers bool
10841087

10851088
// RaftAddressProvider is used to set the raft ServerAddressProvider on
10861089
// each core.
@@ -1561,6 +1564,17 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te
15611564
testCluster.pubKey = pubKey
15621565
testCluster.priKey = priKey
15631566

1567+
if opts != nil && opts.InmemClusterLayers {
1568+
if opts.ClusterLayers != nil {
1569+
t.Fatalf("cannot specify ClusterLayers when InmemClusterLayers is true")
1570+
}
1571+
inmemCluster, err := cluster.NewInmemLayerCluster("inmem-cluster", numCores, testCluster.Logger.Named("inmem-cluster"))
1572+
if err != nil {
1573+
t.Fatal(err)
1574+
}
1575+
opts.ClusterLayers = inmemCluster
1576+
}
1577+
15641578
// Create cores
15651579
testCluster.cleanupFuncs = []func(){}
15661580
cores := []*Core{}

0 commit comments

Comments
 (0)