Skip to content

Commit 8fc1d5b

Browse files
pompon0masih
andauthored
fixed autorestart cooldown (#2887)
There was a semantic mismatch between blocksync sending at most 1 message to the restartCh, and WaitForQuitSignals which ignored messages until cooldown has passed. I think that cooldown in blocksync logic was exactly trying to avoid sending messages when WaitForQuitSignals could have ignored them, which is a very fragile way of doing things. This PR removes cooldown from WaitForQuitSignals, simply delegating the cooldown logic to blocksync and makes the signal nonblocking and idempotent. --------- Co-authored-by: Masih H. Derkani <m@derkani.org>
1 parent 243cc2a commit 8fc1d5b

File tree

17 files changed

+114
-211
lines changed

17 files changed

+114
-211
lines changed

sei-cosmos/server/start.go

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ package server
44

55
import (
66
"context"
7+
"errors"
78
"fmt"
89
"net/http"
10+
"sync"
911
//nolint:gosec,G108
1012
_ "net/http/pprof"
1113
"os"
@@ -175,9 +177,6 @@ is performed. Note, when enabled, gRPC will also be automatically enabled.
175177
tracerProviderOptions = []trace.TracerProviderOption{}
176178
}
177179

178-
// amino is needed here for backwards compatibility of REST routes
179-
exitCode := RestartErrorCode
180-
181180
serverCtx.Logger.Info("Creating node metrics provider")
182181
nodeMetricsProvider := node.DefaultMetricsProvider(serverCtx.Config.Instrumentation)(clientCtx.ChainID)
183182

@@ -193,33 +192,21 @@ is performed. Note, when enabled, gRPC will also be automatically enabled.
193192
}
194193
}
195194

196-
restartCoolDownDuration := time.Second * time.Duration(serverCtx.Config.SelfRemediation.RestartCooldownSeconds)
197-
// Set the first restart time to be now - restartCoolDownDuration so that the first restart can trigger whenever
198-
canRestartAfter := time.Now().Add(-restartCoolDownDuration)
199-
200195
serverCtx.Logger.Info("Starting Process")
201196
for {
202-
err = startInProcess(
197+
err := startInProcess(
203198
serverCtx,
204199
clientCtx,
205200
appCreator,
206201
tracerProviderOptions,
207202
nodeMetricsProvider,
208203
apiMetrics,
209-
canRestartAfter,
210204
)
211-
errCode, ok := err.(ErrorCode)
212-
exitCode = errCode.Code
213-
if !ok {
205+
if !errors.Is(err, ErrShouldRestart) {
214206
return err
215207
}
216-
if exitCode != RestartErrorCode {
217-
break
218-
}
219208
serverCtx.Logger.Info("restarting node...")
220-
canRestartAfter = time.Now().Add(restartCoolDownDuration)
221209
}
222-
return nil
223210
},
224211
}
225212

@@ -283,7 +270,6 @@ func startInProcess(
283270
tracerProviderOptions []trace.TracerProviderOption,
284271
nodeMetricsProvider *node.NodeMetrics,
285272
apiMetrics *telemetry.Metrics,
286-
canRestartAfter time.Time,
287273
) error {
288274
cfg := ctx.Config
289275
home := cfg.RootDir
@@ -330,13 +316,20 @@ func startInProcess(
330316
}
331317
app := appCreator(ctx.Logger, db, traceWriter, ctx.Config, ctx.Viper)
332318

333-
var (
334-
tmNode service.Service
335-
restartCh chan struct{}
336-
gRPCOnly = ctx.Viper.GetBool(flagGRPCOnly)
337-
)
319+
gRPCOnly := ctx.Viper.GetBool(flagGRPCOnly)
320+
var tmNode service.Service
338321

339-
restartCh = make(chan struct{})
322+
var restartMtx sync.Mutex
323+
restartCh := make(chan struct{})
324+
restartEvent := func() {
325+
restartMtx.Lock()
326+
defer restartMtx.Unlock()
327+
select {
328+
case <-restartCh:
329+
default:
330+
close(restartCh)
331+
}
332+
}
340333

341334
if gRPCOnly {
342335
ctx.Logger.Info("starting node in gRPC only mode; Tendermint is disabled")
@@ -361,7 +354,7 @@ func startInProcess(
361354
goCtx,
362355
ctx.Config,
363356
ctx.Logger,
364-
restartCh,
357+
restartEvent,
365358
abciclient.NewLocalClient(ctx.Logger, app),
366359
gen,
367360
tracerProviderOptions,
@@ -434,7 +427,7 @@ func startInProcess(
434427
// we do not need to start Rosetta or handle any Tendermint related processes.
435428
if gRPCOnly {
436429
// wait for signal capture and gracefully return
437-
return WaitForQuitSignals(ctx, restartCh, canRestartAfter)
430+
return WaitForQuitSignals(goCtx, restartCh)
438431
}
439432

440433
var rosettaSrv crgserver.Server
@@ -507,5 +500,5 @@ func startInProcess(
507500
}()
508501

509502
// wait for signal capture and gracefully return
510-
return WaitForQuitSignals(ctx, restartCh, canRestartAfter)
503+
return WaitForQuitSignals(goCtx, restartCh)
511504
}

sei-cosmos/server/util.go

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package server
22

33
import (
4+
"context"
45
"errors"
56
"fmt"
67
"io"
@@ -39,8 +40,7 @@ import (
3940
// a command's Context.
4041
const ServerContextKey = sdk.ContextKey("server.context")
4142

42-
// Error code reserved for signalled
43-
const RestartErrorCode = 100
43+
var ErrShouldRestart = errors.New("node should be restarted")
4444

4545
// server context
4646
type Context struct {
@@ -135,7 +135,7 @@ func InterceptConfigs(cmd *cobra.Command) (*tmcfg.Config, error) {
135135
// is used to read and parse the application configuration. Command handlers can
136136
// fetch the server Context to get the Tendermint configuration or to get access
137137
// to Viper.
138-
func InterceptConfigsPreRunHandler(cmd *cobra.Command, customAppConfigTemplate string, customAppConfig interface{}) error {
138+
func InterceptConfigsPreRunHandler(cmd *cobra.Command, customAppConfigTemplate string, customAppConfig any) error {
139139
serverCtx := NewDefaultContext()
140140

141141
// Get the executable name and configure the viper instance so that environmental
@@ -221,7 +221,7 @@ func SetCmdServerContext(cmd *cobra.Command, serverCtx *Context) error {
221221
// configuration file. The Tendermint configuration file is parsed given a root
222222
// Viper object, whereas the application is parsed with the private package-aware
223223
// viperCfg object.
224-
func interceptConfigs(rootViper *viper.Viper, customAppTemplate string, customConfig interface{}) (*tmcfg.Config, error) {
224+
func interceptConfigs(rootViper *viper.Viper, customAppTemplate string, customConfig any) (*tmcfg.Config, error) {
225225
rootDir := rootViper.GetString(flags.FlagHome)
226226
configPath := filepath.Join(rootDir, "config")
227227
tmCfgFile := filepath.Join(configPath, "config.toml")
@@ -408,26 +408,14 @@ func TrapSignal(cleanupFunc func()) {
408408
}
409409

410410
// WaitForQuitSignals waits for SIGINT and SIGTERM and returns.
411-
func WaitForQuitSignals(ctx *Context, restartCh chan struct{}, canRestartAfter time.Time) ErrorCode {
412-
sigs := make(chan os.Signal, 1)
413-
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
414-
if restartCh != nil {
415-
for {
416-
select {
417-
case sig := <-sigs:
418-
return ErrorCode{Code: int(sig.(syscall.Signal)) + 128}
419-
case <-restartCh:
420-
// If it's in the restart cooldown period
421-
if time.Now().Before(canRestartAfter) {
422-
ctx.Logger.Info("Restarting too frequently, can only restart after %s", canRestartAfter)
423-
continue
424-
}
425-
return ErrorCode{Code: RestartErrorCode}
426-
}
427-
}
428-
} else {
429-
sig := <-sigs
430-
return ErrorCode{Code: int(sig.(syscall.Signal)) + 128}
411+
func WaitForQuitSignals(ctx context.Context, restartCh chan struct{}) error {
412+
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
413+
defer cancel()
414+
select {
415+
case <-ctx.Done():
416+
return nil
417+
case <-restartCh: // blocks forever on a nil channel
418+
return ErrShouldRestart
431419
}
432420
}
433421

sei-cosmos/server/util_test.go

Lines changed: 13 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,13 @@ import (
55
"errors"
66
"fmt"
77
"os"
8-
"os/signal"
98
"path"
109
"path/filepath"
1110
"strings"
1211
"syscall"
1312
"testing"
1413
"time"
1514

16-
"github.com/sei-protocol/sei-chain/sei-tendermint/libs/log"
1715
"github.com/spf13/cobra"
1816
"go.opentelemetry.io/otel/sdk/trace"
1917

@@ -103,8 +101,7 @@ func TestInterceptConfigsPreRunHandlerReadsConfigToml(t *testing.T) {
103101
t.Fatalf("creating config.toml file failed: %v", err)
104102
}
105103

106-
_, err = writer.WriteString(fmt.Sprintf("db-backend = '%s'\n", testDbBackend))
107-
if err != nil {
104+
if _, err := fmt.Fprintf(writer, "db-backend = '%s'\n", testDbBackend); err != nil {
108105
t.Fatalf("Failed writing string to config.toml: %v", err)
109106
}
110107

@@ -144,8 +141,7 @@ func TestInterceptConfigsPreRunHandlerReadsAppToml(t *testing.T) {
144141
t.Fatalf("creating app.toml file failed: %v", err)
145142
}
146143

147-
_, err = writer.WriteString(fmt.Sprintf("halt-time = %d\n", testHaltTime))
148-
if err != nil {
144+
if _, err := fmt.Fprintf(writer, "halt-time = %d\n", testHaltTime); err != nil {
149145
t.Fatalf("Failed writing string to app.toml: %v", err)
150146
}
151147

@@ -308,8 +304,7 @@ func (v precedenceCommon) setAll(t *testing.T, setFlag *string, setEnvVar *strin
308304
t.Fatalf("creating config.toml file failed: %v", err)
309305
}
310306

311-
_, err = writer.WriteString(fmt.Sprintf("[rpc]\nladdr = \"%s\"\n", *setConfigFile))
312-
if err != nil {
307+
if _, err := fmt.Fprintf(writer, "[rpc]\nladdr = \"%s\"\n", *setConfigFile); err != nil {
313308
t.Fatalf("Failed writing string to config.toml: %v", err)
314309
}
315310

@@ -407,63 +402,28 @@ func TestInterceptConfigsWithBadPermissions(t *testing.T) {
407402
}
408403

409404
func TestWaitForQuitSignals(t *testing.T) {
410-
t.Run("WithRestartChannelAndCanRestartAfterNotReached", func(t *testing.T) {
405+
t.Run("WithRestartChannelAndCanRestart", func(t *testing.T) {
411406
restartCh := make(chan struct{})
412407
go func() {
413408
time.Sleep(100 * time.Millisecond)
414409
restartCh <- struct{}{}
415410
}()
416411

417-
go func() {
418-
time.Sleep(200 * time.Millisecond)
419-
syscall.Kill(syscall.Getpid(), syscall.SIGTERM)
420-
}()
421-
422-
errCode := server.WaitForQuitSignals(
423-
&server.Context{Logger: log.NewNopLogger()},
424-
restartCh,
425-
time.Now().Add(500*time.Millisecond),
426-
)
427-
expectedCode := int(syscall.SIGTERM) + 128
428-
if errCode.Code != expectedCode {
429-
t.Errorf("Expected error code %d, got %d", expectedCode, errCode.Code)
430-
}
431-
})
432-
433-
t.Run("WithRestartChannelAndCanRestartAfterReached", func(t *testing.T) {
434-
restartCh := make(chan struct{})
435-
go func() {
436-
time.Sleep(100 * time.Millisecond)
437-
restartCh <- struct{}{}
438-
}()
439-
440-
errCode := server.WaitForQuitSignals(
441-
&server.Context{Logger: log.NewNopLogger()},
442-
restartCh,
443-
time.Now().Add(-100*time.Millisecond),
444-
)
445-
if errCode.Code != server.RestartErrorCode {
446-
t.Errorf("Expected error code %d, got %d", server.RestartErrorCode, errCode.Code)
412+
err := server.WaitForQuitSignals(t.Context(), restartCh)
413+
if !errors.Is(err, server.ErrShouldRestart) {
414+
t.Errorf("Expected ErrShouldRestart, got %v", err)
447415
}
448416
})
449417

450418
t.Run("WithSIGINT", func(t *testing.T) {
451-
sigs := make(chan os.Signal, 1)
452-
signal.Notify(sigs, syscall.SIGINT)
453-
454419
go func() {
455420
time.Sleep(100 * time.Millisecond)
456421
syscall.Kill(syscall.Getpid(), syscall.SIGINT)
457422
}()
458423

459-
errCode := server.WaitForQuitSignals(
460-
&server.Context{Logger: log.NewNopLogger()},
461-
make(chan struct{}),
462-
time.Now(),
463-
)
464-
expectedCode := int(syscall.SIGINT) + 128
465-
if errCode.Code != expectedCode {
466-
t.Errorf("Expected error code %d, got %d", expectedCode, errCode.Code)
424+
err := server.WaitForQuitSignals(t.Context(), make(chan struct{}))
425+
if err != nil {
426+
t.Fatal(err)
467427
}
468428
})
469429

@@ -473,14 +433,9 @@ func TestWaitForQuitSignals(t *testing.T) {
473433
syscall.Kill(syscall.Getpid(), syscall.SIGTERM)
474434
}()
475435

476-
errCode := server.WaitForQuitSignals(
477-
&server.Context{Logger: log.NewNopLogger()},
478-
make(chan struct{}),
479-
time.Now(),
480-
)
481-
expectedCode := int(syscall.SIGTERM) + 128
482-
if errCode.Code != expectedCode {
483-
t.Errorf("Expected error code %d, got %d", expectedCode, errCode.Code)
436+
err := server.WaitForQuitSignals(t.Context(), make(chan struct{}))
437+
if err != nil {
438+
t.Fatal(err)
484439
}
485440
})
486441
}

sei-cosmos/testutil/network/util.go

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package network
22

33
import (
44
"encoding/json"
5-
"io/ioutil"
5+
"os"
66
"path/filepath"
77
"time"
88

@@ -56,7 +56,7 @@ func startInProcess(cfg Config, val *Validator) error {
5656
val.GoCtx,
5757
tmCfg,
5858
logger,
59-
make(chan struct{}),
59+
func() {},
6060
abciclient.NewLocalClient(logger, app),
6161
defaultGensis,
6262
[]trace.TracerProviderOption{},
@@ -211,15 +211,9 @@ func writeFile(name string, dir string, contents []byte) error {
211211
writePath := filepath.Join(dir)
212212
file := filepath.Join(writePath, name)
213213

214-
err := tmos.EnsureDir(writePath, 0755)
215-
if err != nil {
216-
return err
217-
}
218-
219-
err = ioutil.WriteFile(file, contents, 0644)
220-
if err != nil {
214+
if err := tmos.EnsureDir(writePath, 0755); err != nil {
221215
return err
222216
}
223217

224-
return nil
218+
return os.WriteFile(file, contents, 0644)
225219
}

0 commit comments

Comments
 (0)