From cfb2b16bfe91fb91dda7521f611dd32dd8b7895f Mon Sep 17 00:00:00 2001 From: dsabsay Date: Sat, 17 May 2025 19:28:30 -0700 Subject: [PATCH 1/2] Fix data race in TestRecoverAlertsPostOutage Previously, TestRecoverAlertsPostOutage had a data race: ``` ================== WARNING: DATA RACE Read at 0x00c025bbfc30 by goroutine 56074: github.com/prometheus/prometheus/rules.(*Group).Eval.func1() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:569 +0xc8a github.com/prometheus/prometheus/rules.(*Group).Eval() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:666 +0x4e5 github.com/prometheus/prometheus/rules.DefaultEvalIterationFunc() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:81 +0x1a5 github.com/cortexproject/cortex/pkg/ruler.ruleGroupIterationFunc() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:272 +0x4a9 github.com/prometheus/prometheus/rules.(*Group).run() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:256 +0x6a7 github.com/prometheus/prometheus/rules.(*Manager).Update.func1() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:258 +0x11b github.com/prometheus/prometheus/rules.(*Manager).Update.gowrap2() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:259 +0x41 Previous write at 0x00c025bbfc30 by goroutine 48: github.com/prometheus/prometheus/rules.(*Group).Eval.func1.2() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:580 +0x35e runtime.deferreturn() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/runtime/panic.go:605 +0x5d github.com/prometheus/prometheus/rules.(*Group).Eval() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/group.go:666 +0x4e5 github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_test.go:2823 +0x2124 github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage_check_races() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_race_test.go:9 +0x30 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1792 +0x225 testing.(*T).Run.gowrap1() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1851 +0x44 Goroutine 56074 (running) created at: github.com/prometheus/prometheus/rules.(*Manager).Update() /Users/danielsabsay/git/cortex/vendor/github.com/prometheus/prometheus/rules/manager.go:248 +0x69c github.com/cortexproject/cortex/pkg/ruler.(*DefaultMultiTenantManager).syncRulesToManager() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:217 +0xa5c github.com/cortexproject/cortex/pkg/ruler.(*DefaultMultiTenantManager).SyncRuleGroups() /Users/danielsabsay/git/cortex/pkg/ruler/manager.go:140 +0x1da github.com/cortexproject/cortex/pkg/ruler.(*Ruler).syncRules() /Users/danielsabsay/git/cortex/pkg/ruler/ruler.go:728 +0x71b github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_test.go:2781 +0x130b github.com/cortexproject/cortex/pkg/ruler.TestRecoverAlertsPostOutage_check_races() /Users/danielsabsay/git/cortex/pkg/ruler/ruler_race_test.go:9 +0x30 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1792 +0x225 testing.(*T).Run.gowrap1() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1851 +0x44 Goroutine 48 (running) created at: testing.(*T).Run() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1851 +0x8f2 testing.runTests.func1() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:2279 +0x85 testing.tRunner() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:1792 +0x225 testing.runTests() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:2277 +0x96c testing.(*M).Run() /Users/danielsabsay/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.24.0.darwin-amd64/src/testing/testing.go:2142 +0xeea main.main() _testmain.go:167 +0x164 ================== ``` As can be seen above, the data race occurs because the rule evaluation is happening from two different goroutines: 1 that is scheduled by the normal ruler manager and 1 that is run by the test itself. The test itself calls Eval() with specific timestamps to test recovery logic. For purposes of this test, we don't want the normal evaluation loop to run at all. This commit injects a no-op GroupEvalIterationFunc into the ruler manager so that the only rule evaluation that happens is run by the test. Signed-off-by: dsabsay --- pkg/ruler/manager.go | 20 ++++++++++++++++---- pkg/ruler/ruler_test.go | 28 ++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 4d7574cfebb..e3396cf862f 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -64,6 +64,8 @@ type DefaultMultiTenantManager struct { ruleCache map[string][]*promRules.Group ruleCacheMtx sync.RWMutex syncRuleMtx sync.Mutex + + ruleGroupIterationFunc promRules.GroupEvalIterationFunc } func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory ManagerFactory, evalMetrics *RuleEvalMetrics, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) { @@ -122,8 +124,9 @@ func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory Name: "ruler_config_updates_total", Help: "Total number of config updates triggered by a user", }, []string{"user"}), - registry: reg, - logger: logger, + registry: reg, + logger: logger, + ruleGroupIterationFunc: defaultRuleGroupIterationFunc, } if cfg.RulesBackupEnabled() { m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg) @@ -131,6 +134,15 @@ func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory return m, nil } +func NewDefaultMultiTenantManagerWithIterationFunc(iterFunc promRules.GroupEvalIterationFunc, cfg Config, limits RulesLimits, managerFactory ManagerFactory, evalMetrics *RuleEvalMetrics, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) { + manager, err := NewDefaultMultiTenantManager(cfg, limits, managerFactory, evalMetrics, reg, logger) + if err != nil { + return nil, err + } + manager.ruleGroupIterationFunc = iterFunc + return manager, nil +} + func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) { // this is a safety lock to ensure this method is executed sequentially r.syncRuleMtx.Lock() @@ -214,7 +226,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user if (rulesUpdated || externalLabelsUpdated) && existing { r.updateRuleCache(user, manager.RuleGroups()) } - err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), ruleGroupIterationFunc) + err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), r.ruleGroupIterationFunc) r.deleteRuleCache(user) if err != nil { r.lastReloadSuccessful.WithLabelValues(user).Set(0) @@ -257,7 +269,7 @@ func (r *DefaultMultiTenantManager) createRulesManager(user string, ctx context. return manager } -func ruleGroupIterationFunc(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) { +func defaultRuleGroupIterationFunc(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) { logMessage := []interface{}{ "component", "ruler", "rule_group", g.Name(), diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index ec7eb287c30..f40f0dbf03b 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -340,6 +340,26 @@ func buildRuler(t *testing.T, rulerConfig Config, querierTestConfig *querier.Tes return ruler, manager } +func buildRulerWithIterFunc(t *testing.T, rulerConfig Config, querierTestConfig *querier.TestConfig, store rulestore.RuleStore, rulerAddrMap map[string]*Ruler, ruleGroupIterFunc promRules.GroupEvalIterationFunc) (*Ruler, *DefaultMultiTenantManager) { + engine, queryable, pusher, logger, overrides, reg := testSetup(t, querierTestConfig) + metrics := NewRuleEvalMetrics(rulerConfig, reg) + managerFactory := DefaultTenantManagerFactory(rulerConfig, pusher, queryable, engine, overrides, metrics, reg) + manager, err := NewDefaultMultiTenantManagerWithIterationFunc(ruleGroupIterFunc, rulerConfig, &ruleLimits{}, managerFactory, metrics, reg, log.NewNopLogger()) + require.NoError(t, err) + + ruler, err := newRuler( + rulerConfig, + manager, + reg, + logger, + store, + overrides, + newMockClientsPool(rulerConfig, logger, reg, rulerAddrMap), + ) + require.NoError(t, err) + return ruler, manager +} + func newTestRuler(t *testing.T, rulerConfig Config, store rulestore.RuleStore, querierTestConfig *querier.TestConfig) *Ruler { ruler, _ := buildRuler(t, rulerConfig, querierTestConfig, store, nil) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ruler)) @@ -2776,8 +2796,12 @@ func TestRecoverAlertsPostOutage(t *testing.T) { querier.UseAlwaysQueryable(newEmptyQueryable()), } - // create a ruler but don't start it. instead, we'll evaluate the rule groups manually. - r, _ := buildRuler(t, rulerCfg, &querier.TestConfig{Cfg: querierConfig, Distributor: d, Stores: queryables}, store, nil) + // Define a no-op GroupEvalIterationFunc to avoid races between the scheduled Eval() execution and the evaluations invoked by this test. + evalFunc := func(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) { + return + } + + r, _ := buildRulerWithIterFunc(t, rulerCfg, &querier.TestConfig{Cfg: querierConfig, Distributor: d, Stores: queryables}, store, nil, evalFunc) r.syncRules(context.Background(), rulerSyncReasonInitial) // assert initial state of rule group From d3211c37596c8906f1e46db71628f4aa743f546f Mon Sep 17 00:00:00 2001 From: dsabsay Date: Sat, 17 May 2025 19:59:02 -0700 Subject: [PATCH 2/2] Fix linting error Signed-off-by: dsabsay --- pkg/ruler/ruler_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index f40f0dbf03b..61757377027 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -2797,9 +2797,7 @@ func TestRecoverAlertsPostOutage(t *testing.T) { } // Define a no-op GroupEvalIterationFunc to avoid races between the scheduled Eval() execution and the evaluations invoked by this test. - evalFunc := func(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) { - return - } + evalFunc := func(ctx context.Context, g *promRules.Group, evalTimestamp time.Time) {} r, _ := buildRulerWithIterFunc(t, rulerCfg, &querier.TestConfig{Cfg: querierConfig, Distributor: d, Stores: queryables}, store, nil, evalFunc) r.syncRules(context.Background(), rulerSyncReasonInitial)