Alerting: Refactor `Run` of the scheduler (#37157)

* Alerting: Refactor `Run` of the scheduler A bit of a refactor to make the diff easier to read for supporting external Alertmanagers. We'll introduce another routine that checks the database for configuration and spawns other routines accordingly. * Block the wait. * Fix test
4 years ago · 442a6677fc
parent 0c804df763
commit 442a6677fc
3 changed files with 135 additions and 117 deletions
--- a/pkg/services/ngalert/ngalert.go
+++ b/pkg/services/ngalert/ngalert.go
@ -110,14 +110,14 @@ func (ng *AlertNG) Init() error {
 	return nil
 }

-// Run starts the scheduler.
+// Run starts the scheduler and Alertmanager.
 func (ng *AlertNG) Run(ctx context.Context) error {
 	ng.Log.Debug("ngalert starting")
 	ng.stateManager.Warm()

 	children, subCtx := errgroup.WithContext(ctx)
 	children.Go(func() error {
-		return ng.schedule.Ticker(subCtx)
+		return ng.schedule.Run(subCtx)
 	})
 	children.Go(func() error {
 		return ng.Alertmanager.Run(subCtx)
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -25,7 +25,7 @@ var timeNow = time.Now

 // ScheduleService handles scheduling
 type ScheduleService interface {
-	Ticker(context.Context) error
+	Run(context.Context) error
 	Pause() error
 	Unpause() error

@ -41,6 +41,8 @@ type Notifier interface {
 }

 type schedule struct {
+	wg sync.WaitGroup
+
 	// base tick rate (fastest possible configured check)
 	baseInterval time.Duration

@ -67,11 +69,9 @@ type schedule struct {

 	evaluator eval.Evaluator

-	ruleStore store.RuleStore
-
+	ruleStore     store.RuleStore
 	instanceStore store.InstanceStore
-
-	dataService *tsdb.Service
+	dataService   *tsdb.Service

 	stateManager *state.Manager

@ -120,30 +120,6 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
 	return &sch
 }

-func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
-	sch.clock = cfg.C
-	sch.baseInterval = cfg.BaseInterval
-	sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
-	sch.evalAppliedFunc = cfg.EvalAppliedFunc
-	sch.stopAppliedFunc = cfg.StopAppliedFunc
-}
-
-func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
-	if sch.evalAppliedFunc == nil {
-		return
-	}
-
-	sch.evalAppliedFunc(alertDefKey, now)
-}
-
-func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
-	if sch.stopAppliedFunc == nil {
-		return
-	}
-
-	sch.stopAppliedFunc(alertDefKey)
-}
-
 func (sch *schedule) Pause() error {
 	if sch == nil {
 		return fmt.Errorf("scheduler is not initialised")
@ -162,94 +138,23 @@ func (sch *schedule) Unpause() error {
 	return nil
 }

-func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
-	sch.log.Debug("alert rule routine started", "key", key)
-
-	evalRunning := false
-	var attempt int64
-	var alertRule *models.AlertRule
-	for {
-		select {
-		case ctx := <-evalCh:
-			if evalRunning {
-				continue
-			}
-
-			evaluate := func(attempt int64) error {
-				start := timeNow()
-
-				// fetch latest alert rule version
-				if alertRule == nil || alertRule.Version < ctx.version {
-					q := models.GetAlertRuleByUIDQuery{OrgID: key.OrgID, UID: key.UID}
-					err := sch.ruleStore.GetAlertRuleByUID(&q)
-					if err != nil {
-						sch.log.Error("failed to fetch alert rule", "key", key)
-						return err
-					}
-					alertRule = q.Result
-					sch.log.Debug("new alert rule version fetched", "title", alertRule.Title, "key", key, "version", alertRule.Version)
-				}
-
-				condition := models.Condition{
-					Condition: alertRule.Condition,
-					OrgID:     alertRule.OrgID,
-					Data:      alertRule.Data,
-				}
-				results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
-				var (
-					end    = timeNow()
-					tenant = fmt.Sprint(alertRule.OrgID)
-					dur    = end.Sub(start).Seconds()
-				)
-
-				sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
-				sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
-				if err != nil {
-					sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
-					// consider saving alert instance on error
-					sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
-						"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
-					return err
-				}
-
-				processedStates := sch.stateManager.ProcessEvalResults(alertRule, results)
-				sch.saveAlertStates(processedStates)
-				alerts := FromAlertStateToPostableAlerts(sch.log, processedStates, sch.stateManager, sch.appURL)
-				sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
-				err = sch.sendAlerts(alerts)
-				if err != nil {
-					sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
-				}
-				return nil
-			}
-
-			func() {
-				evalRunning = true
-				defer func() {
-					evalRunning = false
-					sch.evalApplied(key, ctx.now)
-				}()
+func (sch *schedule) Run(ctx context.Context) error {
+	sch.wg.Add(1)

-				for attempt = 0; attempt < sch.maxAttempts; attempt++ {
-					err := evaluate(attempt)
-					if err == nil {
-						break
-					}
-				}
-			}()
-		case <-stopCh:
-			sch.stopApplied(key)
-			sch.log.Debug("stopping alert rule routine", "key", key)
-			// interrupt evaluation if it's running
-			return nil
-		case <-grafanaCtx.Done():
-			return grafanaCtx.Err()
+	go func() {
+		if err := sch.ruleEvaluationLoop(ctx); err != nil {
+			sch.log.Error("failure while running the rule evaluation loop", "err", err)
 		}
-	}
+	}()
+
+	sch.wg.Wait()
+	return nil
 }

-func (sch *schedule) Ticker(grafanaCtx context.Context) error {
-	dispatcherGroup, ctx := errgroup.WithContext(grafanaCtx)
+func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
+	defer sch.wg.Done()
+
+	dispatcherGroup, ctx := errgroup.WithContext(ctx)
 	for {
 		select {
 		case tick := <-sch.heartbeat.C:
@ -320,7 +225,7 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
 				ruleInfo.stopCh <- struct{}{}
 				sch.registry.del(key)
 			}
-		case <-grafanaCtx.Done():
+		case <-ctx.Done():
 			waitErr := dispatcherGroup.Wait()

 			orgIds, err := sch.instanceStore.FetchOrgIds()
@ -338,6 +243,92 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
 	}
 }

+func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
+	sch.log.Debug("alert rule routine started", "key", key)
+
+	evalRunning := false
+	var attempt int64
+	var alertRule *models.AlertRule
+	for {
+		select {
+		case ctx := <-evalCh:
+			if evalRunning {
+				continue
+			}
+
+			evaluate := func(attempt int64) error {
+				start := timeNow()
+
+				// fetch latest alert rule version
+				if alertRule == nil || alertRule.Version < ctx.version {
+					q := models.GetAlertRuleByUIDQuery{OrgID: key.OrgID, UID: key.UID}
+					err := sch.ruleStore.GetAlertRuleByUID(&q)
+					if err != nil {
+						sch.log.Error("failed to fetch alert rule", "key", key)
+						return err
+					}
+					alertRule = q.Result
+					sch.log.Debug("new alert rule version fetched", "title", alertRule.Title, "key", key, "version", alertRule.Version)
+				}
+
+				condition := models.Condition{
+					Condition: alertRule.Condition,
+					OrgID:     alertRule.OrgID,
+					Data:      alertRule.Data,
+				}
+				results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
+				var (
+					end    = timeNow()
+					tenant = fmt.Sprint(alertRule.OrgID)
+					dur    = end.Sub(start).Seconds()
+				)
+
+				sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
+				sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
+				if err != nil {
+					sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
+					// consider saving alert instance on error
+					sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
+						"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
+					return err
+				}
+
+				processedStates := sch.stateManager.ProcessEvalResults(alertRule, results)
+				sch.saveAlertStates(processedStates)
+				alerts := FromAlertStateToPostableAlerts(sch.log, processedStates, sch.stateManager, sch.appURL)
+				sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
+				err = sch.sendAlerts(alerts)
+				if err != nil {
+					sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
+				}
+				return nil
+			}
+
+			func() {
+				evalRunning = true
+				defer func() {
+					evalRunning = false
+					sch.evalApplied(key, ctx.now)
+				}()
+
+				for attempt = 0; attempt < sch.maxAttempts; attempt++ {
+					err := evaluate(attempt)
+					if err == nil {
+						break
+					}
+				}
+			}()
+		case <-stopCh:
+			sch.stopApplied(key)
+			sch.log.Debug("stopping alert rule routine", "key", key)
+			// interrupt evaluation if it's running
+			return nil
+		case <-grafanaCtx.Done():
+			return grafanaCtx.Err()
+		}
+	}
+}
+
 func (sch *schedule) sendAlerts(alerts apimodels.PostableAlerts) error {
 	return sch.notifier.PutAlerts(alerts)
 }
@ -445,3 +436,30 @@ type evalContext struct {
 	now     time.Time
 	version int64
 }
+
+// overrideCfg is only used on tests.
+func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
+	sch.clock = cfg.C
+	sch.baseInterval = cfg.BaseInterval
+	sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
+	sch.evalAppliedFunc = cfg.EvalAppliedFunc
+	sch.stopAppliedFunc = cfg.StopAppliedFunc
+}
+
+// evalApplied is only used on tests.
+func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
+	if sch.evalAppliedFunc == nil {
+		return
+	}
+
+	sch.evalAppliedFunc(alertDefKey, now)
+}
+
+// stopApplied is only used on tests.
+func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
+	if sch.stopAppliedFunc == nil {
+		return
+	}
+
+	sch.stopAppliedFunc(alertDefKey)
+}
--- a/pkg/services/ngalert/schedule/schedule_test.go
+++ b/pkg/services/ngalert/schedule/schedule_test.go
@ -158,7 +158,7 @@ func TestAlertingTicker(t *testing.T) {
 	ctx := context.Background()

 	go func() {
-		err := sched.Ticker(ctx)
+		err := sched.Run(ctx)
 		require.NoError(t, err)
 	}()
 	runtime.Gosched()