Alerting: Refactor `Run` of the scheduler (#37157)

* Alerting: Refactor `Run` of the scheduler

A bit of a refactor to make the diff easier to read for supporting
external Alertmanagers.

We'll introduce another routine that checks the database for
configuration and spawns other routines accordingly.

* Block the wait.

* Fix test
pull/32901/head
gotjosh 4 years ago committed by GitHub
parent 0c804df763
commit 442a6677fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      pkg/services/ngalert/ngalert.go
  2. 246
      pkg/services/ngalert/schedule/schedule.go
  3. 2
      pkg/services/ngalert/schedule/schedule_test.go

@ -110,14 +110,14 @@ func (ng *AlertNG) Init() error {
return nil
}
// Run starts the scheduler.
// Run starts the scheduler and Alertmanager.
func (ng *AlertNG) Run(ctx context.Context) error {
ng.Log.Debug("ngalert starting")
ng.stateManager.Warm()
children, subCtx := errgroup.WithContext(ctx)
children.Go(func() error {
return ng.schedule.Ticker(subCtx)
return ng.schedule.Run(subCtx)
})
children.Go(func() error {
return ng.Alertmanager.Run(subCtx)

@ -25,7 +25,7 @@ var timeNow = time.Now
// ScheduleService handles scheduling
type ScheduleService interface {
Ticker(context.Context) error
Run(context.Context) error
Pause() error
Unpause() error
@ -41,6 +41,8 @@ type Notifier interface {
}
type schedule struct {
wg sync.WaitGroup
// base tick rate (fastest possible configured check)
baseInterval time.Duration
@ -67,11 +69,9 @@ type schedule struct {
evaluator eval.Evaluator
ruleStore store.RuleStore
ruleStore store.RuleStore
instanceStore store.InstanceStore
dataService *tsdb.Service
dataService *tsdb.Service
stateManager *state.Manager
@ -120,30 +120,6 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
return &sch
}
func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
sch.clock = cfg.C
sch.baseInterval = cfg.BaseInterval
sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
sch.evalAppliedFunc = cfg.EvalAppliedFunc
sch.stopAppliedFunc = cfg.StopAppliedFunc
}
func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
if sch.evalAppliedFunc == nil {
return
}
sch.evalAppliedFunc(alertDefKey, now)
}
func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
if sch.stopAppliedFunc == nil {
return
}
sch.stopAppliedFunc(alertDefKey)
}
func (sch *schedule) Pause() error {
if sch == nil {
return fmt.Errorf("scheduler is not initialised")
@ -162,94 +138,23 @@ func (sch *schedule) Unpause() error {
return nil
}
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
sch.log.Debug("alert rule routine started", "key", key)
evalRunning := false
var attempt int64
var alertRule *models.AlertRule
for {
select {
case ctx := <-evalCh:
if evalRunning {
continue
}
evaluate := func(attempt int64) error {
start := timeNow()
// fetch latest alert rule version
if alertRule == nil || alertRule.Version < ctx.version {
q := models.GetAlertRuleByUIDQuery{OrgID: key.OrgID, UID: key.UID}
err := sch.ruleStore.GetAlertRuleByUID(&q)
if err != nil {
sch.log.Error("failed to fetch alert rule", "key", key)
return err
}
alertRule = q.Result
sch.log.Debug("new alert rule version fetched", "title", alertRule.Title, "key", key, "version", alertRule.Version)
}
condition := models.Condition{
Condition: alertRule.Condition,
OrgID: alertRule.OrgID,
Data: alertRule.Data,
}
results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
var (
end = timeNow()
tenant = fmt.Sprint(alertRule.OrgID)
dur = end.Sub(start).Seconds()
)
sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
if err != nil {
sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
// consider saving alert instance on error
sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
return err
}
processedStates := sch.stateManager.ProcessEvalResults(alertRule, results)
sch.saveAlertStates(processedStates)
alerts := FromAlertStateToPostableAlerts(sch.log, processedStates, sch.stateManager, sch.appURL)
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
err = sch.sendAlerts(alerts)
if err != nil {
sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
}
return nil
}
func() {
evalRunning = true
defer func() {
evalRunning = false
sch.evalApplied(key, ctx.now)
}()
func (sch *schedule) Run(ctx context.Context) error {
sch.wg.Add(1)
for attempt = 0; attempt < sch.maxAttempts; attempt++ {
err := evaluate(attempt)
if err == nil {
break
}
}
}()
case <-stopCh:
sch.stopApplied(key)
sch.log.Debug("stopping alert rule routine", "key", key)
// interrupt evaluation if it's running
return nil
case <-grafanaCtx.Done():
return grafanaCtx.Err()
go func() {
if err := sch.ruleEvaluationLoop(ctx); err != nil {
sch.log.Error("failure while running the rule evaluation loop", "err", err)
}
}
}()
sch.wg.Wait()
return nil
}
func (sch *schedule) Ticker(grafanaCtx context.Context) error {
dispatcherGroup, ctx := errgroup.WithContext(grafanaCtx)
func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
defer sch.wg.Done()
dispatcherGroup, ctx := errgroup.WithContext(ctx)
for {
select {
case tick := <-sch.heartbeat.C:
@ -320,7 +225,7 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
ruleInfo.stopCh <- struct{}{}
sch.registry.del(key)
}
case <-grafanaCtx.Done():
case <-ctx.Done():
waitErr := dispatcherGroup.Wait()
orgIds, err := sch.instanceStore.FetchOrgIds()
@ -338,6 +243,92 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
}
}
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
sch.log.Debug("alert rule routine started", "key", key)
evalRunning := false
var attempt int64
var alertRule *models.AlertRule
for {
select {
case ctx := <-evalCh:
if evalRunning {
continue
}
evaluate := func(attempt int64) error {
start := timeNow()
// fetch latest alert rule version
if alertRule == nil || alertRule.Version < ctx.version {
q := models.GetAlertRuleByUIDQuery{OrgID: key.OrgID, UID: key.UID}
err := sch.ruleStore.GetAlertRuleByUID(&q)
if err != nil {
sch.log.Error("failed to fetch alert rule", "key", key)
return err
}
alertRule = q.Result
sch.log.Debug("new alert rule version fetched", "title", alertRule.Title, "key", key, "version", alertRule.Version)
}
condition := models.Condition{
Condition: alertRule.Condition,
OrgID: alertRule.OrgID,
Data: alertRule.Data,
}
results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
var (
end = timeNow()
tenant = fmt.Sprint(alertRule.OrgID)
dur = end.Sub(start).Seconds()
)
sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
if err != nil {
sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
// consider saving alert instance on error
sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
return err
}
processedStates := sch.stateManager.ProcessEvalResults(alertRule, results)
sch.saveAlertStates(processedStates)
alerts := FromAlertStateToPostableAlerts(sch.log, processedStates, sch.stateManager, sch.appURL)
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
err = sch.sendAlerts(alerts)
if err != nil {
sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
}
return nil
}
func() {
evalRunning = true
defer func() {
evalRunning = false
sch.evalApplied(key, ctx.now)
}()
for attempt = 0; attempt < sch.maxAttempts; attempt++ {
err := evaluate(attempt)
if err == nil {
break
}
}
}()
case <-stopCh:
sch.stopApplied(key)
sch.log.Debug("stopping alert rule routine", "key", key)
// interrupt evaluation if it's running
return nil
case <-grafanaCtx.Done():
return grafanaCtx.Err()
}
}
}
func (sch *schedule) sendAlerts(alerts apimodels.PostableAlerts) error {
return sch.notifier.PutAlerts(alerts)
}
@ -445,3 +436,30 @@ type evalContext struct {
now time.Time
version int64
}
// overrideCfg is only used on tests.
func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
sch.clock = cfg.C
sch.baseInterval = cfg.BaseInterval
sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
sch.evalAppliedFunc = cfg.EvalAppliedFunc
sch.stopAppliedFunc = cfg.StopAppliedFunc
}
// evalApplied is only used on tests.
func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
if sch.evalAppliedFunc == nil {
return
}
sch.evalAppliedFunc(alertDefKey, now)
}
// stopApplied is only used on tests.
func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
if sch.stopAppliedFunc == nil {
return
}
sch.stopAppliedFunc(alertDefKey)
}

@ -158,7 +158,7 @@ func TestAlertingTicker(t *testing.T) {
ctx := context.Background()
go func() {
err := sched.Ticker(ctx)
err := sched.Run(ctx)
require.NoError(t, err)
}()
runtime.Gosched()

Loading…
Cancel
Save