@ -25,7 +25,7 @@ var timeNow = time.Now
// ScheduleService handles scheduling
type ScheduleService interface {
Ticker ( context . Context ) error
Run ( context . Context ) error
Pause ( ) error
Unpause ( ) error
@ -41,6 +41,8 @@ type Notifier interface {
}
type schedule struct {
wg sync . WaitGroup
// base tick rate (fastest possible configured check)
baseInterval time . Duration
@ -67,11 +69,9 @@ type schedule struct {
evaluator eval . Evaluator
ruleStore store . RuleStore
ruleStore store . RuleStore
instanceStore store . InstanceStore
dataService * tsdb . Service
dataService * tsdb . Service
stateManager * state . Manager
@ -120,30 +120,6 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
return & sch
}
func ( sch * schedule ) overrideCfg ( cfg SchedulerCfg ) {
sch . clock = cfg . C
sch . baseInterval = cfg . BaseInterval
sch . heartbeat = alerting . NewTicker ( cfg . C . Now ( ) , time . Second * 0 , cfg . C , int64 ( cfg . BaseInterval . Seconds ( ) ) )
sch . evalAppliedFunc = cfg . EvalAppliedFunc
sch . stopAppliedFunc = cfg . StopAppliedFunc
}
func ( sch * schedule ) evalApplied ( alertDefKey models . AlertRuleKey , now time . Time ) {
if sch . evalAppliedFunc == nil {
return
}
sch . evalAppliedFunc ( alertDefKey , now )
}
func ( sch * schedule ) stopApplied ( alertDefKey models . AlertRuleKey ) {
if sch . stopAppliedFunc == nil {
return
}
sch . stopAppliedFunc ( alertDefKey )
}
func ( sch * schedule ) Pause ( ) error {
if sch == nil {
return fmt . Errorf ( "scheduler is not initialised" )
@ -162,94 +138,23 @@ func (sch *schedule) Unpause() error {
return nil
}
func ( sch * schedule ) ruleRoutine ( grafanaCtx context . Context , key models . AlertRuleKey , evalCh <- chan * evalContext , stopCh <- chan struct { } ) error {
sch . log . Debug ( "alert rule routine started" , "key" , key )
evalRunning := false
var attempt int64
var alertRule * models . AlertRule
for {
select {
case ctx := <- evalCh :
if evalRunning {
continue
}
evaluate := func ( attempt int64 ) error {
start := timeNow ( )
// fetch latest alert rule version
if alertRule == nil || alertRule . Version < ctx . version {
q := models . GetAlertRuleByUIDQuery { OrgID : key . OrgID , UID : key . UID }
err := sch . ruleStore . GetAlertRuleByUID ( & q )
if err != nil {
sch . log . Error ( "failed to fetch alert rule" , "key" , key )
return err
}
alertRule = q . Result
sch . log . Debug ( "new alert rule version fetched" , "title" , alertRule . Title , "key" , key , "version" , alertRule . Version )
}
condition := models . Condition {
Condition : alertRule . Condition ,
OrgID : alertRule . OrgID ,
Data : alertRule . Data ,
}
results , err := sch . evaluator . ConditionEval ( & condition , ctx . now , sch . dataService )
var (
end = timeNow ( )
tenant = fmt . Sprint ( alertRule . OrgID )
dur = end . Sub ( start ) . Seconds ( )
)
sch . metrics . EvalTotal . WithLabelValues ( tenant ) . Inc ( )
sch . metrics . EvalDuration . WithLabelValues ( tenant ) . Observe ( dur )
if err != nil {
sch . metrics . EvalFailures . WithLabelValues ( tenant ) . Inc ( )
// consider saving alert instance on error
sch . log . Error ( "failed to evaluate alert rule" , "title" , alertRule . Title ,
"key" , key , "attempt" , attempt , "now" , ctx . now , "duration" , end . Sub ( start ) , "error" , err )
return err
}
processedStates := sch . stateManager . ProcessEvalResults ( alertRule , results )
sch . saveAlertStates ( processedStates )
alerts := FromAlertStateToPostableAlerts ( sch . log , processedStates , sch . stateManager , sch . appURL )
sch . log . Debug ( "sending alerts to notifier" , "count" , len ( alerts . PostableAlerts ) , "alerts" , alerts . PostableAlerts )
err = sch . sendAlerts ( alerts )
if err != nil {
sch . log . Error ( "failed to put alerts in the notifier" , "count" , len ( alerts . PostableAlerts ) , "err" , err )
}
return nil
}
func ( ) {
evalRunning = true
defer func ( ) {
evalRunning = false
sch . evalApplied ( key , ctx . now )
} ( )
func ( sch * schedule ) Run ( ctx context . Context ) error {
sch . wg . Add ( 1 )
for attempt = 0 ; attempt < sch . maxAttempts ; attempt ++ {
err := evaluate ( attempt )
if err == nil {
break
}
}
} ( )
case <- stopCh :
sch . stopApplied ( key )
sch . log . Debug ( "stopping alert rule routine" , "key" , key )
// interrupt evaluation if it's running
return nil
case <- grafanaCtx . Done ( ) :
return grafanaCtx . Err ( )
go func ( ) {
if err := sch . ruleEvaluationLoop ( ctx ) ; err != nil {
sch . log . Error ( "failure while running the rule evaluation loop" , "err" , err )
}
}
} ( )
sch . wg . Wait ( )
return nil
}
func ( sch * schedule ) Ticker ( grafanaCtx context . Context ) error {
dispatcherGroup , ctx := errgroup . WithContext ( grafanaCtx )
func ( sch * schedule ) ruleEvaluationLoop ( ctx context . Context ) error {
defer sch . wg . Done ( )
dispatcherGroup , ctx := errgroup . WithContext ( ctx )
for {
select {
case tick := <- sch . heartbeat . C :
@ -320,7 +225,7 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
ruleInfo . stopCh <- struct { } { }
sch . registry . del ( key )
}
case <- grafanaC tx. Done ( ) :
case <- c tx. Done ( ) :
waitErr := dispatcherGroup . Wait ( )
orgIds , err := sch . instanceStore . FetchOrgIds ( )
@ -338,6 +243,92 @@ func (sch *schedule) Ticker(grafanaCtx context.Context) error {
}
}
func ( sch * schedule ) ruleRoutine ( grafanaCtx context . Context , key models . AlertRuleKey , evalCh <- chan * evalContext , stopCh <- chan struct { } ) error {
sch . log . Debug ( "alert rule routine started" , "key" , key )
evalRunning := false
var attempt int64
var alertRule * models . AlertRule
for {
select {
case ctx := <- evalCh :
if evalRunning {
continue
}
evaluate := func ( attempt int64 ) error {
start := timeNow ( )
// fetch latest alert rule version
if alertRule == nil || alertRule . Version < ctx . version {
q := models . GetAlertRuleByUIDQuery { OrgID : key . OrgID , UID : key . UID }
err := sch . ruleStore . GetAlertRuleByUID ( & q )
if err != nil {
sch . log . Error ( "failed to fetch alert rule" , "key" , key )
return err
}
alertRule = q . Result
sch . log . Debug ( "new alert rule version fetched" , "title" , alertRule . Title , "key" , key , "version" , alertRule . Version )
}
condition := models . Condition {
Condition : alertRule . Condition ,
OrgID : alertRule . OrgID ,
Data : alertRule . Data ,
}
results , err := sch . evaluator . ConditionEval ( & condition , ctx . now , sch . dataService )
var (
end = timeNow ( )
tenant = fmt . Sprint ( alertRule . OrgID )
dur = end . Sub ( start ) . Seconds ( )
)
sch . metrics . EvalTotal . WithLabelValues ( tenant ) . Inc ( )
sch . metrics . EvalDuration . WithLabelValues ( tenant ) . Observe ( dur )
if err != nil {
sch . metrics . EvalFailures . WithLabelValues ( tenant ) . Inc ( )
// consider saving alert instance on error
sch . log . Error ( "failed to evaluate alert rule" , "title" , alertRule . Title ,
"key" , key , "attempt" , attempt , "now" , ctx . now , "duration" , end . Sub ( start ) , "error" , err )
return err
}
processedStates := sch . stateManager . ProcessEvalResults ( alertRule , results )
sch . saveAlertStates ( processedStates )
alerts := FromAlertStateToPostableAlerts ( sch . log , processedStates , sch . stateManager , sch . appURL )
sch . log . Debug ( "sending alerts to notifier" , "count" , len ( alerts . PostableAlerts ) , "alerts" , alerts . PostableAlerts )
err = sch . sendAlerts ( alerts )
if err != nil {
sch . log . Error ( "failed to put alerts in the notifier" , "count" , len ( alerts . PostableAlerts ) , "err" , err )
}
return nil
}
func ( ) {
evalRunning = true
defer func ( ) {
evalRunning = false
sch . evalApplied ( key , ctx . now )
} ( )
for attempt = 0 ; attempt < sch . maxAttempts ; attempt ++ {
err := evaluate ( attempt )
if err == nil {
break
}
}
} ( )
case <- stopCh :
sch . stopApplied ( key )
sch . log . Debug ( "stopping alert rule routine" , "key" , key )
// interrupt evaluation if it's running
return nil
case <- grafanaCtx . Done ( ) :
return grafanaCtx . Err ( )
}
}
}
func ( sch * schedule ) sendAlerts ( alerts apimodels . PostableAlerts ) error {
return sch . notifier . PutAlerts ( alerts )
}
@ -445,3 +436,30 @@ type evalContext struct {
now time . Time
version int64
}
// overrideCfg is only used on tests.
func ( sch * schedule ) overrideCfg ( cfg SchedulerCfg ) {
sch . clock = cfg . C
sch . baseInterval = cfg . BaseInterval
sch . heartbeat = alerting . NewTicker ( cfg . C . Now ( ) , time . Second * 0 , cfg . C , int64 ( cfg . BaseInterval . Seconds ( ) ) )
sch . evalAppliedFunc = cfg . EvalAppliedFunc
sch . stopAppliedFunc = cfg . StopAppliedFunc
}
// evalApplied is only used on tests.
func ( sch * schedule ) evalApplied ( alertDefKey models . AlertRuleKey , now time . Time ) {
if sch . evalAppliedFunc == nil {
return
}
sch . evalAppliedFunc ( alertDefKey , now )
}
// stopApplied is only used on tests.
func ( sch * schedule ) stopApplied ( alertDefKey models . AlertRuleKey ) {
if sch . stopAppliedFunc == nil {
return
}
sch . stopAppliedFunc ( alertDefKey )
}