The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
grafana/pkg/services/alerting/engine.go

142 lines
3.7 KiB

package alerting
import (
"fmt"
"time"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/log"
"github.com/grafana/grafana/pkg/services/alerting/alertstates"
)
type Engine struct {
execQueue chan *AlertJob
resultQueue chan *AlertResult
clock clock.Clock
ticker *Ticker
scheduler Scheduler
handler AlertingHandler
ruleReader RuleReader
log log.Logger
responseHandler ResultHandler
}
func NewEngine() *Engine {
e := &Engine{
ticker: NewTicker(time.Now(), time.Second*0, clock.New()),
execQueue: make(chan *AlertJob, 1000),
resultQueue: make(chan *AlertResult, 1000),
scheduler: NewScheduler(),
handler: NewHandler(),
ruleReader: NewRuleReader(),
log: log.New("alerting.engine"),
responseHandler: NewResultHandler(),
}
return e
}
func (e *Engine) Start() {
e.log.Info("Starting Alerting Engine")
go e.alertingTicker()
go e.execDispatch()
go e.resultHandler()
}
func (e *Engine) Stop() {
close(e.execQueue)
close(e.resultQueue)
}
func (e *Engine) alertingTicker() {
defer func() {
if err := recover(); err != nil {
e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
}
}()
tickIndex := 0
for {
select {
case tick := <-e.ticker.C:
// TEMP SOLUTION update rules ever tenth tick
if tickIndex%10 == 0 {
e.scheduler.Update(e.ruleReader.Fetch())
}
e.scheduler.Tick(tick, e.execQueue)
tickIndex++
}
}
}
func (e *Engine) execDispatch() {
defer func() {
if err := recover(); err != nil {
e.log.Error("Scheduler Panic: stopping executor", "error", err, "stack", log.Stack(1))
}
}()
for job := range e.execQueue {
log.Trace("Alerting: engine:execDispatch() starting job %s", job.Rule.Name)
job.Running = true
e.executeJob(job)
}
}
func (e *Engine) executeJob(job *AlertJob) {
now := time.Now()
resultChan := make(chan *AlertResult, 1)
go e.handler.Execute(job, resultChan)
select {
case <-time.After(time.Second * 5):
e.resultQueue <- &AlertResult{
State: alertstates.Pending,
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
Error: fmt.Errorf("Timeout"),
AlertJob: job,
ExeuctionTime: time.Now(),
}
e.log.Debug("Job Execution timeout", "alertRuleId", job.Rule.Id)
case result := <-resultChan:
result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
e.log.Debug("Job Execution done", "timeTakenMs", result.Duration, "ruleId", job.Rule.Id)
e.resultQueue <- result
}
}
func (e *Engine) resultHandler() {
defer func() {
if err := recover(); err != nil {
e.log.Error("Engine Panic, stopping resultHandler", "error", err, "stack", log.Stack(1))
}
}()
for result := range e.resultQueue {
e.log.Debug("Alert Rule Result", "ruleId", result.AlertJob.Rule.Id, "state", result.State, "value", result.ActualValue, "retry", result.AlertJob.RetryCount)
result.AlertJob.Running = false
if result.Error != nil {
result.AlertJob.IncRetry()
if result.AlertJob.Retryable() {
e.log.Error("Alert Rule Result Error", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
e.execQueue <- result.AlertJob
} else {
e.log.Error("Alert Rule Result Error After Max Retries", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
result.State = alertstates.Critical
result.Description = fmt.Sprintf("Failed to run check after %d retires, Error: %v", maxAlertExecutionRetries, result.Error)
e.responseHandler.Handle(result)
}
} else {
result.AlertJob.ResetRetry()
e.responseHandler.Handle(result)
}
}
}