The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
grafana/pkg/services/alerting/engine.go

224 lines
6.3 KiB

package alerting
import (
"context"
"fmt"
"time"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
tlog "github.com/opentracing/opentracing-go/log"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/log"
"github.com/grafana/grafana/pkg/registry"
"github.com/grafana/grafana/pkg/services/rendering"
"github.com/grafana/grafana/pkg/setting"
"golang.org/x/sync/errgroup"
)
type AlertingService struct {
RenderService rendering.Service `inject:""`
execQueue chan *Job
//clock clock.Clock
ticker *Ticker
scheduler Scheduler
evalHandler EvalHandler
ruleReader RuleReader
log log.Logger
resultHandler ResultHandler
}
func init() {
registry.RegisterService(&AlertingService{})
}
func NewEngine() *AlertingService {
e := &AlertingService{}
e.Init()
return e
}
func (e *AlertingService) IsDisabled() bool {
return !setting.AlertingEnabled || !setting.ExecuteAlerts
}
func (e *AlertingService) Init() error {
e.ticker = NewTicker(time.Now(), time.Second*0, clock.New())
e.execQueue = make(chan *Job, 1000)
e.scheduler = NewScheduler()
e.evalHandler = NewEvalHandler()
e.ruleReader = NewRuleReader()
e.log = log.New("alerting.engine")
e.resultHandler = NewResultHandler(e.RenderService)
return nil
}
func (e *AlertingService) Run(ctx context.Context) error {
alertGroup, ctx := errgroup.WithContext(ctx)
alertGroup.Go(func() error { return e.alertingTicker(ctx) })
alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
err := alertGroup.Wait()
return err
}
func (e *AlertingService) alertingTicker(grafanaCtx context.Context) error {
defer func() {
if err := recover(); err != nil {
e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
}
}()
tickIndex := 0
for {
select {
case <-grafanaCtx.Done():
return grafanaCtx.Err()
case tick := <-e.ticker.C:
// TEMP SOLUTION update rules ever tenth tick
if tickIndex%10 == 0 {
e.scheduler.Update(e.ruleReader.Fetch())
}
e.scheduler.Tick(tick, e.execQueue)
tickIndex++
}
}
}
func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
dispatcherGroup, alertCtx := errgroup.WithContext(grafanaCtx)
for {
select {
case <-grafanaCtx.Done():
return dispatcherGroup.Wait()
case job := <-e.execQueue:
dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
}
}
}
var (
Remove redundancy in variable declarations (golint) This commit fixes the following golint warnings: pkg/api/avatar/avatar.go:229:12: should omit type *http.Client from declaration of var client; it will be inferred from the right-hand side pkg/login/brute_force_login_protection.go:13:26: should omit type time.Duration from declaration of var loginAttemptsWindow; it will be inferred from the right-hand side pkg/metrics/graphitebridge/graphite.go:58:26: should omit type []string from declaration of var metricCategoryPrefix; it will be inferred from the right-hand side pkg/metrics/graphitebridge/graphite.go:69:22: should omit type []string from declaration of var trimMetricPrefix; it will be inferred from the right-hand side pkg/models/alert.go:37:36: should omit type error from declaration of var ErrCannotChangeStateOnPausedAlert; it will be inferred from the right-hand side pkg/models/alert.go:38:36: should omit type error from declaration of var ErrRequiresNewState; it will be inferred from the right-hand side pkg/models/datasource.go:61:28: should omit type map[string]bool from declaration of var knownDatasourcePlugins; it will be inferred from the right-hand side pkg/plugins/update_checker.go:16:13: should omit type http.Client from declaration of var httpClient; it will be inferred from the right-hand side pkg/services/alerting/engine.go:103:24: should omit type time.Duration from declaration of var unfinishedWorkTimeout; it will be inferred from the right-hand side pkg/services/alerting/engine.go:105:19: should omit type time.Duration from declaration of var alertTimeout; it will be inferred from the right-hand side pkg/services/alerting/engine.go:106:19: should omit type int from declaration of var alertMaxAttempts; it will be inferred from the right-hand side pkg/services/alerting/notifier.go:143:23: should omit type map[string]*NotifierPlugin from declaration of var notifierFactories; it will be inferred from the right-hand side pkg/services/alerting/rule.go:136:24: should omit type map[string]ConditionFactory from declaration of var conditionFactories; it will be inferred from the right-hand side pkg/services/alerting/conditions/evaluator.go:12:15: should omit type []string from declaration of var defaultTypes; it will be inferred from the right-hand side pkg/services/alerting/conditions/evaluator.go:13:15: should omit type []string from declaration of var rangedTypes; it will be inferred from the right-hand side pkg/services/alerting/notifiers/opsgenie.go:44:19: should omit type string from declaration of var opsgenieAlertURL; it will be inferred from the right-hand side pkg/services/alerting/notifiers/pagerduty.go:43:23: should omit type string from declaration of var pagerdutyEventApiUrl; it will be inferred from the right-hand side pkg/services/alerting/notifiers/telegram.go:21:17: should omit type string from declaration of var telegramApiUrl; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:11:24: should omit type string from declaration of var simpleDashboardConfig; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:12:24: should omit type string from declaration of var oldVersion; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:13:24: should omit type string from declaration of var brokenConfigs; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/file_reader.go:22:30: should omit type time.Duration from declaration of var checkDiskForChangesInterval; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/file_reader.go:24:23: should omit type error from declaration of var ErrFolderNameMissing; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:15:34: should omit type string from declaration of var twoDatasourcesConfig; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:16:34: should omit type string from declaration of var twoDatasourcesConfigPurgeOthers; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:17:34: should omit type string from declaration of var doubleDatasourcesConfig; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:18:34: should omit type string from declaration of var allProperties; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:19:34: should omit type string from declaration of var versionZero; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:20:34: should omit type string from declaration of var brokenYaml; it will be inferred from the right-hand side pkg/services/sqlstore/stats.go:16:25: should omit type time.Duration from declaration of var activeUserTimeLimit; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/mysql_dialect.go:69:14: should omit type bool from declaration of var hasLen1; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/mysql_dialect.go:70:14: should omit type bool from declaration of var hasLen2; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/postgres_dialect.go:95:14: should omit type bool from declaration of var hasLen1; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/postgres_dialect.go:96:14: should omit type bool from declaration of var hasLen2; it will be inferred from the right-hand side pkg/setting/setting.go:42:15: should omit type string from declaration of var Env; it will be inferred from the right-hand side pkg/setting/setting.go:161:18: should omit type bool from declaration of var LdapAllowSignup; it will be inferred from the right-hand side pkg/setting/setting.go:473:30: should omit type bool from declaration of var skipStaticRootValidation; it will be inferred from the right-hand side pkg/tsdb/interval.go:14:21: should omit type time.Duration from declaration of var defaultMinInterval; it will be inferred from the right-hand side pkg/tsdb/interval.go:15:21: should omit type time.Duration from declaration of var year; it will be inferred from the right-hand side pkg/tsdb/interval.go:16:21: should omit type time.Duration from declaration of var day; it will be inferred from the right-hand side pkg/tsdb/cloudwatch/credentials.go:26:24: should omit type map[string]cache from declaration of var awsCredentialCache; it will be inferred from the right-hand side pkg/tsdb/influxdb/query.go:15:27: should omit type *regexp.Regexp from declaration of var regexpOperatorPattern; it will be inferred from the right-hand side pkg/tsdb/influxdb/query.go:16:27: should omit type *regexp.Regexp from declaration of var regexpMeasurementPattern; it will be inferred from the right-hand side pkg/tsdb/mssql/mssql_test.go:25:14: should omit type string from declaration of var serverIP; it will be inferred from the right-hand side
7 years ago
unfinishedWorkTimeout = time.Second * 5
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
alertTimeout = time.Second * 30
resultHandleTimeout = time.Second * 30
alertMaxAttempts = 3
)
func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
}
}()
cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
attemptChan := make(chan int, 1)
// Initialize with first attemptID=1
attemptChan <- 1
job.Running = true
for {
select {
case <-grafanaCtx.Done():
// In case grafana server context is cancel, let a chance to job processing
// to finish gracefully - by waiting a timeout duration - before forcing its end.
unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
select {
case <-unfinishedWorkTimer.C:
return e.endJob(grafanaCtx.Err(), cancelChan, job)
case <-attemptChan:
return e.endJob(nil, cancelChan, job)
}
case attemptID, more := <-attemptChan:
if !more {
return e.endJob(nil, cancelChan, job)
}
go e.processJob(attemptID, attemptChan, cancelChan, job)
}
}
}
func (e *AlertingService) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
job.Running = false
close(cancelChan)
for cancelFn := range cancelChan {
cancelFn()
}
return err
}
func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
}
}()
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
cancelChan <- cancelFn
span := opentracing.StartSpan("alert execution")
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
evalContext := NewEvalContext(alertCtx, job.Rule)
evalContext.Ctx = alertCtx
go func() {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
ext.Error.Set(span, true)
span.LogFields(
tlog.Error(fmt.Errorf("%v", err)),
tlog.String("message", "failed to execute alert rule. panic was recovered."),
)
span.Finish()
close(attemptChan)
}
}()
e.evalHandler.Eval(evalContext)
span.SetTag("alertId", evalContext.Rule.Id)
span.SetTag("dashboardId", evalContext.Rule.DashboardId)
span.SetTag("firing", evalContext.Firing)
span.SetTag("nodatapoints", evalContext.NoDataFound)
span.SetTag("attemptID", attemptID)
if evalContext.Error != nil {
ext.Error.Set(span, true)
span.LogFields(
tlog.Error(evalContext.Error),
tlog.String("message", "alerting execution attempt failed"),
)
if attemptID < alertMaxAttempts {
span.Finish()
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
attemptChan <- (attemptID + 1)
return
}
}
// create new context with timeout for notifications
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
cancelChan <- resultHandleCancelFn
// override the context used for evaluation with a new context for notifications.
// This makes it possible for notifiers to execute when datasources
// dont respond within the timeout limit. We should rewrite this so notifications
// dont reuse the evalContext and get its own context.
evalContext.Ctx = resultHandleCtx
evalContext.Rule.State = evalContext.GetNewState()
e.resultHandler.Handle(evalContext)
span.Finish()
e.log.Debug("Job Execution completed", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
close(attemptChan)
}()
}