The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
grafana/pkg/services/alerting/engine.go

290 lines
9.7 KiB

package alerting
import (
"context"
"errors"
"fmt"
"time"
"github.com/benbjohnson/clock"
"github.com/prometheus/client_golang/prometheus"
"go.opentelemetry.io/otel/attribute"
"golang.org/x/sync/errgroup"
"github.com/grafana/grafana/pkg/infra/localcache"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/infra/usagestats"
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/services/alerting/metrics"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/datasources"
Encryption: Refactor securejsondata.SecureJsonData to stop relying on global functions (#38865) * Encryption: Add support to encrypt/decrypt sjd * Add datasources.Service as a proxy to datasources db operations * Encrypt ds.SecureJsonData before calling SQLStore * Move ds cache code into ds service * Fix tlsmanager tests * Fix pluginproxy tests * Remove some securejsondata.GetEncryptedJsonData usages * Add pluginsettings.Service as a proxy for plugin settings db operations * Add AlertNotificationService as a proxy for alert notification db operations * Remove some securejsondata.GetEncryptedJsonData usages * Remove more securejsondata.GetEncryptedJsonData usages * Fix lint errors * Minor fixes * Remove encryption global functions usages from ngalert * Fix lint errors * Minor fixes * Minor fixes * Remove securejsondata.DecryptedValue usage * Refactor the refactor * Remove securejsondata.DecryptedValue usage * Move securejsondata to migrations package * Move securejsondata to migrations package * Minor fix * Fix integration test * Fix integration tests * Undo undesired changes * Fix tests * Add context.Context into encryption methods * Fix tests * Fix tests * Fix tests * Trigger CI * Fix test * Add names to params of encryption service interface * Remove bus from CacheServiceImpl * Add logging * Add keys to logger Co-authored-by: Emil Tullstedt <emil.tullstedt@grafana.com> * Add missing key to logger Co-authored-by: Emil Tullstedt <emil.tullstedt@grafana.com> * Undo changes in markdown files * Fix formatting * Add context to secrets service * Rename decryptSecureJsonData to decryptSecureJsonDataFn * Name args in GetDecryptedValueFn * Add template back to NewAlertmanagerNotifier * Copy GetDecryptedValueFn to ngalert * Add logging to pluginsettings * Fix pluginsettings test Co-authored-by: Tania B <yalyna.ts@gmail.com> Co-authored-by: Emil Tullstedt <emil.tullstedt@grafana.com>
4 years ago
"github.com/grafana/grafana/pkg/services/encryption"
"github.com/grafana/grafana/pkg/services/notifications"
"github.com/grafana/grafana/pkg/services/rendering"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/tsdb/legacydata"
)
// AlertEngine is the background process that
// schedules alert evaluations and makes sure notifications
// are sent.
type AlertEngine struct {
RenderService rendering.Service
RequestValidator models.PluginRequestValidator
DataService legacydata.RequestHandler
Cfg *setting.Cfg
execQueue chan *Job
ticker *Ticker
scheduler scheduler
evalHandler evalHandler
ruleReader ruleReader
log log.Logger
resultHandler resultHandler
usageStatsService usagestats.Service
tracer tracing.Tracer
AlertStore AlertStore
dashAlertExtractor DashAlertExtractor
dashboardService dashboards.DashboardService
datasourceService datasources.DataSourceService
}
// IsDisabled returns true if the alerting service is disabled for this instance.
func (e *AlertEngine) IsDisabled() bool {
return setting.AlertingEnabled == nil || !*setting.AlertingEnabled || !setting.ExecuteAlerts || e.Cfg.UnifiedAlerting.IsEnabled()
}
// ProvideAlertEngine returns a new AlertEngine.
func ProvideAlertEngine(renderer rendering.Service, requestValidator models.PluginRequestValidator,
dataService legacydata.RequestHandler, usageStatsService usagestats.Service, encryptionService encryption.Internal,
notificationService *notifications.NotificationService, tracer tracing.Tracer, store AlertStore, cfg *setting.Cfg,
dashAlertExtractor DashAlertExtractor, dashboardService dashboards.DashboardService, cacheService *localcache.CacheService, dsService datasources.DataSourceService) *AlertEngine {
e := &AlertEngine{
Cfg: cfg,
RenderService: renderer,
RequestValidator: requestValidator,
DataService: dataService,
usageStatsService: usageStatsService,
tracer: tracer,
AlertStore: store,
dashAlertExtractor: dashAlertExtractor,
dashboardService: dashboardService,
datasourceService: dsService,
}
e.execQueue = make(chan *Job, 1000)
e.scheduler = newScheduler()
e.evalHandler = NewEvalHandler(e.DataService)
e.ruleReader = newRuleReader(store)
e.log = log.New("alerting.engine")
e.resultHandler = newResultHandler(e.RenderService, store, notificationService, encryptionService.GetDecryptedValue)
e.registerUsageMetrics()
return e
}
// Run starts the alerting service background process.
func (e *AlertEngine) Run(ctx context.Context) error {
reg := prometheus.WrapRegistererWithPrefix("legacy_", prometheus.DefaultRegisterer)
e.ticker = NewTicker(clock.New(), 1*time.Second, metrics.NewTickerMetrics(reg))
defer e.ticker.Stop()
alertGroup, ctx := errgroup.WithContext(ctx)
alertGroup.Go(func() error { return e.alertingTicker(ctx) })
alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
err := alertGroup.Wait()
return err
}
func (e *AlertEngine) alertingTicker(grafanaCtx context.Context) error {
defer func() {
if err := recover(); err != nil {
e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
}
}()
tickIndex := 0
for {
select {
case <-grafanaCtx.Done():
return grafanaCtx.Err()
case tick := <-e.ticker.C:
// TEMP SOLUTION update rules ever tenth tick
if tickIndex%10 == 0 {
e.scheduler.Update(e.ruleReader.fetch(grafanaCtx))
}
e.scheduler.Tick(tick, e.execQueue)
tickIndex++
}
}
}
func (e *AlertEngine) runJobDispatcher(grafanaCtx context.Context) error {
dispatcherGroup, alertCtx := errgroup.WithContext(grafanaCtx)
for {
select {
case <-grafanaCtx.Done():
return dispatcherGroup.Wait()
case job := <-e.execQueue:
dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
}
}
}
var (
Remove redundancy in variable declarations (golint) This commit fixes the following golint warnings: pkg/api/avatar/avatar.go:229:12: should omit type *http.Client from declaration of var client; it will be inferred from the right-hand side pkg/login/brute_force_login_protection.go:13:26: should omit type time.Duration from declaration of var loginAttemptsWindow; it will be inferred from the right-hand side pkg/metrics/graphitebridge/graphite.go:58:26: should omit type []string from declaration of var metricCategoryPrefix; it will be inferred from the right-hand side pkg/metrics/graphitebridge/graphite.go:69:22: should omit type []string from declaration of var trimMetricPrefix; it will be inferred from the right-hand side pkg/models/alert.go:37:36: should omit type error from declaration of var ErrCannotChangeStateOnPausedAlert; it will be inferred from the right-hand side pkg/models/alert.go:38:36: should omit type error from declaration of var ErrRequiresNewState; it will be inferred from the right-hand side pkg/models/datasource.go:61:28: should omit type map[string]bool from declaration of var knownDatasourcePlugins; it will be inferred from the right-hand side pkg/plugins/update_checker.go:16:13: should omit type http.Client from declaration of var httpClient; it will be inferred from the right-hand side pkg/services/alerting/engine.go:103:24: should omit type time.Duration from declaration of var unfinishedWorkTimeout; it will be inferred from the right-hand side pkg/services/alerting/engine.go:105:19: should omit type time.Duration from declaration of var alertTimeout; it will be inferred from the right-hand side pkg/services/alerting/engine.go:106:19: should omit type int from declaration of var alertMaxAttempts; it will be inferred from the right-hand side pkg/services/alerting/notifier.go:143:23: should omit type map[string]*NotifierPlugin from declaration of var notifierFactories; it will be inferred from the right-hand side pkg/services/alerting/rule.go:136:24: should omit type map[string]ConditionFactory from declaration of var conditionFactories; it will be inferred from the right-hand side pkg/services/alerting/conditions/evaluator.go:12:15: should omit type []string from declaration of var defaultTypes; it will be inferred from the right-hand side pkg/services/alerting/conditions/evaluator.go:13:15: should omit type []string from declaration of var rangedTypes; it will be inferred from the right-hand side pkg/services/alerting/notifiers/opsgenie.go:44:19: should omit type string from declaration of var opsgenieAlertURL; it will be inferred from the right-hand side pkg/services/alerting/notifiers/pagerduty.go:43:23: should omit type string from declaration of var pagerdutyEventApiUrl; it will be inferred from the right-hand side pkg/services/alerting/notifiers/telegram.go:21:17: should omit type string from declaration of var telegramApiUrl; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:11:24: should omit type string from declaration of var simpleDashboardConfig; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:12:24: should omit type string from declaration of var oldVersion; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/config_reader_test.go:13:24: should omit type string from declaration of var brokenConfigs; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/file_reader.go:22:30: should omit type time.Duration from declaration of var checkDiskForChangesInterval; it will be inferred from the right-hand side pkg/services/provisioning/dashboards/file_reader.go:24:23: should omit type error from declaration of var ErrFolderNameMissing; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:15:34: should omit type string from declaration of var twoDatasourcesConfig; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:16:34: should omit type string from declaration of var twoDatasourcesConfigPurgeOthers; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:17:34: should omit type string from declaration of var doubleDatasourcesConfig; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:18:34: should omit type string from declaration of var allProperties; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:19:34: should omit type string from declaration of var versionZero; it will be inferred from the right-hand side pkg/services/provisioning/datasources/config_reader_test.go:20:34: should omit type string from declaration of var brokenYaml; it will be inferred from the right-hand side pkg/services/sqlstore/stats.go:16:25: should omit type time.Duration from declaration of var activeUserTimeLimit; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/mysql_dialect.go:69:14: should omit type bool from declaration of var hasLen1; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/mysql_dialect.go:70:14: should omit type bool from declaration of var hasLen2; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/postgres_dialect.go:95:14: should omit type bool from declaration of var hasLen1; it will be inferred from the right-hand side pkg/services/sqlstore/migrator/postgres_dialect.go:96:14: should omit type bool from declaration of var hasLen2; it will be inferred from the right-hand side pkg/setting/setting.go:42:15: should omit type string from declaration of var Env; it will be inferred from the right-hand side pkg/setting/setting.go:161:18: should omit type bool from declaration of var LdapAllowSignup; it will be inferred from the right-hand side pkg/setting/setting.go:473:30: should omit type bool from declaration of var skipStaticRootValidation; it will be inferred from the right-hand side pkg/tsdb/interval.go:14:21: should omit type time.Duration from declaration of var defaultMinInterval; it will be inferred from the right-hand side pkg/tsdb/interval.go:15:21: should omit type time.Duration from declaration of var year; it will be inferred from the right-hand side pkg/tsdb/interval.go:16:21: should omit type time.Duration from declaration of var day; it will be inferred from the right-hand side pkg/tsdb/cloudwatch/credentials.go:26:24: should omit type map[string]cache from declaration of var awsCredentialCache; it will be inferred from the right-hand side pkg/tsdb/influxdb/query.go:15:27: should omit type *regexp.Regexp from declaration of var regexpOperatorPattern; it will be inferred from the right-hand side pkg/tsdb/influxdb/query.go:16:27: should omit type *regexp.Regexp from declaration of var regexpMeasurementPattern; it will be inferred from the right-hand side pkg/tsdb/mssql/mssql_test.go:25:14: should omit type string from declaration of var serverIP; it will be inferred from the right-hand side
7 years ago
unfinishedWorkTimeout = time.Second * 5
)
func (e *AlertEngine) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
}
}()
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
attemptChan := make(chan int, 1)
// Initialize with first attemptID=1
attemptChan <- 1
job.SetRunning(true)
for {
select {
case <-grafanaCtx.Done():
// In case grafana server context is cancel, let a chance to job processing
// to finish gracefully - by waiting a timeout duration - before forcing its end.
unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
select {
case <-unfinishedWorkTimer.C:
return e.endJob(grafanaCtx.Err(), cancelChan, job)
case <-attemptChan:
return e.endJob(nil, cancelChan, job)
}
case attemptID, more := <-attemptChan:
if !more {
return e.endJob(nil, cancelChan, job)
}
go e.processJob(attemptID, attemptChan, cancelChan, job)
}
}
}
func (e *AlertEngine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
job.SetRunning(false)
close(cancelChan)
for cancelFn := range cancelChan {
cancelFn()
}
return err
}
func (e *AlertEngine) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
}
}()
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
cancelChan <- cancelFn
alertCtx, span := e.tracer.Start(alertCtx, "alert execution")
evalContext := NewEvalContext(alertCtx, job.Rule, e.RequestValidator, e.AlertStore, e.dashboardService, e.datasourceService)
evalContext.Ctx = alertCtx
go func() {
defer func() {
if err := recover(); err != nil {
e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
span.RecordError(fmt.Errorf("%v", err))
span.AddEvents(
[]string{"error", "message"},
[]tracing.EventValue{
{Str: fmt.Sprintf("%v", err)},
{Str: "failed to execute alert rule. panic was recovered."},
})
span.End()
close(attemptChan)
}
}()
e.evalHandler.Eval(evalContext)
span.SetAttributes("alertId", evalContext.Rule.ID, attribute.Key("alertId").Int64(evalContext.Rule.ID))
span.SetAttributes("dashboardId", evalContext.Rule.DashboardID, attribute.Key("dashboardId").Int64(evalContext.Rule.DashboardID))
span.SetAttributes("firing", evalContext.Firing, attribute.Key("firing").Bool(evalContext.Firing))
span.SetAttributes("nodatapoints", evalContext.NoDataFound, attribute.Key("nodatapoints").Bool(evalContext.NoDataFound))
span.SetAttributes("attemptID", attemptID, attribute.Key("attemptID").Int(attemptID))
if evalContext.Error != nil {
span.RecordError(evalContext.Error)
span.AddEvents(
[]string{"error", "message"},
[]tracing.EventValue{
{Str: fmt.Sprintf("%v", evalContext.Error)},
{Str: "alerting execution attempt failed"},
})
if attemptID < setting.AlertingMaxAttempts {
span.End()
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.ID, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
attemptChan <- (attemptID + 1)
return
}
}
// create new context with timeout for notifications
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
cancelChan <- resultHandleCancelFn
// override the context used for evaluation with a new context for notifications.
// This makes it possible for notifiers to execute when datasources
// don't respond within the timeout limit. We should rewrite this so notifications
// don't reuse the evalContext and get its own context.
evalContext.Ctx = resultHandleCtx
evalContext.Rule.State = evalContext.GetNewState()
if err := e.resultHandler.handle(evalContext); err != nil {
switch {
case errors.Is(err, context.Canceled):
e.log.Debug("Result handler returned context.Canceled")
case errors.Is(err, context.DeadlineExceeded):
e.log.Debug("Result handler returned context.DeadlineExceeded")
default:
e.log.Error("Failed to handle result", "err", err)
}
}
span.End()
e.log.Debug("Job Execution completed", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.ID, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
close(attemptChan)
}()
}
func (e *AlertEngine) registerUsageMetrics() {
e.usageStatsService.RegisterMetricsFunc(func(ctx context.Context) (map[string]interface{}, error) {
alertingUsageStats, err := e.QueryUsageStats(ctx)
if err != nil {
return nil, err
}
alertingOtherCount := 0
metrics := map[string]interface{}{}
for dsType, usageCount := range alertingUsageStats.DatasourceUsage {
if e.usageStatsService.ShouldBeReported(ctx, dsType) {
metrics[fmt.Sprintf("stats.alerting.ds.%s.count", dsType)] = usageCount
} else {
alertingOtherCount += usageCount
}
}
metrics["stats.alerting.ds.other.count"] = alertingOtherCount
return metrics, nil
})
}