diff --git a/pkg/services/ngalert/metrics/metrics.go b/pkg/services/ngalert/metrics/metrics.go index 6ecbb4d4fbc..f33b123eabc 100644 --- a/pkg/services/ngalert/metrics/metrics.go +++ b/pkg/services/ngalert/metrics/metrics.go @@ -29,8 +29,13 @@ type Metrics struct { *metrics.Alerts AlertState *prometheus.GaugeVec // Registerer is for use by subcomponents which register their own metrics. - Registerer prometheus.Registerer - RequestDuration *prometheus.HistogramVec + Registerer prometheus.Registerer + RequestDuration *prometheus.HistogramVec + ActiveConfigurations prometheus.Gauge + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + EvalDuration *prometheus.SummaryVec + GroupRules *prometheus.GaugeVec } func init() { @@ -68,6 +73,54 @@ func NewMetrics(r prometheus.Registerer) *Metrics { }, []string{"method", "route", "status_code", "backend"}, ), + ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{ + Namespace: "grafana", + Subsystem: "alerting", + Name: "active_configurations", + Help: "The number of active, non default alertmanager configurations for grafana managed alerts", + }), + // TODO: once rule groups support multiple rules, consider partitioning + // on rule group as well as tenant, similar to loki|cortex. + EvalTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "grafana", + Subsystem: "alerting", + Name: "rule_evaluations_total", + Help: "The total number of rule evaluations.", + }, + []string{"user"}, + ), + // TODO: once rule groups support multiple rules, consider partitioning + // on rule group as well as tenant, similar to loki|cortex. + EvalFailures: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "grafana", + Subsystem: "alerting", + Name: "rule_evaluation_failures_total", + Help: "The total number of rule evaluation failures.", + }, + []string{"user"}, + ), + EvalDuration: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "grafana", + Subsystem: "alerting", + Help: "The duration for a rule to execute.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, + []string{"user"}, + ), + // TODO: once rule groups support multiple rules, consider partitioning + // on rule group as well as tenant, similar to loki|cortex. + GroupRules: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "grafana", + Subsystem: "alerting", + Name: "rule_group_rules", + Help: "The number of rules.", + }, + []string{"user"}, + ), } } diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 4f2a1c5558e..5e5afb69c8d 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -65,7 +65,11 @@ func (ng *AlertNG) Init() error { ng.stateManager = state.NewManager(ng.Log, ng.Metrics) baseInterval := baseIntervalSeconds * time.Second - store := &store.DBstore{BaseInterval: baseInterval, DefaultIntervalSeconds: defaultIntervalSeconds, SQLStore: ng.SQLStore} + store := &store.DBstore{ + BaseInterval: baseInterval, + DefaultIntervalSeconds: defaultIntervalSeconds, + SQLStore: ng.SQLStore, + } var err error ng.Alertmanager, err = notifier.New(ng.Cfg, store, ng.Metrics) @@ -82,6 +86,7 @@ func (ng *AlertNG) Init() error { InstanceStore: store, RuleStore: store, Notifier: ng.Alertmanager, + Metrics: ng.Metrics, } ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index abf06eaaccd..8710630236e 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -212,6 +212,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er if err != nil { return err } + am.Metrics.ActiveConfigurations.Set(1) return nil } @@ -253,6 +254,12 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return fmt.Errorf("unable to reload configuration: %w", err) } + if q.Result.Default { + am.Metrics.ActiveConfigurations.Set(0) + } else { + am.Metrics.ActiveConfigurations.Set(1) + } + return nil } diff --git a/pkg/services/ngalert/notifier/alertmanager_test.go b/pkg/services/ngalert/notifier/alertmanager_test.go index b259d1b8426..c6d0cec0665 100644 --- a/pkg/services/ngalert/notifier/alertmanager_test.go +++ b/pkg/services/ngalert/notifier/alertmanager_test.go @@ -35,6 +35,7 @@ func setupAMTest(t *testing.T) *Alertmanager { DataPath: dir, } + m := metrics.NewMetrics(prometheus.NewRegistry()) sqlStore := sqlstore.InitTestDB(t) store := &store.DBstore{ BaseInterval: 10 * time.Second, @@ -42,7 +43,7 @@ func setupAMTest(t *testing.T) *Alertmanager { SQLStore: sqlStore, } - am, err := New(cfg, store, metrics.NewMetrics(prometheus.NewRegistry())) + am, err := New(cfg, store, m) require.NoError(t, err) return am } diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index b7f2490176f..40862f2fc81 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -8,6 +8,7 @@ import ( "github.com/benbjohnson/clock" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" + "github.com/grafana/grafana/pkg/services/ngalert/metrics" "golang.org/x/sync/errgroup" "github.com/grafana/grafana/pkg/infra/log" @@ -39,7 +40,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul sch.log.Debug("alert rule routine started", "key", key) evalRunning := false - var start, end time.Time var attempt int64 var alertRule *models.AlertRule for { @@ -50,7 +50,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul } evaluate := func(attempt int64) error { - start = timeNow() + start := timeNow() // fetch latest alert rule version if alertRule == nil || alertRule.Version < ctx.version { @@ -70,8 +70,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul Data: alertRule.Data, } results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService) - end = timeNow() + var ( + end = timeNow() + tenant = fmt.Sprint(alertRule.OrgID) + dur = end.Sub(start).Seconds() + ) + + sch.metrics.EvalTotal.WithLabelValues(tenant).Inc() + sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur) if err != nil { + sch.metrics.EvalFailures.WithLabelValues(tenant).Inc() // consider saving alert instance on error sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title, "key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err) @@ -153,6 +161,7 @@ type schedule struct { dataService *tsdb.Service notifier Notifier + metrics *metrics.Metrics } // SchedulerCfg is the scheduler configuration. @@ -167,6 +176,7 @@ type SchedulerCfg struct { RuleStore store.RuleStore InstanceStore store.InstanceStore Notifier Notifier + Metrics *metrics.Metrics } // NewScheduler returns a new schedule. @@ -186,6 +196,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule { instanceStore: cfg.InstanceStore, dataService: dataService, notifier: cfg.Notifier, + metrics: cfg.Metrics, } return &sch } diff --git a/pkg/services/ngalert/schedule/schedule_test.go b/pkg/services/ngalert/schedule/schedule_test.go index b53fb958af6..7a6be93c55c 100644 --- a/pkg/services/ngalert/schedule/schedule_test.go +++ b/pkg/services/ngalert/schedule/schedule_test.go @@ -14,6 +14,7 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/tests" + "github.com/prometheus/client_golang/prometheus" "github.com/grafana/grafana/pkg/services/ngalert/state" @@ -104,6 +105,7 @@ func TestWarmStateCache(t *testing.T) { RuleStore: dbstore, InstanceStore: dbstore, + Metrics: metrics.NewMetrics(prometheus.NewRegistry()), } sched := schedule.NewScheduler(schedCfg, nil) st := state.NewManager(schedCfg.Logger, nilMetrics) @@ -151,6 +153,7 @@ func TestAlertingTicker(t *testing.T) { RuleStore: dbstore, InstanceStore: dbstore, Logger: log.New("ngalert schedule test"), + Metrics: metrics.NewMetrics(prometheus.NewRegistry()), } sched := schedule.NewScheduler(schedCfg, nil) diff --git a/pkg/services/ngalert/state/cache.go b/pkg/services/ngalert/state/cache.go index cf847b478e9..6d480bf3917 100644 --- a/pkg/services/ngalert/state/cache.go +++ b/pkg/services/ngalert/state/cache.go @@ -149,8 +149,9 @@ func (c *cache) trim() { eval.Error: 0, } - for _, org := range c.states { - for _, rule := range org { + for org, orgMap := range c.states { + c.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap))) + for _, rule := range orgMap { for _, state := range rule { if len(state.Results) > 100 { newResults := make([]Evaluation, 100) diff --git a/pkg/services/ngalert/store/alertmanager.go b/pkg/services/ngalert/store/alertmanager.go index 6ee1c8f92b0..1434ca88e2a 100644 --- a/pkg/services/ngalert/store/alertmanager.go +++ b/pkg/services/ngalert/store/alertmanager.go @@ -13,29 +13,21 @@ var ( ErrNoAlertmanagerConfiguration = fmt.Errorf("could not find an Alertmanager configuration") ) -func getLatestAlertmanagerConfiguration(sess *sqlstore.DBSession) (*models.AlertConfiguration, error) { - c := &models.AlertConfiguration{} - // The ID is already an auto incremental column, using the ID as an order should guarantee the latest. - ok, err := sess.Desc("id").Limit(1).Get(c) - if err != nil { - return nil, err - } - - if !ok { - return nil, ErrNoAlertmanagerConfiguration - } - - return c, nil -} - // GetLatestAlertmanagerConfiguration returns the lastest version of the alertmanager configuration. // It returns ErrNoAlertmanagerConfiguration if no configuration is found. func (st *DBstore) GetLatestAlertmanagerConfiguration(query *models.GetLatestAlertmanagerConfigurationQuery) error { return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error { - c, err := getLatestAlertmanagerConfiguration(sess) + c := &models.AlertConfiguration{} + // The ID is already an auto incremental column, using the ID as an order should guarantee the latest. + ok, err := sess.Desc("id").Limit(1).Get(c) if err != nil { return err } + + if !ok { + return ErrNoAlertmanagerConfiguration + } + query.Result = c return nil }) diff --git a/pkg/services/ngalert/store/database.go b/pkg/services/ngalert/store/database.go index b9b10901f25..933a1c9e56c 100644 --- a/pkg/services/ngalert/store/database.go +++ b/pkg/services/ngalert/store/database.go @@ -27,5 +27,5 @@ type DBstore struct { BaseInterval time.Duration // default alert definiiton interval DefaultIntervalSeconds int64 - SQLStore *sqlstore.SQLStore `inject:""` + SQLStore *sqlstore.SQLStore }