Alerting/ruler metrics (#34144)

* adds active configurations metric

* rule evaluation metrics

* ruler metrics

* pr feedback
time-in-context
Owen Diehl 4 years ago committed by GitHub
parent eb74994b8b
commit 1367f7171e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 57
      pkg/services/ngalert/metrics/metrics.go
  2. 7
      pkg/services/ngalert/ngalert.go
  3. 7
      pkg/services/ngalert/notifier/alertmanager.go
  4. 3
      pkg/services/ngalert/notifier/alertmanager_test.go
  5. 17
      pkg/services/ngalert/schedule/schedule.go
  6. 3
      pkg/services/ngalert/schedule/schedule_test.go
  7. 5
      pkg/services/ngalert/state/cache.go
  8. 24
      pkg/services/ngalert/store/alertmanager.go
  9. 2
      pkg/services/ngalert/store/database.go

@ -29,8 +29,13 @@ type Metrics struct {
*metrics.Alerts
AlertState *prometheus.GaugeVec
// Registerer is for use by subcomponents which register their own metrics.
Registerer prometheus.Registerer
RequestDuration *prometheus.HistogramVec
Registerer prometheus.Registerer
RequestDuration *prometheus.HistogramVec
ActiveConfigurations prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
GroupRules *prometheus.GaugeVec
}
func init() {
@ -68,6 +73,54 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
},
[]string{"method", "route", "status_code", "backend"},
),
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "active_configurations",
Help: "The number of active, non default alertmanager configurations for grafana managed alerts",
}),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "rule_evaluations_total",
Help: "The total number of rule evaluations.",
},
[]string{"user"},
),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalFailures: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.",
},
[]string{"user"},
),
EvalDuration: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "grafana",
Subsystem: "alerting",
Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"user"},
),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
GroupRules: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "rule_group_rules",
Help: "The number of rules.",
},
[]string{"user"},
),
}
}

@ -65,7 +65,11 @@ func (ng *AlertNG) Init() error {
ng.stateManager = state.NewManager(ng.Log, ng.Metrics)
baseInterval := baseIntervalSeconds * time.Second
store := &store.DBstore{BaseInterval: baseInterval, DefaultIntervalSeconds: defaultIntervalSeconds, SQLStore: ng.SQLStore}
store := &store.DBstore{
BaseInterval: baseInterval,
DefaultIntervalSeconds: defaultIntervalSeconds,
SQLStore: ng.SQLStore,
}
var err error
ng.Alertmanager, err = notifier.New(ng.Cfg, store, ng.Metrics)
@ -82,6 +86,7 @@ func (ng *AlertNG) Init() error {
InstanceStore: store,
RuleStore: store,
Notifier: ng.Alertmanager,
Metrics: ng.Metrics,
}
ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService)

@ -212,6 +212,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er
if err != nil {
return err
}
am.Metrics.ActiveConfigurations.Set(1)
return nil
}
@ -253,6 +254,12 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
return fmt.Errorf("unable to reload configuration: %w", err)
}
if q.Result.Default {
am.Metrics.ActiveConfigurations.Set(0)
} else {
am.Metrics.ActiveConfigurations.Set(1)
}
return nil
}

@ -35,6 +35,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
DataPath: dir,
}
m := metrics.NewMetrics(prometheus.NewRegistry())
sqlStore := sqlstore.InitTestDB(t)
store := &store.DBstore{
BaseInterval: 10 * time.Second,
@ -42,7 +43,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
SQLStore: sqlStore,
}
am, err := New(cfg, store, metrics.NewMetrics(prometheus.NewRegistry()))
am, err := New(cfg, store, m)
require.NoError(t, err)
return am
}

@ -8,6 +8,7 @@ import (
"github.com/benbjohnson/clock"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"golang.org/x/sync/errgroup"
"github.com/grafana/grafana/pkg/infra/log"
@ -39,7 +40,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
sch.log.Debug("alert rule routine started", "key", key)
evalRunning := false
var start, end time.Time
var attempt int64
var alertRule *models.AlertRule
for {
@ -50,7 +50,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
}
evaluate := func(attempt int64) error {
start = timeNow()
start := timeNow()
// fetch latest alert rule version
if alertRule == nil || alertRule.Version < ctx.version {
@ -70,8 +70,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
Data: alertRule.Data,
}
results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
end = timeNow()
var (
end = timeNow()
tenant = fmt.Sprint(alertRule.OrgID)
dur = end.Sub(start).Seconds()
)
sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
if err != nil {
sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
// consider saving alert instance on error
sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
@ -153,6 +161,7 @@ type schedule struct {
dataService *tsdb.Service
notifier Notifier
metrics *metrics.Metrics
}
// SchedulerCfg is the scheduler configuration.
@ -167,6 +176,7 @@ type SchedulerCfg struct {
RuleStore store.RuleStore
InstanceStore store.InstanceStore
Notifier Notifier
Metrics *metrics.Metrics
}
// NewScheduler returns a new schedule.
@ -186,6 +196,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule {
instanceStore: cfg.InstanceStore,
dataService: dataService,
notifier: cfg.Notifier,
metrics: cfg.Metrics,
}
return &sch
}

@ -14,6 +14,7 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/tests"
"github.com/prometheus/client_golang/prometheus"
"github.com/grafana/grafana/pkg/services/ngalert/state"
@ -104,6 +105,7 @@ func TestWarmStateCache(t *testing.T) {
RuleStore: dbstore,
InstanceStore: dbstore,
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
}
sched := schedule.NewScheduler(schedCfg, nil)
st := state.NewManager(schedCfg.Logger, nilMetrics)
@ -151,6 +153,7 @@ func TestAlertingTicker(t *testing.T) {
RuleStore: dbstore,
InstanceStore: dbstore,
Logger: log.New("ngalert schedule test"),
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
}
sched := schedule.NewScheduler(schedCfg, nil)

@ -149,8 +149,9 @@ func (c *cache) trim() {
eval.Error: 0,
}
for _, org := range c.states {
for _, rule := range org {
for org, orgMap := range c.states {
c.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
for _, rule := range orgMap {
for _, state := range rule {
if len(state.Results) > 100 {
newResults := make([]Evaluation, 100)

@ -13,29 +13,21 @@ var (
ErrNoAlertmanagerConfiguration = fmt.Errorf("could not find an Alertmanager configuration")
)
func getLatestAlertmanagerConfiguration(sess *sqlstore.DBSession) (*models.AlertConfiguration, error) {
c := &models.AlertConfiguration{}
// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
ok, err := sess.Desc("id").Limit(1).Get(c)
if err != nil {
return nil, err
}
if !ok {
return nil, ErrNoAlertmanagerConfiguration
}
return c, nil
}
// GetLatestAlertmanagerConfiguration returns the lastest version of the alertmanager configuration.
// It returns ErrNoAlertmanagerConfiguration if no configuration is found.
func (st *DBstore) GetLatestAlertmanagerConfiguration(query *models.GetLatestAlertmanagerConfigurationQuery) error {
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
c, err := getLatestAlertmanagerConfiguration(sess)
c := &models.AlertConfiguration{}
// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
ok, err := sess.Desc("id").Limit(1).Get(c)
if err != nil {
return err
}
if !ok {
return ErrNoAlertmanagerConfiguration
}
query.Result = c
return nil
})

@ -27,5 +27,5 @@ type DBstore struct {
BaseInterval time.Duration
// default alert definiiton interval
DefaultIntervalSeconds int64
SQLStore *sqlstore.SQLStore `inject:""`
SQLStore *sqlstore.SQLStore
}

Loading…
Cancel
Save