Alerting: Add new metrics and tracings to state manager and scheduler (#71398)

* add metrics and tracing to state manager

* propagate tracer to state manager

* add scheduler metrics

* fix backtesting

* add test for state metrics

* remove StateUpdateCount

* update docs

* metrics can be null

* add tracer to new tests
pull/73319/head
Yuri Tseretyan 2 years ago committed by GitHub
parent 90e3f516ff
commit 938e26b59f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 21
      docs/sources/shared/alerts/grafana-managed-alerts.md
  2. 8
      pkg/services/ngalert/api/api.go
  3. 3
      pkg/services/ngalert/api/api_testing.go
  4. 3
      pkg/services/ngalert/api/api_testing_test.go
  5. 4
      pkg/services/ngalert/backtesting/engine.go
  6. 26
      pkg/services/ngalert/metrics/scheduler.go
  7. 12
      pkg/services/ngalert/metrics/state.go
  8. 2
      pkg/services/ngalert/ngalert.go
  9. 7
      pkg/services/ngalert/schedule/schedule.go
  10. 144
      pkg/services/ngalert/schedule/schedule_unit_test.go
  11. 49
      pkg/services/ngalert/state/manager.go
  12. 7
      pkg/services/ngalert/state/manager_bench_test.go
  13. 2
      pkg/services/ngalert/state/manager_private_test.go
  14. 40
      pkg/services/ngalert/state/manager_test.go

@ -24,12 +24,15 @@ Alerting rules can only query backend data sources with alerting enabled:
The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../setup-grafana/set-up-grafana-monitoring/" >}}). See also, [View alert rules and their current state]({{< relref "../../alerting/fundamentals/state-and-health/" >}}).
| Metric Name | Type | Description |
| ------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
| `alerting.alerts` | gauge | How many alerts by state |
| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API |
| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts |
| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations |
| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
| `alerting.rule_evaluation_duration_seconds` | summary | The duration for a rule to execute |
| `alerting.rule_group_rules` | gauge | The number of rules |
| Metric Name | Type | Description |
| --------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
| `alerting.alerts` | gauge | How many alerts by state |
| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API |
| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts |
| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations |
| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
| `alerting.rule_evaluation_duration_seconds` | histogram | The time to evaluate a rule |
| `alerting.rule_process_evaluation_duration_seconds` | histogram | The time to process the evaluation results for a rule |
| `alerting.rule_send_alerts_duration_seconds` | histogram | The time to send the alerts to Alertmanager |
| `alerting.rule_group_rules` | gauge | The number of rules |
| `alerting.state_calculation_duration_seconds` | histogram | The duration of calculation of a single state |

@ -7,6 +7,7 @@ import (
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/accesscontrol"
"github.com/grafana/grafana/pkg/services/datasourceproxy"
"github.com/grafana/grafana/pkg/services/datasources"
@ -84,8 +85,8 @@ type API struct {
EvaluatorFactory eval.EvaluatorFactory
FeatureManager featuremgmt.FeatureToggles
Historian Historian
AppUrl *url.URL
Tracer tracing.Tracer
AppUrl *url.URL
// Hooks can be used to replace API handlers for specific paths.
Hooks *Hooks
@ -134,9 +135,10 @@ func (api *API) RegisterAPIEndpoints(m *metrics.API) {
accessControl: api.AccessControl,
evaluator: api.EvaluatorFactory,
cfg: &api.Cfg.UnifiedAlerting,
backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory),
backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory, api.Tracer),
featureManager: api.FeatureManager,
appUrl: api.AppUrl,
tracer: api.Tracer,
}), m)
api.RegisterConfigurationApiEndpoints(NewConfiguration(
&ConfigSrv{

@ -15,6 +15,7 @@ import (
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/accesscontrol"
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
"github.com/grafana/grafana/pkg/services/datasources"
@ -39,6 +40,7 @@ type TestingApiSrv struct {
backtesting *backtesting.Engine
featureManager featuremgmt.FeatureToggles
appUrl *url.URL
tracer tracing.Tracer
}
// RouteTestGrafanaRuleConfig returns a list of potential alerts for a given rule configuration. This is intended to be
@ -86,6 +88,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext,
Clock: clock.New(),
Historian: nil,
MaxStateSaveConcurrency: 1,
Tracer: srv.tracer,
}
manager := state.NewManager(cfg)
includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel)

@ -10,6 +10,7 @@ import (
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/accesscontrol"
acMock "github.com/grafana/grafana/pkg/services/accesscontrol/mock"
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
@ -216,6 +217,7 @@ func TestRouteEvalQueries(t *testing.T) {
srv := &TestingApiSrv{
accessControl: ac,
tracer: tracing.InitializeTracerForTest(),
}
response := srv.RouteEvalQueries(rc, definitions.EvalQueriesPayload{
@ -277,5 +279,6 @@ func createTestingApiSrv(t *testing.T, ds *fakes.FakeCacheService, ac *acMock.Mo
accessControl: ac,
evaluator: evaluator,
cfg: config(t),
tracer: tracing.InitializeTracerForTest(),
}
}

@ -12,6 +12,7 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
@ -40,7 +41,7 @@ type Engine struct {
createStateManager func() stateManager
}
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer) *Engine {
return &Engine{
evalFactory: evalFactory,
createStateManager: func() stateManager {
@ -52,6 +53,7 @@ func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
Clock: clock.New(),
Historian: nil,
MaxStateSaveConcurrency: 1,
Tracer: tracer,
}
return state.NewManager(cfg)
},

@ -18,6 +18,8 @@ type Scheduler struct {
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.HistogramVec
ProcessDuration *prometheus.HistogramVec
SendDuration *prometheus.HistogramVec
GroupRules *prometheus.GaugeVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
@ -63,8 +65,28 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
Help: "The time to evaluate a rule.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),
ProcessDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_process_evaluation_duration_seconds",
Help: "The time to process the evaluation results for a rule.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),
SendDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_send_alerts_duration_seconds",
Help: "The time to send the alerts to Alertmanager.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),

@ -6,7 +6,8 @@ import (
)
type State struct {
AlertState *prometheus.GaugeVec
AlertState *prometheus.GaugeVec
StateUpdateDuration prometheus.Histogram
}
func NewStateMetrics(r prometheus.Registerer) *State {
@ -17,5 +18,14 @@ func NewStateMetrics(r prometheus.Registerer) *State {
Name: "alerts",
Help: "How many alerts by state.",
}, []string{"state"}),
StateUpdateDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_calculation_duration_seconds",
Help: "The duration of calculation of a single state.",
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10},
},
),
}
}

@ -221,6 +221,7 @@ func (ng *AlertNG) init() error {
DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoDataErrorExecution),
Tracer: ng.tracer,
}
stateManager := state.NewManager(cfg)
scheduler := schedule.NewScheduler(schedCfg, stateManager)
@ -268,6 +269,7 @@ func (ng *AlertNG) init() error {
AppUrl: appUrl,
Historian: history,
Hooks: api.NewHooks(ng.Log),
Tracer: ng.tracer,
}
ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())

@ -351,6 +351,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID)
evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
processDuration := sch.metrics.ProcessDuration.WithLabelValues(orgID)
sendDuration := sch.metrics.SendDuration.WithLabelValues(orgID)
notify := func(states []state.StateTransition) {
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
@ -423,6 +425,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
logger.Debug("Skip updating the state because the context has been cancelled")
return
}
start = sch.clock.Now()
processedStates := sch.stateManager.ProcessEvalResults(
ctx,
e.scheduledAt,
@ -430,6 +433,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
results,
state.GetRuleExtraLabels(e.rule, e.folderTitle, !sch.disableGrafanaFolder),
)
processDuration.Observe(sch.clock.Now().Sub(start).Seconds())
start = sch.clock.Now()
alerts := state.FromStateTransitionToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
span.AddEvents(
[]string{"message", "state_transitions", "alerts_to_send"},
@ -441,6 +447,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
if len(alerts.PostableAlerts) > 0 {
sch.alertsSender.Send(key, alerts)
}
sendDuration.Observe(sch.clock.Now().Sub(start).Seconds())
}
retryIfError := func(f func(attempt int64) error) error {

@ -82,6 +82,7 @@ func TestProcessTicks(t *testing.T) {
Clock: mockedClock,
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: testTracer,
}
st := state.NewManager(managerCfg)
@ -453,22 +454,21 @@ func TestSchedule_ruleRoutine(t *testing.T) {
t.Run("it reports metrics", func(t *testing.T) {
// duration metric has 0 values because of mocked clock that do not advance
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
@ -478,9 +478,45 @@ func TestSchedule_ruleRoutine(t *testing.T) {
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
# TYPE grafana_alerting_rule_evaluations_total counter
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
`, rule.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
require.NoError(t, err)
})
})
@ -641,34 +677,69 @@ func TestSchedule_ruleRoutine(t *testing.T) {
t.Run("it should increase failure counter", func(t *testing.T) {
// duration metric has 0 values because of mocked clock that do not advance
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
# TYPE grafana_alerting_rule_evaluation_failures_total counter
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
# TYPE grafana_alerting_rule_evaluations_total counter
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
# TYPE grafana_alerting_rule_evaluation_failures_total counter
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
# TYPE grafana_alerting_rule_evaluations_total counter
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
`, rule.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
require.NoError(t, err)
})
@ -826,6 +897,7 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor
Clock: mockedClock,
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: testTracer,
}
st := state.NewManager(managerCfg)

@ -8,8 +8,10 @@ import (
"github.com/benbjohnson/clock"
"github.com/grafana/dskit/concurrency"
"github.com/grafana/grafana-plugin-sdk-go/data"
"go.opentelemetry.io/otel/attribute"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
@ -30,6 +32,7 @@ type AlertInstanceManager interface {
type Manager struct {
log log.Logger
metrics *metrics.State
tracer tracing.Tracer
clock clock.Clock
cache *cache
@ -60,6 +63,8 @@ type ManagerCfg struct {
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
ApplyNoDataAndErrorToAllStates bool
Tracer tracing.Tracer
}
func NewManager(cfg ManagerCfg) *Manager {
@ -76,6 +81,7 @@ func NewManager(cfg ManagerCfg) *Manager {
doNotSaveNormalState: cfg.DoNotSaveNormalState,
maxStateSaveConcurrency: cfg.MaxStateSaveConcurrency,
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
tracer: cfg.Tracer,
}
}
@ -251,18 +257,46 @@ func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.Alert
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition {
logger := st.log.FromContext(ctx)
tracingCtx, span := st.tracer.Start(ctx, "alert rule state calculation")
defer span.End()
span.SetAttributes("rule_uid", alertRule.UID, attribute.String("rule_uid", alertRule.UID))
span.SetAttributes("org_id", alertRule.OrgID, attribute.Int64("org_id", alertRule.OrgID))
span.SetAttributes("rule_version", alertRule.Version, attribute.Int64("rule_version", alertRule.Version))
utcTick := evaluatedAt.UTC().Format(time.RFC3339Nano)
span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))
span.SetAttributes("results", len(results), attribute.Int("tick", len(results)))
logger := st.log.FromContext(tracingCtx)
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
states := st.setNextStateForRule(ctx, alertRule, results, extraLabels, logger)
states := st.setNextStateForRule(tracingCtx, alertRule, results, extraLabels, logger)
span.AddEvents([]string{"message", "state_transitions"},
[]tracing.EventValue{
{Str: "results processed"},
{Num: int64(len(states))},
})
staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
st.deleteAlertStates(ctx, logger, staleStates)
st.deleteAlertStates(tracingCtx, logger, staleStates)
if len(staleStates) > 0 {
span.AddEvents([]string{"message", "state_transitions"},
[]tracing.EventValue{
{Str: "deleted stale states"},
{Num: int64(len(staleStates))},
})
}
st.saveAlertStates(ctx, logger, states...)
st.saveAlertStates(tracingCtx, logger, states...)
span.AddEvents([]string{"message"},
[]tracing.EventValue{
{Str: "updated database"},
})
allChanges := append(states, staleStates...)
if st.historian != nil {
st.historian.Record(ctx, history_model.NewRuleMeta(alertRule, logger), allChanges)
st.historian.Record(tracingCtx, history_model.NewRuleMeta(alertRule, logger), allChanges)
}
return allChanges
}
@ -303,6 +337,7 @@ func (st *Manager) setNextStateForAll(ctx context.Context, alertRule *ngModels.A
// Set the current state based on evaluation results
func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, currentState *State, result eval.Result, logger log.Logger) StateTransition {
start := st.clock.Now()
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.EvaluationDuration = result.EvaluationDuration
currentState.Results = append(currentState.Results, Evaluation{
@ -390,6 +425,10 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
PreviousStateReason: oldReason,
}
if st.metrics != nil {
st.metrics.StateUpdateDuration.Observe(st.clock.Now().Sub(start).Seconds())
}
return nextState
}

@ -6,14 +6,16 @@ import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/mock"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/annotations"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/mock"
)
func BenchmarkProcessEvalResults(b *testing.B) {
@ -25,6 +27,7 @@ func BenchmarkProcessEvalResults(b *testing.B) {
cfg := state.ManagerCfg{
Historian: hist,
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
sut := state.NewManager(cfg)
now := time.Now().UTC()

@ -19,6 +19,7 @@ import (
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/log/logtest"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
@ -311,6 +312,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
cfg := ManagerCfg{
Metrics: testMetrics,
Tracer: tracing.InitializeTracerForTest(),
ExternalURL: nil,
InstanceStore: &FakeInstanceStore{},
Images: &NotAvailableImageService{},

@ -1,6 +1,7 @@
package state_test
import (
"bytes"
"context"
"errors"
"fmt"
@ -15,11 +16,13 @@ import (
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/exp/slices"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/annotations"
"github.com/grafana/grafana/pkg/services/annotations/annotationstest"
"github.com/grafana/grafana/pkg/services/dashboards"
@ -198,6 +201,7 @@ func TestWarmStateCache(t *testing.T) {
Clock: clock.NewMock(),
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
st.Warm(ctx, dbstore)
@ -234,6 +238,7 @@ func TestDashboardAnnotations(t *testing.T) {
Clock: clock.New(),
Historian: hist,
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
@ -1198,18 +1203,21 @@ func TestProcessEvalResults(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
m := metrics.NewHistorianMetrics(prometheus.NewRegistry())
store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, m)
hist := historian.NewAnnotationBackend(store, nil, m)
reg := prometheus.NewPedanticRegistry()
stateMetrics := metrics.NewStateMetrics(reg)
metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, metrics)
hist := historian.NewAnnotationBackend(store, nil, metrics)
clk := clock.NewMock()
cfg := state.ManagerCfg{
Metrics: testMetrics.GetStateMetrics(),
Metrics: stateMetrics,
ExternalURL: nil,
InstanceStore: &state.FakeInstanceStore{},
Images: &state.NotAvailableImageService{},
Clock: clk,
Historian: hist,
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
@ -1220,7 +1228,7 @@ func TestProcessEvalResults(t *testing.T) {
slices.SortFunc(evals, func(a, b time.Time) bool {
return a.Before(b)
})
results := 0
for _, evalTime := range evals {
res := tc.evalResults[evalTime]
for i := 0; i < len(res); i++ {
@ -1228,6 +1236,7 @@ func TestProcessEvalResults(t *testing.T) {
}
clk.Set(evalTime)
_ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
results += len(res)
}
states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID)
@ -1278,6 +1287,22 @@ func TestProcessEvalResults(t *testing.T) {
require.Eventuallyf(t, func() bool {
return tc.expectedAnnotations == fakeAnnoRepo.Len()
}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items()))
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_state_calculation_duration_seconds The duration of calculation of a single state.
# TYPE grafana_alerting_state_calculation_duration_seconds histogram
grafana_alerting_state_calculation_duration_seconds_bucket{le="0.01"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="0.1"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="1"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="2"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="5"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="10"} %[1]d
grafana_alerting_state_calculation_duration_seconds_bucket{le="+Inf"} %[1]d
grafana_alerting_state_calculation_duration_seconds_sum 0
grafana_alerting_state_calculation_duration_seconds_count %[1]d
`, results)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_state_calculation_duration_seconds", "grafana_alerting_state_calculation_total")
require.NoError(t, err)
})
}
@ -1292,6 +1317,7 @@ func TestProcessEvalResults(t *testing.T) {
Clock: clk,
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
rule := models.AlertRuleGen()()
@ -1442,6 +1468,7 @@ func TestStaleResultsHandler(t *testing.T) {
Clock: clock.New(),
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
st.Warm(ctx, dbstore)
@ -1523,6 +1550,7 @@ func TestStaleResults(t *testing.T) {
Clock: clk,
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
@ -1695,6 +1723,7 @@ func TestDeleteStateByRuleUID(t *testing.T) {
Clock: clk,
Historian: &state.FakeHistorian{},
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
st.Warm(ctx, dbstore)
@ -1835,6 +1864,7 @@ func TestResetStateByRuleUID(t *testing.T) {
Clock: clk,
Historian: fakeHistorian,
MaxStateSaveConcurrency: 1,
Tracer: tracing.InitializeTracerForTest(),
}
st := state.NewManager(cfg)
st.Warm(ctx, dbstore)

Loading…
Cancel
Save