mirror of https://github.com/grafana/grafana
Alerting: refactor scheduler and separate notification logic (#48144)
* Introduce AlertsRouter in the sender package, and move all fields and methods related to notifications out of the scheduler to this router. * Introduce a new interface AlertsSender in the schedule package and replace calls of anonymous function `notify` inside the ruleRoutine to calling methods of that interface. * Rename interface Scheduler in api package to ExternalAlertmanagerProvider, and replace scheduler with AlertRouter as struct that implements the interface.pull/52253/head
parent
ededf1dd6f
commit
a6b1090879
@ -0,0 +1,52 @@ |
||||
// Code generated by mockery v2.10.0. DO NOT EDIT.
|
||||
|
||||
package schedule |
||||
|
||||
import ( |
||||
definitions "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" |
||||
mock "github.com/stretchr/testify/mock" |
||||
|
||||
models "github.com/grafana/grafana/pkg/services/ngalert/models" |
||||
) |
||||
|
||||
// AlertsSenderMock is an autogenerated mock type for the AlertsSender type
|
||||
type AlertsSenderMock struct { |
||||
mock.Mock |
||||
} |
||||
|
||||
type AlertsSenderMock_Expecter struct { |
||||
mock *mock.Mock |
||||
} |
||||
|
||||
func (_m *AlertsSenderMock) EXPECT() *AlertsSenderMock_Expecter { |
||||
return &AlertsSenderMock_Expecter{mock: &_m.Mock} |
||||
} |
||||
|
||||
// Send provides a mock function with given fields: key, alerts
|
||||
func (_m *AlertsSenderMock) Send(key models.AlertRuleKey, alerts definitions.PostableAlerts) { |
||||
_m.Called(key, alerts) |
||||
} |
||||
|
||||
// AlertsSenderMock_Send_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Send'
|
||||
type AlertsSenderMock_Send_Call struct { |
||||
*mock.Call |
||||
} |
||||
|
||||
// Send is a helper method to define mock.On call
|
||||
// - key models.AlertRuleKey
|
||||
// - alerts definitions.PostableAlerts
|
||||
func (_e *AlertsSenderMock_Expecter) Send(key interface{}, alerts interface{}) *AlertsSenderMock_Send_Call { |
||||
return &AlertsSenderMock_Send_Call{Call: _e.mock.On("Send", key, alerts)} |
||||
} |
||||
|
||||
func (_c *AlertsSenderMock_Send_Call) Run(run func(key models.AlertRuleKey, alerts definitions.PostableAlerts)) *AlertsSenderMock_Send_Call { |
||||
_c.Call.Run(func(args mock.Arguments) { |
||||
run(args[0].(models.AlertRuleKey), args[1].(definitions.PostableAlerts)) |
||||
}) |
||||
return _c |
||||
} |
||||
|
||||
func (_c *AlertsSenderMock_Send_Call) Return() *AlertsSenderMock_Send_Call { |
||||
_c.Call.Return() |
||||
return _c |
||||
} |
@ -0,0 +1,255 @@ |
||||
package sender |
||||
|
||||
import ( |
||||
"context" |
||||
"errors" |
||||
"net/url" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/benbjohnson/clock" |
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log" |
||||
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" |
||||
"github.com/grafana/grafana/pkg/services/ngalert/models" |
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier" |
||||
"github.com/grafana/grafana/pkg/services/ngalert/store" |
||||
) |
||||
|
||||
// AlertsRouter handles alerts generated during alert rule evaluation.
|
||||
// Based on rule's orgID and the configuration for that organization,
|
||||
// it determines whether an alert needs to be sent to an external Alertmanager and\or internal notifier.Alertmanager
|
||||
//
|
||||
// After creating a AlertsRouter, you must call Run to keep the AlertsRouter's
|
||||
// state synchronized with the alerting configuration.
|
||||
type AlertsRouter struct { |
||||
logger log.Logger |
||||
clock clock.Clock |
||||
adminConfigStore store.AdminConfigurationStore |
||||
|
||||
// Senders help us send alerts to external Alertmanagers.
|
||||
AdminConfigMtx sync.RWMutex |
||||
SendAlertsTo map[int64]models.AlertmanagersChoice |
||||
Senders map[int64]*Sender |
||||
SendersCfgHash map[int64]string |
||||
|
||||
MultiOrgNotifier *notifier.MultiOrgAlertmanager |
||||
|
||||
appURL *url.URL |
||||
disabledOrgs map[int64]struct{} |
||||
adminConfigPollInterval time.Duration |
||||
} |
||||
|
||||
func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store store.AdminConfigurationStore, clk clock.Clock, appURL *url.URL, disabledOrgs map[int64]struct{}, configPollInterval time.Duration) *AlertsRouter { |
||||
d := &AlertsRouter{ |
||||
logger: log.New("alerts-router"), |
||||
clock: clk, |
||||
adminConfigStore: store, |
||||
|
||||
AdminConfigMtx: sync.RWMutex{}, |
||||
Senders: map[int64]*Sender{}, |
||||
SendersCfgHash: map[int64]string{}, |
||||
SendAlertsTo: map[int64]models.AlertmanagersChoice{}, |
||||
|
||||
MultiOrgNotifier: multiOrgNotifier, |
||||
|
||||
appURL: appURL, |
||||
disabledOrgs: disabledOrgs, |
||||
adminConfigPollInterval: configPollInterval, |
||||
} |
||||
return d |
||||
} |
||||
|
||||
// SyncAndApplyConfigFromDatabase looks for the admin configuration in the database
|
||||
// and adjusts the sender(s) and alert handling mechanism accordingly.
|
||||
func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { |
||||
d.logger.Debug("start of admin configuration sync") |
||||
cfgs, err := d.adminConfigStore.GetAdminConfigurations() |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
d.logger.Debug("found admin configurations", "count", len(cfgs)) |
||||
|
||||
orgsFound := make(map[int64]struct{}, len(cfgs)) |
||||
d.AdminConfigMtx.Lock() |
||||
for _, cfg := range cfgs { |
||||
_, isDisabledOrg := d.disabledOrgs[cfg.OrgID] |
||||
if isDisabledOrg { |
||||
d.logger.Debug("skipping starting sender for disabled org", "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
|
||||
// Update the Alertmanagers choice for the organization.
|
||||
d.SendAlertsTo[cfg.OrgID] = cfg.SendAlertsTo |
||||
|
||||
orgsFound[cfg.OrgID] = struct{}{} // keep track of the which senders we need to keep.
|
||||
|
||||
existing, ok := d.Senders[cfg.OrgID] |
||||
|
||||
// We have no running sender and no Alertmanager(s) configured, no-op.
|
||||
if !ok && len(cfg.Alertmanagers) == 0 { |
||||
d.logger.Debug("no external alertmanagers configured", "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
// We have no running sender and alerts are handled internally, no-op.
|
||||
if !ok && cfg.SendAlertsTo == models.InternalAlertmanager { |
||||
d.logger.Debug("alerts are handled internally", "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
|
||||
// We have a running sender but no Alertmanager(s) configured, shut it down.
|
||||
if ok && len(cfg.Alertmanagers) == 0 { |
||||
d.logger.Debug("no external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID) |
||||
delete(orgsFound, cfg.OrgID) |
||||
continue |
||||
} |
||||
|
||||
// We have a running sender, check if we need to apply a new config.
|
||||
if ok { |
||||
if d.SendersCfgHash[cfg.OrgID] == cfg.AsSHA256() { |
||||
d.logger.Debug("sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers) |
||||
continue |
||||
} |
||||
|
||||
d.logger.Debug("applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers) |
||||
err := existing.ApplyConfig(cfg) |
||||
if err != nil { |
||||
d.logger.Error("failed to apply configuration", "err", err, "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
d.SendersCfgHash[cfg.OrgID] = cfg.AsSHA256() |
||||
continue |
||||
} |
||||
|
||||
// No sender and have Alertmanager(s) to send to - start a new one.
|
||||
d.logger.Info("creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers) |
||||
s, err := New() |
||||
if err != nil { |
||||
d.logger.Error("unable to start the sender", "err", err, "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
|
||||
d.Senders[cfg.OrgID] = s |
||||
s.Run() |
||||
|
||||
err = s.ApplyConfig(cfg) |
||||
if err != nil { |
||||
d.logger.Error("failed to apply configuration", "err", err, "org", cfg.OrgID) |
||||
continue |
||||
} |
||||
|
||||
d.SendersCfgHash[cfg.OrgID] = cfg.AsSHA256() |
||||
} |
||||
|
||||
sendersToStop := map[int64]*Sender{} |
||||
|
||||
for orgID, s := range d.Senders { |
||||
if _, exists := orgsFound[orgID]; !exists { |
||||
sendersToStop[orgID] = s |
||||
delete(d.Senders, orgID) |
||||
delete(d.SendersCfgHash, orgID) |
||||
} |
||||
} |
||||
d.AdminConfigMtx.Unlock() |
||||
|
||||
// We can now stop these senders w/o having to hold a lock.
|
||||
for orgID, s := range sendersToStop { |
||||
d.logger.Info("stopping sender", "org", orgID) |
||||
s.Stop() |
||||
d.logger.Info("stopped sender", "org", orgID) |
||||
} |
||||
|
||||
d.logger.Debug("finish of admin configuration sync") |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.PostableAlerts) { |
||||
logger := d.logger.New("rule_uid", key.UID, "org", key.OrgID) |
||||
if len(alerts.PostableAlerts) == 0 { |
||||
logger.Debug("no alerts to notify about") |
||||
return |
||||
} |
||||
// Send alerts to local notifier if they need to be handled internally
|
||||
// or if no external AMs have been discovered yet.
|
||||
var localNotifierExist, externalNotifierExist bool |
||||
if d.SendAlertsTo[key.OrgID] == models.ExternalAlertmanagers && len(d.AlertmanagersFor(key.OrgID)) > 0 { |
||||
logger.Debug("no alerts to put in the notifier") |
||||
} else { |
||||
logger.Debug("sending alerts to local notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts) |
||||
n, err := d.MultiOrgNotifier.AlertmanagerFor(key.OrgID) |
||||
if err == nil { |
||||
localNotifierExist = true |
||||
if err := n.PutAlerts(alerts); err != nil { |
||||
logger.Error("failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "err", err) |
||||
} |
||||
} else { |
||||
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) { |
||||
logger.Debug("local notifier was not found") |
||||
} else { |
||||
logger.Error("local notifier is not available", "err", err) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Send alerts to external Alertmanager(s) if we have a sender for this organization
|
||||
// and alerts are not being handled just internally.
|
||||
d.AdminConfigMtx.RLock() |
||||
defer d.AdminConfigMtx.RUnlock() |
||||
s, ok := d.Senders[key.OrgID] |
||||
if ok && d.SendAlertsTo[key.OrgID] != models.InternalAlertmanager { |
||||
logger.Debug("sending alerts to external notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts) |
||||
s.SendAlerts(alerts) |
||||
externalNotifierExist = true |
||||
} |
||||
|
||||
if !localNotifierExist && !externalNotifierExist { |
||||
logger.Error("no external or internal notifier - [%d] alerts not delivered", len(alerts.PostableAlerts)) |
||||
} |
||||
} |
||||
|
||||
// AlertmanagersFor returns all the discovered Alertmanager(s) for a particular organization.
|
||||
func (d *AlertsRouter) AlertmanagersFor(orgID int64) []*url.URL { |
||||
d.AdminConfigMtx.RLock() |
||||
defer d.AdminConfigMtx.RUnlock() |
||||
s, ok := d.Senders[orgID] |
||||
if !ok { |
||||
return []*url.URL{} |
||||
} |
||||
return s.Alertmanagers() |
||||
} |
||||
|
||||
// DroppedAlertmanagersFor returns all the dropped Alertmanager(s) for a particular organization.
|
||||
func (d *AlertsRouter) DroppedAlertmanagersFor(orgID int64) []*url.URL { |
||||
d.AdminConfigMtx.RLock() |
||||
defer d.AdminConfigMtx.RUnlock() |
||||
s, ok := d.Senders[orgID] |
||||
if !ok { |
||||
return []*url.URL{} |
||||
} |
||||
|
||||
return s.DroppedAlertmanagers() |
||||
} |
||||
|
||||
// Run starts regular updates of the configuration.
|
||||
func (d *AlertsRouter) Run(ctx context.Context) error { |
||||
for { |
||||
select { |
||||
case <-time.After(d.adminConfigPollInterval): |
||||
if err := d.SyncAndApplyConfigFromDatabase(); err != nil { |
||||
d.logger.Error("unable to sync admin configuration", "err", err) |
||||
} |
||||
case <-ctx.Done(): |
||||
// Stop sending alerts to all external Alertmanager(s).
|
||||
d.AdminConfigMtx.Lock() |
||||
for orgID, s := range d.Senders { |
||||
delete(d.Senders, orgID) // delete before we stop to make sure we don't accept any more alerts.
|
||||
s.Stop() |
||||
} |
||||
d.AdminConfigMtx.Unlock() |
||||
|
||||
return nil |
||||
} |
||||
} |
||||
} |
Loading…
Reference in new issue