grafana/pkg/services/ngalert/state/state.go

package state

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"math"
	"strings"
	"time"

	alertingModels "github.com/grafana/alerting/models"
	"github.com/grafana/grafana-plugin-sdk-go/data"
	prometheusModel "github.com/prometheus/common/model"

	"github.com/grafana/grafana/pkg/apimachinery/errutil"
	"github.com/grafana/grafana/pkg/expr"
	"github.com/grafana/grafana/pkg/infra/log"
	"github.com/grafana/grafana/pkg/services/ngalert/eval"
	"github.com/grafana/grafana/pkg/services/ngalert/models"
	"github.com/grafana/grafana/pkg/services/screenshot"
)

type State struct {
	OrgID        int64
	AlertRuleUID string

	// CacheID is a unique, opaque identifier for the state, and is used to find the state
	// in the state cache. It tends to be derived from the state's labels.
	CacheID data.Fingerprint

	// State represents the current state.
	State eval.State

	// StateReason is a textual description to explain why the state has its current state.
	StateReason string

	// ResultFingerprint is a hash of labels of the result before it is processed by
	ResultFingerprint data.Fingerprint

	// LatestResult contains the result of the most recent evaluation, if available.
	LatestResult *Evaluation

	// Error is set if the current evaluation returned an error. If error is non-nil results
	// can still contain the results of previous evaluations.
	Error error

	// Image contains an optional image for the state. It tends to be included in notifications
	// as a visualization to show why the alert fired.
	Image *models.Image

	// Annotations contains the annotations from the alert rule. If an annotation is templated
	// then the template is first evaluated to derive the final annotation.
	Annotations map[string]string

	// Labels contain the labels from the query and any custom labels from the alert rule.
	// If a label is templated then the template is first evaluated to derive the final label.
	Labels data.Labels

	// Values contains the values of any instant vectors, reduce and math expressions, or classic
	// conditions.
	Values map[string]float64

	StartsAt time.Time
	// EndsAt is different from the Prometheus EndsAt as EndsAt is updated for both Normal states
	// and states that have been resolved. It cannot be used to determine when a state was resolved.
	EndsAt time.Time
	// ResolvedAt is set when the state is first resolved. That is to say, when the state first transitions
	// from Alerting, NoData, or Error to Normal. It is reset to zero when the state transitions from Normal
	// to any other state.
	ResolvedAt           *time.Time
	LastSentAt           *time.Time
	LastEvaluationString string
	LastEvaluationTime   time.Time
	EvaluationDuration   time.Duration
}

func (a *State) GetRuleKey() models.AlertRuleKey {
	return models.AlertRuleKey{
		OrgID: a.OrgID,
		UID:   a.AlertRuleUID,
	}
}

func (a *State) GetAlertInstanceKey() (models.AlertInstanceKey, error) {
	instanceLabels := models.InstanceLabels(a.Labels)
	_, labelsHash, err := instanceLabels.StringAndHash()
	if err != nil {
		return models.AlertInstanceKey{}, err
	}
	return models.AlertInstanceKey{RuleOrgID: a.OrgID, RuleUID: a.AlertRuleUID, LabelsHash: labelsHash}, nil
}

// SetAlerting sets the state to Alerting. It changes both the start and end time.
func (a *State) SetAlerting(reason string, startsAt, endsAt time.Time) {
	a.State = eval.Alerting
	a.StateReason = reason
	a.StartsAt = startsAt
	a.EndsAt = endsAt
	a.Error = nil
}

// SetPending the state to Pending. It changes both the start and end time.
func (a *State) SetPending(reason string, startsAt, endsAt time.Time) {
	a.State = eval.Pending
	a.StateReason = reason
	a.StartsAt = startsAt
	a.EndsAt = endsAt
	a.Error = nil
}

// SetNoData sets the state to NoData. It changes both the start and end time.
func (a *State) SetNoData(reason string, startsAt, endsAt time.Time) {
	a.State = eval.NoData
	a.StateReason = reason
	a.StartsAt = startsAt
	a.EndsAt = endsAt
	a.Error = nil
}

// SetError sets the state to Error. It changes both the start and end time.
func (a *State) SetError(err error, startsAt, endsAt time.Time) {
	a.State = eval.Error
	a.StateReason = models.StateReasonError
	a.StartsAt = startsAt
	a.EndsAt = endsAt
	a.Error = err
}

// SetNormal sets the state to Normal. It changes both the start and end time.
func (a *State) SetNormal(reason string, startsAt, endsAt time.Time) {
	a.State = eval.Normal
	a.StateReason = reason
	a.StartsAt = startsAt
	a.EndsAt = endsAt
	a.Error = nil
}

// Maintain updates the end time using the most recent evaluation.
func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
	a.EndsAt = nextEndsTime(interval, evaluatedAt)
}

// AddErrorAnnotations adds annotations to the state to indicate that an error occurred.
func (a *State) AddErrorAnnotations(err error, rule *models.AlertRule) {
	if err == nil {
		return
	}

	a.Annotations["Error"] = err.Error()
	// If the evaluation failed because a query returned an error then add the Ref ID and
	// Datasource UID as labels
	var utilError errutil.Error
	if errors.As(a.Error, &utilError) &&
		(errors.Is(a.Error, expr.QueryError) || errors.Is(a.Error, expr.ConversionError)) {
		for _, next := range rule.Data {
			if next.RefID == utilError.PublicPayload["refId"].(string) {
				a.Labels["ref_id"] = next.RefID
				a.Labels["datasource_uid"] = next.DatasourceUID
				break
			}
		}
	}
}

// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty
func IsNormalStateWithNoReason(s *State) bool {
	return s.State == eval.Normal && s.StateReason == ""
}

// StateTransition describes the transition from one state to another.
type StateTransition struct {
	*State
	PreviousState       eval.State
	PreviousStateReason string
}

func (c StateTransition) Formatted() string {
	return FormatStateAndReason(c.State.State, c.State.StateReason)
}

func (c StateTransition) PreviousFormatted() string {
	return FormatStateAndReason(c.PreviousState, c.PreviousStateReason)
}

func (c StateTransition) Changed() bool {
	return c.PreviousState != c.State.State || c.PreviousStateReason != c.State.StateReason
}

type StateTransitions []StateTransition

// StaleStates returns the subset of StateTransitions that are stale.
func (c StateTransitions) StaleStates() StateTransitions {
	var result StateTransitions
	for _, t := range c {
		if t.IsStale() {
			result = append(result, t)
		}
	}
	return result
}

type Evaluation struct {
	EvaluationTime  time.Time
	EvaluationState eval.State
	// Values contains the RefID and value of reduce and math expressions.
	// Classic conditions can have different values for the same RefID as they can include multiple conditions.
	// For these, we use the index of the condition in addition RefID as the key e.g. "A0, A1, A2, etc.".
	Values map[string]*float64
	// Condition is the refID specified as the condition in the alerting rule at the time of the evaluation.
	Condition string
}

// NewEvaluationValues returns the labels and values for each RefID in the capture.
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]*float64 {
	result := make(map[string]*float64, len(m))
	for k, v := range m {
		result[k] = v.Value
	}
	return result
}

func resultNormal(state *State, _ *models.AlertRule, result eval.Result, logger log.Logger, reason string) {
	if state.State == eval.Normal {
		logger.Debug("Keeping state", "state", state.State)
	} else {
		nextEndsAt := result.EvaluatedAt
		logger.Debug("Changing state",
			"previous_state",
			state.State,
			"next_state",
			eval.Normal,
			"previous_ends_at",
			state.EndsAt,
			"next_ends_at",
			nextEndsAt)
		// Normal states have the same start and end timestamps
		state.SetNormal(reason, nextEndsAt, nextEndsAt)
	}
}

func resultAlerting(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger, reason string) {
	switch state.State {
	case eval.Alerting:
		prevEndsAt := state.EndsAt
		state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
		logger.Debug("Keeping state",
			"state",
			state.State,
			"previous_ends_at",
			prevEndsAt,
			"next_ends_at",
			state.EndsAt)
	case eval.Pending:
		// If the previous state is Pending then check if the For duration has been observed
		if result.EvaluatedAt.Sub(state.StartsAt) >= rule.For {
			nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
			logger.Debug("Changing state",
				"previous_state",
				state.State,
				"next_state",
				eval.Alerting,
				"previous_ends_at",
				state.EndsAt,
				"next_ends_at",
				nextEndsAt)
			state.SetAlerting(reason, result.EvaluatedAt, nextEndsAt)
		}
	default:
		nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
		if rule.For > 0 {
			// If the alert rule has a For duration that should be observed then the state should be set to Pending
			logger.Debug("Changing state",
				"previous_state",
				state.State,
				"next_state",
				eval.Pending,
				"previous_ends_at",
				state.EndsAt,
				"next_ends_at",
				nextEndsAt)
			state.SetPending(reason, result.EvaluatedAt, nextEndsAt)
		} else {
			logger.Debug("Changing state",
				"previous_state",
				state.State,
				"next_state",
				eval.Alerting,
				"previous_ends_at",
				state.EndsAt,
				"next_ends_at",
				nextEndsAt)
			state.SetAlerting(reason, result.EvaluatedAt, nextEndsAt)
		}
	}
}

func resultError(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger) {
	handlerStr := "resultError"

	switch rule.ExecErrState {
	case models.AlertingErrState:
		logger.Debug("Execution error state is Alerting", "handler", "resultAlerting", "previous_handler", handlerStr)
		resultAlerting(state, rule, result, logger, models.StateReasonError)
		// This is a special case where Alerting and Pending should also have an error and reason
		state.Error = result.Error
	case models.ErrorErrState:
		if state.State == eval.Error {
			prevEndsAt := state.EndsAt
			state.Error = result.Error
			state.AddErrorAnnotations(result.Error, rule)
			state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
			logger.Debug("Keeping state",
				"state",
				state.State,
				"previous_ends_at",
				prevEndsAt,
				"next_ends_at",
				state.EndsAt)
		} else {
			nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
			// This is the first occurrence of an error
			logger.Debug("Changing state",
				"previous_state",
				state.State,
				"next_state",
				eval.Error,
				"previous_ends_at",
				state.EndsAt,
				"next_ends_at",
				nextEndsAt)
			state.SetError(result.Error, result.EvaluatedAt, nextEndsAt)
			state.AddErrorAnnotations(result.Error, rule)
		}
	case models.OkErrState:
		logger.Debug("Execution error state is Normal", "handler", "resultNormal", "previous_handler", handlerStr)
		resultNormal(state, rule, result, logger, "") // TODO: Should we add a reason?
	case models.KeepLastErrState:
		logger := logger.New("previous_handler", handlerStr)
		resultKeepLast(state, rule, result, logger)
	default:
		err := fmt.Errorf("unsupported execution error state: %s", rule.ExecErrState)
		state.SetError(err, state.StartsAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
		state.Annotations["Error"] = err.Error()
	}
}

func resultNoData(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger) {
	handlerStr := "resultNoData"

	switch rule.NoDataState {
	case models.Alerting:
		logger.Debug("Execution no data state is Alerting", "handler", "resultAlerting", "previous_handler", handlerStr)
		resultAlerting(state, rule, result, logger, models.StateReasonNoData)
	case models.NoData:
		if state.State == eval.NoData {
			prevEndsAt := state.EndsAt
			state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
			logger.Debug("Keeping state",
				"state",
				state.State,
				"previous_ends_at",
				prevEndsAt,
				"next_ends_at",
				state.EndsAt)
		} else {
			// This is the first occurrence of no data
			nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
			logger.Debug("Changing state",
				"previous_state",
				state.State,
				"next_state",
				eval.NoData,
				"previous_ends_at",
				state.EndsAt,
				"next_ends_at",
				nextEndsAt)
			state.SetNoData("", result.EvaluatedAt, nextEndsAt)
		}
	case models.OK:
		logger.Debug("Execution no data state is Normal", "handler", "resultNormal", "previous_handler", handlerStr)
		resultNormal(state, rule, result, logger, models.StateReasonNoData)
	case models.KeepLast:
		logger := logger.New("previous_handler", handlerStr)
		resultKeepLast(state, rule, result, logger)
	default:
		err := fmt.Errorf("unsupported no data state: %s", rule.NoDataState)
		state.SetError(err, state.StartsAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
		state.Annotations["Error"] = err.Error()
	}
}

func resultKeepLast(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger) {
	reason := models.ConcatReasons(result.State.String(), models.StateReasonKeepLast)

	switch state.State {
	case eval.Alerting:
		logger.Debug("Execution keep last state is Alerting", "handler", "resultAlerting")
		resultAlerting(state, rule, result, logger, reason)
	case eval.Pending:
		// respect 'for' setting on rule
		if result.EvaluatedAt.Sub(state.StartsAt) >= rule.For {
			logger.Debug("Execution keep last state is Pending", "handler", "resultAlerting")
			resultAlerting(state, rule, result, logger, reason)
		} else {
			logger.Debug("Ignoring set next state to pending")
		}
	case eval.Normal:
		logger.Debug("Execution keep last state is Normal", "handler", "resultNormal")
		resultNormal(state, rule, result, logger, reason)
	default:
		// this should not happen, add as failsafe
		logger.Debug("Reverting invalid state to normal", "handler", "resultNormal")
		resultNormal(state, rule, result, logger, reason)
	}
}

// NeedsSending returns true if the given state needs to be sent to the Alertmanager.
// Reasons for sending include:
// - The state has been resolved since the last notification.
// - The state is firing and the last notification was sent at least resendDelay ago.
// - The state was resolved within the resolvedRetention period, and the last notification was sent at least resendDelay ago.
func (a *State) NeedsSending(resendDelay time.Duration, resolvedRetention time.Duration) bool {
	if a.State == eval.Pending {
		// We do not send notifications for pending states.
		return false
	}

	// We should send a notification if the state has been resolved since the last notification.
	if a.ResolvedAt != nil && (a.LastSentAt == nil || a.ResolvedAt.After(*a.LastSentAt)) {
		return true
	}

	// For normal states, we should only be sending if this is a resolved notification or a re-send of the resolved
	// notification within the resolvedRetention period.
	if a.State == eval.Normal && (a.ResolvedAt == nil || a.LastEvaluationTime.Sub(*a.ResolvedAt) > resolvedRetention) {
		return false
	}

	// We should send, and re-send notifications, each time LastSentAt is <= LastEvaluationTime + resendDelay.
	// This can include normal->normal transitions that were resolved in recent past evaluations.
	return a.LastSentAt == nil || !a.LastSentAt.Add(resendDelay).After(a.LastEvaluationTime)
}

func (a *State) Equals(b *State) bool {
	return a.AlertRuleUID == b.AlertRuleUID &&
		a.OrgID == b.OrgID &&
		a.CacheID == b.CacheID &&
		a.Labels.String() == b.Labels.String() &&
		a.State.String() == b.State.String() &&
		a.StartsAt == b.StartsAt &&
		a.EndsAt == b.EndsAt &&
		a.LastEvaluationTime == b.LastEvaluationTime &&
		data.Labels(a.Annotations).String() == data.Labels(b.Annotations).String()
}

func nextEndsTime(interval int64, evaluatedAt time.Time) time.Time {
	ends := ResendDelay
	intv := time.Second * time.Duration(interval)
	if intv > ResendDelay {
		ends = intv
	}
	// Allow for at least two evaluation cycles to pass before expiring, every time.
	// Synchronized with Prometheus:
	// https://github.com/prometheus/prometheus/blob/6a9b3263ffdba5ea8c23e6f9ef69fb7a15b566f8/rules/alerting.go#L493
	return evaluatedAt.Add(4 * ends)
}

func (a *State) GetLabels(opts ...models.LabelOption) map[string]string {
	labels := a.Labels.Copy()

	for _, opt := range opts {
		opt(labels)
	}

	return labels
}

func (a *State) GetLastEvaluationValuesForCondition() map[string]float64 {
	if a.LatestResult == nil {
		return nil
	}

	lastResult := *a.LatestResult
	r := make(map[string]float64, len(lastResult.Values))

	for refID, value := range lastResult.Values {
		if strings.Contains(refID, lastResult.Condition) {
			if value != nil {
				r[refID] = *value
				continue
			}
			r[refID] = math.NaN()
		}
	}

	return r
}

// IsStale returns true if the state is stale, meaning that the state is ready to be evicted from the cache.
func (a *State) IsStale() bool {
	return a.StateReason == models.StateReasonMissingSeries
}

// shouldTakeImage returns true if the state just has transitioned to alerting from another state,
// transitioned to alerting in a previous evaluation but does not have a screenshot, or has just
// been resolved.
func shouldTakeImage(state, previousState eval.State, previousImage *models.Image, resolved bool) bool {
	return resolved ||
		state == eval.Alerting && previousState != eval.Alerting ||
		state == eval.Alerting && previousImage == nil
}

// takeImage takes an image for the alert rule. It returns nil if screenshots are disabled or
// the rule is not associated with a dashboard panel.
func takeImage(ctx context.Context, s ImageCapturer, r *models.AlertRule) (*models.Image, error) {
	img, err := s.NewImage(ctx, r)
	if err != nil {
		if errors.Is(err, screenshot.ErrScreenshotsUnavailable) ||
			errors.Is(err, models.ErrNoDashboard) ||
			errors.Is(err, models.ErrNoPanel) {
			return nil, nil
		}
		return nil, err
	}
	return img, nil
}

func FormatStateAndReason(state eval.State, reason string) string {
	s := fmt.Sprintf("%v", state)
	if len(reason) > 0 {
		s += fmt.Sprintf(" (%v)", reason)
	}
	return s
}

// ParseFormattedState parses a state string in the format "state (reason)"
// and returns the state and reason separately.
func ParseFormattedState(stateStr string) (eval.State, string, error) {
	p := 0
	// walk string until we find a space
	for i, c := range stateStr {
		if c == ' ' {
			p = i
			break
		}
	}
	if p == 0 {
		p = len(stateStr)
	}

	state, err := eval.ParseStateString(stateStr[:p])
	if err != nil {
		return -1, "", err
	}

	if p == len(stateStr) {
		return state, "", nil
	}

	reason := strings.Trim(stateStr[p+1:], "()")
	return state, reason, nil
}

// GetRuleExtraLabels returns a map of built-in labels that should be added to an alert before it is sent to the Alertmanager or its state is cached.
func GetRuleExtraLabels(l log.Logger, rule *models.AlertRule, folderTitle string, includeFolder bool) map[string]string {
	extraLabels := make(map[string]string, 4)

	extraLabels[alertingModels.NamespaceUIDLabel] = rule.NamespaceUID
	extraLabels[prometheusModel.AlertNameLabel] = rule.Title
	extraLabels[alertingModels.RuleUIDLabel] = rule.UID

	if includeFolder {
		extraLabels[models.FolderTitleLabel] = folderTitle
	}

	if len(rule.NotificationSettings) > 0 {
		// Notification settings are defined as a slice to workaround xorm behavior.
		// Any items past the first should not exist so we ignore them.
		if len(rule.NotificationSettings) > 1 {
			ignored, _ := json.Marshal(rule.NotificationSettings[1:])
			l.Error("Detected multiple notification settings, which is not supported. Only the first will be applied", "ignored_settings", string(ignored))
		}
		return mergeLabels(extraLabels, rule.NotificationSettings[0].ToLabels())
	}
	return extraLabels
}