The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
grafana/pkg/services/ngalert/api/api_alertmanager.go

477 lines
16 KiB

package api
import (
"context"
"errors"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/go-openapi/strfmt"
alertingNotify "github.com/grafana/alerting/notify"
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/accesscontrol"
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
Inhouse alerting api (#33129) * init * autogens AM route * POST dashboards/db spec * POST alert-notifications spec * fix description * re inits vendor, updates grafana to master * go mod updates * alerting routes * renames to receivers * prometheus endpoints * align config endpoint with cortex, include templates * Change grafana receiver type * Update receivers.go * rename struct to stop swagger thrashing * add rules API * index html * standalone swagger ui html page * Update README.md * Expose GrafanaManagedAlert properties * Some fixes - /api/v1/rules/{Namespace} should return a map - update ExtendedUpsertAlertDefinitionCommand properties * am alerts routes * rename prom swagger section for clarity, remove example endpoints * Add missing json and yaml tags * folder perms * make folders POST again * fix grafana receiver type * rename fodler->namespace for perms * make ruler json again * PR fixes * silences * fix Ok -> Ack * Add id to POST /api/v1/silences (#9) Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in> * Add POST /api/v1/alerts (#10) Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in> * fix silences * Add testing endpoints * removes grpc replace directives * [wip] starts validation * pkg cleanup * go mod tidy * ignores vendor dir * Change response type for Cortex/Loki alerts * receiver unmarshaling tests * ability to split routes between AM & Grafana * api marshaling & validation * begins work on routing lib * [hack] ignores embedded field in generation * path specific datasource for alerting * align endpoint names with cloud * single route per Alerting config * removes unused routing pkg * regens spec * adds datasource param to ruler/prom route paths * Modifications for supporting migration * Apply suggestions from code review * hack for cleaning circular refs in swagger definition * generates files * minor fixes for prom endpoints * decorate prom apis with required: true where applicable * Revert "generates files" This reverts commit ef7e97558477d79bcad416e043b04dbd04a2c8f7. * removes server autogen * Update imported structs from ngalert * Fix listing rules response * Update github.com/prometheus/common dependency * Update get silence response * Update get silences response * adds ruler validation & backend switching * Fix GET /alertmanager/{DatasourceId}/config/api/v1/alerts response * Distinct gettable and postable grafana receivers * Remove permissions routes * Latest JSON specs * Fix testing routes * inline yaml annotation on apirulenode * yaml test & yamlv3 + comments * Fix yaml annotations for embedded type * Rename DatasourceId path parameter * Implement Backend.String() * backend zero value is a real backend * exports DiscoveryBase * Fix GO initialisms * Silences: Use PostableSilence as the base struct for creating silences * Use type alias instead of struct embedding * More fixes to alertmanager silencing routes * post and spec JSONs * Split rule config to postable/gettable * Fix empty POST /silences payload Recreating the generated JSON specs fixes the issue without further modifications * better yaml unmarshaling for nested yaml docs in cortex-am configs * regens spec * re-adds config.receivers * omitempty to align with prometheus API behavior * Prefix routes with /api * Update Alertmanager models * Make adjustments to follow the Alertmanager API * ruler: add for and annotations to grafana alert (#45) * Modify testing API routes * Fix grafana rule for field type * Move PostableUserConfig validation to this library * Fix PostableUserConfig YAML encoding/decoding * Use common fields for grafana and lotex rules * Add namespace id in GettableGrafanaRule * Apply suggestions from code review * fixup * more changes * Apply suggestions from code review * aligns structure pre merge * fix new imports & tests * updates tooling readme * goimports * lint * more linting!! * revive lint Co-authored-by: Sofia Papagiannaki <papagian@gmail.com> Co-authored-by: Domas <domasx2@gmail.com> Co-authored-by: Sofia Papagiannaki <papagian@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: gotjosh <josue@grafana.com> Co-authored-by: David Parrott <stomp.box.yo@gmail.com> Co-authored-by: Kyle Brandt <kyle@grafana.com>
5 years ago
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/util"
)
const (
defaultTestReceiversTimeout = 15 * time.Second
maxTestReceiversTimeout = 30 * time.Second
)
type AlertmanagerSrv struct {
log log.Logger
ac accesscontrol.AccessControl
mam *notifier.MultiOrgAlertmanager
crypto notifier.Crypto
}
type UnknownReceiverError struct {
UID string
}
func (e UnknownReceiverError) Error() string {
return fmt.Sprintf("unknown receiver: %s", e.UID)
}
func (srv AlertmanagerSrv) RouteGetAMStatus(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
return response.JSON(http.StatusOK, am.GetStatus())
}
func (srv AlertmanagerSrv) RouteCreateSilence(c *contextmodel.ReqContext, postableSilence apimodels.PostableSilence) response.Response {
err := postableSilence.Validate(strfmt.Default)
if err != nil {
srv.log.Error("silence failed validation", "error", err)
return ErrResp(http.StatusBadRequest, err, "silence failed validation")
}
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
action := accesscontrol.ActionAlertingInstanceUpdate
if postableSilence.ID == "" {
action = accesscontrol.ActionAlertingInstanceCreate
}
if !accesscontrol.HasAccess(srv.ac, c)(accesscontrol.EvalPermission(action)) {
errAction := "update"
if postableSilence.ID == "" {
errAction = "create"
}
return ErrResp(http.StatusUnauthorized, fmt.Errorf("user is not authorized to %s silences", errAction), "")
}
silenceID, err := am.CreateSilence(&postableSilence)
if err != nil {
if errors.Is(err, alertingNotify.ErrSilenceNotFound) {
return ErrResp(http.StatusNotFound, err, "")
}
if errors.Is(err, alertingNotify.ErrCreateSilenceBadPayload) {
return ErrResp(http.StatusBadRequest, err, "")
}
return ErrResp(http.StatusInternalServerError, err, "failed to create silence")
}
return response.JSON(http.StatusAccepted, apimodels.PostSilencesOKBody{
SilenceID: silenceID,
})
}
func (srv AlertmanagerSrv) RouteDeleteAlertingConfig(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
if err := am.SaveAndApplyDefaultConfig(c.Req.Context()); err != nil {
srv.log.Error("unable to save and apply default alertmanager configuration", "error", err)
return ErrResp(http.StatusInternalServerError, err, "failed to save and apply default Alertmanager configuration")
}
return response.JSON(http.StatusAccepted, util.DynMap{"message": "configuration deleted; the default is applied"})
}
func (srv AlertmanagerSrv) RouteDeleteSilence(c *contextmodel.ReqContext, silenceID string) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
if err := am.DeleteSilence(silenceID); err != nil {
if errors.Is(err, alertingNotify.ErrSilenceNotFound) {
return ErrResp(http.StatusNotFound, err, "")
}
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusOK, util.DynMap{"message": "silence deleted"})
}
func (srv AlertmanagerSrv) RouteGetAlertingConfig(c *contextmodel.ReqContext) response.Response {
config, err := srv.mam.GetAlertmanagerConfiguration(c.Req.Context(), c.OrgID)
if err != nil {
if errors.Is(err, store.ErrNoAlertmanagerConfiguration) {
return ErrResp(http.StatusNotFound, err, "")
}
return ErrResp(http.StatusInternalServerError, err, err.Error())
}
return response.JSON(http.StatusOK, config)
}
func (srv AlertmanagerSrv) RouteGetAlertingConfigHistory(c *contextmodel.ReqContext) response.Response {
limit := c.QueryInt("limit")
configs, err := srv.mam.GetAppliedAlertmanagerConfigurations(c.Req.Context(), c.OrgID, limit)
if err != nil {
return ErrResp(http.StatusInternalServerError, err, err.Error())
}
return response.JSON(http.StatusOK, configs)
}
func (srv AlertmanagerSrv) RouteGetAMAlertGroups(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
groups, err := am.GetAlertGroups(
c.QueryBoolWithDefault("active", true),
c.QueryBoolWithDefault("silenced", true),
c.QueryBoolWithDefault("inhibited", true),
c.QueryStrings("filter"),
c.Query("receiver"),
)
if err != nil {
if errors.Is(err, alertingNotify.ErrGetAlertGroupsBadPayload) {
return ErrResp(http.StatusBadRequest, err, "")
}
// any other error here should be an unexpected failure and thus an internal error
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusOK, groups)
}
func (srv AlertmanagerSrv) RouteGetAMAlerts(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
alerts, err := am.GetAlerts(
c.QueryBoolWithDefault("active", true),
c.QueryBoolWithDefault("silenced", true),
c.QueryBoolWithDefault("inhibited", true),
c.QueryStrings("filter"),
c.Query("receiver"),
)
if err != nil {
if errors.Is(err, alertingNotify.ErrGetAlertsBadPayload) {
return ErrResp(http.StatusBadRequest, err, "")
}
if errors.Is(err, alertingNotify.ErrGetAlertsUnavailable) {
return ErrResp(http.StatusServiceUnavailable, err, "")
}
// any other error here should be an unexpected failure and thus an internal error
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusOK, alerts)
}
func (srv AlertmanagerSrv) RouteGetSilence(c *contextmodel.ReqContext, silenceID string) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
gettableSilence, err := am.GetSilence(silenceID)
if err != nil {
if errors.Is(err, alertingNotify.ErrSilenceNotFound) {
return ErrResp(http.StatusNotFound, err, "")
}
// any other error here should be an unexpected failure and thus an internal error
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusOK, gettableSilence)
}
func (srv AlertmanagerSrv) RouteGetSilences(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
gettableSilences, err := am.ListSilences(c.QueryStrings("filter"))
if err != nil {
if errors.Is(err, alertingNotify.ErrListSilencesBadPayload) {
return ErrResp(http.StatusBadRequest, err, "")
}
// any other error here should be an unexpected failure and thus an internal error
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusOK, gettableSilences)
}
func (srv AlertmanagerSrv) RoutePostGrafanaAlertingConfigHistoryActivate(c *contextmodel.ReqContext, id string) response.Response {
confId, err := strconv.ParseInt(id, 10, 64)
if err != nil {
return ErrResp(http.StatusBadRequest, err, "failed to parse config id")
}
err = srv.mam.ActivateHistoricalConfiguration(c.Req.Context(), c.OrgID, confId)
if err != nil {
var unknownReceiverError notifier.UnknownReceiverError
if errors.As(err, &unknownReceiverError) {
return ErrResp(http.StatusBadRequest, unknownReceiverError, "")
}
var configRejectedError notifier.AlertmanagerConfigRejectedError
if errors.As(err, &configRejectedError) {
return ErrResp(http.StatusBadRequest, configRejectedError, "")
}
if errors.Is(err, store.ErrNoAlertmanagerConfiguration) {
return response.Error(http.StatusNotFound, err.Error(), err)
}
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
return response.Error(http.StatusNotFound, err.Error(), err)
}
if errors.Is(err, notifier.ErrAlertmanagerNotReady) {
return response.Error(http.StatusConflict, err.Error(), err)
}
return ErrResp(http.StatusInternalServerError, err, "")
}
return response.JSON(http.StatusAccepted, util.DynMap{"message": "configuration activated"})
}
func (srv AlertmanagerSrv) RoutePostAlertingConfig(c *contextmodel.ReqContext, body apimodels.PostableUserConfig) response.Response {
currentConfig, err := srv.mam.GetAlertmanagerConfiguration(c.Req.Context(), c.OrgID)
// If a config is present and valid we proceed with the guard, otherwise we
// just bypass the guard which is okay as we are anyway in an invalid state.
if err == nil {
if err := srv.provenanceGuard(currentConfig, body); err != nil {
return ErrResp(http.StatusBadRequest, err, "")
}
}
err = srv.mam.ApplyAlertmanagerConfiguration(c.Req.Context(), c.OrgID, body)
if err == nil {
return response.JSON(http.StatusAccepted, util.DynMap{"message": "configuration created"})
}
var unknownReceiverError notifier.UnknownReceiverError
if errors.As(err, &unknownReceiverError) {
return ErrResp(http.StatusBadRequest, unknownReceiverError, "")
}
var configRejectedError notifier.AlertmanagerConfigRejectedError
if errors.As(err, &configRejectedError) {
return ErrResp(http.StatusBadRequest, configRejectedError, "")
}
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
return response.Error(http.StatusNotFound, err.Error(), err)
}
if errors.Is(err, notifier.ErrAlertmanagerNotReady) {
return response.Error(http.StatusConflict, err.Error(), err)
}
return ErrResp(http.StatusInternalServerError, err, "")
}
func (srv AlertmanagerSrv) RouteGetReceivers(c *contextmodel.ReqContext) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
rcvs := am.GetReceivers(c.Req.Context())
return response.JSON(http.StatusOK, rcvs)
}
func (srv AlertmanagerSrv) RoutePostTestReceivers(c *contextmodel.ReqContext, body apimodels.TestReceiversConfigBodyParams) response.Response {
if err := srv.crypto.ProcessSecureSettings(c.Req.Context(), c.OrgID, body.Receivers); err != nil {
var unknownReceiverError UnknownReceiverError
if errors.As(err, &unknownReceiverError) {
return ErrResp(http.StatusBadRequest, err, "")
}
return ErrResp(http.StatusInternalServerError, err, "failed to post process Alertmanager configuration")
}
ctx, cancelFunc, err := contextWithTimeoutFromRequest(
c.Req.Context(),
c.Req,
defaultTestReceiversTimeout,
maxTestReceiversTimeout)
if err != nil {
return ErrResp(http.StatusBadRequest, err, "")
}
defer cancelFunc()
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
result, err := am.TestReceivers(ctx, body)
if err != nil {
if errors.Is(err, alertingNotify.ErrNoReceivers) {
return response.Error(http.StatusBadRequest, "", err)
}
return response.Error(http.StatusInternalServerError, "", err)
}
return response.JSON(statusForTestReceivers(result.Receivers), newTestReceiversResult(result))
}
func (srv AlertmanagerSrv) RoutePostTestTemplates(c *contextmodel.ReqContext, body apimodels.TestTemplatesConfigBodyParams) response.Response {
am, errResp := srv.AlertmanagerFor(c.OrgID)
if errResp != nil {
return errResp
}
res, err := am.TestTemplate(c.Req.Context(), body)
if err != nil {
return response.Error(http.StatusInternalServerError, "", err)
}
return response.JSON(http.StatusOK, newTestTemplateResult(res))
}
// contextWithTimeoutFromRequest returns a context with a deadline set from the
// Request-Timeout header in the HTTP request. If the header is absent then the
// context will use the default timeout. The timeout in the Request-Timeout
// header cannot exceed the maximum timeout.
func contextWithTimeoutFromRequest(ctx context.Context, r *http.Request, defaultTimeout, maxTimeout time.Duration) (context.Context, context.CancelFunc, error) {
timeout := defaultTimeout
if s := strings.TrimSpace(r.Header.Get("Request-Timeout")); s != "" {
// the timeout is measured in seconds
v, err := strconv.ParseInt(s, 10, 16)
if err != nil {
return nil, nil, err
}
if d := time.Duration(v) * time.Second; d < maxTimeout {
timeout = d
} else {
return nil, nil, fmt.Errorf("exceeded maximum timeout of %d seconds", maxTimeout)
}
}
ctx, cancelFunc := context.WithTimeout(ctx, timeout)
return ctx, cancelFunc, nil
}
func newTestReceiversResult(r *notifier.TestReceiversResult) apimodels.TestReceiversResult {
v := apimodels.TestReceiversResult{
Alert: apimodels.TestReceiversConfigAlertParams{
Annotations: r.Alert.Annotations,
Labels: r.Alert.Labels,
},
Receivers: make([]apimodels.TestReceiverResult, len(r.Receivers)),
NotifiedAt: r.NotifedAt,
}
for ix, next := range r.Receivers {
configs := make([]apimodels.TestReceiverConfigResult, len(next.Configs))
for jx, config := range next.Configs {
configs[jx].Name = config.Name
configs[jx].UID = config.UID
configs[jx].Status = config.Status
if config.Error != nil {
configs[jx].Error = config.Error.Error()
}
}
v.Receivers[ix].Configs = configs
v.Receivers[ix].Name = next.Name
}
return v
}
// statusForTestReceivers returns the appropriate status code for the response
// for the results.
//
// It returns an HTTP 200 OK status code if notifications were sent to all receivers,
// an HTTP 400 Bad Request status code if all receivers contain invalid configuration,
// an HTTP 408 Request Timeout status code if all receivers timed out when sending
// a test notification or an HTTP 207 Multi Status.
func statusForTestReceivers(v []notifier.TestReceiverResult) int {
var (
numBadRequests int
numTimeouts int
numUnknownErrors int
)
for _, receiver := range v {
for _, next := range receiver.Configs {
if next.Error != nil {
var (
invalidReceiverErr alertingNotify.IntegrationValidationError
receiverTimeoutErr alertingNotify.IntegrationTimeoutError
)
if errors.As(next.Error, &invalidReceiverErr) {
numBadRequests += 1
} else if errors.As(next.Error, &receiverTimeoutErr) {
numTimeouts += 1
} else {
numUnknownErrors += 1
}
}
}
}
if numBadRequests == len(v) {
// if all receivers contain invalid configuration
return http.StatusBadRequest
} else if numTimeouts == len(v) {
// if all receivers contain valid configuration but timed out
return http.StatusRequestTimeout
} else if numBadRequests+numTimeouts+numUnknownErrors > 0 {
return http.StatusMultiStatus
} else {
// all receivers were sent a notification without error
return http.StatusOK
}
}
func newTestTemplateResult(res *notifier.TestTemplatesResults) apimodels.TestTemplatesResults {
apiRes := apimodels.TestTemplatesResults{}
for _, r := range res.Results {
apiRes.Results = append(apiRes.Results, apimodels.TestTemplatesResult{
Name: r.Name,
Text: r.Text,
})
}
for _, e := range res.Errors {
apiRes.Errors = append(apiRes.Errors, apimodels.TestTemplatesErrorResult{
Name: e.Name,
Kind: apimodels.TemplateErrorKind(e.Kind),
Message: e.Error.Error(),
})
}
return apiRes
}
func (srv AlertmanagerSrv) AlertmanagerFor(orgID int64) (Alertmanager, *response.NormalResponse) {
am, err := srv.mam.AlertmanagerFor(orgID)
if err == nil {
return am, nil
}
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
return nil, response.Error(http.StatusNotFound, err.Error(), err)
}
if errors.Is(err, notifier.ErrAlertmanagerNotReady) {
return am, response.Error(http.StatusConflict, err.Error(), err)
}
srv.log.Error("unable to obtain the org's Alertmanager", "error", err)
return nil, response.Error(http.StatusInternalServerError, "unable to obtain org's Alertmanager", err)
}