pull/104112/head
Andres Martinez Gotor 2 months ago
commit 0ee7c64baa
  1. 7
      apps/advisor/pkg/app/checkregistry/checkregistry.go
  2. 17
      apps/advisor/pkg/app/checks/datasourcecheck/check.go
  3. 2
      apps/advisor/pkg/app/checks/datasourcecheck/check_test.go
  4. 41
      apps/advisor/pkg/app/checks/plugincheck/check.go
  5. 4
      apps/advisor/pkg/app/checks/plugincheck/check_test.go
  6. 64
      docs/sources/alerting/fundamentals/alert-rule-evaluation/state-and-health.md
  7. 2
      docs/sources/alerting/learn/missing-data.md
  8. 2
      e2e/various-suite/loki-table-explore-to-dash.spec.ts
  9. 2
      pkg/plugins/manager/fakes/fakes.go
  10. 2
      pkg/plugins/repo/ifaces.go
  11. 4
      pkg/plugins/repo/service.go
  12. 2
      pkg/plugins/repo/service_test.go
  13. 2
      pkg/server/server.go
  14. 13
      pkg/services/provisioning/provisioning.go

@ -14,6 +14,7 @@ import (
"github.com/grafana/grafana/pkg/services/pluginsintegration/pluginstore"
"github.com/grafana/grafana/pkg/services/pluginsintegration/provisionedplugins"
"github.com/grafana/grafana/pkg/services/ssosettings"
"github.com/grafana/grafana/pkg/setting"
)
type CheckService interface {
@ -31,13 +32,14 @@ type Service struct {
managedPlugins managedplugins.Manager
provisionedPlugins provisionedplugins.Manager
ssoSettingsSvc ssosettings.Service
GrafanaVersion string
}
func ProvideService(datasourceSvc datasources.DataSourceService, pluginStore pluginstore.Store,
pluginContextProvider *plugincontext.Provider, pluginClient plugins.Client,
updateChecker pluginchecker.PluginUpdateChecker,
pluginRepo repo.Service, pluginPreinstall pluginchecker.Preinstall, managedPlugins managedplugins.Manager,
provisionedPlugins provisionedplugins.Manager, ssoSettingsSvc ssosettings.Service,
provisionedPlugins provisionedplugins.Manager, ssoSettingsSvc ssosettings.Service, settings *setting.Cfg,
) *Service {
return &Service{
datasourceSvc: datasourceSvc,
@ -50,6 +52,7 @@ func ProvideService(datasourceSvc datasources.DataSourceService, pluginStore plu
managedPlugins: managedPlugins,
provisionedPlugins: provisionedPlugins,
ssoSettingsSvc: ssoSettingsSvc,
GrafanaVersion: settings.BuildVersion,
}
}
@ -61,11 +64,13 @@ func (s *Service) Checks() []checks.Check {
s.pluginContextProvider,
s.pluginClient,
s.pluginRepo,
s.GrafanaVersion,
),
plugincheck.New(
s.pluginStore,
s.pluginRepo,
s.updateChecker,
s.GrafanaVersion,
),
authchecks.New(s.ssoSettingsSvc),
}

@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
sysruntime "runtime"
"github.com/grafana/grafana-app-sdk/logging"
"github.com/grafana/grafana-plugin-sdk-go/backend"
@ -30,6 +31,7 @@ type check struct {
PluginContextProvider pluginContextProvider
PluginClient plugins.Client
PluginRepo repo.Service
GrafanaVersion string
}
func New(
@ -38,6 +40,7 @@ func New(
pluginContextProvider pluginContextProvider,
pluginClient plugins.Client,
pluginRepo repo.Service,
grafanaVersion string,
) checks.Check {
return &check{
DatasourceSvc: datasourceSvc,
@ -45,6 +48,7 @@ func New(
PluginContextProvider: pluginContextProvider,
PluginClient: pluginClient,
PluginRepo: pluginRepo,
GrafanaVersion: grafanaVersion,
}
}
@ -83,8 +87,9 @@ func (c *check) Steps() []checks.Step {
PluginClient: c.PluginClient,
},
&missingPluginStep{
PluginStore: c.PluginStore,
PluginRepo: c.PluginRepo,
PluginStore: c.PluginStore,
PluginRepo: c.PluginRepo,
GrafanaVersion: c.GrafanaVersion,
},
}
}
@ -201,8 +206,9 @@ func (s *healthCheckStep) Run(ctx context.Context, log logging.Logger, obj *advi
}
type missingPluginStep struct {
PluginStore pluginstore.Store
PluginRepo repo.Service
PluginStore pluginstore.Store
PluginRepo repo.Service
GrafanaVersion string
}
func (s *missingPluginStep) Title() string {
@ -235,7 +241,8 @@ func (s *missingPluginStep) Run(ctx context.Context, log logging.Logger, obj *ad
Url: fmt.Sprintf("/connections/datasources/edit/%s", ds.UID),
},
}
_, err := s.PluginRepo.PluginInfo(ctx, ds.Type)
compatOpts := repo.NewCompatOpts(s.GrafanaVersion, sysruntime.GOOS, sysruntime.GOARCH)
_, err := s.PluginRepo.PluginInfo(ctx, ds.Type, compatOpts)
if err == nil {
// Plugin is available in the repo
links = append(links, advisor.CheckErrorLink{

@ -260,7 +260,7 @@ type MockPluginRepo struct {
exists bool
}
func (m *MockPluginRepo) PluginInfo(context.Context, string) (*repo.PluginInfo, error) {
func (m *MockPluginRepo) PluginInfo(context.Context, string, repo.CompatOpts) (*repo.PluginInfo, error) {
if !m.exists {
return nil, errors.New("plugin not found")
}

@ -9,7 +9,6 @@ import (
"github.com/grafana/grafana-app-sdk/logging"
advisor "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
"github.com/grafana/grafana/pkg/cmd/grafana-cli/services"
"github.com/grafana/grafana/pkg/plugins/repo"
"github.com/grafana/grafana/pkg/services/pluginsintegration/pluginchecker"
"github.com/grafana/grafana/pkg/services/pluginsintegration/pluginstore"
@ -25,18 +24,21 @@ func New(
pluginStore pluginstore.Store,
pluginRepo repo.Service,
updateChecker pluginchecker.PluginUpdateChecker,
grafanaVersion string,
) checks.Check {
return &check{
PluginStore: pluginStore,
PluginRepo: pluginRepo,
updateChecker: updateChecker,
PluginStore: pluginStore,
PluginRepo: pluginRepo,
GrafanaVersion: grafanaVersion,
updateChecker: updateChecker,
}
}
type check struct {
PluginStore pluginstore.Store
PluginRepo repo.Service
updateChecker pluginchecker.PluginUpdateChecker
PluginStore pluginstore.Store
PluginRepo repo.Service
updateChecker pluginchecker.PluginUpdateChecker
GrafanaVersion string
}
func (c *check) ID() string {
@ -63,19 +65,22 @@ func (c *check) Item(ctx context.Context, id string) (any, error) {
func (c *check) Steps() []checks.Step {
return []checks.Step{
&deprecationStep{
PluginRepo: c.PluginRepo,
updateChecker: c.updateChecker,
PluginRepo: c.PluginRepo,
GrafanaVersion: c.GrafanaVersion,
updateChecker: c.updateChecker,
},
&updateStep{
PluginRepo: c.PluginRepo,
updateChecker: c.updateChecker,
PluginRepo: c.PluginRepo,
GrafanaVersion: c.GrafanaVersion,
updateChecker: c.updateChecker,
},
}
}
type deprecationStep struct {
PluginRepo repo.Service
updateChecker pluginchecker.PluginUpdateChecker
PluginRepo repo.Service
GrafanaVersion string
updateChecker pluginchecker.PluginUpdateChecker
}
func (s *deprecationStep) Title() string {
@ -106,7 +111,8 @@ func (s *deprecationStep) Run(ctx context.Context, log logging.Logger, _ *adviso
}
// Check if plugin is deprecated
i, err := s.PluginRepo.PluginInfo(ctx, p.ID)
compatOpts := repo.NewCompatOpts(s.GrafanaVersion, sysruntime.GOOS, sysruntime.GOARCH)
i, err := s.PluginRepo.PluginInfo(ctx, p.ID, compatOpts)
if err != nil {
// Unable to check deprecation status
return nil, nil
@ -129,8 +135,9 @@ func (s *deprecationStep) Run(ctx context.Context, log logging.Logger, _ *adviso
}
type updateStep struct {
PluginRepo repo.Service
updateChecker pluginchecker.PluginUpdateChecker
PluginRepo repo.Service
GrafanaVersion string
updateChecker pluginchecker.PluginUpdateChecker
}
func (s *updateStep) Title() string {
@ -160,7 +167,7 @@ func (s *updateStep) Run(ctx context.Context, log logging.Logger, _ *advisor.Che
}
// Check if plugin has a newer version available
compatOpts := repo.NewCompatOpts(services.GrafanaVersion, sysruntime.GOOS, sysruntime.GOARCH)
compatOpts := repo.NewCompatOpts(s.GrafanaVersion, sysruntime.GOOS, sysruntime.GOARCH)
info, err := s.PluginRepo.GetPluginArchiveInfo(ctx, p.ID, "", compatOpts)
if err != nil {
// Unable to check updates

@ -164,7 +164,7 @@ func TestRun(t *testing.T) {
managedPlugins := &mockManagedPlugins{managed: tt.pluginManaged}
provisionedPlugins := &mockProvisionedPlugins{provisioned: tt.pluginProvisioned}
updateChecker := pluginchecker.ProvideService(managedPlugins, provisionedPlugins, pluginPreinstall)
check := New(pluginStore, pluginRepo, updateChecker)
check := New(pluginStore, pluginRepo, updateChecker, "12.0.0")
items, err := check.Items(context.Background())
assert.NoError(t, err)
@ -200,7 +200,7 @@ type mockPluginRepo struct {
pluginArchiveInfo map[string]*repo.PluginArchiveInfo
}
func (m *mockPluginRepo) PluginInfo(ctx context.Context, id string) (*repo.PluginInfo, error) {
func (m *mockPluginRepo) PluginInfo(ctx context.Context, id string, compatOpts repo.CompatOpts) (*repo.PluginInfo, error) {
return m.pluginInfo[id], nil
}

@ -19,6 +19,12 @@ labels:
title: State and health of alerts
weight: 109
refs:
evaluation_timeout:
- pattern: /docs/
destination: /docs/grafana/<GRAFANA_VERSION>/setup-grafana/configure-grafana/#evaluation_timeout
max_attempts:
- pattern: /docs/
destination: /docs/grafana/<GRAFANA_VERSION>/setup-grafana/configure-grafana/#max_attempts
pending-period:
- pattern: /docs/grafana/
destination: /docs/grafana/<GRAFANA_VERSION>/alerting/fundamentals/alert-rule-evaluation/#pending-period
@ -44,6 +50,16 @@ refs:
destination: /docs/grafana/<GRAFANA_VERSION>/alerting/fundamentals/notifications/notification-policies/
- pattern: /docs/grafana-cloud/
destination: /docs/grafana-cloud/alerting-and-irm/alerting/fundamentals/notifications/notification-policies/
guide-connectivity-errors:
- pattern: /docs/grafana/
destination: /docs/grafana/<GRAFANA_VERSION>/alerting/learn/connectivity-errors/
- pattern: /docs/grafana-cloud/
destination: /docs/grafana-cloud/alerting-and-irm/alerting/learn/connectivity-errors/
guide-missing-data:
- pattern: /docs/grafana/
destination: /docs/grafana/<GRAFANA_VERSION>/alerting/learn/missing-data/
- pattern: /docs/grafana-cloud/
destination: /docs/grafana-cloud/alerting-and-irm/alerting/learn/missing-data/
---
# State and health of alerts
@ -54,14 +70,14 @@ There are three key components that help you understand how your alerts behave d
An alert instance can be in either of the following states:
| State | Description |
| ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Normal** | The state of an alert when the condition (threshold) is not met. |
| **Pending** | The state of an alert that has breached the threshold but for less than the [pending period](ref:pending-period). |
| **Alerting** | The state of an alert that has breached the threshold for longer than the [pending period](ref:pending-period). |
| **Recovering** | The state of an alert that has been configured to keep [firing for a duration after it is triggered](ref:keep-firing). |
| **No Data<sup>\*</sup>** | The state of an alert whose query returns no data or all values are null. <br/> An alert in this state generates a new [DatasourceNoData alert](#no-data-and-error-alerts). You can [modify the default behavior of the no data state](#modify-the-no-data-or-error-state). |
| **Error<sup>\*</sup>** | The state of an alert when an error or timeout occurred evaluating the alert rule. <br/> An alert in this state generates a new [DatasourceError alert](#no-data-and-error-alerts). You can [modify the default behavior of the error state](#modify-the-no-data-or-error-state). |
| State | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Normal** | The state of an alert when the condition (threshold) is not met. |
| **Pending** | The state of an alert that has breached the threshold but for less than the [pending period](ref:pending-period). |
| **Alerting** | The state of an alert that has breached the threshold for longer than the [pending period](ref:pending-period). |
| **Recovering** | The state of an alert that has been configured to keep [firing for a duration after it is triggered](ref:keep-firing). |
| **Error<sup>\*</sup>** | The state of an alert when an error or timeout occurred evaluating the alert rule. <br/> You can customize the behavior of the [Error state](#error-state), which by default triggers a different alert. |
| **No Data<sup>\*</sup>** | The state of an alert whose query returns no data or all values are null. <br/> You can customize the behavior of the [No Data state](#no-data-state), which by default triggers a different alert. |
If an alert rule changes (except for updates to annotations, the evaluation interval, or other internal fields), its alert instances reset to the `Normal` state. The alert instance state then updates accordingly during the next evaluation.
@ -79,13 +95,25 @@ Alert instances will be routed for [notifications](ref:notifications) when they
{{< figure src="/media/docs/alerting/alert-rule-evaluation-overview-statediagram-v2.png" alt="A diagram of the alert instance states and when to route their notifications." max-width="750px" >}}
### Stale alert instances (MissingSeries)
### `Error` state
The **Error** state is triggered when the alert rule fails to evaluate its query or queries successfully.
This can occur due to evaluation timeouts (default: `30s`) or three repeated failures when querying the data source. The [`evaluation_timeout`](ref:evaluation_timeout) and [`max_attempts`](ref:max_attempts) options control these settings.
When an alert instance enters the **Error** state, Grafana, by default, triggers a new [`DatasourceError` alert](#no-data-and-error-alerts). You can control this behavior based on the desired outcome of your alert rule in [Modify the `No Data` or `Error` state](#modify-the-no-data-or-error-state).
The `No Data` state occurs when the alert rule query runs successfully but returns no data points at all.
### `No Data` state
An alert instance is considered stale if the query returns data but its dimension or series has disappeared for two evaluation intervals. In this case, the alert instance transitions to the **Normal (MissingSeries)** state as resolved, and is then evicted.
The **No Data** state occurs when the alert rule query runs successfully but returns no data points at all.
The process for handling stale alert instances is as follows:
When an alert instance enters the **No Data** state, Grafana, by default, triggers a new [`DatasourceNoData` alert](#no-data-and-error-alerts). You can control this behavior based on the desired outcome of your alert rule in [Modify the `No Data` or `Error` state](#modify-the-no-data-or-error-state).
### Stale alert instances (MissingSeries)
An alert instance is considered **stale** if the query returns data but its dimension (or series) has disappeared for two evaluation intervals.
In this case, the alert instance transitions to the **Normal (MissingSeries)** state as resolved, and is then evicted. The process for handling stale alert instances is as follows:
1. The alert rule runs and returns data for some label sets.
@ -99,6 +127,14 @@ The process for handling stale alert instances is as follows:
1. The alert instance is removed from the UI.
{{< admonition type="tip" >}}
For common examples and practical guidance on handling **Error**, **No Data**, and **stale** alert scenarios, see the following related guides:
- [Handling connectivity errors](ref:guide-connectivity-errors)
- [Handling missing data](ref:guide-missing-data)
{{< /admonition >}}
### `No Data` and `Error` alerts
When an alert rule evaluation results in a `No Data` or `Error` state, Grafana Alerting immediately creates a new alert instance —skipping the pending period—with the following additional labels:
@ -117,7 +153,7 @@ If the alert rule is configured to send notifications directly to a selected con
These states are supported only for Grafana-managed alert rules.
In [Configure no data and error handling](ref:no-data-and-error-handling), you can change the default behaviour when the evaluation returns no data or an error. You can set the alert instance state to `Alerting`, `Normal`, `Error`, or `Keep Last State`.
In [Configure no data and error handling](ref:no-data-and-error-handling), you can change the default behavior when the evaluation returns no data or an error. You can set the alert instance state to `Alerting`, `Normal`, `Error`, or `Keep Last State`.
{{< figure src="/media/docs/alerting/alert-rule-configure-no-data-and-error-v2.png" alt="A screenshot of the `Configure no data and error handling` option in Grafana Alerting." max-width="500px" >}}
@ -134,7 +170,7 @@ To minimize the number of **No Data** or **Error** state alerts received, try th
To minimize timeouts resulting in the **Error** state, reduce the time range to request less data every evaluation cycle.
1. Change the default [evaluation time out](https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#evaluation_timeout). The default is set at 30 seconds. To increase the default evaluation timeout, open a support ticket from the [Cloud Portal](https://grafana.com/docs/grafana-cloud/account-management/support/#grafana-cloud-support-options). Note that this should be a last resort, because it may affect the performance of all alert rules and cause missed evaluations if the timeout is too long.
1. Change the default [evaluation time out](ref:evaluation_timeout). The default is set at 30 seconds. To increase the default evaluation timeout, open a support ticket from the [Cloud Portal](https://grafana.com/docs/grafana-cloud/account-management/support/#grafana-cloud-support-options). Note that this should be a last resort, because it may affect the performance of all alert rules and cause missed evaluations if the timeout is too long.
1. To reduce multiple notifications from **Error** alerts, define a [notification policy](ref:notification-policies) to handle all related alerts with `alertname=DatasourceError`, and filter and group errors from the same data source using the `datasource_uid` label.

@ -173,7 +173,7 @@ If an alert instance becomes stale, you’ll find in the [alert history](ref:ale
###
### Why doesn’t MissingSeries match No Data behaviour?
### Why doesn’t MissingSeries match No Data behavior?
In dynamic environments — autoscaling groups, ephemeral pods, spot instances — series naturally come and go. **MissingSeries** normally signals infrastructure or deployment changes.

@ -110,7 +110,7 @@ const lokiQueryResult = {
},
};
describe('Loki Query Editor', () => {
describe.skip('Loki Query Editor', () => {
beforeEach(() => {
e2e.flows.login(Cypress.env('USERNAME'), Cypress.env('PASSWORD'));
});

@ -272,7 +272,7 @@ func (r *FakePluginRepo) PluginVersion(ctx context.Context, pluginID, version st
return repo.VersionData{}, nil
}
func (r *FakePluginRepo) PluginInfo(ctx context.Context, pluginID string) (*repo.PluginInfo, error) {
func (r *FakePluginRepo) PluginInfo(ctx context.Context, pluginID string, compatOpts repo.CompatOpts) (*repo.PluginInfo, error) {
return &repo.PluginInfo{}, nil
}

@ -17,7 +17,7 @@ type Service interface {
// PluginVersion will return plugin version based on the requested information.
PluginVersion(ctx context.Context, pluginID, version string, compatOpts CompatOpts) (VersionData, error)
// PluginInfo will return generic plugin information from grafana.com/api/plugins.
PluginInfo(ctx context.Context, pluginID string) (*PluginInfo, error)
PluginInfo(ctx context.Context, pluginID string, compatOpts CompatOpts) (*PluginInfo, error)
}
type CompatOpts struct {

@ -132,7 +132,7 @@ func (m *Manager) grafanaCompatiblePluginVersions(ctx context.Context, pluginID
return v.Versions, nil
}
func (m *Manager) PluginInfo(ctx context.Context, pluginID string) (*PluginInfo, error) {
func (m *Manager) PluginInfo(ctx context.Context, pluginID string, compatOpts CompatOpts) (*PluginInfo, error) {
u, err := url.Parse(m.client.grafanaComAPIURL)
if err != nil {
return nil, err
@ -140,7 +140,7 @@ func (m *Manager) PluginInfo(ctx context.Context, pluginID string) (*PluginInfo,
u.Path = path.Join(u.Path, pluginID)
body, err := m.client.SendReq(ctx, u, CompatOpts{})
body, err := m.client.SendReq(ctx, u, compatOpts)
if err != nil {
return nil, err
}

@ -126,7 +126,7 @@ func TestPluginInfo(t *testing.T) {
BaseURL: srv.URL,
Logger: log.NewTestPrettyLogger(),
})
pi, err := m.PluginInfo(context.Background(), pluginID)
pi, err := m.PluginInfo(context.Background(), pluginID, CompatOpts{})
require.NoError(t, err)
require.Equal(t, 1, pi.ID)
require.Equal(t, pluginID, pi.Slug)

@ -131,7 +131,7 @@ func (s *Server) Init() error {
return err
}
return nil
return s.provisioningService.RunInitProvisioners(s.context)
}
// Run initializes and starts services. This will block until all services have

@ -183,21 +183,18 @@ func (ps *ProvisioningServiceImpl) RunInitProvisioners(ctx context.Context) erro
return err
}
err = ps.ProvisionAlerting(ctx)
if err != nil {
ps.log.Error("Failed to provision alerting", "error", err)
return err
}
return nil
}
func (ps *ProvisioningServiceImpl) Run(ctx context.Context) error {
var err error
// run Init Provisioners only once
ps.onceInitProvisioners.Do(func() {
err = ps.RunInitProvisioners(ctx)
// Run Alerting Provisioning only once.
// It can't be initialized at RunInitProvisioners because it
// depends on the Server to be already running and listening
// to /apis endpoints.
err = ps.ProvisionAlerting(ctx)
})
if err != nil {

Loading…
Cancel
Save