Advisor: Avoid returning an error when creating initial resources (#102545)

3 months ago · 20e171968e
parent c11a37eabb
commit 20e171968e
4 changed files with 90 additions and 39 deletions
--- a/apps/advisor/pkg/app/checkscheduler/checkscheduler.go
+++ b/apps/advisor/pkg/app/checkscheduler/checkscheduler.go
@ -14,6 +14,7 @@ import (
 	advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checkregistry"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
+	"github.com/grafana/grafana/pkg/infra/log"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/klog/v2"
 )
@ -30,6 +31,7 @@ type Runner struct {
 	evaluationInterval time.Duration
 	maxHistory         int
 	namespace          string
+	log                log.Logger
 }

 // NewRunner creates a new Runner.
@ -66,22 +68,25 @@ func New(cfg app.Config) (app.Runnable, error) {
 		evaluationInterval: evalInterval,
 		maxHistory:         maxHistory,
 		namespace:          namespace,
+		log:                log.New("advisor.checkscheduler"),
 	}, nil
 }

 func (r *Runner) Run(ctx context.Context) error {
 	lastCreated, err := r.checkLastCreated(ctx)
 	if err != nil {
-		return err
-	}
-
-	// do an initial creation if necessary
-	if lastCreated.IsZero() {
-		err = r.createChecks(ctx)
-		if err != nil {
-			klog.Error("Error creating new check reports", "error", err)
-		} else {
-			lastCreated = time.Now()
+		r.log.Error("Error getting last check creation time", "error", err)
+		// Wait for interval to create the next scheduled check
+		lastCreated = time.Now()
+	} else {
+		// do an initial creation if necessary
+		if lastCreated.IsZero() {
+			err = r.createChecks(ctx)
+			if err != nil {
+				klog.Error("Error creating new check reports", "error", err)
+			} else {
+				lastCreated = time.Now()
+			}
 		}
 	}

--- a/apps/advisor/pkg/app/checkscheduler/checkscheduler_test.go
+++ b/apps/advisor/pkg/app/checkscheduler/checkscheduler_test.go
@ -11,28 +11,35 @@ import (
 	"github.com/grafana/grafana-app-sdk/resource"
 	advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
+	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/stretchr/testify/assert"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )

-func TestRunner_Run_ErrorOnList(t *testing.T) {
-	mockCheckService := &MockCheckService{}
-	mockClient := &MockClient{
-		listFunc: func(ctx context.Context, namespace string, options resource.ListOptions) (resource.ListObject, error) {
-			return nil, errors.New("list error")
-		},
-		createFunc: func(ctx context.Context, id resource.Identifier, obj resource.Object, opts resource.CreateOptions) (resource.Object, error) {
-			return &advisorv0alpha1.Check{}, nil
-		},
-	}
-
-	runner := &Runner{
-		checkRegistry: mockCheckService,
-		client:        mockClient,
-	}
-
-	err := runner.Run(context.Background())
-	assert.Error(t, err)
+func TestRunner_Run(t *testing.T) {
+	t.Run("does not crash when error on list", func(t *testing.T) {
+		mockCheckService := &MockCheckService{}
+		mockClient := &MockClient{
+			listFunc: func(ctx context.Context, namespace string, options resource.ListOptions) (resource.ListObject, error) {
+				return nil, errors.New("list error")
+			},
+			createFunc: func(ctx context.Context, id resource.Identifier, obj resource.Object, opts resource.CreateOptions) (resource.Object, error) {
+				return &advisorv0alpha1.Check{}, nil
+			},
+		}
+
+		runner := &Runner{
+			checkRegistry:      mockCheckService,
+			client:             mockClient,
+			log:                log.NewNopLogger(),
+			evaluationInterval: 1 * time.Hour,
+		}
+
+		ctx, cancel := context.WithCancel(context.Background())
+		cancel()
+		err := runner.Run(ctx)
+		assert.ErrorAs(t, err, &context.Canceled)
+	})
 }

 func TestRunner_checkLastCreated_ErrorOnList(t *testing.T) {
@ -44,6 +51,7 @@ func TestRunner_checkLastCreated_ErrorOnList(t *testing.T) {

 	runner := &Runner{
 		client: mockClient,
+		log:    log.NewNopLogger(),
 	}

 	lastCreated, err := runner.checkLastCreated(context.Background())
@ -68,6 +76,7 @@ func TestRunner_createChecks_ErrorOnCreate(t *testing.T) {
 	runner := &Runner{
 		checkRegistry: mockCheckService,
 		client:        mockClient,
+		log:           log.NewNopLogger(),
 	}

 	err := runner.createChecks(context.Background())
@ -91,6 +100,7 @@ func TestRunner_createChecks_Success(t *testing.T) {
 	runner := &Runner{
 		checkRegistry: mockCheckService,
 		client:        mockClient,
+		log:           log.NewNopLogger(),
 	}

 	err := runner.createChecks(context.Background())
@ -106,6 +116,7 @@ func TestRunner_cleanupChecks_ErrorOnList(t *testing.T) {

 	runner := &Runner{
 		client: mockClient,
+		log:    log.NewNopLogger(),
 	}

 	err := runner.cleanupChecks(context.Background())
@ -126,6 +137,7 @@ func TestRunner_cleanupChecks_WithinMax(t *testing.T) {

 	runner := &Runner{
 		client: mockClient,
+		log:    log.NewNopLogger(),
 	}

 	err := runner.cleanupChecks(context.Background())
@ -155,6 +167,7 @@ func TestRunner_cleanupChecks_ErrorOnDelete(t *testing.T) {
 	runner := &Runner{
 		client:     mockClient,
 		maxHistory: defaultMaxHistory,
+		log:        log.NewNopLogger(),
 	}
 	err := runner.cleanupChecks(context.Background())
 	assert.ErrorContains(t, err, "delete error")
@ -190,6 +203,7 @@ func TestRunner_cleanupChecks_Success(t *testing.T) {
 	runner := &Runner{
 		client:     mockClient,
 		maxHistory: defaultMaxHistory,
+		log:        log.NewNopLogger(),
 	}
 	err := runner.cleanupChecks(context.Background())
 	assert.NoError(t, err)
--- a/apps/advisor/pkg/app/checktyperegisterer/checktyperegisterer.go
+++ b/apps/advisor/pkg/app/checktyperegisterer/checktyperegisterer.go
@ -3,6 +3,7 @@ package checktyperegisterer
 import (
 	"context"
 	"fmt"
+	"time"

 	"github.com/grafana/grafana-app-sdk/app"
 	"github.com/grafana/grafana-app-sdk/k8s"
@ -10,6 +11,7 @@ import (
 	advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checkregistry"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
+	"github.com/grafana/grafana/pkg/infra/log"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
@ -21,6 +23,9 @@ type Runner struct {
 	checkRegistry checkregistry.CheckService
 	client        resource.Client
 	namespace     string
+	log           log.Logger
+	retryAttempts int
+	retryDelay    time.Duration
 }

 // NewRunner creates a new Runner.
@ -47,9 +52,32 @@ func New(cfg app.Config) (app.Runnable, error) {
 		checkRegistry: checkRegistry,
 		client:        client,
 		namespace:     namespace,
+		log:           log.New("advisor.checktyperegisterer"),
+		retryAttempts: 3,
+		retryDelay:    time.Second * 5,
 	}, nil
 }

+func (r *Runner) createOrUpdate(ctx context.Context, obj resource.Object) error {
+	id := obj.GetStaticMetadata().Identifier()
+	_, err := r.client.Create(ctx, id, obj, resource.CreateOptions{})
+	if err != nil {
+		if errors.IsAlreadyExists(err) {
+			// Already exists, update
+			r.log.Debug("Check type already exists, updating", "identifier", id)
+			_, err = r.client.Update(ctx, id, obj, resource.UpdateOptions{})
+			if err != nil {
+				// Ignore the error, it's probably due to a race condition
+				r.log.Error("Error updating check type", "error", err)
+			}
+			return nil
+		}
+		return err
+	}
+	r.log.Debug("Check type registered successfully", "identifier", id)
+	return nil
+}
+
 func (r *Runner) Run(ctx context.Context) error {
 	for _, t := range r.checkRegistry.Checks() {
 		steps := t.Steps()
@ -72,19 +100,19 @@ func (r *Runner) Run(ctx context.Context) error {
 				Steps: stepTypes,
 			},
 		}
-		id := obj.GetStaticMetadata().Identifier()
-		_, err := r.client.Create(ctx, id, obj, resource.CreateOptions{})
-		if err != nil {
-			if errors.IsAlreadyExists(err) {
-				// Already exists, update
-				_, err = r.client.Update(ctx, id, obj, resource.UpdateOptions{})
-				if err != nil {
-					return err
+		for i := 0; i < r.retryAttempts; i++ {
+			err := r.createOrUpdate(ctx, obj)
+			if err != nil {
+				r.log.Error("Error creating check type, retrying", "error", err, "attempt", i+1)
+				if i == r.retryAttempts-1 {
+					r.log.Error("Unable to register check type")
 				} else {
-					continue
+					time.Sleep(r.retryDelay)
 				}
+				continue
 			}
-			return err
+			r.log.Debug("Check type registered successfully", "check_type", t.ID())
+			break
 		}
 	}
 	return nil
--- a/apps/advisor/pkg/app/checktyperegisterer/checktyperegisterer_test.go
+++ b/apps/advisor/pkg/app/checktyperegisterer/checktyperegisterer_test.go
@ -9,6 +9,7 @@ import (
 	"github.com/grafana/grafana-app-sdk/resource"
 	advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
 	"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
+	"github.com/grafana/grafana/pkg/infra/log"
 	k8sErrs "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 )
@ -117,7 +118,10 @@ func TestCheckTypesRegisterer_Run(t *testing.T) {
 					createFunc: tt.createFunc,
 					updateFunc: tt.updateFunc,
 				},
-				namespace: "custom-namespace",
+				namespace:     "custom-namespace",
+				log:           log.New("test"),
+				retryAttempts: 1,
+				retryDelay:    0,
 			}
 			err := r.Run(context.Background())
 			if err != nil {