refactor(blooms): Implement retry mechanisms in planner (#13064)

pull/13139/head
Salva Corts 12 months ago committed by GitHub
parent 757b776de3
commit cd64d6ddbd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 6
      docs/sources/shared/configuration.md
  2. 1
      pkg/bloombuild/planner/config.go
  3. 7
      pkg/bloombuild/planner/metrics.go
  4. 42
      pkg/bloombuild/planner/planner.go
  5. 292
      pkg/bloombuild/planner/planner_test.go
  6. 12
      pkg/validation/limits.go

@ -3433,6 +3433,12 @@ shard_streams:
# CLI flag: -bloom-build.max-builders
[bloom_build_max_builders: <int> | default = 0]
# Experimental. Timeout for a builder to finish a task. If a builder does not
# respond within this time, it is considered failed and the task will be
# requeued. 0 disables the timeout.
# CLI flag: -bloom-build.builder-response-timeout
[bloom_build_builder_response_timeout: <duration> | default = 0s]
# Experimental. Length of the n-grams created when computing blooms from log
# lines.
# CLI flag: -bloom-compactor.ngram-length

@ -40,6 +40,7 @@ type Limits interface {
BloomCreationEnabled(tenantID string) bool
BloomSplitSeriesKeyspaceBy(tenantID string) int
BloomBuildMaxBuilders(tenantID string) int
BuilderResponseTimeout(tenantID string) time.Duration
}
type QueueLimits struct {

@ -24,6 +24,7 @@ type Metrics struct {
connectedBuilders prometheus.GaugeFunc
queueDuration prometheus.Histogram
inflightRequests prometheus.Summary
tasksRequeued prometheus.Counter
taskLost prometheus.Counter
buildStarted prometheus.Counter
@ -66,6 +67,12 @@ func NewMetrics(
MaxAge: time.Minute,
AgeBuckets: 6,
}),
tasksRequeued: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "tasks_requeued_total",
Help: "Total number of tasks requeued due to not being picked up by a builder.",
}),
taskLost: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,

@ -557,6 +557,10 @@ func (p *Planner) BuilderLoop(builder protos.PlannerForBuilder_BuilderLoopServer
for p.isRunningOrStopping() {
item, idx, err := p.tasksQueue.Dequeue(builder.Context(), lastIndex, builderID)
if err != nil {
if errors.Is(err, queue.ErrStopped) {
// Planner is stopping, break the loop and return
break
}
return fmt.Errorf("error dequeuing task: %w", err)
}
lastIndex = idx
@ -582,9 +586,11 @@ func (p *Planner) BuilderLoop(builder protos.PlannerForBuilder_BuilderLoopServer
if err := p.enqueueTask(task); err != nil {
p.metrics.taskLost.Inc()
level.Error(logger).Log("msg", "error re-enqueuing task. this task will be lost", "err", err)
continue
}
return fmt.Errorf("error forwarding task to builder (%s). Task requeued: %w", builderID, err)
p.metrics.tasksRequeued.Inc()
level.Error(logger).Log("msg", "error forwarding task to builder, Task requeued", "err", err)
}
}
@ -607,16 +613,34 @@ func (p *Planner) forwardTaskToBuilder(
return fmt.Errorf("error sending task to builder (%s): %w", builderID, err)
}
// TODO(salvacorts): Implement timeout and retry for builder response.
res, err := builder.Recv()
if err != nil {
return fmt.Errorf("error receiving response from builder (%s): %w", builderID, err)
}
if res.GetError() != "" {
return fmt.Errorf("error processing task in builder (%s): %s", builderID, res.GetError())
// Launch a goroutine to wait for the response from the builder so we can
// wait for a timeout, error or a response from the builder
errCh := make(chan error)
go func() {
// If connection is closed, Recv() will return an error
res, err := builder.Recv()
if err != nil {
err = fmt.Errorf("error receiving response from builder (%s): %w", builderID, err)
}
if res.GetError() != "" {
err = fmt.Errorf("error processing task in builder (%s): %s", builderID, res.GetError())
}
errCh <- err // Error will be nil on successful completion
}()
taskTimeout := p.limits.BuilderResponseTimeout(task.Tenant)
if taskTimeout == 0 {
// If the timeout is 0 (disabled), just wait for the builder to respond
return <-errCh
}
return nil
timeout := time.After(taskTimeout)
select {
case err := <-errCh:
return err
case <-timeout:
return fmt.Errorf("timeout waiting for response from builder (%s)", builderID)
}
}
func (p *Planner) isRunningOrStopping() bool {

@ -333,18 +333,25 @@ func Test_blockPlansForGaps(t *testing.T) {
}
}
func Test_BuilderLoop(t *testing.T) {
const (
nTasks = 100
nBuilders = 10
)
logger := log.NewNopLogger()
limits := &fakeLimits{}
cfg := Config{
PlanningInterval: 1 * time.Hour,
MaxQueuedTasksPerTenant: 10000,
func createTasks(n int) []*Task {
tasks := make([]*Task, 0, n)
// Enqueue tasks
for i := 0; i < n; i++ {
task := NewTask(
context.Background(), time.Now(),
protos.NewTask(config.NewDayTable(config.NewDayTime(0), "fake"), "fakeTenant", v1.NewBounds(0, 10), tsdbID(1), nil),
)
tasks = append(tasks, task)
}
return tasks
}
func createPlanner(
t *testing.T,
cfg Config,
limits Limits,
logger log.Logger,
) *Planner {
schemaCfg := config.SchemaConfig{
Configs: []config.PeriodConfig{
{
@ -377,77 +384,227 @@ func Test_BuilderLoop(t *testing.T) {
},
}
// Create planner
planner, err := New(cfg, limits, schemaCfg, storageCfg, storage.NewClientMetrics(), nil, logger, prometheus.DefaultRegisterer)
reg := prometheus.NewPedanticRegistry()
planner, err := New(cfg, limits, schemaCfg, storageCfg, storage.ClientMetrics{}, nil, logger, reg)
require.NoError(t, err)
// Start planner
err = services.StartAndAwaitRunning(context.Background(), planner)
require.NoError(t, err)
t.Cleanup(func() {
err := services.StopAndAwaitTerminated(context.Background(), planner)
require.NoError(t, err)
})
return planner
}
// Enqueue tasks
for i := 0; i < nTasks; i++ {
task := NewTask(
context.Background(), time.Now(),
protos.NewTask(config.NewDayTable(config.NewDayTime(0), "fake"), "fakeTenant", v1.NewBounds(0, 10), tsdbID(1), nil),
)
func Test_BuilderLoop(t *testing.T) {
const (
nTasks = 100
nBuilders = 10
)
err = planner.enqueueTask(task)
require.NoError(t, err)
}
for _, tc := range []struct {
name string
limits Limits
expectedBuilderLoopError error
// modifyBuilder should leave the builder in a state where it will not return or return an error
modifyBuilder func(builder *fakeBuilder)
// resetBuilder should reset the builder to a state where it will return no errors
resetBuilder func(builder *fakeBuilder)
}{
{
name: "success",
limits: &fakeLimits{},
expectedBuilderLoopError: errPlannerIsNotRunning,
},
{
name: "error rpc",
limits: &fakeLimits{},
expectedBuilderLoopError: errPlannerIsNotRunning,
modifyBuilder: func(builder *fakeBuilder) {
builder.SetReturnError(true)
},
resetBuilder: func(builder *fakeBuilder) {
builder.SetReturnError(false)
},
},
{
name: "error msg",
limits: &fakeLimits{},
expectedBuilderLoopError: errPlannerIsNotRunning,
modifyBuilder: func(builder *fakeBuilder) {
builder.SetReturnErrorMsg(true)
},
resetBuilder: func(builder *fakeBuilder) {
builder.SetReturnErrorMsg(false)
},
},
{
name: "timeout",
limits: &fakeLimits{
timeout: 1 * time.Second,
},
expectedBuilderLoopError: errPlannerIsNotRunning,
modifyBuilder: func(builder *fakeBuilder) {
builder.SetWait(true)
},
resetBuilder: func(builder *fakeBuilder) {
builder.SetWait(false)
},
},
{
name: "context cancel",
limits: &fakeLimits{},
// Builders cancel the context when they disconnect. We forward this error to the planner.
expectedBuilderLoopError: context.Canceled,
modifyBuilder: func(builder *fakeBuilder) {
builder.CancelContext(true)
},
},
} {
t.Run(tc.name, func(t *testing.T) {
logger := log.NewNopLogger()
//logger := log.NewLogfmtLogger(os.Stdout)
// All tasks should be pending
require.Equal(t, nTasks, planner.totalPendingTasks())
cfg := Config{
PlanningInterval: 1 * time.Hour,
MaxQueuedTasksPerTenant: 10000,
}
planner := createPlanner(t, cfg, tc.limits, logger)
// Create builders and call planner.BuilderLoop
builders := make([]*fakeBuilder, 0, nBuilders)
for i := 0; i < nBuilders; i++ {
builder := newMockBuilder(fmt.Sprintf("builder-%d", i))
builders = append(builders, builder)
// Start planner
err := services.StartAndAwaitRunning(context.Background(), planner)
require.NoError(t, err)
t.Cleanup(func() {
err := services.StopAndAwaitTerminated(context.Background(), planner)
require.NoError(t, err)
})
// Enqueue tasks
tasks := createTasks(nTasks)
for _, task := range tasks {
err = planner.enqueueTask(task)
require.NoError(t, err)
}
go func() {
// We ignore the error since when the planner is stopped,
// the loop will return an error (queue closed)
_ = planner.BuilderLoop(builder)
}()
}
// Create builders and call planner.BuilderLoop
builders := make([]*fakeBuilder, 0, nBuilders)
for i := 0; i < nBuilders; i++ {
builder := newMockBuilder(fmt.Sprintf("builder-%d", i))
builders = append(builders, builder)
go func() {
err = planner.BuilderLoop(builder)
require.ErrorIs(t, err, tc.expectedBuilderLoopError)
}()
}
// Eventually, all tasks should be sent to builders
require.Eventually(t, func() bool {
var receivedTasks int
for _, builder := range builders {
receivedTasks += len(builder.ReceivedTasks())
}
return receivedTasks == nTasks
}, 5*time.Second, 10*time.Millisecond)
// Finally, the queue should be empty
require.Equal(t, 0, planner.totalPendingTasks())
if tc.modifyBuilder != nil {
// Configure builders to return errors
for _, builder := range builders {
tc.modifyBuilder(builder)
}
// Enqueue tasks again
for _, task := range tasks {
err = planner.enqueueTask(task)
require.NoError(t, err)
}
// Tasks should not be consumed
require.Neverf(
t, func() bool {
return planner.totalPendingTasks() == 0
},
5*time.Second, 10*time.Millisecond,
"all tasks were consumed but they should not be",
)
}
if tc.resetBuilder != nil {
// Configure builders to return no errors
for _, builder := range builders {
tc.resetBuilder(builder)
}
// Eventually, all tasks should be sent to builders
require.Eventually(t, func() bool {
var receivedTasks int
for _, builder := range builders {
receivedTasks += len(builder.ReceivedTasks())
}
return receivedTasks == nTasks
}, 15*time.Second, 10*time.Millisecond)
// Finally, the queue should be empty
require.Equal(t, 0, planner.totalPendingTasks())
// Now all tasks should be consumed
require.Eventuallyf(
t, func() bool {
return planner.totalPendingTasks() == 0
},
5*time.Second, 10*time.Millisecond,
"tasks not consumed, pending: %d", planner.totalPendingTasks(),
)
}
})
}
}
type fakeBuilder struct {
id string
tasks []*protos.Task
grpc.ServerStream
returnError bool
returnErrorMsg bool
wait bool
ctx context.Context
ctxCancel context.CancelFunc
}
func newMockBuilder(id string) *fakeBuilder {
return &fakeBuilder{id: id}
ctx, cancel := context.WithCancel(context.Background())
return &fakeBuilder{
id: id,
ctx: ctx,
ctxCancel: cancel,
}
}
func (f *fakeBuilder) ReceivedTasks() []*protos.Task {
return f.tasks
}
func (f *fakeBuilder) SetReturnError(b bool) {
f.returnError = b
}
func (f *fakeBuilder) SetReturnErrorMsg(b bool) {
f.returnErrorMsg = b
}
func (f *fakeBuilder) SetWait(b bool) {
f.wait = b
}
func (f *fakeBuilder) CancelContext(b bool) {
if b {
f.ctxCancel()
return
}
// Reset context
f.ctx, f.ctxCancel = context.WithCancel(context.Background())
}
func (f *fakeBuilder) Context() context.Context {
return context.Background()
return f.ctx
}
func (f *fakeBuilder) Send(req *protos.PlannerToBuilder) error {
if f.ctx.Err() != nil {
// Context was canceled
return f.ctx.Err()
}
task, err := protos.FromProtoTask(req.Task)
if err != nil {
return err
@ -458,12 +615,37 @@ func (f *fakeBuilder) Send(req *protos.PlannerToBuilder) error {
}
func (f *fakeBuilder) Recv() (*protos.BuilderToPlanner, error) {
if f.returnError {
return nil, fmt.Errorf("fake error from %s", f.id)
}
// Wait until `wait` is false
for f.wait {
time.Sleep(time.Second)
}
if f.ctx.Err() != nil {
// Context was canceled
return nil, f.ctx.Err()
}
var errMsg string
if f.returnErrorMsg {
errMsg = fmt.Sprintf("fake error from %s", f.id)
}
return &protos.BuilderToPlanner{
BuilderID: f.id,
Error: errMsg,
}, nil
}
type fakeLimits struct {
timeout time.Duration
}
func (f *fakeLimits) BuilderResponseTimeout(_ string) time.Duration {
return f.timeout
}
func (f *fakeLimits) BloomCreationEnabled(_ string) bool {

@ -206,9 +206,10 @@ type Limits struct {
BloomCompactorMaxBlockSize flagext.ByteSize `yaml:"bloom_compactor_max_block_size" json:"bloom_compactor_max_block_size" category:"experimental"`
BloomCompactorMaxBloomSize flagext.ByteSize `yaml:"bloom_compactor_max_bloom_size" json:"bloom_compactor_max_bloom_size" category:"experimental"`
BloomCreationEnabled bool `yaml:"bloom_creation_enabled" json:"bloom_creation_enabled" category:"experimental"`
BloomSplitSeriesKeyspaceBy int `yaml:"bloom_split_series_keyspace_by" json:"bloom_split_series_keyspace_by" category:"experimental"`
BloomBuildMaxBuilders int `yaml:"bloom_build_max_builders" json:"bloom_build_max_builders" category:"experimental"`
BloomCreationEnabled bool `yaml:"bloom_creation_enabled" json:"bloom_creation_enabled" category:"experimental"`
BloomSplitSeriesKeyspaceBy int `yaml:"bloom_split_series_keyspace_by" json:"bloom_split_series_keyspace_by" category:"experimental"`
BloomBuildMaxBuilders int `yaml:"bloom_build_max_builders" json:"bloom_build_max_builders" category:"experimental"`
BuilderResponseTimeout time.Duration `yaml:"bloom_build_builder_response_timeout" json:"bloom_build_builder_response_timeout" category:"experimental"`
BloomNGramLength int `yaml:"bloom_ngram_length" json:"bloom_ngram_length" category:"experimental"`
BloomNGramSkip int `yaml:"bloom_ngram_skip" json:"bloom_ngram_skip" category:"experimental"`
@ -389,6 +390,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
f.BoolVar(&l.BloomCreationEnabled, "bloom-build.enable", false, "Experimental. Whether to create blooms for the tenant.")
f.IntVar(&l.BloomSplitSeriesKeyspaceBy, "bloom-build.split-keyspace-by", 256, "Experimental. Number of splits to create for the series keyspace when building blooms. The series keyspace is split into this many parts to parallelize bloom creation.")
f.IntVar(&l.BloomBuildMaxBuilders, "bloom-build.max-builders", 0, "Experimental. Maximum number of builders to use when building blooms. 0 allows unlimited builders.")
f.DurationVar(&l.BuilderResponseTimeout, "bloom-build.builder-response-timeout", 0, "Experimental. Timeout for a builder to finish a task. If a builder does not respond within this time, it is considered failed and the task will be requeued. 0 disables the timeout.")
_ = l.BloomCompactorMaxBloomSize.Set(defaultBloomCompactorMaxBloomSize)
f.Var(&l.BloomCompactorMaxBloomSize, "bloom-compactor.max-bloom-size",
@ -999,6 +1001,10 @@ func (o *Overrides) BloomBuildMaxBuilders(userID string) int {
return o.getOverridesForUser(userID).BloomBuildMaxBuilders
}
func (o *Overrides) BuilderResponseTimeout(userID string) time.Duration {
return o.getOverridesForUser(userID).BuilderResponseTimeout
}
func (o *Overrides) BloomNGramLength(userID string) int {
return o.getOverridesForUser(userID).BloomNGramLength
}

Loading…
Cancel
Save