refactor(blooms): Add metrics for per-tenant tasks progress to planner (#13078)

pull/13270/head
Salva Corts 11 months ago committed by GitHub
parent 1d6f8d51fc
commit 9289493b6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 16
      pkg/bloombuild/planner/metrics.go
  2. 26
      pkg/bloombuild/planner/planner.go

@ -35,7 +35,9 @@ type Metrics struct {
blocksDeleted prometheus.Counter
metasDeleted prometheus.Counter
tenantsDiscovered prometheus.Counter
tenantsDiscovered prometheus.Counter
tenantTasksPlanned *prometheus.GaugeVec
tenantTasksCompleted *prometheus.GaugeVec
}
func NewMetrics(
@ -129,6 +131,18 @@ func NewMetrics(
Name: "tenants_discovered_total",
Help: "Number of tenants discovered during the current build iteration",
}),
tenantTasksPlanned: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "tenant_tasks_planned",
Help: "Number of tasks planned for a tenant during the current build iteration.",
}, []string{"tenant"}),
tenantTasksCompleted: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsSubsystem,
Name: "tenant_tasks_completed",
Help: "Number of tasks completed for a tenant during the current build iteration.",
}, []string{"tenant"}),
}
}

@ -122,6 +122,8 @@ func (p *Planner) stopping(_ error) error {
}
func (p *Planner) running(ctx context.Context) error {
go p.trackInflightRequests(ctx)
// run once at beginning
if err := p.runOne(ctx); err != nil {
level.Error(p.logger).Log("msg", "bloom build iteration failed for the first time", "err", err)
@ -130,9 +132,6 @@ func (p *Planner) running(ctx context.Context) error {
planningTicker := time.NewTicker(p.cfg.PlanningInterval)
defer planningTicker.Stop()
inflightTasksTicker := time.NewTicker(250 * time.Millisecond)
defer inflightTasksTicker.Stop()
for {
select {
case <-ctx.Done():
@ -149,6 +148,19 @@ func (p *Planner) running(ctx context.Context) error {
if err := p.runOne(ctx); err != nil {
level.Error(p.logger).Log("msg", "bloom build iteration failed", "err", err)
}
}
}
}
func (p *Planner) trackInflightRequests(ctx context.Context) {
inflightTasksTicker := time.NewTicker(250 * time.Millisecond)
defer inflightTasksTicker.Stop()
for {
select {
case <-ctx.Done():
// We just return. Error handling and logging is done in the main loop (running method).
return
case <-inflightTasksTicker.C:
inflight := p.totalPendingTasks()
@ -223,6 +235,7 @@ func (p *Planner) runOne(ctx context.Context) error {
tenantTableEnqueuedTasks++
}
p.metrics.tenantTasksPlanned.WithLabelValues(tt.tenant).Add(float64(tenantTableEnqueuedTasks))
tasksResultForTenantTable[tt] = tenantTableTaskResults{
tasksToWait: tenantTableEnqueuedTasks,
originalMetas: existingMetas,
@ -489,6 +502,12 @@ func (p *Planner) loadTenantWork(
tenantTableWork[table][tenant] = bounds
// Reset progress tracking metrics for this tenant
// NOTE(salvacorts): We will reset them multiple times for the same tenant, for each table, but it's not a big deal.
// Alternatively, we can use a Counter instead of a Gauge, but I think a Gauge is easier to reason about.
p.metrics.tenantTasksPlanned.WithLabelValues(tenant).Set(0)
p.metrics.tenantTasksCompleted.WithLabelValues(tenant).Set(0)
level.Debug(p.logger).Log("msg", "loading work for tenant", "table", table, "tenant", tenant, "splitFactor", splitFactor)
}
if err := tenants.Err(); err != nil {
@ -804,6 +823,7 @@ func (p *Planner) BuilderLoop(builder protos.PlannerForBuilder_BuilderLoopServer
"retries", task.timesEnqueued.Load(),
)
p.removePendingTask(task)
p.metrics.tenantTasksCompleted.WithLabelValues(task.Tenant).Inc()
// Send the result back to the task. The channel is buffered, so this should not block.
task.resultsChannel <- result

Loading…
Cancel
Save