package scheduler import ( "context" "errors" "flag" "fmt" "sync" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/grafana/loki/v3/pkg/blockbuilder/types" ) const ( DefaultPriority = 0 // TODO(owen-d): better determine priority when unknown defaultCompletedJobsCapacity = 100 ) type jobQueueMetrics struct { pending prometheus.Gauge inProgress prometheus.Gauge completed *prometheus.CounterVec } func newJobQueueMetrics(r prometheus.Registerer) *jobQueueMetrics { return &jobQueueMetrics{ pending: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "loki_block_scheduler_pending_jobs", Help: "Number of jobs in the block scheduler queue", }), inProgress: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "loki_block_scheduler_in_progress_jobs", Help: "Number of jobs currently being processed", }), completed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "loki_block_scheduler_completed_jobs_total", Help: "Total number of jobs completed by the block scheduler", }, []string{"status"}), } } // JobWithMetadata wraps a job with additional metadata for tracking its lifecycle type JobWithMetadata struct { *types.Job Priority int Status types.JobStatus StartTime time.Time UpdateTime time.Time } // NewJobWithMetadata creates a new JobWithMetadata instance func NewJobWithMetadata(job *types.Job, priority int) *JobWithMetadata { return &JobWithMetadata{ Job: job, Priority: priority, Status: types.JobStatusPending, UpdateTime: time.Now(), } } type JobQueueConfig struct { LeaseExpiryCheckInterval time.Duration `yaml:"lease_expiry_check_interval"` LeaseDuration time.Duration `yaml:"lease_duration"` } func (cfg *JobQueueConfig) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.LeaseExpiryCheckInterval, "jobqueue.lease-expiry-check-interval", 1*time.Minute, "Interval to check for expired job leases") f.DurationVar(&cfg.LeaseDuration, "jobqueue.lease-duration", 10*time.Minute, "Duration after which a job lease is considered expired if the scheduler receives no updates from builders about the job. Expired jobs are re-enqueued") } // JobQueue is a thread-safe implementation of a job queue with state tracking type JobQueue struct { cfg JobQueueConfig mu sync.RWMutex pending *PriorityQueue[string, *JobWithMetadata] // Jobs waiting to be processed, ordered by priority inProgress map[string]*JobWithMetadata // Jobs currently being processed completed *CircularBuffer[*JobWithMetadata] // Last N completed jobs statusMap map[string]types.JobStatus // Maps job ID to its current status logger log.Logger metrics *jobQueueMetrics } // NewJobQueue creates a new JobQueue instance func NewJobQueue(cfg JobQueueConfig, logger log.Logger, reg prometheus.Registerer) *JobQueue { return &JobQueue{ cfg: cfg, pending: NewPriorityQueue(priorityComparator, jobIDExtractor), inProgress: make(map[string]*JobWithMetadata), completed: NewCircularBuffer[*JobWithMetadata](defaultCompletedJobsCapacity), statusMap: make(map[string]types.JobStatus), logger: logger, metrics: newJobQueueMetrics(reg), } } // RunLeaseExpiryChecker periodically checks for expired job leases and requeues them func (q *JobQueue) RunLeaseExpiryChecker(ctx context.Context) { ticker := time.NewTicker(q.cfg.LeaseExpiryCheckInterval) defer ticker.Stop() for { select { case <-ticker.C: level.Debug(q.logger).Log("msg", "checking for expired job leases") if err := q.requeueExpiredJobs(); err != nil { level.Error(q.logger).Log("msg", "failed to requeue expired jobs", "err", err) } case <-ctx.Done(): return } } } // requeueExpiredJobs checks for jobs that have exceeded their lease duration and requeues them func (q *JobQueue) requeueExpiredJobs() error { // First collect expired jobs while holding the lock q.mu.Lock() var expiredJobs []*JobWithMetadata for id, job := range q.inProgress { if time.Since(job.UpdateTime) > q.cfg.LeaseDuration { level.Warn(q.logger).Log("msg", "job lease expired, will requeue", "job", id, "update_time", job.UpdateTime, "now", time.Now()) expiredJobs = append(expiredJobs, job) } } q.mu.Unlock() // Then requeue them without holding the lock var multiErr error for _, job := range expiredJobs { // First try to transition from in-progress to expired ok, err := q.TransitionState(job.ID(), types.JobStatusInProgress, types.JobStatusExpired) if err != nil { level.Error(q.logger).Log("msg", "failed to mark job as expired", "job", job.ID(), "err", err) multiErr = errors.Join(multiErr, fmt.Errorf("failed to mark job %s as expired: %w", job.ID(), err)) continue } if !ok { // Job is no longer in progress, someone else must have handled it level.Debug(q.logger).Log("msg", "job no longer in progress, skipping expiry", "job", job.ID()) continue } // Then re-enqueue it _, _, err = q.TransitionAny(job.ID(), types.JobStatusPending, func() (*JobWithMetadata, error) { return NewJobWithMetadata(job.Job, job.Priority), nil }) if err != nil { level.Error(q.logger).Log("msg", "failed to requeue expired job", "job", job.ID(), "err", err) multiErr = errors.Join(multiErr, fmt.Errorf("failed to requeue expired job %s: %w", job.ID(), err)) } } return multiErr } // priorityComparator compares two jobs by priority (higher priority first) func priorityComparator(a, b *JobWithMetadata) bool { return a.Priority > b.Priority } // jobIDExtractor extracts the job ID from a JobWithMetadata func jobIDExtractor(j *JobWithMetadata) string { return j.ID() } // TransitionState attempts to transition a job from one specific state to another func (q *JobQueue) TransitionState(jobID string, from, to types.JobStatus) (bool, error) { q.mu.Lock() defer q.mu.Unlock() currentStatus, exists := q.statusMap[jobID] if !exists { return false, fmt.Errorf("job %s not found", jobID) } if currentStatus != from { return false, fmt.Errorf("job %s is in state %s, not %s", jobID, currentStatus, from) } return q.transitionLockLess(jobID, to) } // TransitionAny transitions a job from any state to the specified state func (q *JobQueue) TransitionAny(jobID string, to types.JobStatus, createFn func() (*JobWithMetadata, error)) (prevStatus types.JobStatus, found bool, err error) { q.mu.Lock() defer q.mu.Unlock() currentStatus, exists := q.statusMap[jobID] // If the job isn't found or has already finished, create a new job if finished := currentStatus.IsFinished(); !exists || finished { // exception: // we're just moving one finished type to another; no need to re-enqueue if finished && to.IsFinished() { q.statusMap[jobID] = to if j, found := q.completed.Lookup( func(jwm *JobWithMetadata) bool { return jwm.ID() == jobID }, ); found { j.Status = to j.UpdateTime = time.Now() } return currentStatus, true, nil } if createFn == nil { return types.JobStatusUnknown, false, fmt.Errorf("job %s not found and no creation function provided", jobID) } if finished { level.Debug(q.logger).Log("msg", "creating a copy of already-completed job", "id", jobID, "from", currentStatus, "to", to) } job, err := createFn() if err != nil { return types.JobStatusUnknown, false, fmt.Errorf("failed to create job %s: %w", jobID, err) } // temporarily mark as pending so we can transition it to the target state q.statusMap[jobID] = types.JobStatusPending q.pending.Push(job) q.metrics.pending.Inc() level.Debug(q.logger).Log("msg", "created new job", "id", jobID, "status", types.JobStatusPending) if _, err := q.transitionLockLess(jobID, to); err != nil { return types.JobStatusUnknown, false, err } return types.JobStatusUnknown, false, nil } _, err = q.transitionLockLess(jobID, to) return currentStatus, true, err } // transitionLockLess performs the actual state transition (must be called with lock held) func (q *JobQueue) transitionLockLess(jobID string, to types.JobStatus) (bool, error) { from := q.statusMap[jobID] if from == to { return false, nil } var job *JobWithMetadata // Remove from current state switch from { case types.JobStatusPending: if j, exists := q.pending.Remove(jobID); exists { job = j q.metrics.pending.Dec() } case types.JobStatusInProgress: if j, exists := q.inProgress[jobID]; exists { job = j delete(q.inProgress, jobID) q.metrics.inProgress.Dec() } } if job == nil { return false, fmt.Errorf("job %s not found in its supposed state %s", jobID, from) } // Add to new state job.Status = to job.UpdateTime = time.Now() q.statusMap[jobID] = to switch to { case types.JobStatusPending: q.pending.Push(job) q.metrics.pending.Inc() case types.JobStatusInProgress: q.inProgress[jobID] = job q.metrics.inProgress.Inc() job.StartTime = job.UpdateTime case types.JobStatusComplete, types.JobStatusFailed, types.JobStatusExpired: q.completed.Push(job) q.metrics.completed.WithLabelValues(to.String()).Inc() delete(q.statusMap, jobID) // remove from status map so we don't grow indefinitely default: return false, fmt.Errorf("invalid target state: %s", to) } level.Debug(q.logger).Log("msg", "transitioned job state", "id", jobID, "from", from, "to", to) return true, nil } // Exists checks if a job exists and returns its current status func (q *JobQueue) Exists(jobID string) (types.JobStatus, bool) { q.mu.RLock() defer q.mu.RUnlock() status, exists := q.statusMap[jobID] return status, exists } // Dequeue removes and returns the highest priority pending job func (q *JobQueue) Dequeue() (*types.Job, bool) { q.mu.Lock() defer q.mu.Unlock() job, ok := q.pending.Peek() if !ok { return nil, false } _, err := q.transitionLockLess(job.ID(), types.JobStatusInProgress) if err != nil { level.Error(q.logger).Log("msg", "failed to transition dequeued job to in progress", "id", job.ID(), "err", err) return nil, false } return job.Job, true } // ListPendingJobs returns a list of all pending jobs func (q *JobQueue) ListPendingJobs() []JobWithMetadata { q.mu.RLock() defer q.mu.RUnlock() // return copies of the jobs since they can change after the lock is released jobs := make([]JobWithMetadata, 0, q.pending.Len()) for _, j := range q.pending.List() { cpy := *j.Job jobs = append(jobs, JobWithMetadata{ Job: &cpy, // force copy Priority: j.Priority, Status: j.Status, StartTime: j.StartTime, UpdateTime: j.UpdateTime, }) } return jobs } // ListInProgressJobs returns a list of all in-progress jobs func (q *JobQueue) ListInProgressJobs() []JobWithMetadata { q.mu.RLock() defer q.mu.RUnlock() // return copies of the jobs since they can change after the lock is released jobs := make([]JobWithMetadata, 0, len(q.inProgress)) for _, j := range q.inProgress { cpy := *j.Job jobs = append(jobs, JobWithMetadata{ Job: &cpy, // force copy Priority: j.Priority, Status: j.Status, StartTime: j.StartTime, UpdateTime: j.UpdateTime, }) } return jobs } // ListCompletedJobs returns a list of completed jobs func (q *JobQueue) ListCompletedJobs() []JobWithMetadata { q.mu.RLock() defer q.mu.RUnlock() jobs := make([]JobWithMetadata, 0, q.completed.Len()) q.completed.Range(func(job *JobWithMetadata) bool { cpy := *job.Job jobs = append(jobs, JobWithMetadata{ Job: &cpy, // force copy Priority: job.Priority, Status: job.Status, StartTime: job.StartTime, UpdateTime: job.UpdateTime, }) return true }) return jobs } // UpdatePriority updates the priority of a pending job. If the job is not pending, // returns false to indicate the update was not performed. func (q *JobQueue) UpdatePriority(id string, priority int) bool { q.mu.Lock() defer q.mu.Unlock() // Check if job is still pending if job, ok := q.pending.Lookup(id); ok { // nit: we're technically already updating the prio via reference, // but that's fine -- we may refactor this eventually to have 3 generic types: (key, value, priority) where value implements a `Priority() T` method. job.Priority = priority return q.pending.UpdatePriority(id, job) } // Job is no longer pending (might be in progress, completed, etc) return false } // Ping updates the last-updated timestamp of a job and returns whether it was found. // This is useful for keeping jobs alive and preventing lease expiry. func (q *JobQueue) Ping(id string) bool { q.mu.Lock() defer q.mu.Unlock() if job, ok := q.inProgress[id]; ok { job.UpdateTime = time.Now() return true } return false }