The open and composable observability and data visualization platform. Visualize metrics, logs, and traces from multiple sources like Prometheus, Loki, Elasticsearch, InfluxDB, Postgres and many more.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
grafana/pkg/services/alerting/scheduler.go

130 lines
3.2 KiB

package alerting
import (
"fmt"
"time"
"github.com/Unknwon/log"
"github.com/grafana/grafana/pkg/services/alerting/alertstates"
)
type Scheduler struct {
jobs map[int64]*AlertJob
runQueue chan *AlertJob
responseQueue chan *AlertResult
}
func NewScheduler() *Scheduler {
return &Scheduler{
jobs: make(map[int64]*AlertJob, 0),
runQueue: make(chan *AlertJob, 1000),
responseQueue: make(chan *AlertResult, 1000),
}
}
func (scheduler *Scheduler) dispatch(reader RuleReader) {
reschedule := time.NewTicker(time.Second * 10)
secondTicker := time.NewTicker(time.Second)
scheduler.updateJobs(reader.Fetch)
for {
select {
case <-secondTicker.C:
scheduler.queueJobs()
case <-reschedule.C:
scheduler.updateJobs(reader.Fetch)
}
}
}
func (scheduler *Scheduler) updateJobs(alertRuleFn func() []AlertRule) {
log.Debug("Scheduler: UpdateJobs()")
jobs := make(map[int64]*AlertJob, 0)
rules := alertRuleFn()
for i, rule := range rules {
var job *AlertJob
if scheduler.jobs[rule.Id] != nil {
job = scheduler.jobs[rule.Id]
} else {
job = &AlertJob{
Running: false,
RetryCount: 0,
}
}
job.Rule = rule
job.Offset = int64(i)
jobs[rule.Id] = job
}
log.Debug("Scheduler: Selected %d jobs", len(jobs))
scheduler.jobs = jobs
}
func (scheduler *Scheduler) queueJobs() {
now := time.Now().Unix()
for _, job := range scheduler.jobs {
if now%job.Rule.Frequency == 0 && job.Running == false {
log.Info("Scheduler: Putting job on to run queue: %s", job.Rule.Title)
scheduler.runQueue <- job
}
}
}
func (scheduler *Scheduler) executor(executor Executor) {
for job := range scheduler.runQueue {
//log.Info("Executor: queue length %d", len(this.runQueue))
log.Info("Executor: executing %s", job.Rule.Title)
job.Running = true
scheduler.measureAndExecute(executor, job)
}
}
func (scheduler *Scheduler) handleResponses() {
for response := range scheduler.responseQueue {
log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d)", response.Id, response.State, response.ActualValue, response.AlertJob.RetryCount)
response.AlertJob.Running = false
if response.IsResultIncomplete() {
response.AlertJob.RetryCount++
if response.AlertJob.RetryCount < maxRetries {
scheduler.runQueue <- response.AlertJob
} else {
saveState(&AlertResult{
Id: response.Id,
State: alertstates.Critical,
Description: fmt.Sprintf("Failed to run check after %d retires", maxRetries),
})
}
} else {
response.AlertJob.RetryCount = 0
saveState(response)
}
}
}
func (scheduler *Scheduler) measureAndExecute(exec Executor, job *AlertJob) {
now := time.Now()
responseChan := make(chan *AlertResult, 1)
go exec.Execute(job, responseChan)
select {
case <-time.After(time.Second * 5):
scheduler.responseQueue <- &AlertResult{
Id: job.Rule.Id,
State: alertstates.Pending,
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
AlertJob: job,
}
case result := <-responseChan:
result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
log.Info("Schedular: exeuction took %vms", result.Duration)
scheduler.responseQueue <- result
}
}