rule manager: Moved metric registration to custom registerer which is already available. (#4961)

Signed-off-by: Bartek Plotka <bwplotka@gmail.com>
7 years ago · de213d4a5e
parent 806632790e
commit de213d4a5e
1 changed files with 134 additions and 94 deletions
--- a/rules/manager.go
+++ b/rules/manager.go
@ -53,55 +53,6 @@ const (
 const namespace = "prometheus"

 var (
-	evalDuration = prometheus.NewSummary(
-		prometheus.SummaryOpts{
-			Namespace: namespace,
-			Name:      "rule_evaluation_duration_seconds",
-			Help:      "The duration for a rule to execute.",
-		},
-	)
-	evalFailures = prometheus.NewCounter(
-		prometheus.CounterOpts{
-			Namespace: namespace,
-			Name:      "rule_evaluation_failures_total",
-			Help:      "The total number of rule evaluation failures.",
-		},
-	)
-	evalTotal = prometheus.NewCounter(
-		prometheus.CounterOpts{
-			Namespace: namespace,
-			Name:      "rule_evaluations_total",
-			Help:      "The total number of rule evaluations.",
-		},
-	)
-	iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
-		Namespace:  namespace,
-		Name:       "rule_group_duration_seconds",
-		Help:       "The duration of rule group evaluations.",
-		Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
-	})
-	iterationsMissed = prometheus.NewCounter(prometheus.CounterOpts{
-		Namespace: namespace,
-		Name:      "rule_group_iterations_missed_total",
-		Help:      "The total number of rule group evaluations missed due to slow rule group evaluation.",
-	})
-	iterationsScheduled = prometheus.NewCounter(prometheus.CounterOpts{
-		Namespace: namespace,
-		Name:      "rule_group_iterations_total",
-		Help:      "The total number of scheduled rule group evaluations, whether executed or missed.",
-	})
-	lastEvaluation = prometheus.NewDesc(
-		prometheus.BuildFQName(namespace, "", "rule_group_last_evaluation_timestamp_seconds"),
-		"The timestamp of the last rule group evaluation in seconds.",
-		[]string{"rule_group"},
-		nil,
-	)
-	lastDuration = prometheus.NewDesc(
-		prometheus.BuildFQName(namespace, "", "rule_group_last_duration_seconds"),
-		"The duration of the last rule group evaluation.",
-		[]string{"rule_group"},
-		nil,
-	)
 	groupInterval = prometheus.NewDesc(
 		prometheus.BuildFQName(namespace, "", "rule_group_interval_seconds"),
 		"The interval of a rule group.",
@ -110,12 +61,85 @@ var (
 	)
 )

-func init() {
-	prometheus.MustRegister(iterationDuration)
-	prometheus.MustRegister(iterationsScheduled)
-	prometheus.MustRegister(iterationsMissed)
-	prometheus.MustRegister(evalFailures)
-	prometheus.MustRegister(evalDuration)
+type metrics struct {
+	evalDuration        prometheus.Summary
+	evalFailures        prometheus.Counter
+	evalTotal           prometheus.Counter
+	iterationDuration   prometheus.Summary
+	iterationsMissed    prometheus.Counter
+	iterationsScheduled prometheus.Counter
+	groupLastEvalTime   *prometheus.GaugeVec
+	groupLastDuration   *prometheus.GaugeVec
+}
+
+func newGroupMetrics(reg prometheus.Registerer) *metrics {
+	m := &metrics{
+		evalDuration: prometheus.NewSummary(
+			prometheus.SummaryOpts{
+				Namespace: namespace,
+				Name:      "rule_evaluation_duration_seconds",
+				Help:      "The duration for a rule to execute.",
+			}),
+		evalFailures: prometheus.NewCounter(
+			prometheus.CounterOpts{
+				Namespace: namespace,
+				Name:      "rule_evaluation_failures_total",
+				Help:      "The total number of rule evaluation failures.",
+			}),
+		evalTotal: prometheus.NewCounter(
+			prometheus.CounterOpts{
+				Namespace: namespace,
+				Name:      "rule_evaluations_total",
+				Help:      "The total number of rule evaluations.",
+			}),
+		iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
+			Namespace:  namespace,
+			Name:       "rule_group_duration_seconds",
+			Help:       "The duration of rule group evaluations.",
+			Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
+		}),
+		iterationsMissed: prometheus.NewCounter(prometheus.CounterOpts{
+			Namespace: namespace,
+			Name:      "rule_group_iterations_missed_total",
+			Help:      "The total number of rule group evaluations missed due to slow rule group evaluation.",
+		}),
+		iterationsScheduled: prometheus.NewCounter(prometheus.CounterOpts{
+			Namespace: namespace,
+			Name:      "rule_group_iterations_total",
+			Help:      "The total number of scheduled rule group evaluations, whether executed or missed.",
+		}),
+		groupLastEvalTime: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "rule_group_last_evaluation_timestamp_seconds",
+				Help:      "The timestamp of the last rule group evaluation in seconds.",
+			},
+			[]string{"rule_group"},
+		),
+		groupLastDuration: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "rule_group_last_duration_seconds",
+				Help:      "The duration of the last rule group evaluation.",
+			},
+			[]string{"rule_group"},
+		),
+	}
+
+	if reg != nil {
+		reg.MustRegister(
+			m.evalDuration,
+			m.evalFailures,
+			m.evalTotal,
+			m.iterationDuration,
+			m.iterationsMissed,
+			m.iterationsScheduled,
+			m.groupLastEvalTime,
+			m.groupLastDuration,
+		)
+	}
+
+	return m
 }

 // QueryFunc processes PromQL queries.
@ -165,8 +189,12 @@ type Rule interface {
 	// Health returns the current health of the rule.
 	Health() RuleHealth
 	SetEvaluationDuration(time.Duration)
+	// GetEvaluationDuration returns last evaluation duration.
+	// NOTE: Used dynamically by rules.html template.
 	GetEvaluationDuration() time.Duration
 	SetEvaluationTimestamp(time.Time)
+	// GetEvaluationTimestamp returns last evaluation timestamp.
+	// NOTE: Used dynamically by rules.html template.
 	GetEvaluationTimestamp() time.Time
 	// HTMLSnippet returns a human-readable string representation of the rule,
 	// decorated with HTML elements for use the web frontend.
@ -191,10 +219,20 @@ type Group struct {
 	terminated chan struct{}

 	logger log.Logger
+
+	metrics *metrics
 }

 // NewGroup makes a new Group with the given name, options, and rules.
 func NewGroup(name, file string, interval time.Duration, rules []Rule, shouldRestore bool, opts *ManagerOptions) *Group {
+	metrics := opts.metrics
+	if metrics == nil {
+		metrics = newGroupMetrics(opts.Registerer)
+	}
+
+	metrics.groupLastEvalTime.WithLabelValues(groupKey(file, name))
+	metrics.groupLastDuration.WithLabelValues(groupKey(file, name))
+
 	return &Group{
 		name:                 name,
 		file:                 file,
@ -206,6 +244,7 @@ func NewGroup(name, file string, interval time.Duration, rules []Rule, shouldRes
 		done:                 make(chan struct{}),
 		terminated:           make(chan struct{}),
 		logger:               log.With(opts.Logger, "group", name),
+		metrics:              metrics,
 	}
 }

@ -233,15 +272,15 @@ func (g *Group) run(ctx context.Context) {
 	}

 	iter := func() {
-		iterationsScheduled.Inc()
+		g.metrics.iterationsScheduled.Inc()

 		start := time.Now()
 		g.Eval(ctx, evalTimestamp)
 		timeSinceStart := time.Since(start)

-		iterationDuration.Observe(timeSinceStart.Seconds())
-		g.SetEvaluationDuration(timeSinceStart)
-		g.SetEvaluationTimestamp(start)
+		g.metrics.iterationDuration.Observe(timeSinceStart.Seconds())
+		g.setEvaluationDuration(timeSinceStart)
+		g.setEvaluationTimestamp(start)
 	}

 	// The assumption here is that since the ticker was started after having
@ -262,8 +301,8 @@ func (g *Group) run(ctx context.Context) {
 		case <-tick.C:
 			missed := (time.Since(evalTimestamp) / g.interval) - 1
 			if missed > 0 {
-				iterationsMissed.Add(float64(missed))
-				iterationsScheduled.Add(float64(missed))
+				g.metrics.iterationsMissed.Add(float64(missed))
+				g.metrics.iterationsScheduled.Add(float64(missed))
 			}
 			evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval)
 			iter()
@ -284,8 +323,8 @@ func (g *Group) run(ctx context.Context) {
 			case <-tick.C:
 				missed := (time.Since(evalTimestamp) / g.interval) - 1
 				if missed > 0 {
-					iterationsMissed.Add(float64(missed))
-					iterationsScheduled.Add(float64(missed))
+					g.metrics.iterationsMissed.Add(float64(missed))
+					g.metrics.iterationsScheduled.Add(float64(missed))
 				}
 				evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval)
 				iter()
@ -314,25 +353,29 @@ func (g *Group) GetEvaluationDuration() time.Duration {
 	return g.evaluationDuration
 }

-// SetEvaluationDuration sets the time in seconds the last evaluation took.
-func (g *Group) SetEvaluationDuration(dur time.Duration) {
+// setEvaluationDuration sets the time in seconds the last evaluation took.
+func (g *Group) setEvaluationDuration(dur time.Duration) {
+	g.metrics.groupLastDuration.WithLabelValues(groupKey(g.file, g.name)).Set(float64(dur))
+
 	g.mtx.Lock()
 	defer g.mtx.Unlock()
 	g.evaluationDuration = dur
 }

-// SetEvaluationTimestamp updates evaluationTimestamp to the timestamp of when the rule group was last evaluated.
-func (g *Group) SetEvaluationTimestamp(ts time.Time) {
+// GetEvaluationTimestamp returns the time the last evaluation of the rule group took place.
+func (g *Group) GetEvaluationTimestamp() time.Time {
 	g.mtx.Lock()
 	defer g.mtx.Unlock()
-	g.evaluationTimestamp = ts
+	return g.evaluationTimestamp
 }

-// GetEvaluationTimestamp returns the time the last evaluation of the rule group took place.
-func (g *Group) GetEvaluationTimestamp() time.Time {
+// setEvaluationTimestamp updates evaluationTimestamp to the timestamp of when the rule group was last evaluated.
+func (g *Group) setEvaluationTimestamp(ts time.Time) {
+	g.metrics.groupLastEvalTime.WithLabelValues(groupKey(g.file, g.name)).Set(float64(ts.Second()))
+
 	g.mtx.Lock()
 	defer g.mtx.Unlock()
-	return g.evaluationTimestamp
+	g.evaluationTimestamp = ts
 }

 // evalTimestamp returns the immediately preceding consistently slotted evaluation time.
@ -399,12 +442,14 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
 			sp.SetTag("name", rule.Name())
 			defer func(t time.Time) {
 				sp.Finish()
-				evalDuration.Observe(time.Since(t).Seconds())
-				rule.SetEvaluationDuration(time.Since(t))
+
+				since := time.Since(t)
+				g.metrics.evalDuration.Observe(since.Seconds())
+				rule.SetEvaluationDuration(since)
 				rule.SetEvaluationTimestamp(t)
 			}(time.Now())

-			evalTotal.Inc()
+			g.metrics.evalTotal.Inc()

 			vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
 			if err != nil {
@ -413,7 +458,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
 				if _, ok := err.(promql.ErrQueryCanceled); !ok {
 					level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err)
 				}
-				evalFailures.Inc()
+				g.metrics.evalFailures.Inc()
 				return
 			}

@ -490,7 +535,11 @@ func (g *Group) RestoreForState(ts time.Time) {
 		level.Error(g.logger).Log("msg", "Failed to get Querier", "err", err)
 		return
 	}
-	defer q.Close()
+	defer func() {
+		if err := q.Close(); err != nil {
+			level.Error(g.logger).Log("msg", "Failed to close Querier", "err", err)
+		}
+	}()

 	for _, rule := range g.Rules() {
 		alertRule, ok := rule.(*AlertingRule)
@ -634,20 +683,29 @@ type ManagerOptions struct {
 	OutageTolerance time.Duration
 	ForGracePeriod  time.Duration
 	ResendDelay     time.Duration
+
+	metrics *metrics
 }

 // NewManager returns an implementation of Manager, ready to be started
 // by calling the Run method.
 func NewManager(o *ManagerOptions) *Manager {
+	if o.metrics == nil {
+		o.metrics = newGroupMetrics(o.Registerer)
+	}
+
 	m := &Manager{
 		groups: map[string]*Group{},
 		opts:   o,
 		block:  make(chan struct{}),
 		logger: o.Logger,
 	}
+
 	if o.Registerer != nil {
 		o.Registerer.MustRegister(m)
 	}
+
+	o.metrics.iterationsMissed.Inc()
 	return m
 }

@ -825,29 +883,11 @@ func (m *Manager) AlertingRules() []*AlertingRule {

 // Describe implements prometheus.Collector.
 func (m *Manager) Describe(ch chan<- *prometheus.Desc) {
-	ch <- lastEvaluation
-	ch <- lastDuration
 	ch <- groupInterval
 }

 // Collect implements prometheus.Collector.
 func (m *Manager) Collect(ch chan<- prometheus.Metric) {
-	for _, g := range m.RuleGroups() {
-		lastEvaluationTime := g.GetEvaluationTimestamp()
-		lastEvaluationTimestamp := math.Inf(-1)
-		if !lastEvaluationTime.IsZero() {
-			lastEvaluationTimestamp = float64(lastEvaluationTime.UnixNano()) / 1e9
-		}
-		key := groupKey(g.file, g.name)
-		ch <- prometheus.MustNewConstMetric(lastEvaluation,
-			prometheus.GaugeValue,
-			lastEvaluationTimestamp,
-			key)
-		ch <- prometheus.MustNewConstMetric(lastDuration,
-			prometheus.GaugeValue,
-			g.GetEvaluationDuration().Seconds(),
-			key)
-	}
 	for _, g := range m.RuleGroups() {
 		ch <- prometheus.MustNewConstMetric(groupInterval,
 			prometheus.GaugeValue,