mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
373 lines
11 KiB
373 lines
11 KiB
package ruler
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
"github.com/prometheus/common/model"
|
|
"github.com/prometheus/prometheus/model/labels"
|
|
"github.com/prometheus/prometheus/promql"
|
|
"github.com/prometheus/prometheus/rules"
|
|
"github.com/prometheus/prometheus/storage"
|
|
|
|
"github.com/grafana/loki/pkg/querier/series"
|
|
"github.com/grafana/loki/pkg/util"
|
|
)
|
|
|
|
const (
|
|
AlertForStateMetricName = "ALERTS_FOR_STATE"
|
|
statusSuccess = "success"
|
|
statusFailure = "failure"
|
|
)
|
|
|
|
func ForStateMetric(base labels.Labels, alertName string) labels.Labels {
|
|
b := labels.NewBuilder(base)
|
|
b.Set(labels.MetricName, AlertForStateMetricName)
|
|
b.Set(labels.AlertName, alertName)
|
|
return b.Labels(nil)
|
|
}
|
|
|
|
type memstoreMetrics struct {
|
|
evaluations *prometheus.CounterVec
|
|
samples prometheus.Gauge // in memory samples
|
|
cacheHits *prometheus.CounterVec // cache hits on in memory samples
|
|
}
|
|
|
|
func newMemstoreMetrics(r prometheus.Registerer) *memstoreMetrics {
|
|
return &memstoreMetrics{
|
|
evaluations: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: "loki",
|
|
Name: "ruler_memory_for_state_evaluations_total",
|
|
}, []string{"status", "tenant"}),
|
|
samples: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "loki",
|
|
Name: "ruler_memory_samples",
|
|
}),
|
|
cacheHits: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: "loki",
|
|
Name: "ruler_memory_for_state_cache_hits_total",
|
|
}, []string{"tenant"}),
|
|
}
|
|
}
|
|
|
|
type RuleIter interface {
|
|
AlertingRules() []*rules.AlertingRule
|
|
}
|
|
|
|
type MemStore struct {
|
|
mtx sync.Mutex
|
|
userID string
|
|
queryFunc rules.QueryFunc
|
|
metrics *memstoreMetrics
|
|
mgr RuleIter
|
|
logger log.Logger
|
|
rules map[string]*RuleCache
|
|
|
|
initiated chan struct{}
|
|
done chan struct{}
|
|
cleanupInterval time.Duration
|
|
}
|
|
|
|
func NewMemStore(userID string, queryFunc rules.QueryFunc, metrics *memstoreMetrics, cleanupInterval time.Duration, logger log.Logger) *MemStore {
|
|
s := &MemStore{
|
|
userID: userID,
|
|
metrics: metrics,
|
|
queryFunc: queryFunc,
|
|
logger: log.With(logger, "subcomponent", "MemStore", "user", userID),
|
|
cleanupInterval: cleanupInterval,
|
|
rules: make(map[string]*RuleCache),
|
|
|
|
initiated: make(chan struct{}), // blocks execution until Start() is called
|
|
done: make(chan struct{}),
|
|
}
|
|
return s
|
|
|
|
}
|
|
|
|
// Calling Start will set the RuleIter, unblock the MemStore, and start the run() function in a separate goroutine.
|
|
func (m *MemStore) Start(iter RuleIter) {
|
|
m.mgr = iter
|
|
close(m.initiated)
|
|
go m.run()
|
|
}
|
|
|
|
func (m *MemStore) Stop() {
|
|
select {
|
|
case <-m.initiated:
|
|
default:
|
|
// If initiated is blocked, the MemStore has yet to start: easy no-op.
|
|
return
|
|
}
|
|
|
|
// Need to nil all series & decrement gauges
|
|
m.mtx.Lock()
|
|
defer m.mtx.Unlock()
|
|
|
|
select {
|
|
// ensures Stop() is idempotent
|
|
case <-m.done:
|
|
return
|
|
default:
|
|
for ruleKey, cache := range m.rules {
|
|
// Force cleanup of all samples older than time.Now (all of them).
|
|
_ = cache.CleanupOldSamples(time.Now())
|
|
delete(m.rules, ruleKey)
|
|
}
|
|
close(m.done)
|
|
}
|
|
}
|
|
|
|
// run periodically cleans up old series/samples to ensure memory consumption doesn't grow unbounded.
|
|
func (m *MemStore) run() {
|
|
<-m.initiated
|
|
t := time.NewTicker(m.cleanupInterval)
|
|
for {
|
|
select {
|
|
case <-m.done:
|
|
t.Stop()
|
|
return
|
|
case <-t.C:
|
|
m.mtx.Lock()
|
|
holdDurs := make(map[string]time.Duration)
|
|
for _, rule := range m.mgr.AlertingRules() {
|
|
holdDurs[rule.Name()] = rule.HoldDuration()
|
|
}
|
|
|
|
for ruleKey, cache := range m.rules {
|
|
dur, ok := holdDurs[ruleKey]
|
|
|
|
// rule is no longer being tracked, remove it
|
|
if !ok {
|
|
_ = cache.CleanupOldSamples(time.Now())
|
|
delete(m.rules, ruleKey)
|
|
continue
|
|
}
|
|
|
|
// trim older samples out of tracking bounds, doubled to buffer.
|
|
if empty := cache.CleanupOldSamples(time.Now().Add(-2 * dur)); empty {
|
|
delete(m.rules, ruleKey)
|
|
}
|
|
|
|
}
|
|
|
|
m.mtx.Unlock()
|
|
}
|
|
}
|
|
}
|
|
|
|
// implement storage.Queryable. It is only called with the desired ts as maxtime. Mint is
|
|
// parameterized via the outage tolerance, but since we're synthetically generating these,
|
|
// we only care about the desired time.
|
|
func (m *MemStore) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
|
|
<-m.initiated
|
|
return &memStoreQuerier{
|
|
ts: util.TimeFromMillis(maxt),
|
|
MemStore: m,
|
|
ctx: ctx,
|
|
}, nil
|
|
|
|
}
|
|
|
|
type memStoreQuerier struct {
|
|
ts time.Time
|
|
ctx context.Context
|
|
*MemStore
|
|
}
|
|
|
|
// Select implements storage.Querier but takes advantage of the fact that it's only called when restoring for state
|
|
// in order to lookup & cache previous rule evaluations. This results in a sort of synthetic metric store.
|
|
func (m *memStoreQuerier) Select(sortSeries bool, params *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
|
|
b := labels.NewBuilder(nil)
|
|
var ruleKey string
|
|
for _, matcher := range matchers {
|
|
// Since Select is only called to restore the for state of an alert, we can deduce two things:
|
|
// 1) The matchers will all be in the form {foo="bar"}. This means we can construct the cache entry from these matchers.
|
|
// 2) The alertname label value can be used to discover the rule this query is associated with.
|
|
b.Set(matcher.Name, matcher.Value)
|
|
if matcher.Name == labels.AlertName && matcher.Type == labels.MatchEqual {
|
|
ruleKey = matcher.Value
|
|
}
|
|
}
|
|
ls := b.Labels(nil)
|
|
if ruleKey == "" {
|
|
level.Error(m.logger).Log("msg", "Select called in an unexpected fashion without alertname or ALERTS_FOR_STATE labels")
|
|
return storage.NoopSeriesSet()
|
|
}
|
|
|
|
var rule *rules.AlertingRule
|
|
|
|
// go fetch the rule via the alertname
|
|
for _, x := range m.mgr.AlertingRules() {
|
|
if x.Name() == ruleKey {
|
|
rule = x
|
|
break
|
|
}
|
|
}
|
|
|
|
// should not happen
|
|
if rule == nil {
|
|
level.Error(m.logger).Log("msg", "failure trying to restore for state for untracked alerting rule", "name", ruleKey)
|
|
return storage.NoopSeriesSet()
|
|
}
|
|
|
|
level.Debug(m.logger).Log("msg", "restoring for state via evaluation", "rule", ruleKey)
|
|
|
|
m.mtx.Lock()
|
|
defer m.mtx.Unlock()
|
|
cache, ok := m.rules[ruleKey]
|
|
|
|
// no timestamp results are cached for this rule at all; Create it.
|
|
if !ok {
|
|
cache = NewRuleCache(m.metrics)
|
|
m.rules[ruleKey] = cache
|
|
}
|
|
|
|
smpl, cached := cache.Get(m.ts, ls)
|
|
if cached {
|
|
m.metrics.cacheHits.WithLabelValues(m.userID).Inc()
|
|
level.Debug(m.logger).Log("msg", "result cached", "rule", ruleKey)
|
|
// Assuming the result is cached but the desired series is not in the result, it wouldn't be considered active.
|
|
if smpl == nil {
|
|
return storage.NoopSeriesSet()
|
|
}
|
|
|
|
// If the labelset is cached we can consider it active. Return the for state sample active immediately.
|
|
return series.NewConcreteSeriesSet(
|
|
[]storage.Series{
|
|
series.NewConcreteSeries(smpl.Metric, []model.SamplePair{
|
|
{Timestamp: model.Time(util.TimeToMillis(m.ts)), Value: model.SampleValue(smpl.V)},
|
|
}),
|
|
},
|
|
)
|
|
}
|
|
|
|
// see if alert condition had any inhabitants at ts-forDuration. We can assume it's still firing because
|
|
// that's the only condition under which this is queried (via RestoreForState).
|
|
checkTime := m.ts.Add(-rule.HoldDuration())
|
|
vec, err := m.queryFunc(m.ctx, rule.Query().String(), checkTime)
|
|
if err != nil {
|
|
level.Info(m.logger).Log("msg", "error querying for rule", "rule", ruleKey, "err", err.Error())
|
|
m.metrics.evaluations.WithLabelValues(statusFailure, m.userID).Inc()
|
|
return storage.NoopSeriesSet()
|
|
}
|
|
m.metrics.evaluations.WithLabelValues(statusSuccess, m.userID).Inc()
|
|
level.Debug(m.logger).Log("msg", "rule state successfully restored", "rule", ruleKey, "len", len(vec))
|
|
|
|
// translate the result into the ALERTS_FOR_STATE series for caching,
|
|
// considered active & written at the timetamp requested
|
|
forStateVec := make(promql.Vector, 0, len(vec))
|
|
for _, smpl := range vec {
|
|
|
|
ts := util.TimeToMillis(m.ts)
|
|
|
|
forStateVec = append(forStateVec, promql.Sample{
|
|
Metric: ForStateMetric(smpl.Metric, rule.Name()),
|
|
Point: promql.Point{
|
|
T: ts,
|
|
V: float64(checkTime.Unix()),
|
|
},
|
|
})
|
|
|
|
}
|
|
|
|
// cache the result of the evaluation at this timestamp
|
|
cache.Set(m.ts, forStateVec)
|
|
|
|
// Finally return the series if it exists.
|
|
// Calling cache.Get leverages the existing code to return only single sample.
|
|
smpl, ok = cache.Get(m.ts, ls)
|
|
if !ok || smpl == nil {
|
|
return storage.NoopSeriesSet()
|
|
}
|
|
// If the labelset is cached we can consider it active. Return the for state sample active immediately.
|
|
return series.NewConcreteSeriesSet(
|
|
[]storage.Series{
|
|
series.NewConcreteSeries(smpl.Metric, []model.SamplePair{
|
|
{Timestamp: model.Time(util.TimeToMillis(m.ts)), Value: model.SampleValue(smpl.V)},
|
|
}),
|
|
},
|
|
)
|
|
}
|
|
|
|
// LabelValues returns all potential values for a label name.
|
|
func (*memStoreQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
|
|
return nil, nil, errors.New("unimplemented")
|
|
}
|
|
|
|
// LabelNames returns all the unique label names present in the block in sorted order.
|
|
func (*memStoreQuerier) LabelNames(matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
|
|
return nil, nil, errors.New("unimplemented")
|
|
}
|
|
|
|
// Close releases the resources of the Querier.
|
|
func (*memStoreQuerier) Close() error { return nil }
|
|
|
|
type RuleCache struct {
|
|
mtx sync.Mutex
|
|
metrics *memstoreMetrics
|
|
data map[int64]map[uint64]promql.Sample
|
|
}
|
|
|
|
func NewRuleCache(metrics *memstoreMetrics) *RuleCache {
|
|
return &RuleCache{
|
|
data: make(map[int64]map[uint64]promql.Sample),
|
|
metrics: metrics,
|
|
}
|
|
}
|
|
|
|
func (c *RuleCache) Set(ts time.Time, vec promql.Vector) {
|
|
c.mtx.Lock()
|
|
defer c.mtx.Unlock()
|
|
tsMap, ok := c.data[ts.UnixNano()]
|
|
if !ok {
|
|
tsMap = make(map[uint64]promql.Sample)
|
|
c.data[ts.UnixNano()] = tsMap
|
|
}
|
|
|
|
for _, sample := range vec {
|
|
tsMap[sample.Metric.Hash()] = sample
|
|
}
|
|
c.metrics.samples.Add(float64(len(vec)))
|
|
}
|
|
|
|
// Get returns ok if that timestamp's result is cached.
|
|
func (c *RuleCache) Get(ts time.Time, ls labels.Labels) (*promql.Sample, bool) {
|
|
c.mtx.Lock()
|
|
defer c.mtx.Unlock()
|
|
|
|
match, ok := c.data[ts.UnixNano()]
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
|
|
smp, ok := match[ls.Hash()]
|
|
if !ok {
|
|
return nil, true
|
|
}
|
|
return &smp, true
|
|
|
|
}
|
|
|
|
// CleanupOldSamples removes samples that are outside of the rule's `For` duration.
|
|
func (c *RuleCache) CleanupOldSamples(olderThan time.Time) (empty bool) {
|
|
c.mtx.Lock()
|
|
defer c.mtx.Unlock()
|
|
|
|
ns := olderThan.UnixNano()
|
|
|
|
// This could be more efficient (logarithmic instead of linear)
|
|
for ts, tsMap := range c.data {
|
|
if ts < ns {
|
|
delete(c.data, ts)
|
|
c.metrics.samples.Add(-float64(len(tsMap)))
|
|
}
|
|
|
|
}
|
|
return len(c.data) == 0
|
|
}
|
|
|