Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/querier/queryrange/log_result_cache.go

281 lines
9.2 KiB

package queryrange
import (
"context"
"fmt"
"net/http"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/gogo/protobuf/proto"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/model"
"github.com/weaveworks/common/httpgrpc"
"golang.org/x/sync/errgroup"
"github.com/grafana/dskit/tenant"
"github.com/grafana/loki/pkg/loghttp"
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/logqlmodel/stats"
"github.com/grafana/loki/pkg/querier/queryrange/queryrangebase"
"github.com/grafana/loki/pkg/storage/chunk/cache"
"github.com/grafana/loki/pkg/util/validation"
)
// LogResultCacheMetrics is the metrics wrapper used in log result cache.
type LogResultCacheMetrics struct {
CacheHit prometheus.Counter
CacheMiss prometheus.Counter
}
// NewLogResultCacheMetrics creates metrics to be used in log result cache.
func NewLogResultCacheMetrics(registerer prometheus.Registerer) *LogResultCacheMetrics {
return &LogResultCacheMetrics{
CacheHit: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Name: "query_frontend_log_result_cache_hit_total",
}),
CacheMiss: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Name: "query_frontend_log_result_cache_miss_total",
}),
}
}
// NewLogResultCache creates a new log result cache middleware.
// Currently it only caches empty filter queries, this is because those are usually easily and freely cacheable.
// Log hits are difficult to handle because of the limit query parameter and the size of the response.
// In the future it could be extended to cache non-empty query results.
// see https://docs.google.com/document/d/1_mACOpxdWZ5K0cIedaja5gzMbv-m0lUVazqZd2O4mEU/edit
func NewLogResultCache(logger log.Logger, limits Limits, cache cache.Cache, shouldCache queryrangebase.ShouldCacheFn, metrics *LogResultCacheMetrics) queryrangebase.Middleware {
if metrics == nil {
metrics = NewLogResultCacheMetrics(nil)
}
return queryrangebase.MiddlewareFunc(func(next queryrangebase.Handler) queryrangebase.Handler {
return &logResultCache{
next: next,
limits: limits,
cache: cache,
logger: logger,
shouldCache: shouldCache,
metrics: metrics,
}
})
}
type logResultCache struct {
next queryrangebase.Handler
limits Limits
cache cache.Cache
shouldCache queryrangebase.ShouldCacheFn
metrics *LogResultCacheMetrics
logger log.Logger
}
func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (queryrangebase.Response, error) {
tenantIDs, err := tenant.TenantIDs(ctx)
if err != nil {
return nil, httpgrpc.Errorf(http.StatusBadRequest, err.Error())
}
if l.shouldCache != nil && !l.shouldCache(req) {
return l.next.Do(ctx, req)
}
maxCacheFreshness := validation.MaxDurationPerTenant(tenantIDs, l.limits.MaxCacheFreshness)
maxCacheTime := int64(model.Now().Add(-maxCacheFreshness))
if req.GetEnd() > maxCacheTime {
return l.next.Do(ctx, req)
}
lokiReq, ok := req.(*LokiRequest)
if !ok {
return nil, httpgrpc.Errorf(http.StatusInternalServerError, "invalid request type %T", req)
}
interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration)
// skip caching by if interval is unset
if interval == 0 {
return l.next.Do(ctx, req)
}
// The first subquery might not be aligned.
alignedStart := time.Unix(0, lokiReq.GetStartTs().UnixNano()-(lokiReq.GetStartTs().UnixNano()%interval.Nanoseconds()))
// generate the cache key based on query, tenant and start time.
cacheKey := fmt.Sprintf("log:%s:%s:%d:%d", tenant.JoinTenantIDs(tenantIDs), req.GetQuery(), interval.Nanoseconds(), alignedStart.UnixNano()/(interval.Nanoseconds()))
_, buff, _, err := l.cache.Fetch(ctx, []string{cache.HashKey(cacheKey)})
if err != nil {
level.Warn(l.logger).Log("msg", "error fetching cache", "err", err, "cacheKey", cacheKey)
return l.next.Do(ctx, req)
}
// we expect only one key to be found or missing.
if len(buff) > 1 {
level.Warn(l.logger).Log("msg", "unexpected length of cache return values", "buff", len(buff))
return l.next.Do(ctx, req)
}
if len(buff) == 0 {
// cache miss
return l.handleMiss(ctx, cacheKey, lokiReq)
}
// cache hit
var cachedRequest LokiRequest
err = proto.Unmarshal(buff[0], &cachedRequest)
if err != nil {
level.Warn(l.logger).Log("msg", "error unmarshalling request from cache", "err", err)
return l.next.Do(ctx, req)
}
return l.handleHit(ctx, cacheKey, &cachedRequest, lokiReq)
}
func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *LokiRequest) (queryrangebase.Response, error) {
l.metrics.CacheMiss.Inc()
level.Debug(l.logger).Log("msg", "cache miss", "key", cacheKey)
resp, err := l.next.Do(ctx, req)
if err != nil {
return nil, err
}
lokiRes, ok := resp.(*LokiResponse)
if !ok {
return nil, fmt.Errorf("unexpected response type %T", resp)
}
// At the moment we only cache empty results
if !isEmpty(lokiRes) {
return resp, nil
}
data, err := proto.Marshal(req)
if err != nil {
level.Warn(l.logger).Log("msg", "error marshalling request", "err", err)
return resp, nil
}
// cache the result
err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data})
if err != nil {
level.Warn(l.logger).Log("msg", "error storing cache", "err", err)
}
return resp, nil
}
func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) {
l.metrics.CacheHit.Inc()
// we start with an empty response
result := emptyResponse(cachedRequest)
// if the request is the same and cover the whole time range,
// we can just return the cached result.
if !lokiReq.GetStartTs().After(cachedRequest.GetStartTs()) && lokiReq.GetStartTs().Equal(cachedRequest.GetStartTs()) &&
!lokiReq.GetEndTs().Before(cachedRequest.GetEndTs()) && lokiReq.GetEndTs().Equal(cachedRequest.GetEndTs()) {
return result, nil
}
// we could be missing data at the start and the end.
// so we're going to fetch what is missing.
var (
startRequest, endRequest *LokiRequest
startResp, endResp *LokiResponse
updateCache bool
ok bool
)
g, ctx := errgroup.WithContext(ctx)
// if we're missing data at the start, start fetching from the start to the cached start.
if lokiReq.GetStartTs().Before(cachedRequest.GetStartTs()) {
g.Go(func() error {
startRequest = lokiReq.WithStartEndTime(lokiReq.GetStartTs(), cachedRequest.GetStartTs())
resp, err := l.next.Do(ctx, startRequest)
if err != nil {
return err
}
startResp, ok = resp.(*LokiResponse)
if !ok {
return fmt.Errorf("unexpected response type %T", resp)
}
return nil
})
}
// if we're missing data at the end, start fetching from the cached end to the end.
if lokiReq.GetEndTs().After(cachedRequest.GetEndTs()) {
g.Go(func() error {
endRequest = lokiReq.WithStartEndTime(cachedRequest.GetEndTs(), lokiReq.GetEndTs())
resp, err := l.next.Do(ctx, endRequest)
if err != nil {
return err
}
endResp, ok = resp.(*LokiResponse)
if !ok {
return fmt.Errorf("unexpected response type %T", resp)
}
return nil
})
}
if err := g.Wait(); err != nil {
return nil, err
}
// if we have data at the start, we need to merge it with the cached data if it's empty and update the cache.
// If it's not empty only merge the response.
if startResp != nil {
if isEmpty(startResp) {
cachedRequest = cachedRequest.WithStartEndTime(startRequest.GetStartTs(), cachedRequest.GetEndTs())
updateCache = true
} else {
if startResp.Status != loghttp.QueryStatusSuccess {
return startResp, nil
}
result = mergeLokiResponse(startResp, result)
}
}
// if we have data at the end, we need to merge it with the cached data if it's empty and update the cache.
// If it's not empty only merge the response.
if endResp != nil {
if isEmpty(endResp) {
cachedRequest = cachedRequest.WithStartEndTime(cachedRequest.GetStartTs(), endRequest.GetEndTs())
updateCache = true
} else {
if endResp.Status != loghttp.QueryStatusSuccess {
return endResp, nil
}
result = mergeLokiResponse(endResp, result)
}
}
// we need to update the cache since we fetched more either at the end or the start and it was empty.
if updateCache {
data, err := proto.Marshal(cachedRequest)
if err != nil {
level.Warn(l.logger).Log("msg", "error marshalling request", "err", err)
return result, err
}
// cache the result
err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data})
if err != nil {
level.Warn(l.logger).Log("msg", "error storing cache", "err", err)
}
}
return result, nil
}
func isEmpty(lokiRes *LokiResponse) bool {
return lokiRes.Status == loghttp.QueryStatusSuccess && len(lokiRes.Data.Result) == 0
}
func emptyResponse(lokiReq *LokiRequest) *LokiResponse {
return &LokiResponse{
Status: loghttp.QueryStatusSuccess,
Statistics: stats.Result{},
Direction: lokiReq.Direction,
Limit: lokiReq.Limit,
Version: uint32(loghttp.GetVersion(lokiReq.Path)),
Data: LokiData{
ResultType: loghttp.ResultTypeStream,
Result: []logproto.Stream{},
},
}
}