fix(log results cache): compose empty response based on the request (#11657)

**What this PR does / why we need it**: Log results cache when handling a hit composes an empty response based on the cached request. But the limit or direction fields in the cached request need not match with the current request being served. This causes the log results cache to return a response with incorrect limit. This incorrect limit could then get applied when merging responses upstream (split by interval mw for ex.) This pr fixes this by composing the response based on the request being served. I also thought about updating the cache key to include both limit and direction to have a clear separation, but I left it as is for the following reason: if a time range contains no log lines, that result would not change irrespective of a different limit or direction **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](d10549e3ec) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](0d4416a4b0)
2 years ago · e915efc7f8
parent e7b9455327
commit e915efc7f8
3 changed files with 53 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -55,6 +55,7 @@
 * [11551](https://github.com/grafana/loki/pull/11551) **dannykopping** Do not reflect label names in request metrics' "route" label.
 * [11601](https://github.com/grafana/loki/pull/11601) **dannykopping** Ruler: Fixed a panic that can be caused by concurrent read-write access of tenant configs when there are a large amount of rules.
 * [11606](https://github.com/grafana/loki/pull/11606) **dannykopping** Fixed regression adding newlines to HTTP error response bodies which may break client integrations.
+* [11657](https://github.com/grafana/loki/pull/11657) **ashwanthgoli** Log results cache: compose empty response based on the request being served to avoid returning incorrect limit or direction.

 ##### Changes

--- a/pkg/querier/queryrange/log_result_cache.go
+++ b/pkg/querier/queryrange/log_result_cache.go
@ -106,7 +106,8 @@ func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (qu

 	interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration)
 	// skip caching by if interval is unset
-	if interval == 0 {
+	// skip caching when limit is 0 as it would get registerted as empty result in the cache even if that time range contains log lines.
+	if interval == 0 || lokiReq.Limit == 0 {
 		return l.next.Do(ctx, req)
 	}
 	// The first subquery might not be aligned.
@ -181,7 +182,7 @@ func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *L
 func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) {
 	l.metrics.CacheHit.Inc()
 	// we start with an empty response
-	result := emptyResponse(cachedRequest)
+	result := emptyResponse(lokiReq)
 	// if the request is the same and cover the whole time range,
 	// we can just return the cached result.
 	if cachedRequest.StartTs.UnixNano() <= lokiReq.StartTs.UnixNano() && cachedRequest.EndTs.UnixNano() >= lokiReq.EndTs.UnixNano() {
--- a/pkg/querier/queryrange/log_result_cache_test.go
+++ b/pkg/querier/queryrange/log_result_cache_test.go
@ -580,6 +580,54 @@ func Test_LogResultNonOverlappingCache(t *testing.T) {
 	fake.AssertExpectations(t)
 }

+func Test_LogResultCacheDifferentLimit(t *testing.T) {
+	var (
+		ctx = user.InjectOrgID(context.Background(), "foo")
+		lrc = NewLogResultCache(
+			log.NewNopLogger(),
+			fakeLimits{
+				splitDuration: map[string]time.Duration{"foo": time.Minute},
+			},
+			cache.NewMockCache(),
+			nil,
+			nil,
+			nil,
+		)
+	)
+
+	req1 := &LokiRequest{
+		StartTs: time.Unix(0, time.Minute.Nanoseconds()),
+		EndTs:   time.Unix(0, 2*time.Minute.Nanoseconds()),
+		Limit:   entriesLimit,
+	}
+
+	req2 := &LokiRequest{
+		StartTs: time.Unix(0, time.Minute.Nanoseconds()),
+		EndTs:   time.Unix(0, 2*time.Minute.Nanoseconds()),
+		Limit:   10,
+	}
+
+	fake := newFakeResponse([]mockResponse{
+		{
+			RequestResponse: queryrangebase.RequestResponse{
+				Request:  req1,
+				Response: emptyResponse(req1),
+			},
+		},
+	})
+
+	h := lrc.Wrap(fake)
+
+	resp, err := h.Do(ctx, req1)
+	require.NoError(t, err)
+	require.Equal(t, emptyResponse(req1), resp)
+	resp, err = h.Do(ctx, req2)
+	require.NoError(t, err)
+	require.Equal(t, emptyResponse(req2), resp)
+
+	fake.AssertExpectations(t)
+}
+
 func TestExtractLokiResponse(t *testing.T) {
 	for _, tc := range []struct {
 		name           string
@ -677,6 +725,7 @@ func newFakeResponse(responses []mockResponse) fakeResponse {
 	for _, r := range responses {
 		m.On("Do", mock.Anything, r.Request).Return(r.Response, r.err).Once()
 	}
+
 	return fakeResponse{
 		Mock: m,
 	}