Querier/Ruler: add histogram to track fetched chunk size distribution (#8682)

**What this PR does / why we need it**: We are looking at making some changes to the chunk caching strategy, and we need this data to know what size chunks we're typically requesting. For example, if the overwhelming majority of chunks are small we may decide to cache only small chunks to decrease the number of requests to the object store; we could fit more chunks in cache this way, which may have a positive performance impact.
2 years ago · ab7a970b94
parent 08ac5336d5
commit ab7a970b94
2 changed files with 17 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,7 @@

 ##### Enhancements

+* [8682](https://github.com/grafana/loki/pull/8682) **dannykopping**: Add fetched chunk size distribution metric `loki_chunk_fetcher_fetched_size_bytes`.
 * [8532](https://github.com/grafana/loki/pull/8532) **justcompile**: Adds Storage Class option to S3 objects
 * [7951](https://github.com/grafana/loki/pull/7951) **MichelHollands**: Add a count template function to line_format and label_format.
 * [7380](https://github.com/grafana/loki/pull/7380) **liguozhong**: metrics query: range vector support streaming agg when no overlap.
--- a/pkg/storage/chunk/fetcher/fetcher.go
+++ b/pkg/storage/chunk/fetcher/fetcher.go
@ -38,6 +38,15 @@ var (
 		Name:      "cache_corrupt_chunks_total",
 		Help:      "Total count of corrupt chunks found in cache.",
 	})
+	chunkFetchedSize = promauto.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace: "loki",
+		Subsystem: "chunk_fetcher",
+		Name:      "fetched_size_bytes",
+		Help:      "Compressed chunk size distribution fetched from storage.",
+		// TODO: expand these buckets if we ever make larger chunks
+		// TODO: consider adding `chunk_target_size` to this list in case users set very large chunk sizes
+		Buckets: []float64{128, 1024, 16 * 1024, 64 * 1024, 128 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 1.5 * 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024},
+	}, []string{"source"})
 )

 const chunkDecodeParallelism = 16
@ -173,6 +182,11 @@ func (c *Fetcher) FetchChunks(ctx context.Context, chunks []chunk.Chunk, keys []
 	if err != nil {
 		level.Warn(log).Log("msg", "error fetching from cache", "err", err)
 	}
+
+	for _, buf := range cacheBufs {
+		chunkFetchedSize.WithLabelValues("cache").Observe(float64(len(buf)))
+	}
+
 	fromCache, missing, err := c.processCacheResponse(ctx, chunks, cacheHits, cacheBufs)
 	if err != nil {
 		level.Warn(log).Log("msg", "error process response from cache", "err", err)
@ -188,6 +202,8 @@ func (c *Fetcher) FetchChunks(ctx context.Context, chunks []chunk.Chunk, keys []
 	var bytes int
 	for _, c := range fromStorage {
 		bytes += c.Size()
+
+		chunkFetchedSize.WithLabelValues("store").Observe(float64(c.Size()))
 	}

 	st := stats.FromContext(ctx)