Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/querier/queryrange/queryrangebase/querysharding_test.go

664 lines
16 KiB

package queryrangebase
import (
"context"
"fmt"
"math"
"runtime"
"testing"
"time"
"github.com/go-kit/log"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/promql/parser"
"github.com/prometheus/prometheus/storage"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/storage/chunk"
"github.com/grafana/loki/pkg/util"
)
func TestQueryshardingMiddleware(t *testing.T) {
var testExpr = []struct {
name string
next Handler
input Request
ctx context.Context
expected *PrometheusResponse
err bool
override func(*testing.T, Handler)
}{
{
name: "invalid query error",
// if the query parses correctly force it to succeed
next: mockHandlerWith(&PrometheusResponse{
Status: "",
Data: PrometheusData{
ResultType: string(parser.ValueTypeVector),
Result: []SampleStream{},
},
ErrorType: "",
Error: "",
}, nil),
input: &PrometheusRequest{Query: "^GARBAGE"},
ctx: context.Background(),
expected: nil,
err: true,
},
{
name: "downstream err",
next: mockHandlerWith(nil, errors.Errorf("some err")),
input: defaultReq(),
ctx: context.Background(),
expected: nil,
err: true,
},
{
name: "successful trip",
next: mockHandlerWith(sampleMatrixResponse(), nil),
override: func(t *testing.T, handler Handler) {
// pre-encode the query so it doesn't try to re-split. We're just testing if it passes through correctly
qry := defaultReq().WithQuery(
`__embedded_queries__{__cortex_queries__="{\"Concat\":[\"http_requests_total{cluster=\\\"prod\\\"}\"]}"}`,
)
out, err := handler.Do(context.Background(), qry)
require.Nil(t, err)
require.Equal(t, string(parser.ValueTypeMatrix), out.(*PrometheusResponse).Data.ResultType)
require.Equal(t, sampleMatrixResponse(), out)
},
},
}
for _, c := range testExpr {
t.Run(c.name, func(t *testing.T) {
engine := promql.NewEngine(promql.EngineOpts{
Logger: log.NewNopLogger(),
Reg: nil,
MaxSamples: 1000,
Timeout: time.Minute,
})
handler := NewQueryShardMiddleware(
log.NewNopLogger(),
engine,
ShardingConfigs{
{
RowShards: 3,
},
},
PrometheusCodec,
0,
nil,
nil,
).Wrap(c.next)
// escape hatch for custom tests
if c.override != nil {
c.override(t, handler)
return
}
out, err := handler.Do(c.ctx, c.input)
if c.err {
require.NotNil(t, err)
} else {
require.Nil(t, err)
require.Equal(t, c.expected, out)
}
})
}
}
func sampleMatrixResponse() *PrometheusResponse {
return &PrometheusResponse{
Status: StatusSuccess,
Data: PrometheusData{
ResultType: string(parser.ValueTypeMatrix),
Result: []SampleStream{
{
Labels: []logproto.LabelAdapter{
{Name: "a", Value: "a1"},
{Name: "b", Value: "b1"},
},
Samples: []logproto.LegacySample{
{
TimestampMs: 5,
Value: 1,
},
{
TimestampMs: 10,
Value: 2,
},
},
},
{
Labels: []logproto.LabelAdapter{
{Name: "a", Value: "a1"},
{Name: "b", Value: "b1"},
},
Samples: []logproto.LegacySample{
{
TimestampMs: 5,
Value: 8,
},
{
TimestampMs: 10,
Value: 9,
},
},
},
},
},
}
}
func mockHandlerWith(resp *PrometheusResponse, err error) Handler {
return HandlerFunc(func(ctx context.Context, req Request) (Response, error) {
if expired := ctx.Err(); expired != nil {
return nil, expired
}
return resp, err
})
}
func defaultReq() *PrometheusRequest {
return &PrometheusRequest{
Path: "/query_range",
Start: 00,
End: 10,
Step: 5,
Timeout: time.Minute,
Query: `sum(rate(http_requests_total{}[5m]))`,
}
}
func TestShardingConfigs_ValidRange(t *testing.T) {
reqWith := func(start, end string) *PrometheusRequest {
r := defaultReq()
if start != "" {
r.Start = int64(parseDate(start))
}
if end != "" {
r.End = int64(parseDate(end))
}
return r
}
var testExpr = []struct {
name string
confs ShardingConfigs
req *PrometheusRequest
expected chunk.PeriodConfig
err error
}{
{
name: "0 ln configs fail",
confs: ShardingConfigs{},
req: defaultReq(),
err: errInvalidShardingRange,
},
{
name: "request starts before beginning config",
confs: ShardingConfigs{
{
From: chunk.DayTime{Time: parseDate("2019-10-16")},
RowShards: 1,
},
},
req: reqWith("2019-10-15", ""),
err: errInvalidShardingRange,
},
{
name: "request spans multiple configs",
confs: ShardingConfigs{
{
From: chunk.DayTime{Time: parseDate("2019-10-16")},
RowShards: 1,
},
{
From: chunk.DayTime{Time: parseDate("2019-11-16")},
RowShards: 2,
},
},
req: reqWith("2019-10-15", "2019-11-17"),
err: errInvalidShardingRange,
},
{
name: "selects correct config ",
confs: ShardingConfigs{
{
From: chunk.DayTime{Time: parseDate("2019-10-16")},
RowShards: 1,
},
{
From: chunk.DayTime{Time: parseDate("2019-11-16")},
RowShards: 2,
},
{
From: chunk.DayTime{Time: parseDate("2019-12-16")},
RowShards: 3,
},
},
req: reqWith("2019-11-20", "2019-11-25"),
expected: chunk.PeriodConfig{
From: chunk.DayTime{Time: parseDate("2019-11-16")},
RowShards: 2,
},
},
}
for _, c := range testExpr {
t.Run(c.name, func(t *testing.T) {
out, err := c.confs.ValidRange(c.req.Start, c.req.End)
if c.err != nil {
require.EqualError(t, err, c.err.Error())
} else {
require.Nil(t, err)
require.Equal(t, c.expected, out)
}
})
}
}
func parseDate(in string) model.Time {
t, err := time.Parse("2006-01-02", in)
if err != nil {
panic(err)
}
return model.Time(t.UnixNano())
}
// mappingValidator can be injected into a middleware chain to assert that a query matches an expected query
type mappingValidator struct {
expected string
next Handler
}
func (v *mappingValidator) Do(ctx context.Context, req Request) (Response, error) {
expr, err := parser.ParseExpr(req.GetQuery())
if err != nil {
return nil, err
}
if v.expected != expr.String() {
return nil, errors.Errorf("bad query mapping: expected [%s], got [%s]", v.expected, expr.String())
}
return v.next.Do(ctx, req)
}
// approximatelyEquals ensures two responses are approximately equal, up to 6 decimals precision per sample
func approximatelyEquals(t *testing.T, a, b *PrometheusResponse) {
require.Equal(t, a.Status, b.Status)
if a.Status != StatusSuccess {
return
}
as, err := ResponseToSamples(a)
require.Nil(t, err)
bs, err := ResponseToSamples(b)
require.Nil(t, err)
require.Equal(t, len(as), len(bs))
for i := 0; i < len(as); i++ {
a := as[i]
b := bs[i]
require.Equal(t, a.Labels, b.Labels)
require.Equal(t, len(a.Samples), len(b.Samples))
for j := 0; j < len(a.Samples); j++ {
aSample := &a.Samples[j]
aSample.Value = math.Round(aSample.Value*1e6) / 1e6
bSample := &b.Samples[j]
bSample.Value = math.Round(bSample.Value*1e6) / 1e6
}
require.Equal(t, a, b)
}
}
func TestQueryshardingCorrectness(t *testing.T) {
shardFactor := 2
req := &PrometheusRequest{
Path: "/query_range",
Start: util.TimeToMillis(start),
End: util.TimeToMillis(end),
Step: int64(step) / int64(time.Second),
}
for _, tc := range []struct {
desc string
query string
mapped string
}{
{
desc: "fully encoded histogram_quantile",
query: `histogram_quantile(0.5, rate(bar1{baz="blip"}[30s]))`,
mapped: `__embedded_queries__{__cortex_queries__="{\"Concat\":[\"histogram_quantile(0.5, rate(bar1{baz=\\\"blip\\\"}[30s]))\"]}"}`,
},
{
desc: "entire query with shard summer",
query: `sum by (foo,bar) (min_over_time(bar1{baz="blip"}[1m]))`,
mapped: `sum by(foo, bar) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(foo, bar, __cortex_shard__) (min_over_time(bar1{__cortex_shard__=\\\"0_of_2\\\",baz=\\\"blip\\\"}[1m]))\",\"sum by(foo, bar, __cortex_shard__) (min_over_time(bar1{__cortex_shard__=\\\"1_of_2\\\",baz=\\\"blip\\\"}[1m]))\"]}"})`,
},
{
desc: "shard one leg encode the other",
query: "sum(rate(bar1[1m])) or rate(bar1[1m])",
mapped: `sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"0_of_2\\\"}[1m]))\",\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"1_of_2\\\"}[1m]))\"]}"}) or __embedded_queries__{__cortex_queries__="{\"Concat\":[\"rate(bar1[1m])\"]}"}`,
},
{
desc: "should skip encoding leaf scalar/strings",
query: `histogram_quantile(0.5, sum(rate(cortex_cache_value_size_bytes_bucket[5m])) by (le))`,
mapped: `histogram_quantile(0.5, sum by(le) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(le, __cortex_shard__) (rate(cortex_cache_value_size_bytes_bucket{__cortex_shard__=\\\"0_of_2\\\"}[5m]))\",\"sum by(le, __cortex_shard__) (rate(cortex_cache_value_size_bytes_bucket{__cortex_shard__=\\\"1_of_2\\\"}[5m]))\"]}"}))`,
},
{
desc: "ensure sharding sub aggregations are skipped to avoid non-associative series merging across shards",
query: `sum(
count(
count(
bar1
) by (drive,instance)
) by (instance)
)`,
mapped: `__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum(count by(instance) (count by(drive, instance) (bar1)))\"]}"}`,
},
} {
t.Run(tc.desc, func(t *testing.T) {
shardingConf := ShardingConfigs{
chunk.PeriodConfig{
Schema: "v10",
RowShards: uint32(shardFactor),
},
}
shardingware := NewQueryShardMiddleware(
log.NewNopLogger(),
engine,
// ensure that all requests are shard compatbile
shardingConf,
PrometheusCodec,
0,
nil,
nil,
)
downstream := &downstreamHandler{
engine: engine,
queryable: shardAwareQueryable,
}
assertionMWare := MiddlewareFunc(func(next Handler) Handler {
return &mappingValidator{
expected: tc.mapped,
next: next,
}
})
mapperware := MiddlewareFunc(func(next Handler) Handler {
return newASTMapperware(shardingConf, next, log.NewNopLogger(), nil)
})
r := req.WithQuery(tc.query)
// ensure the expected ast mapping occurs
_, err := MergeMiddlewares(mapperware, assertionMWare).Wrap(downstream).Do(context.Background(), r)
require.Nil(t, err)
shardedRes, err := shardingware.Wrap(downstream).Do(context.Background(), r)
require.Nil(t, err)
res, err := downstream.Do(context.Background(), r)
require.Nil(t, err)
approximatelyEquals(t, res.(*PrometheusResponse), shardedRes.(*PrometheusResponse))
})
}
}
func TestShardSplitting(t *testing.T) {
for _, tc := range []struct {
desc string
lookback time.Duration
shouldShard bool
}{
{
desc: "older than lookback",
lookback: -1, // a negative lookback will ensure the entire query doesn't cross the sharding boundary & can safely be sharded.
shouldShard: true,
},
{
desc: "overlaps lookback",
lookback: end.Sub(start) / 2, // intersect the request causing it to avoid sharding
shouldShard: false,
},
{
desc: "newer than lookback",
lookback: end.Sub(start) + 1,
shouldShard: false,
},
} {
t.Run(tc.desc, func(t *testing.T) {
req := &PrometheusRequest{
Path: "/query_range",
Start: util.TimeToMillis(start),
End: util.TimeToMillis(end),
Step: int64(step) / int64(time.Second),
Query: "sum(rate(bar1[1m]))",
}
shardingware := NewQueryShardMiddleware(
log.NewNopLogger(),
engine,
// ensure that all requests are shard compatbile
ShardingConfigs{
chunk.PeriodConfig{
Schema: "v10",
RowShards: uint32(2),
},
},
PrometheusCodec,
tc.lookback,
nil,
nil,
)
downstream := &downstreamHandler{
engine: engine,
queryable: shardAwareQueryable,
}
handler := shardingware.Wrap(downstream).(*shardSplitter)
handler.now = func() time.Time { return end } // make the split cut the request in half (don't use time.Now)
var didShard bool
old := handler.shardingware
handler.shardingware = HandlerFunc(func(ctx context.Context, req Request) (Response, error) {
didShard = true
return old.Do(ctx, req)
})
resp, err := handler.Do(context.Background(), req)
require.Nil(t, err)
require.Equal(t, tc.shouldShard, didShard)
unaltered, err := downstream.Do(context.Background(), req)
require.Nil(t, err)
approximatelyEquals(t, unaltered.(*PrometheusResponse), resp.(*PrometheusResponse))
})
}
}
func BenchmarkQuerySharding(b *testing.B) {
var shards []uint32
// max out at half available cpu cores in order to minimize noisy neighbor issues while benchmarking
for shard := 1; shard <= runtime.NumCPU()/2; shard = shard * 2 {
shards = append(shards, uint32(shard))
}
for _, tc := range []struct {
labelBuckets int
labels []string
samplesPerSeries int
query string
desc string
}{
// Ensure you have enough cores to run these tests without blocking.
// We want to simulate parallel computations and waiting in queue doesn't help
// no-group
{
labelBuckets: 16,
labels: []string{"a", "b", "c"},
samplesPerSeries: 100,
query: `sum(rate(http_requests_total[5m]))`,
desc: "sum nogroup",
},
// sum by
{
labelBuckets: 16,
labels: []string{"a", "b", "c"},
samplesPerSeries: 100,
query: `sum by(a) (rate(http_requests_total[5m]))`,
desc: "sum by",
},
// sum without
{
labelBuckets: 16,
labels: []string{"a", "b", "c"},
samplesPerSeries: 100,
query: `sum without (a) (rate(http_requests_total[5m]))`,
desc: "sum without",
},
} {
for _, delayPerSeries := range []time.Duration{
0,
time.Millisecond / 10,
} {
engine := promql.NewEngine(promql.EngineOpts{
Logger: log.NewNopLogger(),
Reg: nil,
MaxSamples: 100000000,
Timeout: time.Minute,
})
queryable := NewMockShardedQueryable(
tc.samplesPerSeries,
tc.labels,
tc.labelBuckets,
delayPerSeries,
)
downstream := &downstreamHandler{
engine: engine,
queryable: queryable,
}
var (
start int64
end = int64(1000 * tc.samplesPerSeries)
step = (end - start) / 1000
)
req := &PrometheusRequest{
Path: "/query_range",
Start: start,
End: end,
Step: step,
Timeout: time.Minute,
Query: tc.query,
}
for _, shardFactor := range shards {
shardingware := NewQueryShardMiddleware(
log.NewNopLogger(),
engine,
// ensure that all requests are shard compatbile
ShardingConfigs{
chunk.PeriodConfig{
Schema: "v10",
RowShards: shardFactor,
},
},
PrometheusCodec,
0,
nil,
nil,
).Wrap(downstream)
b.Run(
fmt.Sprintf(
"desc:[%s]---shards:[%d]---series:[%.0f]---delayPerSeries:[%s]---samplesPerSeries:[%d]",
tc.desc,
shardFactor,
math.Pow(float64(tc.labelBuckets), float64(len(tc.labels))),
delayPerSeries,
tc.samplesPerSeries,
),
func(b *testing.B) {
for n := 0; n < b.N; n++ {
_, err := shardingware.Do(
context.Background(),
req,
)
if err != nil {
b.Fatal(err.Error())
}
}
},
)
}
fmt.Println()
}
fmt.Print("--------------------------------\n\n")
}
}
type downstreamHandler struct {
engine *promql.Engine
queryable storage.Queryable
}
func (h *downstreamHandler) Do(ctx context.Context, r Request) (Response, error) {
qry, err := h.engine.NewRangeQuery(
h.queryable,
r.GetQuery(),
util.TimeFromMillis(r.GetStart()),
util.TimeFromMillis(r.GetEnd()),
time.Duration(r.GetStep())*time.Millisecond,
)
if err != nil {
return nil, err
}
res := qry.Exec(ctx)
extracted, err := FromResult(res)
if err != nil {
return nil, err
}
return &PrometheusResponse{
Status: StatusSuccess,
Data: PrometheusData{
ResultType: string(res.Value.Type()),
Result: extracted,
},
}, nil
}