Label selector optimizations (#8763)

This PR introduces the following label selector optimizations: - any selector that is `=~<literal>` is converted to an `Equals/NotEquals` matcher - Use the greedy->non-greedy regex improvements that exist in line and label filters - Never run `.*` label matchers. Prometheus regex matchers anchor the match to the beginning and end of the line. That means matchers of the form `=~<literal_value>` are just testing equality. Benchmarks -- this is just a quick bench to demonstrate equality matchers are faster than their equivalent regex matchers: ``` BenchmarkMatcherTypes/regex-8 27694834 53.74 ns/op 0 B/op 0 allocs/op BenchmarkMatcherTypes/equals-8 627045850 2.153 ns/op 0 B/op 0 allocs/op ```
3 years ago · 5a08a6bcb9
parent cb8a7440d4
commit 5a08a6bcb9
9 changed files with 98 additions and 52 deletions
--- a/pkg/logql/log/filter.go
+++ b/pkg/logql/log/filter.go
@ -3,6 +3,7 @@ package log
 import (
 	"bytes"
 	"fmt"
+	"github.com/grafana/loki/pkg/util"
 	"unicode"
 	"unicode/utf8"

@ -442,7 +443,7 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
 	// attempt to improve regex with tricks
 	f, ok := simplify(reg, isLabel)
 	if !ok {
-		allNonGreedy(reg)
+		util.AllNonGreedy(reg)
 		regex := reg.String()
 		if isLabel {
 			// label regexes are anchored to
@ -457,25 +458,6 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
 	return newNotFilter(f), nil
 }

-// allNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing
-// `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match
-// we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match.
-// Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will
-// just negate `.*` and `.*?`.
-func allNonGreedy(regs ...*syntax.Regexp) {
-	clearCapture(regs...)
-	for _, re := range regs {
-		switch re.Op {
-		case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate:
-			allNonGreedy(re.Sub...)
-		case syntax.OpStar, syntax.OpPlus:
-			re.Flags = re.Flags | syntax.NonGreedy
-		default:
-			continue
-		}
-	}
-}
-
 // simplify a regexp expression by replacing it, when possible, with a succession of literal filters.
 // For example `(foo|bar)` will be replaced by  `containsFilter(foo) or containsFilter(bar)`
 func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
@ -485,13 +467,13 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
 	case syntax.OpConcat:
 		return simplifyConcat(reg, nil)
 	case syntax.OpCapture:
-		clearCapture(reg)
+		util.ClearCapture(reg)
 		return simplify(reg, isLabel)
 	case syntax.OpLiteral:
 		if isLabel {
-			return newEqualFilter([]byte(string(reg.Rune)), isCaseInsensitive(reg)), true
+			return newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
 		}
-		return newContainsFilter([]byte(string(reg.Rune)), isCaseInsensitive(reg)), true
+		return newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
 	case syntax.OpStar:
 		if reg.Sub[0].Op == syntax.OpAnyCharNotNL {
 			return TrueFilter, true
@ -506,23 +488,10 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
 	return nil, false
 }

-func isCaseInsensitive(reg *syntax.Regexp) bool {
-	return (reg.Flags & syntax.FoldCase) != 0
-}
-
-// clearCapture removes capture operation as they are not used for filtering.
-func clearCapture(regs ...*syntax.Regexp) {
-	for _, r := range regs {
-		if r.Op == syntax.OpCapture {
-			*r = *r.Sub[0]
-		}
-	}
-}
-
 // simplifyAlternate simplifies, when possible, alternate regexp expressions such as:
 // (foo|bar) or (foo|(bar|buzz)).
 func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
-	clearCapture(reg.Sub...)
+	util.ClearCapture(reg.Sub...)
 	// attempt to simplify the first leg
 	f, ok := simplify(reg.Sub[0], isLabel)
 	if !ok {
@ -545,7 +514,7 @@ func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
 // Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates.
 // Anything else is rejected.
 func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
-	clearCapture(reg.Sub...)
+	util.ClearCapture(reg.Sub...)
 	// remove empty match as we don't need them for filtering
 	i := 0
 	for _, r := range reg.Sub {
@ -574,7 +543,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
 			}
 			literals++
 			baseLiteral = append(baseLiteral, []byte(string(sub.Rune))...)
-			baseLiteralIsCaseInsensitive = isCaseInsensitive(sub)
+			baseLiteralIsCaseInsensitive = util.IsCaseInsensitive(sub)
 			continue
 		}
 		// if we have an alternate we must also have a base literal to apply the concatenation with.
@ -613,7 +582,7 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer,
 		// and alternate expression is marked as case insensitive. For example, for the original expression
 		// f|f(?i)oo the extracted expression would be "f (?:)|(?i:OO)" i.e. f with empty match
 		// and fOO. For fOO, we can't initialize containsFilter with caseInsensitve variable as either true or false
-		isAltCaseInsensitive := isCaseInsensitive(alt)
+		isAltCaseInsensitive := util.IsCaseInsensitive(alt)
 		if !baseLiteralIsCaseInsensitive && isAltCaseInsensitive {
 			return nil, false
 		}
--- a/pkg/logql/matchers_test.go
+++ b/pkg/logql/matchers_test.go
@ -32,7 +32,7 @@ func Test_match(t *testing.T) {
 				{mustMatcher(labels.MatchEqual, "a", "1")},
 				{
 					mustMatcher(labels.MatchEqual, "b", "2"),
-					mustMatcher(labels.MatchRegexp, "c", "3"),
+					mustMatcher(labels.MatchEqual, "c", "3"),
 					mustMatcher(labels.MatchNotEqual, "d", "4"),
 				},
 			},
--- a/pkg/logql/shardmapper_test.go
+++ b/pkg/logql/shardmapper_test.go
@ -257,11 +257,11 @@ func TestMappingStrings(t *testing.T) {
 		},
 		{
 			in:  `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
-			out: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
+			out: `avg(avg_over_time({job=~"myapps(?-s:.)*?"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
 		},
 		{
 			in:  `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
-			out: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
+			out: `avg_over_time({job=~"myapps(?-s:.)*?"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
 		},
 		// should be noop if VectorExpr
 		{
@ -271,28 +271,28 @@ func TestMappingStrings(t *testing.T) {
 		{
 			// or exprs aren't shardable
 			in:  `count_over_time({a=~".+"}[1s]) or count_over_time({a=~".+"}[1s])`,
-			out: `(downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>ordownstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>)`,
+			out: `(downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>ordownstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>)`,
 		},
 		{
 			// vector() exprs aren't shardable
 			in:  `sum(count_over_time({a=~".+"}[1s]) + vector(1))`,
-			out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>+vector(1.000000)))`,
+			out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>+vector(1.000000)))`,
 		},
 		{
 			// on() is never shardable as it can mutate labels
 			in:  `sum(count_over_time({a=~".+"}[1s]) * on () count_over_time({a=~".+"}[1s]))`,
-			out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>*on()downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>))`,
+			out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>*on()downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>))`,
 		},
 		{
 			// ignoring(<non-empty-labels>) is never shardable as it can mutate labels
 			in:  `sum(count_over_time({a=~".+"}[1s]) * ignoring (foo) count_over_time({a=~".+"}[1s]))`,
-			out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>*ignoring(foo)downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>))`,
+			out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>*ignoring(foo)downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>))`,
 		},
 		{
 			// ignoring () doesn't mutate labels and therefore can be shardable
 			// as long as the operation is shardable
 			in:  `sum(count_over_time({a=~".+"}[1s]) * ignoring () count_over_time({a=~".+"}[1s]))`,
-			out: `sum(downstream<sum((count_over_time({a=~".+"}[1s])*count_over_time({a=~".+"}[1s]))),shard=0_of_2>++downstream<sum((count_over_time({a=~".+"}[1s])*count_over_time({a=~".+"}[1s]))),shard=1_of_2>)`,
+			out: `sum(downstream<sum((count_over_time({a=~"(?-s:.)+?"}[1s])*count_over_time({a=~"(?-s:.)+?"}[1s]))),shard=0_of_2>++downstream<sum((count_over_time({a=~"(?-s:.)+?"}[1s])*count_over_time({a=~"(?-s:.)+?"}[1s]))),shard=1_of_2>)`,
 		},
 	} {
 		t.Run(tc.in, func(t *testing.T) {
--- a/pkg/logql/syntax/ast.go
+++ b/pkg/logql/syntax/ast.go
@ -2,6 +2,7 @@ package syntax

 import (
 	"fmt"
+	"github.com/grafana/loki/pkg/util"
 	"math"
 	"regexp"
 	"strconv"
@ -16,6 +17,7 @@ import (
 	"github.com/grafana/loki/pkg/logql/log"
 	"github.com/grafana/loki/pkg/logql/log/logfmt"
 	"github.com/grafana/loki/pkg/logqlmodel"
+	"github.com/grafana/regexp/syntax"
 )

 // Expr is the root expression which can be a SampleExpr or LogSelectorExpr
@ -616,6 +618,10 @@ func (l *LogfmtExpressionParser) String() string {
 }

 func mustNewMatcher(t labels.MatchType, n, v string) *labels.Matcher {
+	if t == labels.MatchRegexp || t == labels.MatchNotRegexp {
+		return simplifyRegexMatcher(t, n, v)
+	}
+
 	m, err := labels.NewMatcher(t, n, v)
 	if err != nil {
 		panic(logqlmodel.NewParseError(err.Error(), 0, 0))
@ -623,6 +629,38 @@ func mustNewMatcher(t labels.MatchType, n, v string) *labels.Matcher {
 	return m
 }

+func simplifyRegexMatcher(typ labels.MatchType, name, value string) *labels.Matcher {
+	reg, err := syntax.Parse(value, syntax.Perl)
+	if err != nil {
+		panic(logqlmodel.NewParseError(err.Error(), 0, 0))
+	}
+	reg = reg.Simplify()
+
+	m, ok := simplify(typ, name, value, reg)
+	if !ok {
+		util.AllNonGreedy(reg)
+		return labels.MustNewMatcher(typ, name, reg.String())
+	}
+
+	return m
+}
+
+// simplify will return an equals matcher if there is a regex matching a literal
+func simplify(typ labels.MatchType, name, value string, reg *syntax.Regexp) (*labels.Matcher, bool) {
+	switch reg.Op {
+	case syntax.OpLiteral:
+		if !util.IsCaseInsensitive(reg) {
+			t := labels.MatchEqual
+			if typ == labels.MatchNotRegexp {
+				t = labels.MatchNotEqual
+			}
+			return labels.MustNewMatcher(t, name, value), true
+		}
+		return nil, false
+	}
+	return nil, false
+}
+
 func mustNewFloat(s string) float64 {
 	n, err := strconv.ParseFloat(s, 64)
 	if err != nil {
--- a/pkg/logql/syntax/parser_test.go
+++ b/pkg/logql/syntax/parser_test.go
@ -3414,7 +3414,7 @@ func TestNoOpLabelToString(t *testing.T) {
 	logExpr := `{container_name="app"} | foo=~".*"`
 	l, err := ParseLogSelector(logExpr, false)
 	require.NoError(t, err)
-	require.Equal(t, logExpr, l.String())
+	require.Equal(t, `{container_name="app"} | foo=~"(?-s:.)*?"`, l.String())

 	stages, err := l.(*PipelineExpr).MultiStages.stages()
 	require.NoError(t, err)
--- a/pkg/loki/runtime_config_test.go
+++ b/pkg/loki/runtime_config_test.go
@ -52,7 +52,7 @@ overrides:
 		}},
 		{Period: model.Duration(24 * time.Hour), Priority: 5, Selector: `{namespace="bar", cluster=~"fo.*|b.+|[1-2]"}`, Matchers: []*labels.Matcher{
 			labels.MustNewMatcher(labels.MatchEqual, "namespace", "bar"),
-			labels.MustNewMatcher(labels.MatchRegexp, "cluster", "fo.*|b.+|[1-2]"),
+			labels.MustNewMatcher(labels.MatchRegexp, "cluster", "fo(?-s:.)*?|b(?-s:.)+?|[1-2]"),
 		}},
 	}, overrides.StreamRetention("29"))
 }
--- a/pkg/storage/store_test.go
+++ b/pkg/storage/store_test.go
@ -955,7 +955,7 @@ func Test_store_decodeReq_Matchers(t *testing.T) {
 			"unsharded",
 			newQuery("{foo=~\"ba.*\"}", from, from.Add(6*time.Millisecond), nil, nil),
 			[]*labels.Matcher{
-				labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba.*"),
+				labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba(?-s:.)*?"),
 				labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "logs"),
 			},
 		},
@ -969,7 +969,7 @@ func Test_store_decodeReq_Matchers(t *testing.T) {
 				nil,
 			),
 			[]*labels.Matcher{
-				labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba.*"),
+				labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba(?-s:.)*?"),
 				labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "logs"),
 				labels.MustNewMatcher(
 					labels.MatchEqual,
--- a/pkg/util/matchers.go
+++ b/pkg/util/matchers.go
@ -14,6 +14,10 @@ func SplitFiltersAndMatchers(allMatchers []*labels.Matcher) (filters, matchers [
 		// the index, we should ignore this matcher to fetch all possible chunks
 		// and then filter on the matcher after the chunks have been fetched.
 		if matcher.Matches("") {
+			// Always skip matches that match everything
+			if matcher.Type == labels.MatchRegexp && matcher.Value == ".*" {
+				continue
+			}
 			filters = append(filters, matcher)
 		} else {
 			matchers = append(matchers, matcher)
--- a/pkg/util/regex.go
+++ b/pkg/util/regex.go
@ -0,0 +1,35 @@
+package util
+
+import "github.com/grafana/regexp/syntax"
+
+func IsCaseInsensitive(reg *syntax.Regexp) bool {
+	return (reg.Flags & syntax.FoldCase) != 0
+}
+
+// AllNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing
+// `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match
+// we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match.
+// Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will
+// just negate `.*` and `.*?`.
+func AllNonGreedy(regs ...*syntax.Regexp) {
+	ClearCapture(regs...)
+	for _, re := range regs {
+		switch re.Op {
+		case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate:
+			AllNonGreedy(re.Sub...)
+		case syntax.OpStar, syntax.OpPlus:
+			re.Flags = re.Flags | syntax.NonGreedy
+		default:
+			continue
+		}
+	}
+}
+
+// ClearCapture removes capture operation as they are not used for filtering.
+func ClearCapture(regs ...*syntax.Regexp) {
+	for _, r := range regs {
+		if r.Op == syntax.OpCapture {
+			*r = *r.Sub[0]
+		}
+	}
+}