Label selector optimizations (#8763)

This PR introduces the following label selector optimizations:
- any selector that is `=~<literal>` is converted to an
`Equals/NotEquals` matcher
- Use the greedy->non-greedy regex improvements that exist in line and
label filters
- Never run `.*` label matchers.

Prometheus regex matchers anchor the match to the beginning and end of
the line. That means matchers of the form `=~<literal_value>` are just
testing equality.

Benchmarks -- this is just a quick bench to demonstrate equality
matchers are faster than their equivalent regex matchers:
```
BenchmarkMatcherTypes/regex-8           27694834                53.74 ns/op            0 B/op          0 allocs/op
BenchmarkMatcherTypes/equals-8          627045850                2.153 ns/op           0 B/op          0 allocs/op
```
pull/8857/head
Travis Patterson 3 years ago committed by GitHub
parent cb8a7440d4
commit 5a08a6bcb9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 49
      pkg/logql/log/filter.go
  2. 2
      pkg/logql/matchers_test.go
  3. 14
      pkg/logql/shardmapper_test.go
  4. 38
      pkg/logql/syntax/ast.go
  5. 2
      pkg/logql/syntax/parser_test.go
  6. 2
      pkg/loki/runtime_config_test.go
  7. 4
      pkg/storage/store_test.go
  8. 4
      pkg/util/matchers.go
  9. 35
      pkg/util/regex.go

@ -3,6 +3,7 @@ package log
import (
"bytes"
"fmt"
"github.com/grafana/loki/pkg/util"
"unicode"
"unicode/utf8"
@ -442,7 +443,7 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
// attempt to improve regex with tricks
f, ok := simplify(reg, isLabel)
if !ok {
allNonGreedy(reg)
util.AllNonGreedy(reg)
regex := reg.String()
if isLabel {
// label regexes are anchored to
@ -457,25 +458,6 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
return newNotFilter(f), nil
}
// allNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing
// `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match
// we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match.
// Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will
// just negate `.*` and `.*?`.
func allNonGreedy(regs ...*syntax.Regexp) {
clearCapture(regs...)
for _, re := range regs {
switch re.Op {
case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate:
allNonGreedy(re.Sub...)
case syntax.OpStar, syntax.OpPlus:
re.Flags = re.Flags | syntax.NonGreedy
default:
continue
}
}
}
// simplify a regexp expression by replacing it, when possible, with a succession of literal filters.
// For example `(foo|bar)` will be replaced by `containsFilter(foo) or containsFilter(bar)`
func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
@ -485,13 +467,13 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
case syntax.OpConcat:
return simplifyConcat(reg, nil)
case syntax.OpCapture:
clearCapture(reg)
util.ClearCapture(reg)
return simplify(reg, isLabel)
case syntax.OpLiteral:
if isLabel {
return newEqualFilter([]byte(string(reg.Rune)), isCaseInsensitive(reg)), true
return newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
}
return newContainsFilter([]byte(string(reg.Rune)), isCaseInsensitive(reg)), true
return newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
case syntax.OpStar:
if reg.Sub[0].Op == syntax.OpAnyCharNotNL {
return TrueFilter, true
@ -506,23 +488,10 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
return nil, false
}
func isCaseInsensitive(reg *syntax.Regexp) bool {
return (reg.Flags & syntax.FoldCase) != 0
}
// clearCapture removes capture operation as they are not used for filtering.
func clearCapture(regs ...*syntax.Regexp) {
for _, r := range regs {
if r.Op == syntax.OpCapture {
*r = *r.Sub[0]
}
}
}
// simplifyAlternate simplifies, when possible, alternate regexp expressions such as:
// (foo|bar) or (foo|(bar|buzz)).
func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
clearCapture(reg.Sub...)
util.ClearCapture(reg.Sub...)
// attempt to simplify the first leg
f, ok := simplify(reg.Sub[0], isLabel)
if !ok {
@ -545,7 +514,7 @@ func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
// Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates.
// Anything else is rejected.
func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
clearCapture(reg.Sub...)
util.ClearCapture(reg.Sub...)
// remove empty match as we don't need them for filtering
i := 0
for _, r := range reg.Sub {
@ -574,7 +543,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
}
literals++
baseLiteral = append(baseLiteral, []byte(string(sub.Rune))...)
baseLiteralIsCaseInsensitive = isCaseInsensitive(sub)
baseLiteralIsCaseInsensitive = util.IsCaseInsensitive(sub)
continue
}
// if we have an alternate we must also have a base literal to apply the concatenation with.
@ -613,7 +582,7 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer,
// and alternate expression is marked as case insensitive. For example, for the original expression
// f|f(?i)oo the extracted expression would be "f (?:)|(?i:OO)" i.e. f with empty match
// and fOO. For fOO, we can't initialize containsFilter with caseInsensitve variable as either true or false
isAltCaseInsensitive := isCaseInsensitive(alt)
isAltCaseInsensitive := util.IsCaseInsensitive(alt)
if !baseLiteralIsCaseInsensitive && isAltCaseInsensitive {
return nil, false
}

@ -32,7 +32,7 @@ func Test_match(t *testing.T) {
{mustMatcher(labels.MatchEqual, "a", "1")},
{
mustMatcher(labels.MatchEqual, "b", "2"),
mustMatcher(labels.MatchRegexp, "c", "3"),
mustMatcher(labels.MatchEqual, "c", "3"),
mustMatcher(labels.MatchNotEqual, "d", "4"),
},
},

@ -257,11 +257,11 @@ func TestMappingStrings(t *testing.T) {
},
{
in: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
out: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
out: `avg(avg_over_time({job=~"myapps(?-s:.)*?"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
},
{
in: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
out: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
out: `avg_over_time({job=~"myapps(?-s:.)*?"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
},
// should be noop if VectorExpr
{
@ -271,28 +271,28 @@ func TestMappingStrings(t *testing.T) {
{
// or exprs aren't shardable
in: `count_over_time({a=~".+"}[1s]) or count_over_time({a=~".+"}[1s])`,
out: `(downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>ordownstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>)`,
out: `(downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>ordownstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>)`,
},
{
// vector() exprs aren't shardable
in: `sum(count_over_time({a=~".+"}[1s]) + vector(1))`,
out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>+vector(1.000000)))`,
out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>+vector(1.000000)))`,
},
{
// on() is never shardable as it can mutate labels
in: `sum(count_over_time({a=~".+"}[1s]) * on () count_over_time({a=~".+"}[1s]))`,
out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>*on()downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>))`,
out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>*on()downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>))`,
},
{
// ignoring(<non-empty-labels>) is never shardable as it can mutate labels
in: `sum(count_over_time({a=~".+"}[1s]) * ignoring (foo) count_over_time({a=~".+"}[1s]))`,
out: `sum((downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>*ignoring(foo)downstream<count_over_time({a=~".+"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~".+"}[1s]),shard=1_of_2>))`,
out: `sum((downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>*ignoring(foo)downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=0_of_2>++downstream<count_over_time({a=~"(?-s:.)+?"}[1s]),shard=1_of_2>))`,
},
{
// ignoring () doesn't mutate labels and therefore can be shardable
// as long as the operation is shardable
in: `sum(count_over_time({a=~".+"}[1s]) * ignoring () count_over_time({a=~".+"}[1s]))`,
out: `sum(downstream<sum((count_over_time({a=~".+"}[1s])*count_over_time({a=~".+"}[1s]))),shard=0_of_2>++downstream<sum((count_over_time({a=~".+"}[1s])*count_over_time({a=~".+"}[1s]))),shard=1_of_2>)`,
out: `sum(downstream<sum((count_over_time({a=~"(?-s:.)+?"}[1s])*count_over_time({a=~"(?-s:.)+?"}[1s]))),shard=0_of_2>++downstream<sum((count_over_time({a=~"(?-s:.)+?"}[1s])*count_over_time({a=~"(?-s:.)+?"}[1s]))),shard=1_of_2>)`,
},
} {
t.Run(tc.in, func(t *testing.T) {

@ -2,6 +2,7 @@ package syntax
import (
"fmt"
"github.com/grafana/loki/pkg/util"
"math"
"regexp"
"strconv"
@ -16,6 +17,7 @@ import (
"github.com/grafana/loki/pkg/logql/log"
"github.com/grafana/loki/pkg/logql/log/logfmt"
"github.com/grafana/loki/pkg/logqlmodel"
"github.com/grafana/regexp/syntax"
)
// Expr is the root expression which can be a SampleExpr or LogSelectorExpr
@ -616,6 +618,10 @@ func (l *LogfmtExpressionParser) String() string {
}
func mustNewMatcher(t labels.MatchType, n, v string) *labels.Matcher {
if t == labels.MatchRegexp || t == labels.MatchNotRegexp {
return simplifyRegexMatcher(t, n, v)
}
m, err := labels.NewMatcher(t, n, v)
if err != nil {
panic(logqlmodel.NewParseError(err.Error(), 0, 0))
@ -623,6 +629,38 @@ func mustNewMatcher(t labels.MatchType, n, v string) *labels.Matcher {
return m
}
func simplifyRegexMatcher(typ labels.MatchType, name, value string) *labels.Matcher {
reg, err := syntax.Parse(value, syntax.Perl)
if err != nil {
panic(logqlmodel.NewParseError(err.Error(), 0, 0))
}
reg = reg.Simplify()
m, ok := simplify(typ, name, value, reg)
if !ok {
util.AllNonGreedy(reg)
return labels.MustNewMatcher(typ, name, reg.String())
}
return m
}
// simplify will return an equals matcher if there is a regex matching a literal
func simplify(typ labels.MatchType, name, value string, reg *syntax.Regexp) (*labels.Matcher, bool) {
switch reg.Op {
case syntax.OpLiteral:
if !util.IsCaseInsensitive(reg) {
t := labels.MatchEqual
if typ == labels.MatchNotRegexp {
t = labels.MatchNotEqual
}
return labels.MustNewMatcher(t, name, value), true
}
return nil, false
}
return nil, false
}
func mustNewFloat(s string) float64 {
n, err := strconv.ParseFloat(s, 64)
if err != nil {

@ -3414,7 +3414,7 @@ func TestNoOpLabelToString(t *testing.T) {
logExpr := `{container_name="app"} | foo=~".*"`
l, err := ParseLogSelector(logExpr, false)
require.NoError(t, err)
require.Equal(t, logExpr, l.String())
require.Equal(t, `{container_name="app"} | foo=~"(?-s:.)*?"`, l.String())
stages, err := l.(*PipelineExpr).MultiStages.stages()
require.NoError(t, err)

@ -52,7 +52,7 @@ overrides:
}},
{Period: model.Duration(24 * time.Hour), Priority: 5, Selector: `{namespace="bar", cluster=~"fo.*|b.+|[1-2]"}`, Matchers: []*labels.Matcher{
labels.MustNewMatcher(labels.MatchEqual, "namespace", "bar"),
labels.MustNewMatcher(labels.MatchRegexp, "cluster", "fo.*|b.+|[1-2]"),
labels.MustNewMatcher(labels.MatchRegexp, "cluster", "fo(?-s:.)*?|b(?-s:.)+?|[1-2]"),
}},
}, overrides.StreamRetention("29"))
}

@ -955,7 +955,7 @@ func Test_store_decodeReq_Matchers(t *testing.T) {
"unsharded",
newQuery("{foo=~\"ba.*\"}", from, from.Add(6*time.Millisecond), nil, nil),
[]*labels.Matcher{
labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba.*"),
labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba(?-s:.)*?"),
labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "logs"),
},
},
@ -969,7 +969,7 @@ func Test_store_decodeReq_Matchers(t *testing.T) {
nil,
),
[]*labels.Matcher{
labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba.*"),
labels.MustNewMatcher(labels.MatchRegexp, "foo", "ba(?-s:.)*?"),
labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "logs"),
labels.MustNewMatcher(
labels.MatchEqual,

@ -14,6 +14,10 @@ func SplitFiltersAndMatchers(allMatchers []*labels.Matcher) (filters, matchers [
// the index, we should ignore this matcher to fetch all possible chunks
// and then filter on the matcher after the chunks have been fetched.
if matcher.Matches("") {
// Always skip matches that match everything
if matcher.Type == labels.MatchRegexp && matcher.Value == ".*" {
continue
}
filters = append(filters, matcher)
} else {
matchers = append(matchers, matcher)

@ -0,0 +1,35 @@
package util
import "github.com/grafana/regexp/syntax"
func IsCaseInsensitive(reg *syntax.Regexp) bool {
return (reg.Flags & syntax.FoldCase) != 0
}
// AllNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing
// `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match
// we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match.
// Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will
// just negate `.*` and `.*?`.
func AllNonGreedy(regs ...*syntax.Regexp) {
ClearCapture(regs...)
for _, re := range regs {
switch re.Op {
case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate:
AllNonGreedy(re.Sub...)
case syntax.OpStar, syntax.OpPlus:
re.Flags = re.Flags | syntax.NonGreedy
default:
continue
}
}
}
// ClearCapture removes capture operation as they are not used for filtering.
func ClearCapture(regs ...*syntax.Regexp) {
for _, r := range regs {
if r.Op == syntax.OpCapture {
*r = *r.Sub[0]
}
}
}
Loading…
Cancel
Save