From a8aa3f68cb46289baf912b22e427b1a18a520bcc Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Fri, 1 Mar 2024 17:27:26 +0100 Subject: [PATCH] feat: Bloom filter regexes (#12096) --- pkg/logql/log/filter.go | 225 +++++++++++++++++----- pkg/storage/bloom/v1/bloom_tester.go | 94 ++++++++- pkg/storage/bloom/v1/bloom_tester_test.go | 43 ++++- 3 files changed, 312 insertions(+), 50 deletions(-) diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go index 7860ad88df..7117b77805 100644 --- a/pkg/logql/log/filter.go +++ b/pkg/logql/log/filter.go @@ -6,20 +6,59 @@ import ( "unicode" "unicode/utf8" - "github.com/grafana/loki/pkg/util" - "github.com/grafana/regexp" "github.com/grafana/regexp/syntax" + "github.com/grafana/loki/pkg/util" "github.com/prometheus/prometheus/model/labels" ) +// Checker is an interface that matches against the input line or regexp. +type Checker interface { + Test(line []byte, caseInsensitive bool, equal bool) bool + TestRegex(reg *regexp.Regexp) bool +} + +// Matcher is a interface to match log lines against a Checker. +// This works in the opposite direction of Filterer. Whereas Filterer.Filter +// checks if an input log line satisfies the filter, Matcher.Matches checks if +// a filter satisfies an input log line (or regexp). +type Matcher interface { + Matches(test Checker) bool +} + // Filterer is a interface to filter log lines. type Filterer interface { Filter(line []byte) bool ToStage() Stage } +type MatcherFilterer interface { + Matcher + Filterer +} + +type wrapper struct { + Filterer + Matcher +} + +func (w wrapper) IsMatcher() bool { + return w.Matcher != nil +} + +func (w wrapper) IsFilterer() bool { + return w.Filterer != nil +} + +func WrapFilterer(f Filterer) MatcherFilterer { + return wrapper{Filterer: f} +} + +func WrapMatcher(m Matcher) MatcherFilterer { + return wrapper{Matcher: m} +} + // LineFilterFunc is a syntax sugar for creating line filter from a function type FiltererFunc func(line []byte) bool @@ -32,9 +71,36 @@ type trueFilter struct{} func (trueFilter) Filter(_ []byte) bool { return true } func (trueFilter) ToStage() Stage { return NoopStage } +// Matches implements Matcher +func (trueFilter) Matches(_ Checker) bool { return true } + // TrueFilter is a filter that returns and matches all log lines whatever their content. var TrueFilter = trueFilter{} +func isTrueFilter(f MatcherFilterer) bool { + if f == TrueFilter { + return true + } + + if _, ok := f.(trueFilter); ok { + return true + } + + if wrap, ok := f.(wrapper); ok { + if wrap.IsFilterer() { + if _, ok = wrap.Filterer.(trueFilter); ok { + return true + } + } + // Otherwise, it's a matcher + if _, ok = wrap.Matcher.(trueFilter); ok { + return true + } + } + + return false +} + type existsFilter struct{} func (e existsFilter) Filter(line []byte) bool { @@ -49,15 +115,18 @@ func (e existsFilter) ToStage() Stage { } } +// Matches implements Matcher +func (e existsFilter) Matches(_ Checker) bool { return true } + // ExistsFilter is a filter that returns and matches when a line has any characters. var ExistsFilter = existsFilter{} type notFilter struct { - Filterer + MatcherFilterer } func (n notFilter) Filter(line []byte) bool { - return !n.Filterer.Filter(line) + return !n.MatcherFilterer.Filter(line) } func (n notFilter) ToStage() Stage { @@ -68,29 +137,33 @@ func (n notFilter) ToStage() Stage { } } +func (n notFilter) Matches(test Checker) bool { + return !n.MatcherFilterer.Matches(test) +} + // NewNotFilter creates a new filter which matches only if the base filter doesn't match. // If the base filter is a `or` it will recursively simplify with `and` operations. -func NewNotFilter(base Filterer) Filterer { +func NewNotFilter(base MatcherFilterer) MatcherFilterer { // not(a|b) = not(a) and not(b) , and operation can't benefit from this optimization because both legs always needs to be executed. if or, ok := base.(orFilter); ok { return NewAndFilter(NewNotFilter(or.left), NewNotFilter(or.right)) } - return notFilter{Filterer: base} + return notFilter{MatcherFilterer: base} } type andFilter struct { - left Filterer - right Filterer + left MatcherFilterer + right MatcherFilterer } // NewAndFilter creates a new filter which matches only if left and right matches. -func NewAndFilter(left Filterer, right Filterer) Filterer { +func NewAndFilter(left MatcherFilterer, right MatcherFilterer) MatcherFilterer { // Make sure we take care of panics in case a nil or noop filter is passed. - if right == nil || right == TrueFilter { + if right == nil || isTrueFilter(right) { return left } - if left == nil || left == TrueFilter { + if left == nil || isTrueFilter(left) { return right } @@ -112,6 +185,10 @@ func (a andFilter) ToStage() Stage { } } +func (a andFilter) Matches(test Checker) bool { + return a.left.Matches(test) && a.right.Matches(test) +} + type andFilters struct { filters []Filterer } @@ -123,7 +200,7 @@ func NewAndFilters(filters []Filterer) Filterer { n := 0 for _, filter := range filters { // Make sure we take care of panics in case a nil or noop filter is passed. - if !(filter == nil || filter == TrueFilter) { + if !(filter == nil || isTrueFilter(WrapFilterer(filter))) { switch c := filter.(type) { case *containsFilter: // Start accumulating contains filters. @@ -190,17 +267,17 @@ func (a andFilters) ToStage() Stage { } type orFilter struct { - left Filterer - right Filterer + left MatcherFilterer + right MatcherFilterer } // newOrFilter creates a new filter which matches only if left or right matches. -func newOrFilter(left Filterer, right Filterer) Filterer { - if left == nil || left == TrueFilter { +func newOrFilter(left MatcherFilterer, right MatcherFilterer) MatcherFilterer { + if left == nil || isTrueFilter(left) { return right } - if right == nil || right == TrueFilter { + if right == nil || isTrueFilter(right) { return left } @@ -210,14 +287,19 @@ func newOrFilter(left Filterer, right Filterer) Filterer { } } -// ChainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many) -func ChainOrFilter(curr, new Filterer) Filterer { +// ChainOrMatcherFilterer is a syntax sugar to chain multiple `or` filters. (1 or many) +func ChainOrMatcherFilterer(curr, new MatcherFilterer) MatcherFilterer { if curr == nil { return new } return newOrFilter(curr, new) } +// ChainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many) +func ChainOrFilter(curr, new Filterer) Filterer { + return ChainOrMatcherFilterer(WrapFilterer(curr), WrapFilterer(new)) +} + func (a orFilter) Filter(line []byte) bool { return a.left.Filter(line) || a.right.Filter(line) } @@ -230,6 +312,11 @@ func (a orFilter) ToStage() Stage { } } +// Matches implements Matcher +func (a orFilter) Matches(test Checker) bool { + return a.left.Matches(test) || a.right.Matches(test) +} + type regexpFilter struct { *regexp.Regexp @@ -238,7 +325,7 @@ type regexpFilter struct { // newRegexpFilter creates a new line filter for a given regexp. // If match is false the filter is the negation of the regexp. -func newRegexpFilter(re string, orig string, match bool) (Filterer, error) { +func newRegexpFilter(re string, orig string, match bool) (MatcherFilterer, error) { reg, err := regexp.Compile(re) if err != nil { return nil, err @@ -262,6 +349,10 @@ func (r regexpFilter) ToStage() Stage { } } +func (r regexpFilter) Matches(test Checker) bool { + return test.TestRegex(r.Regexp) +} + func (r regexpFilter) String() string { return r.orig } @@ -287,11 +378,15 @@ func (l equalFilter) ToStage() Stage { } } +func (l equalFilter) Matches(test Checker) bool { + return test.Test(l.match, l.caseInsensitive, true) +} + func (l equalFilter) String() string { return string(l.match) } -func newEqualFilter(match []byte, caseInsensitive bool) Filterer { +func newEqualFilter(match []byte, caseInsensitive bool) MatcherFilterer { return equalFilter{match, caseInsensitive} } @@ -359,12 +454,17 @@ func (l containsFilter) ToStage() Stage { } } +// Matches implements Matcher +func (l containsFilter) Matches(test Checker) bool { + return test.Test(l.match, l.caseInsensitive, false) +} + func (l containsFilter) String() string { return string(l.match) } // newContainsFilter creates a contains filter that checks if a log line contains a match. -func newContainsFilter(match []byte, caseInsensitive bool) Filterer { +func newContainsFilter(match []byte, caseInsensitive bool) MatcherFilterer { if len(match) == 0 { return TrueFilter } @@ -406,6 +506,15 @@ func (f containsAllFilter) ToStage() Stage { } } +func (f containsAllFilter) Matches(test Checker) bool { + for _, m := range f.matches { + if !test.Test(m.match, m.caseInsensitive, false) { + return false + } + } + return true +} + // NewFilter creates a new line filter from a match string and type. func NewFilter(match string, mt labels.MatchType) (Filterer, error) { switch mt { @@ -440,7 +549,7 @@ func NewLabelFilter(match string, mt labels.MatchType) (Filterer, error) { // parseRegexpFilter parses a regexp and attempt to simplify it with only literal filters. // If not possible it will returns the original regexp filter. -func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) { +func parseRegexpFilter(re string, match bool, isLabel bool) (MatcherFilterer, error) { reg, err := syntax.Parse(re, syntax.Perl) if err != nil { return nil, err @@ -448,7 +557,7 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) { reg = reg.Simplify() // attempt to improve regex with tricks - f, ok := simplify(reg, isLabel) + filter, ok := defaultRegexSimplifier.Simplify(reg, isLabel) if !ok { util.AllNonGreedy(reg) regex := reg.String() @@ -459,28 +568,52 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) { } return newRegexpFilter(regex, re, match) } + if match { - return f, nil + return filter, nil + } + return NewNotFilter(filter), nil +} + +type Simplifier interface { + Simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) +} + +type NewMatcherFiltererFunc func(match []byte, caseInsensitive bool) MatcherFilterer + +type RegexSimplifier struct { + newContainsFilter NewMatcherFiltererFunc + newEqualFilter NewMatcherFiltererFunc +} + +var defaultRegexSimplifier = NewRegexSimplifier(newContainsFilter, newEqualFilter) + +func NewRegexSimplifier( + newContainsFilter NewMatcherFiltererFunc, + newEqualFilter NewMatcherFiltererFunc, +) *RegexSimplifier { + return &RegexSimplifier{ + newContainsFilter: newContainsFilter, + newEqualFilter: newEqualFilter, } - return NewNotFilter(f), nil } -// simplify a regexp expression by replacing it, when possible, with a succession of literal filters. +// Simplify a regexp expression by replacing it, when possible, with a succession of literal filters. // For example `(foo|bar)` will be replaced by `containsFilter(foo) or containsFilter(bar)` -func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) { +func (s *RegexSimplifier) Simplify(reg *syntax.Regexp, isLabel bool) (MatcherFilterer, bool) { switch reg.Op { case syntax.OpAlternate: - return simplifyAlternate(reg, isLabel) + return s.simplifyAlternate(reg, isLabel) case syntax.OpConcat: - return simplifyConcat(reg, nil) + return s.simplifyConcat(reg, nil) case syntax.OpCapture: util.ClearCapture(reg) - return simplify(reg, isLabel) + return s.Simplify(reg, isLabel) case syntax.OpLiteral: if isLabel { - return newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true + return s.newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true } - return newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true + return s.newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true case syntax.OpStar: if reg.Sub[0].Op == syntax.OpAnyCharNotNL { return TrueFilter, true @@ -497,16 +630,16 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) { // simplifyAlternate simplifies, when possible, alternate regexp expressions such as: // (foo|bar) or (foo|(bar|buzz)). -func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) { +func (s *RegexSimplifier) simplifyAlternate(reg *syntax.Regexp, isLabel bool) (MatcherFilterer, bool) { util.ClearCapture(reg.Sub...) // attempt to simplify the first leg - f, ok := simplify(reg.Sub[0], isLabel) + f, ok := s.Simplify(reg.Sub[0], isLabel) if !ok { return nil, false } // merge the rest of the legs for i := 1; i < len(reg.Sub); i++ { - f2, ok := simplify(reg.Sub[i], isLabel) + f2, ok := s.Simplify(reg.Sub[i], isLabel) if !ok { return nil, false } @@ -520,7 +653,7 @@ func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) { // which is a literalFilter. // Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates. // Anything else is rejected. -func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { +func (s *RegexSimplifier) simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (MatcherFilterer, bool) { util.ClearCapture(reg.Sub...) // remove empty match as we don't need them for filtering i := 0 @@ -538,7 +671,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { return nil, false } - var curr Filterer + var curr MatcherFilterer var ok bool literals := 0 var baseLiteralIsCaseInsensitive bool @@ -555,7 +688,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { } // if we have an alternate we must also have a base literal to apply the concatenation with. if sub.Op == syntax.OpAlternate && baseLiteral != nil { - if curr, ok = simplifyConcatAlternate(sub, baseLiteral, curr, baseLiteralIsCaseInsensitive); !ok { + if curr, ok = s.simplifyConcatAlternate(sub, baseLiteral, curr, baseLiteralIsCaseInsensitive); !ok { return nil, false } continue @@ -573,7 +706,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { // if we have only a concat with literals. if baseLiteral != nil { - return newContainsFilter(baseLiteral, baseLiteralIsCaseInsensitive), true + return s.newContainsFilter(baseLiteral, baseLiteralIsCaseInsensitive), true } return nil, false @@ -583,7 +716,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { // A concat alternate is found when a concat operation has a sub alternate and is preceded by a literal. // For instance bar|b|buzz is expressed as b(ar|(?:)|uzz) => b concat alternate(ar,(?:),uzz). // (?:) being an OpEmptyMatch and b being the literal to concat all alternates (ar,(?:),uzz) with. -func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer, baseLiteralIsCaseInsensitive bool) (Filterer, bool) { +func (s *RegexSimplifier) simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr MatcherFilterer, baseLiteralIsCaseInsensitive bool) (MatcherFilterer, bool) { for _, alt := range reg.Sub { // we should not consider the case where baseLiteral is not marked as case insensitive // and alternate expression is marked as case insensitive. For example, for the original expression @@ -595,25 +728,25 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer, } switch alt.Op { case syntax.OpEmptyMatch: - curr = ChainOrFilter(curr, newContainsFilter(literal, baseLiteralIsCaseInsensitive)) + curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(literal, baseLiteralIsCaseInsensitive)) case syntax.OpLiteral: // concat the root literal with the alternate one. altBytes := []byte(string(alt.Rune)) altLiteral := make([]byte, 0, len(literal)+len(altBytes)) altLiteral = append(altLiteral, literal...) altLiteral = append(altLiteral, altBytes...) - curr = ChainOrFilter(curr, newContainsFilter(altLiteral, baseLiteralIsCaseInsensitive)) + curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(altLiteral, baseLiteralIsCaseInsensitive)) case syntax.OpConcat: - f, ok := simplifyConcat(alt, literal) + f, ok := s.simplifyConcat(alt, literal) if !ok { return nil, false } - curr = ChainOrFilter(curr, f) + curr = ChainOrMatcherFilterer(curr, f) case syntax.OpStar: if alt.Sub[0].Op != syntax.OpAnyCharNotNL { return nil, false } - curr = ChainOrFilter(curr, newContainsFilter(literal, baseLiteralIsCaseInsensitive)) + curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(literal, baseLiteralIsCaseInsensitive)) default: return nil, false } diff --git a/pkg/storage/bloom/v1/bloom_tester.go b/pkg/storage/bloom/v1/bloom_tester.go index 19f9f8d557..450be9339c 100644 --- a/pkg/storage/bloom/v1/bloom_tester.go +++ b/pkg/storage/bloom/v1/bloom_tester.go @@ -1,8 +1,11 @@ package v1 import ( + "github.com/grafana/regexp" + regexpsyntax "github.com/grafana/regexp/syntax" "github.com/prometheus/prometheus/model/labels" + "github.com/grafana/loki/pkg/logql/log" "github.com/grafana/loki/pkg/logql/syntax" "github.com/grafana/loki/pkg/storage/bloom/v1/filter" ) @@ -59,14 +62,80 @@ func simpleFilterToBloomTest(b NGramBuilder, filter syntax.LineFilter) BloomTest } return test case labels.MatchRegexp, labels.MatchNotRegexp: - // TODO(salvacorts): Simplify regex similarly to how it's done at pkg/logql/log/filter.go (`simplify` function) - // Ideally we want to extract the simplify logic into pkg/util/regex.go - return MatchAll + reg, err := regexpsyntax.Parse(filter.Match, regexpsyntax.Perl) + if err != nil { + // TODO: log error + return MatchAll + } + reg = reg.Simplify() + + simplifier := log.NewRegexSimplifier(newStringFilterFunc(b), newStringFilterFunc(b)) + matcher, ok := simplifier.Simplify(reg, false) + if !ok { + // If the regex simplifier fails, we default to MatchAll + return MatchAll + } + + var test BloomTest = matcherFilterWrapper{filter: matcher} + if filter.Ty == labels.MatchNotRegexp { + test = newNotTest(test) + } + return test default: return MatchAll } } +type bloomCheckerWrapper struct { + bloom filter.Checker +} + +// Test implements the log.Checker interface +func (b bloomCheckerWrapper) Test(line []byte, _ bool, _ bool) bool { + return b.bloom.Test(line) +} + +// TestRegex implements the log.Checker interface +func (b bloomCheckerWrapper) TestRegex(_ *regexp.Regexp) bool { + // We won't support regexes in bloom filters so we just return true + return true +} + +type logCheckerWrapper struct { + checker log.Checker +} + +// Test implements the filter.Checker interface +func (l logCheckerWrapper) Test(data []byte) bool { + return l.checker.Test(data, true, false) +} + +type matcherFilterWrapper struct { + filter log.Matcher +} + +func (m matcherFilterWrapper) Matches(bloom filter.Checker) bool { + return m.filter.Matches(bloomCheckerWrapper{bloom}) +} + +func (m matcherFilterWrapper) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool { + return m.filter.Matches(bloomCheckerWrapper{prefixedChecker{ + checker: bloom, + buf: buf, + prefixLen: prefixLen, + }}) +} + +type prefixedChecker struct { + checker filter.Checker + buf []byte + prefixLen int +} + +func (p prefixedChecker) Test(data []byte) bool { + return p.checker.Test(append(p.buf[:p.prefixLen], data...)) +} + type matchAllTest struct{} var MatchAll = matchAllTest{} @@ -101,6 +170,7 @@ func newStringTest(b NGramBuilder, search string) stringTest { return test } +// Matches implements the BloomTest interface func (b stringTest) Matches(bloom filter.Checker) bool { for _, ngram := range b.ngrams { if !bloom.Test(ngram) { @@ -110,6 +180,7 @@ func (b stringTest) Matches(bloom filter.Checker) bool { return true } +// MatchesWithPrefixBuf implements the BloomTest interface func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool { for _, ngram := range b.ngrams { buf = append(buf[:prefixLen], ngram...) @@ -120,6 +191,23 @@ func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefi return true } +type stringMatcherFilter struct { + test stringTest +} + +// Matches implements the log.Filterer interface +func (b stringMatcherFilter) Matches(test log.Checker) bool { + return b.test.Matches(logCheckerWrapper{test}) +} + +func newStringFilterFunc(b NGramBuilder) log.NewMatcherFiltererFunc { + return func(match []byte, caseInsensitive bool) log.MatcherFilterer { + return log.WrapMatcher(stringMatcherFilter{ + test: newStringTest(b, string(match)), + }) + } +} + type notTest struct { BloomTest } diff --git a/pkg/storage/bloom/v1/bloom_tester_test.go b/pkg/storage/bloom/v1/bloom_tester_test.go index d99984f7b8..c873887aca 100644 --- a/pkg/storage/bloom/v1/bloom_tester_test.go +++ b/pkg/storage/bloom/v1/bloom_tester_test.go @@ -112,7 +112,48 @@ func TestFiltersToBloomTests(t *testing.T) { bloom: fakeBloom{"foo", "bar", "baz", "fuzz"}, expectMatch: true, }, - // TODO: test regexes + { + name: "regex match all star", + query: `{app="fake"} |~ ".*"`, + bloom: fakeBloom{"foo", "bar"}, + expectMatch: true, + }, + { + name: "regex match all plus", + query: `{app="fake"} |~ ".+"`, + bloom: fakeBloom{"foo", "bar"}, + expectMatch: true, + }, + { + name: "regex match none", + query: `{app="fake"} !~ ".*"`, + bloom: fakeBloom{"foo", "bar"}, + expectMatch: false, + }, + { + name: "regex match", + query: `{app="fake"} |~ "nope|.*foo.*"`, + bloom: fakeBloom{"foo", "bar"}, + expectMatch: true, + }, + { + name: "regex no match", + query: `{app="fake"} !~ "nope|.*foo.*"`, + bloom: fakeBloom{"foo", "bar"}, + expectMatch: false, + }, + { + name: "complex regex match", + query: `{app="fake"} |~ "(nope|.*not.*|.*foo.*)" or "(no|ba)" !~ "noz.*" or "(nope|not)"`, + bloom: fakeBloom{"foo", "bar", "baz", "fuzz"}, + expectMatch: true, + }, + { + name: "complex regex no match", + query: `{app="fake"} |~ "(nope|.*not.*|.*foo.*)" or "(no|ba)" !~ "noz.*"`, + bloom: fakeBloom{"foo", "bar", "baz", "fuzz", "noz"}, + expectMatch: false, + }, } { t.Run(tc.name, func(t *testing.T) { expr, err := syntax.ParseExpr(tc.query)