feat: Bloom filter regexes (#12096)

pull/12107/head
Salva Corts 1 year ago committed by GitHub
parent 90bcaea724
commit a8aa3f68cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 225
      pkg/logql/log/filter.go
  2. 94
      pkg/storage/bloom/v1/bloom_tester.go
  3. 43
      pkg/storage/bloom/v1/bloom_tester_test.go

@ -6,20 +6,59 @@ import (
"unicode"
"unicode/utf8"
"github.com/grafana/loki/pkg/util"
"github.com/grafana/regexp"
"github.com/grafana/regexp/syntax"
"github.com/grafana/loki/pkg/util"
"github.com/prometheus/prometheus/model/labels"
)
// Checker is an interface that matches against the input line or regexp.
type Checker interface {
Test(line []byte, caseInsensitive bool, equal bool) bool
TestRegex(reg *regexp.Regexp) bool
}
// Matcher is a interface to match log lines against a Checker.
// This works in the opposite direction of Filterer. Whereas Filterer.Filter
// checks if an input log line satisfies the filter, Matcher.Matches checks if
// a filter satisfies an input log line (or regexp).
type Matcher interface {
Matches(test Checker) bool
}
// Filterer is a interface to filter log lines.
type Filterer interface {
Filter(line []byte) bool
ToStage() Stage
}
type MatcherFilterer interface {
Matcher
Filterer
}
type wrapper struct {
Filterer
Matcher
}
func (w wrapper) IsMatcher() bool {
return w.Matcher != nil
}
func (w wrapper) IsFilterer() bool {
return w.Filterer != nil
}
func WrapFilterer(f Filterer) MatcherFilterer {
return wrapper{Filterer: f}
}
func WrapMatcher(m Matcher) MatcherFilterer {
return wrapper{Matcher: m}
}
// LineFilterFunc is a syntax sugar for creating line filter from a function
type FiltererFunc func(line []byte) bool
@ -32,9 +71,36 @@ type trueFilter struct{}
func (trueFilter) Filter(_ []byte) bool { return true }
func (trueFilter) ToStage() Stage { return NoopStage }
// Matches implements Matcher
func (trueFilter) Matches(_ Checker) bool { return true }
// TrueFilter is a filter that returns and matches all log lines whatever their content.
var TrueFilter = trueFilter{}
func isTrueFilter(f MatcherFilterer) bool {
if f == TrueFilter {
return true
}
if _, ok := f.(trueFilter); ok {
return true
}
if wrap, ok := f.(wrapper); ok {
if wrap.IsFilterer() {
if _, ok = wrap.Filterer.(trueFilter); ok {
return true
}
}
// Otherwise, it's a matcher
if _, ok = wrap.Matcher.(trueFilter); ok {
return true
}
}
return false
}
type existsFilter struct{}
func (e existsFilter) Filter(line []byte) bool {
@ -49,15 +115,18 @@ func (e existsFilter) ToStage() Stage {
}
}
// Matches implements Matcher
func (e existsFilter) Matches(_ Checker) bool { return true }
// ExistsFilter is a filter that returns and matches when a line has any characters.
var ExistsFilter = existsFilter{}
type notFilter struct {
Filterer
MatcherFilterer
}
func (n notFilter) Filter(line []byte) bool {
return !n.Filterer.Filter(line)
return !n.MatcherFilterer.Filter(line)
}
func (n notFilter) ToStage() Stage {
@ -68,29 +137,33 @@ func (n notFilter) ToStage() Stage {
}
}
func (n notFilter) Matches(test Checker) bool {
return !n.MatcherFilterer.Matches(test)
}
// NewNotFilter creates a new filter which matches only if the base filter doesn't match.
// If the base filter is a `or` it will recursively simplify with `and` operations.
func NewNotFilter(base Filterer) Filterer {
func NewNotFilter(base MatcherFilterer) MatcherFilterer {
// not(a|b) = not(a) and not(b) , and operation can't benefit from this optimization because both legs always needs to be executed.
if or, ok := base.(orFilter); ok {
return NewAndFilter(NewNotFilter(or.left), NewNotFilter(or.right))
}
return notFilter{Filterer: base}
return notFilter{MatcherFilterer: base}
}
type andFilter struct {
left Filterer
right Filterer
left MatcherFilterer
right MatcherFilterer
}
// NewAndFilter creates a new filter which matches only if left and right matches.
func NewAndFilter(left Filterer, right Filterer) Filterer {
func NewAndFilter(left MatcherFilterer, right MatcherFilterer) MatcherFilterer {
// Make sure we take care of panics in case a nil or noop filter is passed.
if right == nil || right == TrueFilter {
if right == nil || isTrueFilter(right) {
return left
}
if left == nil || left == TrueFilter {
if left == nil || isTrueFilter(left) {
return right
}
@ -112,6 +185,10 @@ func (a andFilter) ToStage() Stage {
}
}
func (a andFilter) Matches(test Checker) bool {
return a.left.Matches(test) && a.right.Matches(test)
}
type andFilters struct {
filters []Filterer
}
@ -123,7 +200,7 @@ func NewAndFilters(filters []Filterer) Filterer {
n := 0
for _, filter := range filters {
// Make sure we take care of panics in case a nil or noop filter is passed.
if !(filter == nil || filter == TrueFilter) {
if !(filter == nil || isTrueFilter(WrapFilterer(filter))) {
switch c := filter.(type) {
case *containsFilter:
// Start accumulating contains filters.
@ -190,17 +267,17 @@ func (a andFilters) ToStage() Stage {
}
type orFilter struct {
left Filterer
right Filterer
left MatcherFilterer
right MatcherFilterer
}
// newOrFilter creates a new filter which matches only if left or right matches.
func newOrFilter(left Filterer, right Filterer) Filterer {
if left == nil || left == TrueFilter {
func newOrFilter(left MatcherFilterer, right MatcherFilterer) MatcherFilterer {
if left == nil || isTrueFilter(left) {
return right
}
if right == nil || right == TrueFilter {
if right == nil || isTrueFilter(right) {
return left
}
@ -210,14 +287,19 @@ func newOrFilter(left Filterer, right Filterer) Filterer {
}
}
// ChainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many)
func ChainOrFilter(curr, new Filterer) Filterer {
// ChainOrMatcherFilterer is a syntax sugar to chain multiple `or` filters. (1 or many)
func ChainOrMatcherFilterer(curr, new MatcherFilterer) MatcherFilterer {
if curr == nil {
return new
}
return newOrFilter(curr, new)
}
// ChainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many)
func ChainOrFilter(curr, new Filterer) Filterer {
return ChainOrMatcherFilterer(WrapFilterer(curr), WrapFilterer(new))
}
func (a orFilter) Filter(line []byte) bool {
return a.left.Filter(line) || a.right.Filter(line)
}
@ -230,6 +312,11 @@ func (a orFilter) ToStage() Stage {
}
}
// Matches implements Matcher
func (a orFilter) Matches(test Checker) bool {
return a.left.Matches(test) || a.right.Matches(test)
}
type regexpFilter struct {
*regexp.Regexp
@ -238,7 +325,7 @@ type regexpFilter struct {
// newRegexpFilter creates a new line filter for a given regexp.
// If match is false the filter is the negation of the regexp.
func newRegexpFilter(re string, orig string, match bool) (Filterer, error) {
func newRegexpFilter(re string, orig string, match bool) (MatcherFilterer, error) {
reg, err := regexp.Compile(re)
if err != nil {
return nil, err
@ -262,6 +349,10 @@ func (r regexpFilter) ToStage() Stage {
}
}
func (r regexpFilter) Matches(test Checker) bool {
return test.TestRegex(r.Regexp)
}
func (r regexpFilter) String() string {
return r.orig
}
@ -287,11 +378,15 @@ func (l equalFilter) ToStage() Stage {
}
}
func (l equalFilter) Matches(test Checker) bool {
return test.Test(l.match, l.caseInsensitive, true)
}
func (l equalFilter) String() string {
return string(l.match)
}
func newEqualFilter(match []byte, caseInsensitive bool) Filterer {
func newEqualFilter(match []byte, caseInsensitive bool) MatcherFilterer {
return equalFilter{match, caseInsensitive}
}
@ -359,12 +454,17 @@ func (l containsFilter) ToStage() Stage {
}
}
// Matches implements Matcher
func (l containsFilter) Matches(test Checker) bool {
return test.Test(l.match, l.caseInsensitive, false)
}
func (l containsFilter) String() string {
return string(l.match)
}
// newContainsFilter creates a contains filter that checks if a log line contains a match.
func newContainsFilter(match []byte, caseInsensitive bool) Filterer {
func newContainsFilter(match []byte, caseInsensitive bool) MatcherFilterer {
if len(match) == 0 {
return TrueFilter
}
@ -406,6 +506,15 @@ func (f containsAllFilter) ToStage() Stage {
}
}
func (f containsAllFilter) Matches(test Checker) bool {
for _, m := range f.matches {
if !test.Test(m.match, m.caseInsensitive, false) {
return false
}
}
return true
}
// NewFilter creates a new line filter from a match string and type.
func NewFilter(match string, mt labels.MatchType) (Filterer, error) {
switch mt {
@ -440,7 +549,7 @@ func NewLabelFilter(match string, mt labels.MatchType) (Filterer, error) {
// parseRegexpFilter parses a regexp and attempt to simplify it with only literal filters.
// If not possible it will returns the original regexp filter.
func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
func parseRegexpFilter(re string, match bool, isLabel bool) (MatcherFilterer, error) {
reg, err := syntax.Parse(re, syntax.Perl)
if err != nil {
return nil, err
@ -448,7 +557,7 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
reg = reg.Simplify()
// attempt to improve regex with tricks
f, ok := simplify(reg, isLabel)
filter, ok := defaultRegexSimplifier.Simplify(reg, isLabel)
if !ok {
util.AllNonGreedy(reg)
regex := reg.String()
@ -459,28 +568,52 @@ func parseRegexpFilter(re string, match bool, isLabel bool) (Filterer, error) {
}
return newRegexpFilter(regex, re, match)
}
if match {
return f, nil
return filter, nil
}
return NewNotFilter(filter), nil
}
type Simplifier interface {
Simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool)
}
type NewMatcherFiltererFunc func(match []byte, caseInsensitive bool) MatcherFilterer
type RegexSimplifier struct {
newContainsFilter NewMatcherFiltererFunc
newEqualFilter NewMatcherFiltererFunc
}
var defaultRegexSimplifier = NewRegexSimplifier(newContainsFilter, newEqualFilter)
func NewRegexSimplifier(
newContainsFilter NewMatcherFiltererFunc,
newEqualFilter NewMatcherFiltererFunc,
) *RegexSimplifier {
return &RegexSimplifier{
newContainsFilter: newContainsFilter,
newEqualFilter: newEqualFilter,
}
return NewNotFilter(f), nil
}
// simplify a regexp expression by replacing it, when possible, with a succession of literal filters.
// Simplify a regexp expression by replacing it, when possible, with a succession of literal filters.
// For example `(foo|bar)` will be replaced by `containsFilter(foo) or containsFilter(bar)`
func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
func (s *RegexSimplifier) Simplify(reg *syntax.Regexp, isLabel bool) (MatcherFilterer, bool) {
switch reg.Op {
case syntax.OpAlternate:
return simplifyAlternate(reg, isLabel)
return s.simplifyAlternate(reg, isLabel)
case syntax.OpConcat:
return simplifyConcat(reg, nil)
return s.simplifyConcat(reg, nil)
case syntax.OpCapture:
util.ClearCapture(reg)
return simplify(reg, isLabel)
return s.Simplify(reg, isLabel)
case syntax.OpLiteral:
if isLabel {
return newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
return s.newEqualFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
}
return newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
return s.newContainsFilter([]byte(string(reg.Rune)), util.IsCaseInsensitive(reg)), true
case syntax.OpStar:
if reg.Sub[0].Op == syntax.OpAnyCharNotNL {
return TrueFilter, true
@ -497,16 +630,16 @@ func simplify(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
// simplifyAlternate simplifies, when possible, alternate regexp expressions such as:
// (foo|bar) or (foo|(bar|buzz)).
func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
func (s *RegexSimplifier) simplifyAlternate(reg *syntax.Regexp, isLabel bool) (MatcherFilterer, bool) {
util.ClearCapture(reg.Sub...)
// attempt to simplify the first leg
f, ok := simplify(reg.Sub[0], isLabel)
f, ok := s.Simplify(reg.Sub[0], isLabel)
if !ok {
return nil, false
}
// merge the rest of the legs
for i := 1; i < len(reg.Sub); i++ {
f2, ok := simplify(reg.Sub[i], isLabel)
f2, ok := s.Simplify(reg.Sub[i], isLabel)
if !ok {
return nil, false
}
@ -520,7 +653,7 @@ func simplifyAlternate(reg *syntax.Regexp, isLabel bool) (Filterer, bool) {
// which is a literalFilter.
// Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates.
// Anything else is rejected.
func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
func (s *RegexSimplifier) simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (MatcherFilterer, bool) {
util.ClearCapture(reg.Sub...)
// remove empty match as we don't need them for filtering
i := 0
@ -538,7 +671,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
return nil, false
}
var curr Filterer
var curr MatcherFilterer
var ok bool
literals := 0
var baseLiteralIsCaseInsensitive bool
@ -555,7 +688,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
}
// if we have an alternate we must also have a base literal to apply the concatenation with.
if sub.Op == syntax.OpAlternate && baseLiteral != nil {
if curr, ok = simplifyConcatAlternate(sub, baseLiteral, curr, baseLiteralIsCaseInsensitive); !ok {
if curr, ok = s.simplifyConcatAlternate(sub, baseLiteral, curr, baseLiteralIsCaseInsensitive); !ok {
return nil, false
}
continue
@ -573,7 +706,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
// if we have only a concat with literals.
if baseLiteral != nil {
return newContainsFilter(baseLiteral, baseLiteralIsCaseInsensitive), true
return s.newContainsFilter(baseLiteral, baseLiteralIsCaseInsensitive), true
}
return nil, false
@ -583,7 +716,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
// A concat alternate is found when a concat operation has a sub alternate and is preceded by a literal.
// For instance bar|b|buzz is expressed as b(ar|(?:)|uzz) => b concat alternate(ar,(?:),uzz).
// (?:) being an OpEmptyMatch and b being the literal to concat all alternates (ar,(?:),uzz) with.
func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer, baseLiteralIsCaseInsensitive bool) (Filterer, bool) {
func (s *RegexSimplifier) simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr MatcherFilterer, baseLiteralIsCaseInsensitive bool) (MatcherFilterer, bool) {
for _, alt := range reg.Sub {
// we should not consider the case where baseLiteral is not marked as case insensitive
// and alternate expression is marked as case insensitive. For example, for the original expression
@ -595,25 +728,25 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer,
}
switch alt.Op {
case syntax.OpEmptyMatch:
curr = ChainOrFilter(curr, newContainsFilter(literal, baseLiteralIsCaseInsensitive))
curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(literal, baseLiteralIsCaseInsensitive))
case syntax.OpLiteral:
// concat the root literal with the alternate one.
altBytes := []byte(string(alt.Rune))
altLiteral := make([]byte, 0, len(literal)+len(altBytes))
altLiteral = append(altLiteral, literal...)
altLiteral = append(altLiteral, altBytes...)
curr = ChainOrFilter(curr, newContainsFilter(altLiteral, baseLiteralIsCaseInsensitive))
curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(altLiteral, baseLiteralIsCaseInsensitive))
case syntax.OpConcat:
f, ok := simplifyConcat(alt, literal)
f, ok := s.simplifyConcat(alt, literal)
if !ok {
return nil, false
}
curr = ChainOrFilter(curr, f)
curr = ChainOrMatcherFilterer(curr, f)
case syntax.OpStar:
if alt.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil, false
}
curr = ChainOrFilter(curr, newContainsFilter(literal, baseLiteralIsCaseInsensitive))
curr = ChainOrMatcherFilterer(curr, s.newContainsFilter(literal, baseLiteralIsCaseInsensitive))
default:
return nil, false
}

@ -1,8 +1,11 @@
package v1
import (
"github.com/grafana/regexp"
regexpsyntax "github.com/grafana/regexp/syntax"
"github.com/prometheus/prometheus/model/labels"
"github.com/grafana/loki/pkg/logql/log"
"github.com/grafana/loki/pkg/logql/syntax"
"github.com/grafana/loki/pkg/storage/bloom/v1/filter"
)
@ -59,14 +62,80 @@ func simpleFilterToBloomTest(b NGramBuilder, filter syntax.LineFilter) BloomTest
}
return test
case labels.MatchRegexp, labels.MatchNotRegexp:
// TODO(salvacorts): Simplify regex similarly to how it's done at pkg/logql/log/filter.go (`simplify` function)
// Ideally we want to extract the simplify logic into pkg/util/regex.go
return MatchAll
reg, err := regexpsyntax.Parse(filter.Match, regexpsyntax.Perl)
if err != nil {
// TODO: log error
return MatchAll
}
reg = reg.Simplify()
simplifier := log.NewRegexSimplifier(newStringFilterFunc(b), newStringFilterFunc(b))
matcher, ok := simplifier.Simplify(reg, false)
if !ok {
// If the regex simplifier fails, we default to MatchAll
return MatchAll
}
var test BloomTest = matcherFilterWrapper{filter: matcher}
if filter.Ty == labels.MatchNotRegexp {
test = newNotTest(test)
}
return test
default:
return MatchAll
}
}
type bloomCheckerWrapper struct {
bloom filter.Checker
}
// Test implements the log.Checker interface
func (b bloomCheckerWrapper) Test(line []byte, _ bool, _ bool) bool {
return b.bloom.Test(line)
}
// TestRegex implements the log.Checker interface
func (b bloomCheckerWrapper) TestRegex(_ *regexp.Regexp) bool {
// We won't support regexes in bloom filters so we just return true
return true
}
type logCheckerWrapper struct {
checker log.Checker
}
// Test implements the filter.Checker interface
func (l logCheckerWrapper) Test(data []byte) bool {
return l.checker.Test(data, true, false)
}
type matcherFilterWrapper struct {
filter log.Matcher
}
func (m matcherFilterWrapper) Matches(bloom filter.Checker) bool {
return m.filter.Matches(bloomCheckerWrapper{bloom})
}
func (m matcherFilterWrapper) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
return m.filter.Matches(bloomCheckerWrapper{prefixedChecker{
checker: bloom,
buf: buf,
prefixLen: prefixLen,
}})
}
type prefixedChecker struct {
checker filter.Checker
buf []byte
prefixLen int
}
func (p prefixedChecker) Test(data []byte) bool {
return p.checker.Test(append(p.buf[:p.prefixLen], data...))
}
type matchAllTest struct{}
var MatchAll = matchAllTest{}
@ -101,6 +170,7 @@ func newStringTest(b NGramBuilder, search string) stringTest {
return test
}
// Matches implements the BloomTest interface
func (b stringTest) Matches(bloom filter.Checker) bool {
for _, ngram := range b.ngrams {
if !bloom.Test(ngram) {
@ -110,6 +180,7 @@ func (b stringTest) Matches(bloom filter.Checker) bool {
return true
}
// MatchesWithPrefixBuf implements the BloomTest interface
func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
for _, ngram := range b.ngrams {
buf = append(buf[:prefixLen], ngram...)
@ -120,6 +191,23 @@ func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefi
return true
}
type stringMatcherFilter struct {
test stringTest
}
// Matches implements the log.Filterer interface
func (b stringMatcherFilter) Matches(test log.Checker) bool {
return b.test.Matches(logCheckerWrapper{test})
}
func newStringFilterFunc(b NGramBuilder) log.NewMatcherFiltererFunc {
return func(match []byte, caseInsensitive bool) log.MatcherFilterer {
return log.WrapMatcher(stringMatcherFilter{
test: newStringTest(b, string(match)),
})
}
}
type notTest struct {
BloomTest
}

@ -112,7 +112,48 @@ func TestFiltersToBloomTests(t *testing.T) {
bloom: fakeBloom{"foo", "bar", "baz", "fuzz"},
expectMatch: true,
},
// TODO: test regexes
{
name: "regex match all star",
query: `{app="fake"} |~ ".*"`,
bloom: fakeBloom{"foo", "bar"},
expectMatch: true,
},
{
name: "regex match all plus",
query: `{app="fake"} |~ ".+"`,
bloom: fakeBloom{"foo", "bar"},
expectMatch: true,
},
{
name: "regex match none",
query: `{app="fake"} !~ ".*"`,
bloom: fakeBloom{"foo", "bar"},
expectMatch: false,
},
{
name: "regex match",
query: `{app="fake"} |~ "nope|.*foo.*"`,
bloom: fakeBloom{"foo", "bar"},
expectMatch: true,
},
{
name: "regex no match",
query: `{app="fake"} !~ "nope|.*foo.*"`,
bloom: fakeBloom{"foo", "bar"},
expectMatch: false,
},
{
name: "complex regex match",
query: `{app="fake"} |~ "(nope|.*not.*|.*foo.*)" or "(no|ba)" !~ "noz.*" or "(nope|not)"`,
bloom: fakeBloom{"foo", "bar", "baz", "fuzz"},
expectMatch: true,
},
{
name: "complex regex no match",
query: `{app="fake"} |~ "(nope|.*not.*|.*foo.*)" or "(no|ba)" !~ "noz.*"`,
bloom: fakeBloom{"foo", "bar", "baz", "fuzz", "noz"},
expectMatch: false,
},
} {
t.Run(tc.name, func(t *testing.T) {
expr, err := syntax.ParseExpr(tc.query)

Loading…
Cancel
Save