chore(storage/bloom): support simplifiable regexp matchers (#14622)

This adds support for basic regexps which can be simplified into a sequence of
OR matchers, such as:

* `key=~"value" becomes key="value"
* `key=~"value1|value2" becomes key="value1" or key="value2".
* `key=~".+" checks for the presence of key. This is currently the only way to 
   check if a key exists.

Only the cases above are "officially" supported. However, we technically
support basic concatenations and character classes due to how regexp/syntax
parses and simplifies expressions such as `value1|value2` into `value[12]`.

To prevent unbounded cardinality, we limit regexp expansion to 25 matchers;
otherwise a regexp like `value[0-9][0-9][0-9][0-9]` would expand into 10,000
matchers (too many!).

Closes grafana/loki-private#1106.

Co-authored-by: J Stickler <julie.stickler@grafana.com>
pull/14766/head
Robert Fratto 7 months ago committed by GitHub
parent 7b53f20f70
commit 8eca826795
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 5
      docs/sources/query/query_accceleration.md
  2. 6
      pkg/bloomgateway/processor_test.go
  3. 202
      pkg/storage/bloom/v1/ast_extractor.go
  4. 157
      pkg/storage/bloom/v1/ast_extractor_test.go
  5. 83
      pkg/storage/bloom/v1/bloom_tester.go
  6. 10
      pkg/storage/bloom/v1/bloom_tester_test.go

@ -26,6 +26,11 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured
Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria:
* The label filter expression using **string equality**, such as `| key="value"`.
* `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`.
* _Basic_ regular expressions are automatically simplified into a supported expression:
* `| key=~"value"` is converted to `| key="value"`.
* `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`.
* `| key=~".+"` checks for existence of `key`. `.*` is not supported.
* The label filter expression is querying for structured metadata and not a stream label.
* The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][].

@ -141,7 +141,7 @@ func TestProcessor(t *testing.T) {
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
@ -191,7 +191,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
@ -238,7 +238,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},

@ -1,12 +1,24 @@
package v1
import (
regexsyn "github.com/grafana/regexp/syntax"
"github.com/prometheus/prometheus/model/labels"
"github.com/grafana/loki/v3/pkg/logql/log"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/util"
)
// Simplifiable regexp expressions can quickly expand into very high
// cardinality; we limit the number of matchers to prevent this. However,
// since bloom tests are relatively cheap to test, we can afford to be a little
// generous while still preventing excessive cardinality.
//
// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while
// `[0-9][0-9][0-9]` expands to 1000 matchers (000, 001, .., 999).
const maxRegexMatchers = 200
// LabelMatcher represents bloom tests for key-value pairs, mapped from
// LabelFilterExprs from the AST.
type LabelMatcher interface{ isLabelMatcher() }
@ -15,9 +27,13 @@ type LabelMatcher interface{ isLabelMatcher() }
// mapped. Bloom tests for UnsupportedLabelMatchers must always pass.
type UnsupportedLabelMatcher struct{}
// PlainLabelMatcher represents a direct key-value matcher. Bloom tests
// must only pass if the key-value pair exists in the bloom.
type PlainLabelMatcher struct{ Key, Value string }
// KeyValueMatcher represents a direct key-value matcher. Bloom tests must only
// pass if the key-value pair exists in the bloom.
type KeyValueMatcher struct{ Key, Value string }
// KeyMatcher represents a key matcher. Bloom tests must only pass if the key
// exists in the bloom.
type KeyMatcher struct{ Key string }
// OrLabelMatcher represents a logical OR test. Bloom tests must only pass if
// one of the Left or Right label matcher bloom tests pass.
@ -54,21 +70,27 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
switch filter := filter.(type) {
case *log.LineFilterLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
if filter.Type == labels.MatchEqual {
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
} else if filter.Type == labels.MatchRegexp {
reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl)
if err != nil {
return UnsupportedLabelMatcher{}
}
return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify())
}
return PlainLabelMatcher{
Key: filter.Name,
Value: filter.Value,
}
return UnsupportedLabelMatcher{}
case *log.StringLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
}
return PlainLabelMatcher{
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
@ -89,11 +111,169 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
}
}
// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex.
// reg may be mutated.
func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)
left := buildSimplifiedRegexMatcher(key, reg.Sub[0])
if len(reg.Sub) == 1 {
// This shouldn't be possible (even `warn|` has two subexpressions, where
// the latter matches an empty string), but we have a length check here
// anyway just to avoid a potential panic.
return left
}
for _, sub := range reg.Sub[1:] {
right := buildSimplifiedRegexMatcher(key, sub)
left = OrLabelMatcher{Left: left, Right: right}
}
return left
case regexsyn.OpConcat:
// OpConcat checks for the concatenation of two or more subexpressions. For
// example, value1|value2 simplifies to value[12], with the two
// subexpressions value and [12].
//
// We expand subexpressions back out into full matchers where possible, so
// value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR
// value2 .. OR value9.
util.ClearCapture(reg)
matchers, ok := expandSubexpr(reg)
if !ok || len(matchers) == 0 {
return UnsupportedLabelMatcher{}
}
var left LabelMatcher = KeyValueMatcher{Key: key, Value: matchers[0]}
for _, matcher := range matchers[1:] {
right := KeyValueMatcher{Key: key, Value: matcher}
left = OrLabelMatcher{Left: left, Right: right}
}
return left
case regexsyn.OpCapture:
util.ClearCapture(reg)
return buildSimplifiedRegexMatcher(key, reg)
case regexsyn.OpLiteral:
return KeyValueMatcher{
Key: key,
Value: string(reg.Rune),
}
case regexsyn.OpPlus:
if reg.Sub[0].Op == regexsyn.OpAnyChar || reg.Sub[0].Op == regexsyn.OpAnyCharNotNL { // .+
return KeyMatcher{Key: key}
}
return UnsupportedLabelMatcher{}
default:
return UnsupportedLabelMatcher{}
}
}
func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)
for _, sub := range reg.Sub {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(prefixes)+len(subPrefixes) > maxRegexMatchers {
return nil, false
}
prefixes = append(prefixes, subPrefixes...)
}
return prefixes, true
case regexsyn.OpCharClass:
// OpCharClass stores ranges of characters, so [12] is the range of bytes
// []rune('1', '2'), while [15] is represented as []rune('1', '1', '5',
// '5').
//
// To expand OpCharClass, we iterate over each pair of runes.
if len(reg.Rune)%2 != 0 {
// Invalid regexp; sequences should be even.
return nil, false
}
for i := 0; i < len(reg.Rune); i += 2 {
start, end := reg.Rune[i+0], reg.Rune[i+1]
for r := start; r <= end; r++ {
prefixes = append(prefixes, string(r))
if len(prefixes) > maxRegexMatchers {
return nil, false
}
}
}
return prefixes, true
case regexsyn.OpConcat:
if len(reg.Sub) == 0 {
return nil, false
}
// We get the prefixes for each subexpression and then iteratively combine
// them together.
//
// For the regexp [12][34]value (which concatenates [12], [34], and value):
//
// 1. We get the prefixes for [12], which are 1 and 2.
// 2. We get the prefixes for [34], which are 3 and 4.
// 3. We add the prefixes together to get 13, 14, 23, and 24.
// 4. We get the prerfixes for value, which is value.
// 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value.
curPrefixes, ok := expandSubexpr(reg.Sub[0])
if !ok {
return nil, false
}
for _, sub := range reg.Sub[1:] {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers {
return nil, false
}
newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes))
for _, curPrefix := range curPrefixes {
for _, subPrefix := range subPrefixes {
newPrefixes = append(newPrefixes, curPrefix+subPrefix)
}
}
curPrefixes = newPrefixes
}
return curPrefixes, true
case regexsyn.OpCapture:
util.ClearCapture(reg)
return expandSubexpr(reg)
case regexsyn.OpLiteral:
prefixes = append(prefixes, string(reg.Rune))
return prefixes, true
default:
return nil, false
}
}
//
// Implement marker types:
//
func (UnsupportedLabelMatcher) isLabelMatcher() {}
func (PlainLabelMatcher) isLabelMatcher() {}
func (KeyValueMatcher) isLabelMatcher() {}
func (KeyMatcher) isLabelMatcher() {}
func (OrLabelMatcher) isLabelMatcher() {}
func (AndLabelMatcher) isLabelMatcher() {}

@ -20,7 +20,7 @@ func TestExtractLabelMatchers(t *testing.T) {
name: "basic label matcher",
input: `{app="foo"} | key="value"`,
expect: []v1.LabelMatcher{
v1.PlainLabelMatcher{Key: "key", Value: "value"},
v1.KeyValueMatcher{Key: "key", Value: "value"},
},
},
@ -29,8 +29,8 @@ func TestExtractLabelMatchers(t *testing.T) {
input: `{app="foo"} | key1="value1" or key2="value2"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
Left: v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
Right: v1.PlainLabelMatcher{Key: "key2", Value: "value2"},
Left: v1.KeyValueMatcher{Key: "key1", Value: "value1"},
Right: v1.KeyValueMatcher{Key: "key2", Value: "value2"},
},
},
},
@ -40,8 +40,8 @@ func TestExtractLabelMatchers(t *testing.T) {
input: `{app="foo"} | key1="value1" and key2="value2"`,
expect: []v1.LabelMatcher{
v1.AndLabelMatcher{
Left: v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
Right: v1.PlainLabelMatcher{Key: "key2", Value: "value2"},
Left: v1.KeyValueMatcher{Key: "key1", Value: "value1"},
Right: v1.KeyValueMatcher{Key: "key2", Value: "value2"},
},
},
},
@ -50,14 +50,136 @@ func TestExtractLabelMatchers(t *testing.T) {
name: "multiple label matchers",
input: `{app="foo"} | key1="value1" | key2="value2"`,
expect: []v1.LabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
v1.PlainLabelMatcher{Key: "key2", Value: "value2"},
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
v1.KeyValueMatcher{Key: "key2", Value: "value2"},
},
},
{
name: "unsupported label matchers",
name: "basic regex matcher",
input: `{app="foo"} | key1=~"value1"`,
expect: []v1.LabelMatcher{
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
},
},
{
name: "regex matcher short", // simplifies to value[15].
input: `{app="foo"} | key1=~"value1|value5"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
v1.KeyValueMatcher{Key: "key1", Value: "value5"},
},
},
},
{
name: "regex matcher range",
input: `{app="foo"} | key1=~"value[0-9]"`,
expect: []v1.LabelMatcher{
buildOrMatchers(
v1.KeyValueMatcher{Key: "key1", Value: "value0"},
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
v1.KeyValueMatcher{Key: "key1", Value: "value2"},
v1.KeyValueMatcher{Key: "key1", Value: "value3"},
v1.KeyValueMatcher{Key: "key1", Value: "value4"},
v1.KeyValueMatcher{Key: "key1", Value: "value5"},
v1.KeyValueMatcher{Key: "key1", Value: "value6"},
v1.KeyValueMatcher{Key: "key1", Value: "value7"},
v1.KeyValueMatcher{Key: "key1", Value: "value8"},
v1.KeyValueMatcher{Key: "key1", Value: "value9"},
),
},
},
{
name: "regex matcher ignore high cardinality",
input: `{app="foo"} | key1=~"value[0-9][0-9][0-9]"`, // This would expand to 1000 matchers. Too many!
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
},
{
name: "regex matcher",
input: `{app="foo"} | key1=~"value123|value456"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.KeyValueMatcher{Key: "key1", Value: "value123"},
v1.KeyValueMatcher{Key: "key1", Value: "value456"},
},
},
},
{
name: "regex multiple expands",
input: `{app="foo"} | detected_level=~"debug|info|warn|error"`,
expect: []v1.LabelMatcher{
buildOrMatchers(
v1.KeyValueMatcher{Key: "detected_level", Value: "debug"},
v1.KeyValueMatcher{Key: "detected_level", Value: "info"},
v1.KeyValueMatcher{Key: "detected_level", Value: "warn"},
v1.KeyValueMatcher{Key: "detected_level", Value: "error"},
),
},
},
{
name: "regex matcher with ignored capture groups",
input: `{app="foo"} | key1=~"value1|(value2)"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
v1.KeyValueMatcher{Key: "key1", Value: "value2"},
},
},
},
{
name: "advanced regex matcher",
input: `{app="foo"} | key1=~"(warn|info[0-3])"`,
expect: []v1.LabelMatcher{
v1.OrLabelMatcher{
v1.KeyValueMatcher{Key: "key1", Value: "warn"},
buildOrMatchers(
v1.KeyValueMatcher{Key: "key1", Value: "info0"},
v1.KeyValueMatcher{Key: "key1", Value: "info1"},
v1.KeyValueMatcher{Key: "key1", Value: "info2"},
v1.KeyValueMatcher{Key: "key1", Value: "info3"},
),
},
},
},
{
name: "regex .+ matcher",
input: `{app="foo"} | key1=~".+"`,
expect: []v1.LabelMatcher{
v1.KeyMatcher{Key: "key1"},
},
},
{
// This should also be unsupported for suffix or substring regexes.
name: "regex .+ prefix matcher",
input: `{app="foo"} | key1=~".+foo"`,
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
},
{
name: "regex .* matcher",
input: `{app="foo"} | key1=~".*"`,
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
},
{
name: "unsupported label matchers",
input: `{app="foo"} | key1!="value1"`,
expect: []v1.LabelMatcher{
v1.UnsupportedLabelMatcher{},
},
@ -73,6 +195,23 @@ func TestExtractLabelMatchers(t *testing.T) {
}
}
func buildOrMatchers(matchers ...v1.LabelMatcher) v1.LabelMatcher {
if len(matchers) == 1 {
return matchers[0]
}
left := matchers[0]
for _, right := range matchers[1:] {
left = v1.OrLabelMatcher{
Left: left,
Right: right,
}
}
return left
}
func TestExtractLabelMatchers_IgnoreAfterParse(t *testing.T) {
tt := []struct {
name string
@ -92,7 +231,7 @@ func TestExtractLabelMatchers_IgnoreAfterParse(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
fullInput := fmt.Sprintf(`{app="foo"} | key1="value1" | %s | key2="value2"`, tc.expr)
expect := []v1.LabelMatcher{
v1.PlainLabelMatcher{Key: "key1", Value: "value1"},
v1.KeyValueMatcher{Key: "key1", Value: "value1"},
// key2="value2" should be ignored following tc.expr
}

@ -119,8 +119,11 @@ func matcherToBloomTest(matcher LabelMatcher) BloomTest {
case UnsupportedLabelMatcher:
return matchAllTest{}
case PlainLabelMatcher:
return newStringMatcherTest(matcher)
case KeyValueMatcher:
return newKeyValueMatcherTest(matcher)
case KeyMatcher:
return newKeyMatcherTest(matcher)
case OrLabelMatcher:
return newOrTest(
@ -140,15 +143,15 @@ func matcherToBloomTest(matcher LabelMatcher) BloomTest {
}
}
type stringMatcherTest struct {
matcher PlainLabelMatcher
type keyValueMatcherTest struct {
matcher KeyValueMatcher
}
func newStringMatcherTest(matcher PlainLabelMatcher) stringMatcherTest {
return stringMatcherTest{matcher: matcher}
func newKeyValueMatcherTest(matcher KeyValueMatcher) keyValueMatcherTest {
return keyValueMatcherTest{matcher: matcher}
}
func (sm stringMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool {
func (kvm keyValueMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool {
// TODO(rfratto): reintroduce the use of a shared tokenizer here to avoid
// desyncing between how tokens are passed during building vs passed during
// querying.
@ -159,24 +162,24 @@ func (sm stringMatcherTest) Matches(series labels.Labels, bloom filter.Checker)
// 2. It should be possible to test for just the key
var (
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)
combined = fmt.Sprintf("%s=%s", kvm.matcher.Key, kvm.matcher.Value)
rawCombined = unsafe.Slice(unsafe.StringData(combined), len(combined))
)
return sm.match(series, bloom, rawCombined)
return kvm.match(series, bloom, rawCombined)
}
func (sm stringMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool {
func (kvm keyValueMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool {
var (
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)
combined = fmt.Sprintf("%s=%s", kvm.matcher.Key, kvm.matcher.Value)
prefixedCombined = appendToBuf(buf, prefixLen, combined)
)
return sm.match(series, bloom, prefixedCombined)
return kvm.match(series, bloom, prefixedCombined)
}
// match returns true if the series matches the matcher or is in the bloom filter.
func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, combined []byte) bool {
func (kvm keyValueMatcherTest) match(series labels.Labels, bloom filter.Checker, combined []byte) bool {
// If we don't have the series labels, we cannot disambiguate which labels come from the series in which case
// we may filter out chunks for queries like `{env="prod"} | env="prod"` if env=prod is not structured metadata
if len(series) == 0 {
@ -186,8 +189,8 @@ func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, co
// It's in the series if the key is set and has the same value.
// By checking val != "" we handle `{env="prod"} | user=""`.
val := series.Get(sm.matcher.Key)
inSeries := val != "" && val == sm.matcher.Value
val := series.Get(kvm.matcher.Key)
inSeries := val != "" && val == kvm.matcher.Value
inBloom := bloom.Test(combined)
return inSeries || inBloom
@ -199,3 +202,53 @@ func appendToBuf(buf []byte, prefixLen int, str string) []byte {
rawString := unsafe.Slice(unsafe.StringData(str), len(str))
return append(buf[:prefixLen], rawString...)
}
type keyMatcherTest struct {
matcher KeyMatcher
}
func newKeyMatcherTest(matcher KeyMatcher) keyMatcherTest {
return keyMatcherTest{matcher: matcher}
}
func (km keyMatcherTest) Matches(series labels.Labels, bloom filter.Checker) bool {
// TODO(rfratto): reintroduce the use of a shared tokenizer here to avoid
// desyncing between how tokens are passed during building vs passed during
// querying.
//
// For a shared tokenizer to be ergonomic:
//
// 1. A prefix shouldn't be required until MatchesWithPrefixBuf is called
// 2. It should be possible to test for just the key
var (
key = km.matcher.Key
rawKey = unsafe.Slice(unsafe.StringData(key), len(key))
)
return km.match(series, bloom, rawKey)
}
func (km keyMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool {
var (
key = km.matcher.Key
prefixedKey = appendToBuf(buf, prefixLen, key)
)
return km.match(series, bloom, prefixedKey)
}
// match returns true if the series matches the matcher or is in the bloom
// filter.
func (km keyMatcherTest) match(series labels.Labels, bloom filter.Checker, key []byte) bool {
// If we don't have the series labels, we cannot disambiguate which labels come from the series in which case
// we may filter out chunks for queries like `{env="prod"} | env="prod"` if env=prod is not structured metadata
if len(series) == 0 {
level.Warn(util_log.Logger).Log("msg", "series has no labels, cannot filter out chunks")
return true
}
inSeries := series.Get(km.matcher.Key) != ""
inBloom := bloom.Test(key)
return inSeries || inBloom
}

@ -116,6 +116,16 @@ func TestLabelMatchersToBloomTest(t *testing.T) {
query: `{app="fake"} | trace_id="exists_1" and trace_id="noexist"`,
match: false,
},
{
name: "presence test pass",
query: `{app="fake"} | trace_id=~".+"`,
match: true,
},
{
name: "presence test pass",
query: `{app="fake"} | noexist=~".+"`,
match: false,
},
}
for _, tc := range tt {

Loading…
Cancel
Save