Improve case insensitive search to avoid allocations. (#4394)

* Improve case sensitive search to avoid allocations.

```
❯ benchcmp  before.txt after.txt
benchmark                                            old ns/op     new ns/op     delta
Benchmark_LineFilter/default_true_(?i)foo-16         2400          2233          -6.96%
Benchmark_LineFilter/simplified_true_(?i)foo-16      201           228           +13.13%
Benchmark_LineFilter/default_false_(?i)foo-16        2443          2376          -2.74%
Benchmark_LineFilter/simplified_false_(?i)foo-16     185           231           +24.96%

benchmark                                            old allocs     new allocs     delta
Benchmark_LineFilter/default_true_(?i)foo-16         0              0              +0.00%
Benchmark_LineFilter/simplified_true_(?i)foo-16      1              0              -100.00%
Benchmark_LineFilter/default_false_(?i)foo-16        0              0              +0.00%
Benchmark_LineFilter/simplified_false_(?i)foo-16     1              0              -100.00%

benchmark                                            old bytes     new bytes     delta
Benchmark_LineFilter/default_true_(?i)foo-16         0             0             +0.00%
Benchmark_LineFilter/simplified_true_(?i)foo-16      128           0             -100.00%
Benchmark_LineFilter/default_false_(?i)foo-16        0             0             +0.00%
Benchmark_LineFilter/simplified_false_(?i)foo-16     128           0             -100.00%
```

It's not much but for a billions line it makes a big difference.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* typo

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Simplified version

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Cleanup

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
pull/4803/head
Cyril Tovena 4 years ago committed by GitHub
parent 8a914b5c2f
commit dc222dc98d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 48
      pkg/logql/log/filter.go
  2. 23
      pkg/logql/log/filter_test.go

@ -5,6 +5,8 @@ import (
"fmt"
"regexp"
"regexp/syntax"
"unicode"
"unicode/utf8"
"github.com/prometheus/prometheus/pkg/labels"
)
@ -242,11 +244,47 @@ type containsFilter struct {
caseInsensitive bool
}
func (l containsFilter) Filter(line []byte) bool {
if l.caseInsensitive {
line = bytes.ToLower(line)
func (l *containsFilter) Filter(line []byte) bool {
if !l.caseInsensitive {
return bytes.Contains(line, l.match)
}
return bytes.Contains(line, l.match)
if len(l.match) == 0 {
return true
}
if len(l.match) > len(line) {
return false
}
j := 0
for len(line) > 0 {
// ascii fast case
if c := line[0]; c < utf8.RuneSelf {
if c == l.match[j] || c+'a'-'A' == l.match[j] {
j++
if j == len(l.match) {
return true
}
line = line[1:]
continue
}
line = line[1:]
j = 0
continue
}
// unicode slow case
lr, lwid := utf8.DecodeRune(line)
mr, mwid := utf8.DecodeRune(l.match[j:])
if lr == mr || mr == unicode.To(unicode.LowerCase, lr) {
j += mwid
if j == len(l.match) {
return true
}
line = line[lwid:]
continue
}
line = line[lwid:]
j = 0
}
return false
}
func (l containsFilter) ToStage() Stage {
@ -269,7 +307,7 @@ func newContainsFilter(match []byte, caseInsensitive bool) Filterer {
if caseInsensitive {
match = bytes.ToLower(match)
}
return containsFilter{
return &containsFilter{
match: match,
caseInsensitive: caseInsensitive,
}

@ -10,6 +10,7 @@ import (
func Test_SimplifiedRegex(t *testing.T) {
fixtures := []string{
"foo", "foobar", "bar", "foobuzz", "buzz", "f", " ", "fba", "foofoofoo", "b", "foob", "bfoo", "FoO",
"foo, 世界", allunicode(), "fooÏbar",
}
for _, test := range []struct {
re string
@ -53,6 +54,8 @@ func Test_SimplifiedRegex(t *testing.T) {
{".*||||", true, TrueFilter, true},
{"", true, TrueFilter, true},
{"(?i)foo", true, newContainsFilter([]byte("foo"), true), true},
{"(?i)界", true, newContainsFilter([]byte("界"), true), true},
{"(?i)ïB", true, newContainsFilter([]byte("ïB"), true), true},
// regex we are not supporting.
{"[a-z]+foo", true, nil, false},
@ -95,6 +98,14 @@ func Test_SimplifiedRegex(t *testing.T) {
}
}
func allunicode() string {
var b []byte
for i := 0x00; i < 0x10FFFF; i++ {
b = append(b, byte(i))
}
return string(b)
}
func Test_TrueFilter(t *testing.T) {
empty := []byte("")
for _, test := range []struct {
@ -169,13 +180,21 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) {
b.ResetTimer()
b.Run(fmt.Sprintf("default_%v_%s", match, re), func(b *testing.B) {
for i := 0; i < b.N; i++ {
m = d.Filter(l)
for j := 0; j < 1e6; j++ {
m = d.Filter(l)
}
}
})
b.Run(fmt.Sprintf("simplified_%v_%s", match, re), func(b *testing.B) {
for i := 0; i < b.N; i++ {
m = s.Filter(l)
for j := 0; j < 1e6; j++ {
m = s.Filter(l)
}
}
})
res = m
}
func Test_rune(t *testing.T) {
require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo")))
}

Loading…
Cancel
Save