feat: Add utf8 support to Pattern Lexer to support utf8 chars (#13085)

pull/13083/head
benclive 12 months ago committed by GitHub
parent 21dd4afdc7
commit f6f8babf83
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 5
      pkg/logql/log/pattern/lexer.go
  2. 16
      pkg/logql/log/pattern/lexer.rl
  3. 362
      pkg/logql/log/pattern/lexer.rl.go
  4. 1
      pkg/logql/log/pattern/lexer_test.go
  5. 5
      pkg/logql/log/pattern/parser_test.go
  6. 21
      pkg/logql/log/pattern/pattern_test.go
  7. 16
      pkg/pattern/drain/drain_test.go

@ -1,5 +1,7 @@
package pattern
import "unicode/utf8"
type lexer struct {
data []byte
p, pe, cs int
@ -57,6 +59,7 @@ func (lex *lexer) identifier(out *exprSymType) (int, error) {
// nolint
func (lex *lexer) literal(out *exprSymType) (int, error) {
out.literal = rune(lex.data[lex.ts])
decoded, _ := utf8.DecodeRune(lex.data[lex.ts:lex.te])
out.literal = decoded
return LITERAL, nil
}

@ -13,11 +13,25 @@ package pattern
}
}%%
%%{
utf8 = (
0x00..0x7F |
0xC2..0xDF 0x80..0xBF |
0xE0 0xA0..0xBF 0x80..0xBF |
0xE1..0xEC 0x80..0xBF 0x80..0xBF |
0xED 0x80..0x9F 0x80..0xBF |
0xEE..0xEF 0x80..0xBF 0x80..0xBF |
0xF0 0x90..0xBF 0x80..0xBF 0x80..0xBF |
0xF1..0xF3 0x80..0xBF 0x80..0xBF 0x80..0xBF |
0xF4 0x80..0x8F 0x80..0xBF 0x80..0xBF
);
}%%
const LEXER_ERROR = 0
%%{
identifier = '<' (alpha| '_') (alnum | '_' )* '>';
literal = any;
literal = utf8;
}%%
func (lex *lexer) Lex(out *exprSymType) int {

@ -1,251 +1,271 @@
//line pkg/logql/log/pattern/lexer.rl:1
package pattern
//line pkg/logql/log/pattern/lexer.rl.go:7
var _pattern_actions []byte = []byte{
0, 1, 0, 1, 1, 1, 2, 1, 3,
1, 4, 1, 5, 1, 6,
0, 1, 0, 1, 1, 1, 2, 1, 3,
1, 4, 1, 5, 1, 6,
}
var _pattern_key_offsets []byte = []byte{
0, 8, 9,
0, 0, 8, 10, 12, 14, 16, 18,
20, 22, 37,
}
var _pattern_trans_keys []byte = []byte{
62, 95, 48, 57, 65, 90, 97, 122,
60, 95, 65, 90, 97, 122,
62, 95, 48, 57, 65, 90, 97, 122,
128, 191, 160, 191, 128, 191, 128, 159,
144, 191, 128, 191, 128, 143, 60, 224,
237, 240, 244, 128, 193, 194, 223, 225,
239, 241, 243, 245, 255, 95, 65, 90,
97, 122,
}
var _pattern_single_lengths []byte = []byte{
2, 1, 1,
0, 2, 0, 0, 0, 0, 0, 0,
0, 5, 1,
}
var _pattern_range_lengths []byte = []byte{
3, 0, 2,
0, 3, 1, 1, 1, 1, 1, 1,
1, 5, 2,
}
var _pattern_index_offsets []byte = []byte{
0, 6, 8,
0, 0, 6, 8, 10, 12, 14, 16,
18, 20, 31,
}
var _pattern_indicies []byte = []byte{
2, 1, 1, 1, 1, 0, 3, 4,
5, 4, 5, 4, 5, 4, 6, 4,
6, 4, 6, 4, 7, 8, 9, 10,
12, 4, 5, 6, 11, 4, 3, 1,
1, 1, 13,
}
var _pattern_trans_targs []byte = []byte{
1, 0, 0, 0, 0, 1, 2, 1,
0, 0, 0, 1, 1, 1,
9, 1, 9, 9, 0, 2, 4, 10,
3, 5, 6, 7, 8, 9,
}
var _pattern_trans_actions []byte = []byte{
7, 0, 0, 0, 0, 13, 5, 9,
0, 0, 0, 11, 13, 11,
13, 0, 7, 9, 0, 0, 0, 5,
0, 0, 0, 0, 0, 11,
}
var _pattern_to_state_actions []byte = []byte{
0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0,
}
var _pattern_from_state_actions []byte = []byte{
0, 3, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 0,
}
var _pattern_eof_trans []byte = []byte{
13, 0, 14,
0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 14,
}
const pattern_start int = 1
const pattern_first_final int = 1
const pattern_error int = -1
const pattern_start int = 9
const pattern_first_final int = 9
const pattern_error int = 0
const pattern_en_main int = 9
const pattern_en_main int = 1
//line pkg/logql/log/pattern/lexer.rl:14
//line pkg/logql/log/pattern/lexer.rl:28
const LEXER_ERROR = 0
//line pkg/logql/log/pattern/lexer.rl:21
//line pkg/logql/log/pattern/lexer.rl:35
func (lex *lexer) Lex(out *exprSymType) int {
eof := lex.pe
tok := 0
eof := lex.pe
tok := 0
//line pkg/logql/log/pattern/lexer.rl.go:77
//line pkg/logql/log/pattern/lexer.rl.go:100
{
var _klen int
var _trans int
var _acts int
var _nacts uint
var _keys int
if (lex.p) == (lex.pe) {
goto _test_eof
}
_resume:
_acts = int(_pattern_from_state_actions[lex.cs])
_nacts = uint(_pattern_actions[_acts])
_acts++
for ; _nacts > 0; _nacts-- {
_acts++
switch _pattern_actions[_acts-1] {
case 1:
var _klen int
var _trans int
var _acts int
var _nacts uint
var _keys int
if ( lex.p) == ( lex.pe) {
goto _test_eof
}
if lex.cs == 0 {
goto _out
}
_resume:
_acts = int(_pattern_from_state_actions[ lex.cs])
_nacts = uint(_pattern_actions[_acts]); _acts++
for ; _nacts > 0; _nacts-- {
_acts++
switch _pattern_actions[_acts - 1] {
case 1:
//line NONE:1
lex.ts = (lex.p)
lex.ts = ( lex.p)
//line pkg/logql/log/pattern/lexer.rl.go:97
}
//line pkg/logql/log/pattern/lexer.rl.go:123
}
}
_keys = int(_pattern_key_offsets[lex.cs])
_trans = int(_pattern_index_offsets[lex.cs])
_klen = int(_pattern_single_lengths[lex.cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + _klen - 1)
for {
if _upper < _lower {
break
}
_mid = _lower + ((_upper - _lower) >> 1)
switch {
case lex.data[(lex.p)] < _pattern_trans_keys[_mid]:
_upper = _mid - 1
case lex.data[(lex.p)] > _pattern_trans_keys[_mid]:
_lower = _mid + 1
default:
_trans += int(_mid - int(_keys))
goto _match
}
_keys = int(_pattern_key_offsets[ lex.cs])
_trans = int(_pattern_index_offsets[ lex.cs])
_klen = int(_pattern_single_lengths[ lex.cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + _klen - 1)
for {
if _upper < _lower {
break
}
_keys += _klen
_trans += _klen
}
_klen = int(_pattern_range_lengths[lex.cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + (_klen << 1) - 2)
for {
if _upper < _lower {
break
}
_mid = _lower + (((_upper - _lower) >> 1) & ^1)
switch {
case lex.data[(lex.p)] < _pattern_trans_keys[_mid]:
_upper = _mid - 2
case lex.data[(lex.p)] > _pattern_trans_keys[_mid+1]:
_lower = _mid + 2
default:
_trans += int((_mid - int(_keys)) >> 1)
goto _match
}
_mid = _lower + ((_upper - _lower) >> 1)
switch {
case lex.data[( lex.p)] < _pattern_trans_keys[_mid]:
_upper = _mid - 1
case lex.data[( lex.p)] > _pattern_trans_keys[_mid]:
_lower = _mid + 1
default:
_trans += int(_mid - int(_keys))
goto _match
}
_trans += _klen
}
_keys += _klen
_trans += _klen
}
_match:
_eof_trans:
lex.cs = int(_pattern_trans_targs[_trans])
_klen = int(_pattern_range_lengths[ lex.cs])
if _klen > 0 {
_lower := int(_keys)
var _mid int
_upper := int(_keys + (_klen << 1) - 2)
for {
if _upper < _lower {
break
}
if _pattern_trans_actions[_trans] == 0 {
goto _again
_mid = _lower + (((_upper - _lower) >> 1) & ^1)
switch {
case lex.data[( lex.p)] < _pattern_trans_keys[_mid]:
_upper = _mid - 2
case lex.data[( lex.p)] > _pattern_trans_keys[_mid + 1]:
_lower = _mid + 2
default:
_trans += int((_mid - int(_keys)) >> 1)
goto _match
}
}
_trans += _klen
}
_match:
_trans = int(_pattern_indicies[_trans])
_eof_trans:
lex.cs = int(_pattern_trans_targs[_trans])
if _pattern_trans_actions[_trans] == 0 {
goto _again
}
_acts = int(_pattern_trans_actions[_trans])
_nacts = uint(_pattern_actions[_acts])
_acts = int(_pattern_trans_actions[_trans])
_nacts = uint(_pattern_actions[_acts]); _acts++
for ; _nacts > 0; _nacts-- {
_acts++
for ; _nacts > 0; _nacts-- {
_acts++
switch _pattern_actions[_acts-1] {
case 2:
switch _pattern_actions[_acts-1] {
case 2:
//line NONE:1
lex.te = (lex.p) + 1
case 3:
//line pkg/logql/log/pattern/lexer.rl:30
lex.te = (lex.p) + 1
{
tok = lex.handle(lex.identifier(out))
(lex.p)++
goto _out
}
case 4:
//line pkg/logql/log/pattern/lexer.rl:31
lex.te = (lex.p) + 1
{
tok = lex.handle(lex.literal(out))
(lex.p)++
goto _out
}
case 5:
//line pkg/logql/log/pattern/lexer.rl:31
lex.te = (lex.p)
(lex.p)--
{
tok = lex.handle(lex.literal(out))
(lex.p)++
goto _out
}
case 6:
//line pkg/logql/log/pattern/lexer.rl:31
(lex.p) = (lex.te) - 1
{
tok = lex.handle(lex.literal(out))
(lex.p)++
goto _out
}
//line pkg/logql/log/pattern/lexer.rl.go:191
}
lex.te = ( lex.p)+1
case 3:
//line pkg/logql/log/pattern/lexer.rl:44
lex.te = ( lex.p)+1
{ tok = lex.handle(lex.identifier(out)); ( lex.p)++; goto _out
}
case 4:
//line pkg/logql/log/pattern/lexer.rl:45
lex.te = ( lex.p)+1
{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out
}
case 5:
//line pkg/logql/log/pattern/lexer.rl:45
lex.te = ( lex.p)
( lex.p)--
{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out
}
case 6:
//line pkg/logql/log/pattern/lexer.rl:45
( lex.p) = ( lex.te) - 1
{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out
}
//line pkg/logql/log/pattern/lexer.rl.go:218
}
}
_again:
_acts = int(_pattern_to_state_actions[lex.cs])
_nacts = uint(_pattern_actions[_acts])
_again:
_acts = int(_pattern_to_state_actions[ lex.cs])
_nacts = uint(_pattern_actions[_acts]); _acts++
for ; _nacts > 0; _nacts-- {
_acts++
for ; _nacts > 0; _nacts-- {
_acts++
switch _pattern_actions[_acts-1] {
case 0:
switch _pattern_actions[_acts-1] {
case 0:
//line NONE:1
lex.ts = 0
lex.ts = 0
//line pkg/logql/log/pattern/lexer.rl.go:205
}
//line pkg/logql/log/pattern/lexer.rl.go:232
}
}
(lex.p)++
if (lex.p) != (lex.pe) {
goto _resume
}
_test_eof:
{
}
if (lex.p) == eof {
if _pattern_eof_trans[lex.cs] > 0 {
_trans = int(_pattern_eof_trans[lex.cs] - 1)
goto _eof_trans
}
if lex.cs == 0 {
goto _out
}
( lex.p)++
if ( lex.p) != ( lex.pe) {
goto _resume
}
_test_eof: {}
if ( lex.p) == eof {
if _pattern_eof_trans[ lex.cs] > 0 {
_trans = int(_pattern_eof_trans[ lex.cs] - 1)
goto _eof_trans
}
}
_out:
{
}
_out: {}
}
//line pkg/logql/log/pattern/lexer.rl:35
//line pkg/logql/log/pattern/lexer.rl:49
return tok
return tok;
}
func (lex *lexer) init() {
//line pkg/logql/log/pattern/lexer.rl.go:233
func (lex *lexer) init() {
//line pkg/logql/log/pattern/lexer.rl.go:263
{
lex.cs = pattern_start
lex.ts = 0
lex.te = 0
lex.act = 0
lex.cs = pattern_start
lex.ts = 0
lex.te = 0
lex.act = 0
}
//line pkg/logql/log/pattern/lexer.rl:43
//line pkg/logql/log/pattern/lexer.rl:57
}

@ -18,6 +18,7 @@ func Test_Lex(t *testing.T) {
{`<_1foo>`, []int{IDENTIFIER}},
{`<_1foo> bar <buzz>`, []int{IDENTIFIER, LITERAL, LITERAL, LITERAL, LITERAL, LITERAL, IDENTIFIER}},
{`<1foo>`, []int{LITERAL, LITERAL, LITERAL, LITERAL, LITERAL, LITERAL}},
{``, []int{LITERAL}},
} {
tc := tc
t.Run(tc.input, func(t *testing.T) {

@ -47,6 +47,11 @@ func Test_Parse(t *testing.T) {
expr{capture("ip"), literals(" - "), capture("user"), literals(" ["), capture("_"), literals(`] "`), capture("method"), literals(" "), capture("path"), literals(" "), capture('_'), literals(`" `), capture("status"), literals(" "), capture("size"), literals(" "), capture("url"), literals(" "), capture("user_agent")},
nil,
},
{
"▶",
expr{literals("▶")},
nil,
},
} {
tc := tc
actual, err := parseExpr(tc.input)

@ -1,6 +1,7 @@
package pattern
import (
"bytes"
"fmt"
"testing"
@ -151,6 +152,26 @@ var fixtures = []struct {
[]string{"POST", "/api/v1/locations", "204", "154", "0", "226", "100", "10.0.35.28", "nsq2http", "tcp://10.0.2.1:80"},
true,
},
{
// UTF-8: Matches a unicode character
`unicode <emoji> character`,
`unicode 🤷 character`,
[]string{`🤷`},
true,
},
{
// UTF-8: Parses unicode character as literal
"unicode ▶ <what>",
"unicode ▶ character",
[]string{"character"},
true,
},
}
func Test_BytesIndexUnicode(t *testing.T) {
data := []byte("Hello ▶ World")
index := bytes.Index(data, []byte("▶"))
require.Equal(t, 6, index)
}
func Test_matcher_Matches(t *testing.T) {

@ -508,6 +508,19 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
` test 4 test test`,
},
},
{
name: "Unicode characters are matchable",
drain: New(DefaultConfig(), nil),
inputLines: []string{
`13:25:18.033470 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_999 0.00 1717075518`,
`13:25:18.033422 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_99 0.00 1717075518`,
`13:25:18.033394 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_95 0.00 1717075518`,
`13:25:18.033364 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_75 0.00 1717075518`,
`13:25:18.033335 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_50 0.00 1717075518`,
`13:25:18.033304 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_std 0.00 1717075518`,
`13:25:18.033281 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_mean 0.00 1717075518`,
},
},
}
for _, tt := range tests {
tt := tt
@ -523,7 +536,8 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T)
for _, line := range tt.inputLines {
passes := matcher.Test([]byte(line))
require.Truef(t, passes, `Line %q should match extracted pattern`, line)
require.Truef(t, passes, "Line should match extracted pattern: \nPatt[%q] \nLine[%q]", cluster.String(), line)
}
})
}

Loading…
Cancel
Save