Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/storage/bloom/v1/tokenizer_test.go

216 lines
4.4 KiB

package v1
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/pkg/logproto"
)
const BigFile = "../../../logql/sketch/testdata/war_peace.txt"
func TestNGramIterator(t *testing.T) {
var (
three = NewNGramTokenizer(3, 0)
threeSkip1 = NewNGramTokenizer(3, 1)
threeSkip3 = NewNGramTokenizer(3, 3)
)
for _, tc := range []struct {
desc string
t *NGramTokenizer
input string
exp []string
}{
{
t: three,
input: "",
exp: []string{},
},
{
t: three,
input: "ab",
exp: []string{},
},
{
t: three,
input: "abcdefg",
exp: []string{"abc", "bcd", "cde", "def", "efg"},
},
{
t: threeSkip1,
input: "abcdefg",
exp: []string{"abc", "cde", "efg"},
},
{
t: threeSkip3,
input: "abcdefgh",
exp: []string{"abc", "efg"},
},
{
t: three,
input: "日本語",
exp: []string{"日本語"},
},
{
t: four,
input: "日本語日本語",
exp: []string{
"日本語日",
"本語日本",
"語日本語"},
},
} {
t.Run(tc.desc, func(t *testing.T) {
itr := tc.t.Tokens(tc.input)
for _, exp := range tc.exp {
require.True(t, itr.Next())
require.Equal(t, exp, string(itr.At()))
}
require.False(t, itr.Next())
})
}
}
func TestPrefixedIterator(t *testing.T) {
var (
three = NewNGramTokenizer(3, 0)
)
for _, tc := range []struct {
desc string
input string
exp []string
}{
{
input: "",
exp: []string{},
},
{
input: "ab",
exp: []string{},
},
{
input: "abcdefg",
exp: []string{"0123abc", "0123bcd", "0123cde", "0123def", "0123efg"},
},
{
input: "日本語",
exp: []string{"0123日本語"},
},
} {
prefix := []byte("0123")
t.Run(tc.desc, func(t *testing.T) {
itr := NewPrefixedTokenIter(prefix, len(prefix), three.Tokens(tc.input))
for _, exp := range tc.exp {
require.True(t, itr.Next())
require.Equal(t, exp, string(itr.At()))
}
require.False(t, itr.Next())
})
}
}
const lorem = `
lorum ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna
aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat
duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur
sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est
laborum ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna
aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat
duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur
sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est
`
func BenchmarkTokens(b *testing.B) {
var (
v2Three = NewNGramTokenizer(3, 0)
v2ThreeSkip1 = NewNGramTokenizer(3, 1)
)
type impl struct {
desc string
f func()
}
type tc struct {
desc string
impls []impl
}
for _, tc := range []tc{
{
desc: "three",
impls: []impl{
{
desc: "v2",
f: func() {
itr := v2Three.Tokens(lorem)
for itr.Next() {
_ = itr.At()
}
},
},
},
},
{
desc: "threeSkip1",
impls: []impl{
{
desc: "v2",
f: func() {
itr := v2ThreeSkip1.Tokens(lorem)
for itr.Next() {
_ = itr.At()
}
},
},
},
},
{
desc: "threeChunk",
impls: []impl{
{
desc: "v2",
f: func() func() {
buf, prefixLn := prefixedToken(v2Three.N, logproto.ChunkRef{})
return func() {
itr := NewPrefixedTokenIter(buf, prefixLn, v2Three.Tokens(lorem))
for itr.Next() {
_ = itr.At()
}
}
}(),
},
},
},
{
desc: "threeSkip1Chunk",
impls: []impl{
{
desc: "v2",
f: func() func() {
buf, prefixLn := prefixedToken(v2Three.N, logproto.ChunkRef{})
return func() {
itr := NewPrefixedTokenIter(buf, prefixLn, v2ThreeSkip1.Tokens(lorem))
for itr.Next() {
_ = itr.At()
}
}
}(),
},
},
},
} {
b.Run(tc.desc, func(b *testing.B) {
for _, impl := range tc.impls {
b.Run(impl.desc, func(b *testing.B) {
for i := 0; i < b.N; i++ {
impl.f()
}
})
}
})
}
}