mirror of https://github.com/grafana/loki
Initial loki bloom creation library (#10957)
**What this PR does / why we need it**:
Optimization and benchmarking of various LRUs that sit in front of the
bloom filters.
Initial creation of a library that can be used to create blooms from
chunks
**Which issue(s) this PR fixes**:
Fixes #<issue number>
**Special notes for your reviewer**:
**Checklist**
- [ ] Reviewed the
[`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md)
guide (**required**)
- [ ] Documentation added
- [ ] Tests updated
- [ ] `CHANGELOG.md` updated
- [ ] If the change is worth mentioning in the release notes, add
`add-to-release-notes` label
- [ ] Changes that require user attention or interaction to upgrade are
documented in `docs/sources/setup/upgrade/_index.md`
- [ ] For Helm chart changes bump the Helm chart version in
`production/helm/loki/Chart.yaml` and update
`production/helm/loki/CHANGELOG.md` and
`production/helm/loki/README.md`. [Example
PR](d10549e3ec
)
---------
Co-authored-by: Owen Diehl <ow.diehl@gmail.com>
pull/11020/head^2
parent
54320f27e3
commit
c7eb757ed8
@ -0,0 +1,111 @@ |
||||
package v1 |
||||
|
||||
import ( |
||||
"context" |
||||
"math" |
||||
"time" |
||||
|
||||
"github.com/go-kit/log/level" |
||||
"github.com/prometheus/client_golang/prometheus" |
||||
|
||||
"github.com/grafana/loki/pkg/chunkenc" |
||||
"github.com/grafana/loki/pkg/logproto" |
||||
"github.com/grafana/loki/pkg/logql/log" |
||||
"github.com/grafana/loki/pkg/storage/bloom/v1/filter" |
||||
"github.com/grafana/loki/pkg/storage/chunk" |
||||
util_log "github.com/grafana/loki/pkg/util/log" |
||||
//"github.com/grafana/loki/tools/tsdb/helpers"
|
||||
) |
||||
|
||||
type metrics struct{} |
||||
|
||||
/* |
||||
BloomTokenizer is a utility that converts either Loki chunks or individual lines into tokens. |
||||
These tokens are n-grams, representing adjacent letters, that are used to populate a bloom filter. |
||||
https://en.wikipedia.org/wiki/Bloom_filter
|
||||
Bloom filters are utilized for faster lookups of log lines. |
||||
*/ |
||||
type BloomTokenizer struct { |
||||
metrics *metrics |
||||
|
||||
lineTokenizer Tokenizer |
||||
chunkIDTokenizer *WrappedTokenizer |
||||
cache map[string]interface{} |
||||
} |
||||
|
||||
const CacheSize = 150000 |
||||
|
||||
// NewBloomTokenizer returns a new instance of the Bloom Tokenizer.
|
||||
// Warning: the tokens returned use the same byte slice to reduce allocations. This has two consequences:
|
||||
// 1) The token slices generated must not be mutated externally
|
||||
// 2) The token slice must not be used after the next call to `Tokens()` as it will repopulate the slice.
|
||||
// 2) This is not thread safe.
|
||||
func NewBloomTokenizer(reg prometheus.Registerer) (*BloomTokenizer, error) { |
||||
t := &BloomTokenizer{ |
||||
metrics: newMetrics(reg), |
||||
} |
||||
t.cache = make(map[string]interface{}, CacheSize) |
||||
t.lineTokenizer = NewNGramTokenizer(4, 5, 0) // default to 4-grams, no skip
|
||||
t.chunkIDTokenizer = ChunkIDTokenizer(t.lineTokenizer) |
||||
|
||||
level.Info(util_log.Logger).Log("bloom tokenizer created") |
||||
|
||||
return t, nil |
||||
} |
||||
|
||||
func (bt *BloomTokenizer) SetLineTokenizer(t Tokenizer) { |
||||
bt.lineTokenizer = t |
||||
bt.chunkIDTokenizer = ChunkIDTokenizer(bt.lineTokenizer) |
||||
} |
||||
|
||||
// TODO: Something real here with metrics
|
||||
func newMetrics(r prometheus.Registerer) *metrics { |
||||
return &metrics{} |
||||
} |
||||
|
||||
func clearCache(cache map[string]interface{}) { |
||||
for k := range cache { |
||||
delete(cache, k) |
||||
} |
||||
} |
||||
|
||||
func (bt *BloomTokenizer) PopulateSBF(sbf *filter.ScalableBloomFilter, chunks []chunk.Chunk) { |
||||
clearCache(bt.cache) |
||||
for idx := range chunks { |
||||
lc := chunks[idx].Data.(*chunkenc.Facade).LokiChunk() |
||||
bt.chunkIDTokenizer.Reinit(chunks[idx].ChunkRef) |
||||
|
||||
// TODO: error handling
|
||||
itr, _ := lc.Iterator( |
||||
context.Background(), |
||||
time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps?
|
||||
time.Unix(0, math.MaxInt64), |
||||
logproto.FORWARD, |
||||
log.NewNoopPipeline().ForStream(chunks[idx].Metric), |
||||
) |
||||
|
||||
for itr.Next() && itr.Error() == nil { |
||||
toks := bt.chunkIDTokenizer.Tokens(itr.Entry().Line) |
||||
|
||||
for _, tok := range toks { |
||||
if tok.Key != nil { |
||||
str := string(tok.Key) |
||||
_, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters
|
||||
if !found { |
||||
bt.cache[str] = nil |
||||
|
||||
sbf.TestAndAdd(tok.Key) |
||||
|
||||
if len(bt.cache) > 150000 { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other
|
||||
clearCache(bt.cache) |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} // for each chunk
|
||||
} |
||||
|
||||
func (bt *BloomTokenizer) TokenizeLine(line string) []Token { |
||||
return bt.lineTokenizer.Tokens(line) |
||||
} |
@ -0,0 +1,30 @@ |
||||
package v1 |
||||
|
||||
import ( |
||||
"fmt" |
||||
"testing" |
||||
|
||||
"github.com/prometheus/client_golang/prometheus" |
||||
) |
||||
|
||||
func BenchmarkMapClear(b *testing.B) { |
||||
bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer) |
||||
for i := 0; i < b.N; i++ { |
||||
for k := 0; k < CacheSize; k++ { |
||||
bt.cache[fmt.Sprint(k)] = k |
||||
} |
||||
|
||||
clearCache(bt.cache) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkNewMap(b *testing.B) { |
||||
bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer) |
||||
for i := 0; i < b.N; i++ { |
||||
for k := 0; k < CacheSize; k++ { |
||||
bt.cache[fmt.Sprint(k)] = k |
||||
} |
||||
|
||||
bt.cache = make(map[string]interface{}, CacheSize) |
||||
} |
||||
} |
@ -0,0 +1,162 @@ |
||||
package v1 |
||||
|
||||
import ( |
||||
"encoding/binary" |
||||
"unicode/utf8" |
||||
|
||||
"github.com/grafana/loki/pkg/logproto" |
||||
) |
||||
|
||||
type Token struct { |
||||
Key []byte |
||||
} |
||||
|
||||
type Tokenizer interface { |
||||
Tokens(line string) []Token |
||||
GetSkip() int |
||||
GetMin() int |
||||
GetMax() int |
||||
} |
||||
|
||||
const TokenBufferSize = 4096 |
||||
const TokenKeySize = 132 |
||||
|
||||
type NgramTokenizer struct { |
||||
// [min,max) exclusivity
|
||||
min, max, skip int |
||||
buffers [][]rune // circular buffers used for ngram generation
|
||||
runeBuffer []byte // buffer used for token generation
|
||||
internalTokenBuffer []Token // circular buffer for tokens
|
||||
} |
||||
|
||||
/* |
||||
N-Grams (https://en.wikipedia.org/wiki/N-gram) are a series of 'n' adjacent characters in a string.
|
||||
These will be utilized for the bloom filters to allow for fuzzy searching. |
||||
*/ |
||||
func NewNGramTokenizer(min, max, skip int) *NgramTokenizer { |
||||
capacity := max - min |
||||
t := &NgramTokenizer{ |
||||
min: min, |
||||
max: max, |
||||
skip: skip, |
||||
buffers: make([][]rune, capacity), |
||||
runeBuffer: make([]byte, 0, max*4), |
||||
internalTokenBuffer: make([]Token, 0, TokenBufferSize), |
||||
} |
||||
|
||||
for i := range t.buffers { |
||||
t.buffers[i] = make([]rune, t.min+i) |
||||
} |
||||
|
||||
for i := 0; i < cap(t.internalTokenBuffer); i++ { |
||||
t.internalTokenBuffer = append(t.internalTokenBuffer, Token{Key: make([]byte, 0, TokenKeySize)}) |
||||
} |
||||
|
||||
return t |
||||
} |
||||
|
||||
func (t *NgramTokenizer) GetSkip() int { |
||||
return t.skip |
||||
} |
||||
|
||||
func (t *NgramTokenizer) GetMin() int { |
||||
return t.min |
||||
} |
||||
|
||||
func (t *NgramTokenizer) GetMax() int { |
||||
return t.max |
||||
} |
||||
|
||||
func (t *NgramTokenizer) Tokens(line string) []Token { |
||||
var i int // rune index (not position that is measured in the range loop)
|
||||
numToks := 0 |
||||
for _, r := range line { |
||||
|
||||
// j is the index of the buffer to use
|
||||
for j := 0; j < (t.max - t.min); j++ { |
||||
// n is the length of the ngram
|
||||
n := j + t.min |
||||
// pos is the position in the buffer to overwrite
|
||||
pos := i % n |
||||
t.buffers[j][pos] = r |
||||
|
||||
if i >= n-1 && (i+1-n)%(t.skip+1) == 0 { |
||||
t.runeBuffer = reassemble(t.buffers[j], (i+1)%n, t.runeBuffer) |
||||
if numToks >= cap(t.internalTokenBuffer) || numToks == len(t.internalTokenBuffer) { |
||||
t.internalTokenBuffer = append(t.internalTokenBuffer, Token{Key: make([]byte, 0, TokenKeySize)}) |
||||
} |
||||
t.internalTokenBuffer[numToks].Key = t.internalTokenBuffer[numToks].Key[:0] |
||||
t.internalTokenBuffer[numToks].Key = append(t.internalTokenBuffer[numToks].Key, t.runeBuffer...) |
||||
numToks++ |
||||
} |
||||
} |
||||
i++ |
||||
} |
||||
return t.internalTokenBuffer[0:numToks] |
||||
} |
||||
|
||||
func reassemble(buf []rune, pos int, result []byte) []byte { |
||||
result = result[:0] // Reset the result slice
|
||||
for i := 0; i < len(buf); i++ { |
||||
cur := (pos + i) % len(buf) |
||||
result = utf8.AppendRune(result, buf[cur]) |
||||
} |
||||
return result |
||||
} |
||||
|
||||
func chunkIDTransformer(tok Token, prefix []byte) Token { |
||||
tok.Key = append(append(tok.Key, prefix...), tok.Key...)[len(tok.Key):] |
||||
return tok |
||||
} |
||||
|
||||
type WrappedTokenizer struct { |
||||
t Tokenizer |
||||
tokenBuffer []Token |
||||
prefix []byte |
||||
i64buf []byte |
||||
i32buf []byte |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) Tokens(line string) []Token { |
||||
w.tokenBuffer = w.tokenBuffer[:0] // Reset the result slice
|
||||
toks := w.t.Tokens(line) |
||||
for _, tok := range toks { |
||||
w.tokenBuffer = append(w.tokenBuffer, chunkIDTransformer(tok, w.prefix), tok) |
||||
} |
||||
|
||||
return w.tokenBuffer |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) GetSkip() int { |
||||
return w.t.GetSkip() |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) GetMin() int { |
||||
return w.t.GetMin() |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) GetMax() int { |
||||
return w.t.GetMax() |
||||
} |
||||
|
||||
func ChunkIDTokenizer(t Tokenizer) *WrappedTokenizer { |
||||
p := make([]byte, 0, 256) |
||||
return &WrappedTokenizer{ |
||||
t: t, |
||||
tokenBuffer: make([]Token, 0, TokenBufferSize), |
||||
prefix: p, |
||||
i64buf: make([]byte, binary.MaxVarintLen64), |
||||
i32buf: make([]byte, 4), |
||||
} |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) Reinit(chk logproto.ChunkRef) { |
||||
w.prefix = w.prefix[:0] |
||||
|
||||
binary.PutVarint(w.i64buf, int64(chk.From)) |
||||
w.prefix = append(w.prefix, w.i64buf...) |
||||
binary.PutVarint(w.i64buf, int64(chk.Through)) |
||||
w.prefix = append(w.prefix, w.i64buf...) |
||||
binary.LittleEndian.PutUint32(w.i32buf, chk.Checksum) |
||||
w.prefix = append(w.prefix, w.i32buf...) |
||||
} |
@ -0,0 +1,206 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"encoding/binary" |
||||
"github.com/stretchr/testify/require" |
||||
"strconv" |
||||
"testing" |
||||
) |
||||
|
||||
var num = 1000000 |
||||
|
||||
func BenchmarkLRU1Put(b *testing.B) { |
||||
cache := NewLRUCache(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU1Get(b *testing.B) { |
||||
cache := NewLRUCache(num) |
||||
for i := 0; i < num; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Get(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU2Put(b *testing.B) { |
||||
cache := NewLRUCache2(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU2Get(b *testing.B) { |
||||
cache := NewLRUCache2(num) |
||||
for i := 0; i < num; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Get(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU4Put(b *testing.B) { |
||||
cache := NewLRUCache4(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put([]byte(strconv.Itoa(i))) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU4Get(b *testing.B) { |
||||
cache := NewLRUCache4(num) |
||||
for i := 0; i < num; i++ { |
||||
cache.Put([]byte(strconv.Itoa(i))) |
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Get([]byte(strconv.Itoa(i))) |
||||
} |
||||
} |
||||
|
||||
func TestByteSet(t *testing.T) { |
||||
set := NewByteSet(30) |
||||
set.Put("fooa") |
||||
set.PutBytes([]byte("foob")) |
||||
for _, tc := range []struct { |
||||
desc string |
||||
input string |
||||
exp bool |
||||
}{ |
||||
{ |
||||
desc: "test string put", |
||||
input: "fooa", |
||||
exp: true, |
||||
}, |
||||
{ |
||||
desc: "test byte put", |
||||
input: "foob", |
||||
exp: true, |
||||
}, |
||||
} { |
||||
t.Run(tc.desc, func(t *testing.T) { |
||||
require.Equal(t, tc.exp, set.Get(tc.input)) |
||||
}) |
||||
} |
||||
} |
||||
|
||||
func TestByteKeyLRUCache(t *testing.T) { |
||||
set := NewByteKeyLRUCache(30) |
||||
set.Put(NewFourByteKeyFromSlice([]byte("fooa"))) |
||||
//set.PutBytes([]byte("foob"))
|
||||
for _, tc := range []struct { |
||||
desc string |
||||
input string |
||||
exp bool |
||||
}{ |
||||
{ |
||||
desc: "test valid", |
||||
input: "fooa", |
||||
exp: true, |
||||
}, |
||||
{ |
||||
desc: "test not valid", |
||||
input: "foob", |
||||
exp: false, |
||||
}, |
||||
} { |
||||
t.Run(tc.desc, func(t *testing.T) { |
||||
require.Equal(t, tc.exp, set.Get(NewFourByteKeyFromSlice([]byte(tc.input)))) |
||||
}) |
||||
} |
||||
} |
||||
|
||||
func TestLRUCache5(t *testing.T) { |
||||
set := NewLRUCache5(30) |
||||
set.Put("fooa") |
||||
for _, tc := range []struct { |
||||
desc string |
||||
input string |
||||
exp bool |
||||
}{ |
||||
{ |
||||
desc: "test valid", |
||||
input: "fooa", |
||||
exp: true, |
||||
}, |
||||
{ |
||||
desc: "test not valid", |
||||
input: "foob", |
||||
exp: false, |
||||
}, |
||||
} { |
||||
t.Run(tc.desc, func(t *testing.T) { |
||||
require.Equal(t, tc.exp, set.Get(tc.input)) |
||||
}) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU5Put(b *testing.B) { |
||||
cache := NewLRUCache5(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkLRU5Get(b *testing.B) { |
||||
cache := NewLRUCache5(num) |
||||
for i := 0; i < num; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Get(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkByteKeyLRUCacheSet(b *testing.B) { |
||||
buf := make([]byte, 26) |
||||
cache := NewByteKeyLRUCache(num) |
||||
for i := 0; i < b.N; i++ { |
||||
binary.LittleEndian.PutUint64(buf, uint64(i)) |
||||
|
||||
cache.Put(NewThirtyOneByteKeyFromSlice(buf)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkByteKeyLRUCacheGet(b *testing.B) { |
||||
buf := make([]byte, 26) |
||||
|
||||
cache := NewByteKeyLRUCache(num) |
||||
for i := 0; i < b.N; i++ { |
||||
binary.LittleEndian.PutUint64(buf, uint64(i)) |
||||
|
||||
cache.Put(NewThirtyOneByteKeyFromSlice(buf)) |
||||
//cache.Put(NewTwentySixByteKeyFromSlice([]byte(strconv.Itoa(i))))
|
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
binary.LittleEndian.PutUint64(buf, uint64(i)) |
||||
|
||||
cache.Get(NewThirtyOneByteKeyFromSlice(buf)) |
||||
//cache.Get(NewTwentySixByteKeyFromSlice([]byte(strconv.Itoa(i))))
|
||||
} |
||||
} |
||||
|
||||
func BenchmarkByteSetPut(b *testing.B) { |
||||
cache := NewByteSet(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
} |
||||
|
||||
func BenchmarkByteSetGet(b *testing.B) { |
||||
cache := NewByteSet(num) |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Put(strconv.Itoa(i)) |
||||
} |
||||
b.ResetTimer() |
||||
for i := 0; i < b.N; i++ { |
||||
cache.Get(strconv.Itoa(i)) |
||||
} |
||||
} |
@ -1,255 +0,0 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"encoding/binary" |
||||
"unicode/utf8" |
||||
|
||||
"github.com/grafana/loki/pkg/logproto" |
||||
) |
||||
|
||||
type Token struct { |
||||
Key []byte |
||||
Value string |
||||
} |
||||
|
||||
type Tokenizer interface { |
||||
Tokens(line string) []Token |
||||
getSkip() int |
||||
getMin() int |
||||
getMax() int |
||||
} |
||||
|
||||
/* |
||||
type logfmtTokenizer struct { |
||||
parser *log.LogfmtParser |
||||
lbls *log.LabelsBuilder |
||||
} |
||||
|
||||
func (t *logfmtTokenizer) Tokens(line string) []Token { |
||||
t.lbls.Reset() |
||||
t.parser.Process(0, []byte(line), t.lbls) |
||||
ls := t.lbls.LabelsResult().Labels() |
||||
res := make([]Token, 0, len(ls)) |
||||
for _, l := range ls { |
||||
res = append(res, Token{Key: l.Name, Value: l.Value}) |
||||
} |
||||
return res |
||||
} |
||||
|
||||
func newLogfmtTokenizer() *logfmtTokenizer { |
||||
return &logfmtTokenizer{ |
||||
// non strict, allow empty values
|
||||
parser: log.NewLogfmtParser(false, true), |
||||
lbls: log.NewBaseLabelsBuilder().ForLabels(nil, 0), |
||||
} |
||||
} |
||||
|
||||
*/ |
||||
|
||||
type ngramTokenizer struct { |
||||
// [min,max) exclusivity
|
||||
min, max, skip int |
||||
buffers [][]rune // circular buffers used for ngram generation
|
||||
runeBuffer []byte // buffer used for token generation
|
||||
tokenBuffer []Token // buffer used for holding tokens that is returned
|
||||
internalTokenBuffer []Token // circular buffer for tokens
|
||||
} |
||||
|
||||
func newNGramTokenizer(min, max, skip int) *ngramTokenizer { |
||||
capacity := max - min |
||||
t := &ngramTokenizer{ |
||||
min: min, |
||||
max: max, |
||||
skip: skip, |
||||
buffers: make([][]rune, capacity), |
||||
} |
||||
for i := t.min; i < t.max; i++ { |
||||
t.buffers[i-t.min] = make([]rune, i) |
||||
} |
||||
t.runeBuffer = make([]byte, 0, max*4) |
||||
t.tokenBuffer = make([]Token, 0, 1024) |
||||
t.internalTokenBuffer = make([]Token, 0, 1024) |
||||
for i := 0; i < cap(t.internalTokenBuffer); i++ { |
||||
tok := Token{} |
||||
tok.Key = make([]byte, 0, 132) |
||||
t.internalTokenBuffer = append(t.internalTokenBuffer, tok) |
||||
} |
||||
|
||||
return t |
||||
} |
||||
|
||||
func (t *ngramTokenizer) getSkip() int { |
||||
return t.skip |
||||
} |
||||
|
||||
func (t *ngramTokenizer) getMin() int { |
||||
return t.min |
||||
} |
||||
|
||||
func (t *ngramTokenizer) getMax() int { |
||||
return t.max |
||||
} |
||||
|
||||
func (t *ngramTokenizer) Tokens(line string) []Token { |
||||
t.tokenBuffer = t.tokenBuffer[:0] // Reset the result slice
|
||||
var i int // rune index (not position that is measured in the range loop)
|
||||
numToks := 0 |
||||
for _, r := range line { |
||||
|
||||
// j is the index of the buffer to use
|
||||
for j := 0; j < (t.max - t.min); j++ { |
||||
// n is the length of the ngram
|
||||
n := j + t.min |
||||
// pos is the position in the buffer to overwrite
|
||||
pos := i % n |
||||
t.buffers[j][pos] = r |
||||
|
||||
if i >= n-1 && (i+1-n)%(t.skip+1) == 0 { |
||||
t.runeBuffer = reassemble(t.buffers[j], (i+1)%n, t.runeBuffer) |
||||
//fmt.Println(numToks, cap(t.internalTokenBuffer), len(t.internalTokenBuffer))
|
||||
if numToks >= cap(t.internalTokenBuffer) || numToks == len(t.internalTokenBuffer) { |
||||
tok := Token{} |
||||
tok.Key = make([]byte, 0, 132) |
||||
t.internalTokenBuffer = append(t.internalTokenBuffer, tok) |
||||
} |
||||
//fmt.Println(numToks, cap(t.internalTokenBuffer), len(t.internalTokenBuffer))
|
||||
t.internalTokenBuffer[numToks].Key = t.internalTokenBuffer[numToks].Key[:0] |
||||
t.internalTokenBuffer[numToks].Key = append(t.internalTokenBuffer[numToks].Key, t.runeBuffer...) |
||||
t.internalTokenBuffer[numToks].Value = string(t.internalTokenBuffer[numToks].Key) |
||||
numToks++ |
||||
} |
||||
} |
||||
i++ |
||||
} |
||||
t.tokenBuffer = append(t.tokenBuffer, t.internalTokenBuffer[:numToks]...) |
||||
return t.tokenBuffer |
||||
} |
||||
|
||||
func (t *ngramTokenizer) OldTokens(line string) []Token { |
||||
t.tokenBuffer = t.tokenBuffer[:0] // Reset the result slice
|
||||
var i int // rune index (not position that is measured in the range loop)
|
||||
for _, r := range line { |
||||
|
||||
// j is the index of the buffer to use
|
||||
for j := 0; j < (t.max - t.min); j++ { |
||||
// n is the length of the ngram
|
||||
n := j + t.min |
||||
// pos is the position in the buffer to overwrite
|
||||
pos := i % n |
||||
t.buffers[j][pos] = r |
||||
|
||||
if i >= n-1 && (i+1-n)%(t.skip+1) == 0 { |
||||
t.runeBuffer = reassemble(t.buffers[j], (i+1)%n, t.runeBuffer) |
||||
b := Token{} |
||||
b.Key = make([]byte, 0, 132) // TODO: Yeah, that's too big but I didn't fee like doing the math at the end of the day
|
||||
b.Key = append(b.Key, t.runeBuffer...) |
||||
b.Value = string(b.Key) |
||||
t.tokenBuffer = append(t.tokenBuffer, b) |
||||
} |
||||
} |
||||
i++ |
||||
} |
||||
return t.tokenBuffer |
||||
} |
||||
|
||||
func reassemble(buf []rune, pos int, result []byte) []byte { |
||||
result = result[:0] // Reset the result slice
|
||||
for i := 0; i < len(buf); i++ { |
||||
cur := (pos + i) % len(buf) |
||||
result = utf8.AppendRune(result, buf[cur]) |
||||
} |
||||
return result |
||||
} |
||||
|
||||
type WrappedTokenizer struct { |
||||
t Tokenizer |
||||
f func(Token) Token |
||||
tokenBuffer []Token |
||||
prefix []byte |
||||
i64buf []byte |
||||
i32buf []byte |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) Tokens(line string) []Token { |
||||
w.tokenBuffer = w.tokenBuffer[:0] // Reset the result slice
|
||||
toks := w.t.Tokens(line) |
||||
for _, tok := range toks { |
||||
w.tokenBuffer = append(w.tokenBuffer, w.f(tok)) |
||||
} |
||||
return append(w.tokenBuffer, toks...) |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) getSkip() int { |
||||
return w.t.getSkip() |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) getMin() int { |
||||
return w.t.getMin() |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) getMax() int { |
||||
return w.t.getMax() |
||||
} |
||||
|
||||
func ChunkIDTokenizer(chk logproto.ChunkRef, t Tokenizer) *WrappedTokenizer { |
||||
//prefix := fmt.Sprintf("%d:%d:%d:", chk.From, chk.Through, chk.Checksum)
|
||||
p := make([]byte, 0, 256) |
||||
i64buf := make([]byte, binary.MaxVarintLen64) |
||||
i32buf := make([]byte, 4) |
||||
|
||||
binary.PutVarint(i64buf, int64(chk.From)) |
||||
p = append(p, i64buf...) |
||||
p = append(p, 58) |
||||
binary.PutVarint(i64buf, int64(chk.Through)) |
||||
p = append(p, i64buf...) |
||||
p = append(p, 58) |
||||
binary.LittleEndian.PutUint32(i32buf, chk.Checksum) |
||||
p = append(p, i32buf...) |
||||
p = append(p, 58) |
||||
|
||||
return &WrappedTokenizer{ |
||||
t: t, |
||||
f: func(tok Token) Token { |
||||
tok.Key = append(append(tok.Key, p...), tok.Key...)[len(tok.Key):] |
||||
tok.Value = string(tok.Key) |
||||
return tok |
||||
}, |
||||
tokenBuffer: make([]Token, 0, 1024), |
||||
prefix: p, |
||||
i64buf: i64buf, |
||||
i32buf: i32buf, |
||||
} |
||||
} |
||||
|
||||
func ChunkIDTokenizerHalfInit(t Tokenizer) *WrappedTokenizer { |
||||
p := make([]byte, 0, 256) |
||||
return &WrappedTokenizer{ |
||||
t: t, |
||||
tokenBuffer: make([]Token, 0, 1024), |
||||
prefix: p, |
||||
i64buf: make([]byte, binary.MaxVarintLen64), |
||||
i32buf: make([]byte, 4), |
||||
} |
||||
} |
||||
|
||||
func (w *WrappedTokenizer) reinit(chk logproto.ChunkRef) { |
||||
//prefix := fmt.Sprintf("%d:%d:%d:", chk.From, chk.Through, chk.Checksum)
|
||||
w.prefix = w.prefix[:0] |
||||
|
||||
//w.prefix = fmt.Appendf(w.prefix, "%d:%d:%d:", chk.From, chk.Through, chk.Checksum)
|
||||
binary.PutVarint(w.i64buf, int64(chk.From)) |
||||
w.prefix = append(w.prefix, w.i64buf...) |
||||
w.prefix = append(w.prefix, 58) |
||||
binary.PutVarint(w.i64buf, int64(chk.Through)) |
||||
w.prefix = append(w.prefix, w.i64buf...) |
||||
w.prefix = append(w.prefix, 58) |
||||
binary.LittleEndian.PutUint32(w.i32buf, chk.Checksum) |
||||
w.prefix = append(w.prefix, w.i32buf...) |
||||
w.prefix = append(w.prefix, 58) |
||||
|
||||
w.f = func(tok Token) Token { |
||||
tok.Key = append(append(tok.Key, w.prefix...), tok.Key...)[len(tok.Key):] |
||||
tok.Value = string(tok.Key) |
||||
return tok |
||||
} |
||||
} |
Loading…
Reference in new issue