Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/pattern/tokenization/tokenization.go

201 lines
6.0 KiB

package tokenization
import (
"bytes"
"unsafe"
)
const placeholderEndOfLine = "<...>"
// Outside of quoted strings, these are the delimiters between tokens. However,
// they are not going to be a part of the tokens themselves and will be replaced
// by spaces in the actual log template.
var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true}
type tokenizer struct {
// Input
rawLine []byte
maxTokens int
// State and position iterators
buf []byte
tpos int
tokenCount int
maybeJSON bool
// Result values, the values in the `tokens` slice reference line and shouldn't
// allocate new memory.
line string
tokens []string
}
func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) {
if t.tokens != nil {
// Intentionally written like this and not with append(), so this can
// panic if we ever exceed the preallocated slice size, since that means
// we have a nasty bug in handleNextToken() below.
t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos]
}
t.tokenCount++
t.tpos = endTokenPos + skip
}
func (t *tokenizer) handleNextToken() bool {
escaped := false
var c, curQuoteChar byte
curQuotePos := -1
lineLen := len(t.line)
for p := t.tpos; p < lineLen; p++ {
c = t.line[p]
switch {
// If the previous character was a backslash, we ignore the next
// character, unless it was an non-token delimiter (space, tab, etc.)
// outside of a quoted string.
case escaped:
if curQuotePos < 0 && delimiters[c] {
t.countOrSaveToken(p, 1)
return true
} else {
escaped = false
}
// If we weren't already escaped and we encounter a backslash, toggle
// the escaped flag and ignore the current byte.
case c == '\\':
escaped = true
// Non-ASCII / UTF8 / invalid character, consider it a part of the
// current token, for lack of a better efficient option...
case c > 127:
// Intentionally blank, continues to the next char
// If we are currently in a quoted part of the string, the current
// character is also part of the current token. The only special case
// here is if the current character is a matching quote, that means
// we'll no longer be quoted.
case curQuotePos >= 0:
if c == curQuoteChar { // end quote
curQuotePos = -1
}
// If we encounter a qoute character and we were not already in a quoted
// part of the line, mark that we are now in a quote from that type.
case c == '"' || c == '\'' || c == '`':
curQuoteChar = c
curQuotePos = p
// If we encounter a delimiter outside of a quote, count or save the
// token and skip the delimiter.
case delimiters[c]:
t.countOrSaveToken(p, 1)
return true
// Handle likely JSON object keys that have been serialized without
// spaces. For example, something like this:
// `{"context":{"taskId":1},"message":"starting",...}`
//
// If the line starts or ends with curly braces, we consider it might be
// a JSON log and try to detect the `":` part of the message that isn't
// followed by a delimiter character. If we find that pattern, we
// consider everything up to and including the `:` character as a
// separate token.
//
// Similarly, we try to detect the `,"` pattern and also split a token
// before the comma value. The `p > t.tpos` check is crucial here,
// because it ensures that we're not at the start of a token, i.e. there
// wasn't a delimiter right before the comma.
case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen:
if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] {
t.countOrSaveToken(p+1, 0)
return true
}
if c == ',' && t.line[p+1] == '"' {
t.countOrSaveToken(p, 0)
return true
}
}
// By default we do nothing, simply advance one character forward
// because the current character was a part of the current token.
}
// We have an unterminated single quote at position `curQuotePos`. To handle
// this edge case somewhat gracefully, we can emit everything up to that
// unterminated quote and the quote itself as a single token, and continue
// fairly normally from there.
if curQuotePos > 0 {
t.countOrSaveToken(curQuotePos+1, 0)
return true
}
if t.tpos < len(t.line) {
t.countOrSaveToken(len(t.line), 0)
return true
}
return false
}
// This function is called twice! The first time it counts the tokens but
// doesn't save them. Afterwards we allocate the tokens return slice with
// exactly the correct capacity and we call it again, this time to save them.
func (t *tokenizer) process() {
// We want to handle the end of the string as a single token, so we start
// the loop from 1.
for i := 1; i < t.maxTokens; i++ {
if !t.handleNextToken() {
break
}
}
if t.tpos >= len(t.line) {
return
}
// We have token count more than or equal to maxTokens, add one last token
// containing placeholderEndOfLine to signal that.
if t.tokens != nil {
t.tokens[t.tokenCount] = placeholderEndOfLine
}
t.tokenCount++
t.tpos += len(placeholderEndOfLine)
}
func (t *tokenizer) tokenize() []string {
t.buf = Preprocess(t.rawLine)
// We use unsafe to convert buf to a string without any new allocations.
// This is safe because t.buf won't be used or referenced anywhere else
// besides here from now on.
t.line = unsafe.String(unsafe.SliceData(t.buf), len(t.buf))
if len(t.line) >= 2 && (t.line[0] == '{' || t.line[len(t.line)-1] == '}') {
t.maybeJSON = true
}
t.process()
// If we have super long lines (more than twice the size we need to get the
// maxTokens we want), copy just the part we need so the tokens don't hold a
// reference to the original huge []byte slice.
if t.tokenCount == t.maxTokens && t.tpos*2 < len(t.line) {
t.line = string(t.buf[0 : t.tpos+1])
}
t.tokens = make([]string, t.tokenCount) // intentionally like this, see comment in countOrSaveToken()
t.tokenCount = 0
t.tpos = 0
t.process()
return t.tokens
}
func PreprocessAndTokenize(content []byte) []string {
content = bytes.TrimSpace(content)
t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens
return t.tokenize()
}