diff --git a/pkg/pattern/tokenization/replacer.go b/pkg/pattern/tokenization/replacer.go deleted file mode 100644 index cc725cb5b9..0000000000 --- a/pkg/pattern/tokenization/replacer.go +++ /dev/null @@ -1,900 +0,0 @@ -package tokenization - -import ( - "bytes" - "slices" -) - -var ( - placeholderNumber = []byte("") - placeholderHex = []byte("") - placeholderUUID = []byte("") - placeholderTimestamp = []byte("") - placeholderDuration = []byte("") - placeholderBytesize = []byte("") - placeholderIP = []byte("") -) - -// Integer numbers after these words won't be replaced by ``. -var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")} - -const ( - maxYear = 2100 // anything above that is unlikely to be a timestamp... - minHexLen = 12 // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings -) - -var boundaryChars = [256]bool{} - -func init() { - for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary - if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' { - boundaryChars[i] = true - } - } - - // We need these keys sorted in an ascending length to efficiently compare them. - slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int { - return len(a) - len(b) - }) -} - -type replacer struct { - source, dest []byte - - // This is the last position that was copied to the destination buffer. - tail int - // This is the current position we are working with - cur int - // This is essentially used for lookahed operations - head int - - // Here's some ASCII art to visualize that (the long string of dashes - // visualizes the log line: - // - // 0 `tail` `cur` `head` `len(source)` - // |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------| - - // A somewhat hacky solution that allows us to preserve specific numeric - // values we care about, like status and status_code. - preserveNextNumbers bool -} - -// commit advances the current position marker to the lookahead position, i.e. -// we are committing to consume everything we've looked ahead so far. -func (r *replacer) commit() { - r.cur = r.head -} - -func (r *replacer) resetHead() { - r.head = r.cur -} - -func (r *replacer) resetHeadExpr() bool { - r.head = r.cur - return true // useful when included in boolean expressions with advance() -} - -func (r *replacer) backtrack() { - r.head-- -} - -func (r *replacer) consumeUpToCurrent() { - r.tail = r.cur -} - -// advanceUintRet returns the actual value of the number we read, as well as its -// length. The value might overflow for large numbers, so it's also important to -// check the length! -func (r *replacer) advanceUintRet() (val uint, length uint) { - var c byte - for ; r.head < len(r.source); r.head++ { - c = r.source[r.head] - if c < '0' || '9' < c { - break - } - length++ - val = val*10 + uint(c-'0') // TODO: use bitwise trick? - } - - return val, length -} - -func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) { - var c byte - for ; r.head < len(r.source); r.head++ { - c = r.source[r.head] - if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) { - break - } - length++ - } - - return length -} - -func (r *replacer) advanceUint() bool { - _, l := r.advanceUintRet() - return l > 0 -} - -func (r *replacer) advanceUintUpTo(val uint, length uint) bool { - foundVal, foundLength := r.advanceUintRet() - return foundLength > 0 && foundVal <= val && foundLength <= length -} - -func (r *replacer) advanceYear() bool { - return r.advanceUintUpTo(maxYear, 4) -} - -func (r *replacer) advanceChar(c byte) bool { - if r.head < len(r.source) && r.source[r.head] == c { - r.head++ - return true - } - return false -} - -func (r *replacer) peekNextIsBoundary() bool { - if r.head >= len(r.source) { - return true // we are at the end of the line - } - return boundaryChars[r.source[r.head]] -} - -func (r *replacer) peekFirstNonInt() (c byte) { - for i := r.head; i < len(r.source); i++ { - c = r.source[i] - if c < '0' || '9' < c { - return c - } - } - return 0 // we can return the 0 value here! -} - -func (r *replacer) peekNext4() (result [4]byte) { - overhead := len(r.source) - r.head - switch { - case overhead > 3: - result[3] = r.source[r.head+3] - fallthrough - case overhead > 2: - result[2] = r.source[r.head+2] - fallthrough - case overhead > 1: - result[1] = r.source[r.head+1] - fallthrough - case overhead > 0: - result[0] = r.source[r.head+0] - } - return result -} - -func (r *replacer) advanceTimeZoneLetters() bool { - UCLetters := 0 - for { - nc, ok := r.advance() - if !ok { - break - } - if nc < 'A' || nc > 'Z' { - r.backtrack() - break - } - UCLetters++ - } - - return UCLetters >= 2 && UCLetters <= 5 -} - -func (r *replacer) advanceNumericTimeZone() bool { - // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones - return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5) -} - -// helper for handleWeirdTimestamp() -func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool { - curHead := r.head - if r.advanceChar(' ') && r.advanceNumericTimeZone() { - return true - } - r.head = curHead - if r.advanceChar(' ') && r.advanceTimeZoneLetters() { - return true - } - r.head = curHead - return isOptional -} - -func (r *replacer) advanceOneOf(chars ...byte) bool { - if r.head >= len(r.source) { - return false - } - c := r.source[r.head] - - for _, ec := range chars { - if c == ec { - r.head++ - return true - } - } - return false -} - -func (r *replacer) advanceTime(secondsOptional bool) bool { - return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2))) -} - -// Mon|Tue|Wed|Thu|Fri|Sat|Sun -func (r *replacer) advanceDayOfTheWeek() bool { - c1, ok1 := r.advance() - c2, ok2 := r.advance() - c3, ok3 := r.advance() - if !ok1 || !ok2 || !ok3 { - return false - } - return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun - (c1 == 'M' && c2 == 'o' && c3 == 'n') || - (c1 == 'T' && c2 == 'u' && c3 == 'e') || - (c1 == 'W' && c2 == 'e' && c3 == 'd') || - (c1 == 'T' && c2 == 'h' && c3 == 'u') || - (c1 == 'F' && c2 == 'r' && c3 == 'i') -} - -// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec -func (r *replacer) advanceMonthName() bool { - c1, ok1 := r.advance() - c2, ok2 := r.advance() - c3, ok3 := r.advance() - if !ok1 || !ok2 || !ok3 { - return false - } - - return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul - (c2 == 'a' && c3 == 'n'))) || // Jan - (c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May - (c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec - (c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr - (c2 == 'u' && c3 == 'g'))) || // Aug - (c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct - (c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov -} - -// Check if we in the middle of an UUID, exactly after the first 8 characters -// and the dash after them, e.g after "abcd0123-": -func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) { - return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary() -} - -// Only moves the head forward if it successfully matches a duration -func (r *replacer) advanceDuration() (matched bool) { - curHead := r.head - - var secsLen int - n := r.peekNext4() - if n[0] == 'h' { - r.head++ - if boundaryChars[n[1]] { - return true // e.g. "1h", "123h" - } - if !r.advanceUintUpTo(60, 2) { - goto restore - } - n = r.peekNext4() - } - if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here - r.head++ - if boundaryChars[n[1]] { - return true // e.g. "2h21m", "121m" - } - if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) { - goto restore - } - n = r.peekNext4() - } - - if n[0] == 's' && boundaryChars[n[1]] { - secsLen = 1 - } else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] { - secsLen = 2 - } else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] { - // This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu) - secsLen = 3 - } else { - goto restore - } - - r.head += secsLen - return true - -restore: // should be faster than a defer - r.head = curHead - return false -} - -// 'b' and 'B' are not present here because of the way we check for byte size -// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would -// be considered valid byte sizes. Also, only integer numbers are accepted as -// valid bytesizes in bytes, so we handle bytes with special cases instead. -var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true} - -// Only moves the head forward if it successfully matches a duration -func (r *replacer) advanceBytesize(c1 byte) (matched bool) { - if !byteSizes[c1] { - return false - } - n := r.peekNext4() - - var unitLen int // not counting the first character c1, which is already advanced to - if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] { - unitLen = 1 - } else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] { - unitLen = 2 - } else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] { - unitLen = 3 - } - - if unitLen > 0 { - r.head += unitLen - return true - } - return false -} - -func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) { - // Get the next character after the space - c1, hasNext := r.advance() - if !hasNext { - return false - } - if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() { - return true - } - if r.advanceBytesize(c1) { - return true - } - r.backtrack() - return false -} - -func (r *replacer) advance() (c byte, advanced bool) { - if r.head >= len(r.source) { - return 0, false - } - - c = r.source[r.head] - r.head++ - return c, true -} - -func (r *replacer) emitNumber() { - r.commit() - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() -} - -func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) { - r.commit() - if !r.preserveNextNumbers { - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() - } else { - r.maybeEmitDash(hasMinusPrefix) - r.copyUpToCurrent() - } -} - -func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) { - r.commit() - r.maybeEmitDash(hasMinusPrefix) - r.dest = append(r.dest, placeholder...) - r.consumeUpToCurrent() -} - -func (r *replacer) maybeEmitDash(hasMinusPrefix bool) { - // This minus was actually a dash, so we just copy it to the result - if hasMinusPrefix { - r.dest = append(r.dest, '-') - } -} - -func (r *replacer) copyUpToCurrent() { - r.dest = append(r.dest, r.source[r.tail:r.cur]...) - r.consumeUpToCurrent() -} - -func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) { - // Special case that is likely a hex string of the format "0x", but we don't - // know whether the rest is upper case or lower case yet. - zeroHex := false - if n1 == 0 && l1 == 1 && c1 == 'x' { - zeroHex = true // these cannot be the start of an UUID - c1 = r.peekFirstNonInt() - } - - // Special case, this might be a byte size - if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() { - // We do not subsume a minus sign - byte sizes are unlikely to be - // negative, it's more likely this is a dash as a part of a range - r.emit(hasMinusPrefix, placeholderBytesize) - return true - } - - // Maybe we are at the start of a hex string, either something like - // "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper - // case letters, but to avoid false positives, we want hex replacements to - // happen only if the strings are fully lower case or fully upper case. - if 'a' <= c1 && c1 <= 'f' { - return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex) - } else if 'A' <= c1 && c1 <= 'F' { - return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex) - } else if zeroHex { - // Well, it probably wasn't a zero-hex after all, or it contained only - // digits, so try to handle that or emit what we absorbed - _, l2 := r.advanceUintRet() - if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen - r.emit(hasMinusPrefix, placeholderHex) - } else { - r.maybeEmitDash(hasMinusPrefix) - r.commit() - r.copyUpToCurrent() - } - return r.peekNextIsBoundary() - } - - return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1) -} - -func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) { - totalLen := l1 + r.advanceUintOrRangeRet(lc, hc) - r.commit() - - if totalLen == 8 && canBeUUID { - // We might be at the first part of a UUID, right before the first dash - if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) { - r.emit(hasMinusPrefix, placeholderUUID) - return true - } - r.resetHead() - } - - if totalLen >= minHexLen && r.peekNextIsBoundary() { - r.emit(hasMinusPrefix, placeholderHex) - return true - } - - r.copyUpToCurrent() - return r.peekNextIsBoundary() -} - -func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) { - if r.advanceBytesize(c1) { - // We do not subsume a minus sign - byte sizes are unlikely to be - // negative, it's more likely this is a dash as a part of a range - r.emit(hasMinusPrefix, placeholderBytesize) - return true - } - - r.backtrack() - if r.advanceDuration() { - // We subsume hasMinusPrefix, since durations can be negative - r.emit(false, placeholderDuration) - return true - } - - // We couldn't match anything, so just copy what existed. - r.maybeEmitDash(hasMinusPrefix) - r.copyUpToCurrent() - return false -} - -func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) { - n2, l2 := r.advanceUintRet() - if l2 == 0 { - // The dot wasn't followed by another number, so emit everything before it. - r.backtrack() - r.emitNumberOrCopyText(hasMinusPrefix) - return false - } - - // See if the number after the decimal is also followed by a boundary - b2, hasNext := r.advance() - - // We are at the end of the string, which is a boundary, replace evertything - // up to now with a number - if !hasNext { - r.emitNumber() // this also subsumes any minus sign we had - return true - } - - // The decimal number isn't followed by a boundary char (which include - // things like '.', ':', '/', etc.), so the part after the decimal is either - // not a real number, or it's some sort of a unit that can support decimals - // like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc. - if !boundaryChars[b2] { - return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2) - } - - // This can be a byte size with a space, e.g. "3.14 GiB" - if b2 == ' ' && r.advanceSpacedBytesize(false) { - // We do not subsume a minus sign - byte sizes are unlikely to be - // negative, it's more likely this is a dash as a part of a range - r.emit(hasMinusPrefix, placeholderBytesize) - return true - } - - // We have a decimal number followed by a non-dot boundary, so this is not - // an IP or a version number or anything like that. - if b2 != '.' { - r.backtrack() - r.emitNumber() - return true - } - - n3, l3 := r.advanceUintRet() - if l3 == 0 || !r.peekNextIsBoundary() { - // The second dot wasn't followed by another number and a boundary, so - // emit just the first number. - r.resetHead() - r.emitNumber() - return true - } - - // We have ".." at this point, so we either have something - // like a version number, or an IP address, but certainly not a simple - // decimal number we can just emit. - r.commit() - - // Check if this is an IP address... - if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() { - r.emit(hasMinusPrefix, placeholderIP) - return true - } - - // This wasn't an IP after all, so just emit "..". We don't - // want to assume this is a simple decimal number because of the second dot, - // e.g. this might be something like a version number. - r.resetHead() - r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus - r.dest = append(r.dest, placeholderNumber...) - r.dest = append(r.dest, '.') - r.dest = append(r.dest, placeholderNumber...) - r.dest = append(r.dest, '.') - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() - return true -} - -func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) { - if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) { - r.commit() - // continue down to parsing sub-second and timezone values - } else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() { - // We might not have a sane timestamp, but apparently a not-so-sane - // timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"... - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } else { - // Apparently that wasn't it either, we probably just have a dash or - // forward slash after a number, so we backtrack and emit the number. - r.resetHead() - r.emitNumberOrCopyText(hasMinusPrefix) - return true - } - - // We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or - // `YY[YY]/MM/DDZhh:mm:ss` - r.commit() // we want to keep this - - // Now we need to also check for sub-second precision and time zones: - c, canAdvance := r.advance() - if !canAdvance { - // End of the string - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - if c == '.' { - _, intl := r.advanceUintRet() - if intl == 0 { - // No sub-second precision, the dot was not part of the timestamp - r.backtrack() - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - - // We had sub-second precision, capture that too - r.commit() - - // Get the next char to see if we have time zone - c, canAdvance = r.advance() - if !canAdvance { - // We are at the end of the sting after we captured the - // sub-second value. - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - } - - if c == 'Z' || c == 'z' { - //UTC string, nothing to do after that - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones - if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) { - // e.g. "2020-09-30T00:00:59.9999+03:00" - r.commit() - } else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() { - // e.g. "2023-09-05 23:20:28.030285153 +0000 UTC" - r.commit() - } else { - r.resetHead() - } - r.emit(hasMinusPrefix, placeholderTimestamp) - return true -} - -func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) { - r.resetHead() // keep the head pos in sync with the current pos - // We know we were at a digit due to how handleNumber() is called - n1, l1 := r.advanceUintRet() - r.commit() // we will consume this one way or another for sure - - // See if the number is followed by a boundary - b1, hasNext := r.advance() - - switch { - // We are at the end of the string, which is a boundary, replace evertything - // up to now with a number - case !hasNext: - r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had - return true - - // The number isn't followed by a boundary char (which include things like - // '.', ' ', '/', etc.), so it's either not a real number or date, or it's - // some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB, - // 3MiB), etc. - case !boundaryChars[b1]: - return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1) - - // We have a decimal point, so this can either be a plain number, a unit - // that can have a float value, or an IP address. - case b1 == '.': - return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1) - - // This might be a timestamp that looks like '2024-04-01...' or - // `2024/03/27...`; timestamps in the far future are not supported :) - case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'): - return r.handleSaneTimestamp(hasMinusPrefix, n1, b1) - - // This might be a byte size with a space, e.g. "2 b", "3 GiB" - case b1 == ' ' && r.advanceSpacedBytesize(true): - r.emit(hasMinusPrefix, placeholderBytesize) - return true - - // Weird RFC822 dates like "02 Jan 06 15:04 MST" - case n1 <= 31 && l1 <= 2 && b1 == ' ': - if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) { - r.commit() - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - // if not, go to default handler after switch statement - - // It could be a UUID that starts with 8 digits - case l1 == 8 && b1 == '-': - if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) { - r.emit(hasMinusPrefix, placeholderUUID) - return true - } - // if not, go to default handler after switch statement - } - - // Number with an unknown boundary - emit the number and leave the boundary - // for the following passes. - r.resetHead() - r.emitNumberOrCopyText(hasMinusPrefix) - return true -} - -var longDayNames = [...][]byte{ - []byte("Sunday"), - []byte("Monday"), - []byte("Tuesday"), - []byte("Wednesday"), - []byte("Thursday"), - []byte("Friday"), - []byte("Saturday"), -} - -func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) { - r.resetHead() - - if r.advanceDayOfTheWeek() { - r.commit() // we will always consume this - - // RFC1123 and RFC1123Z, e.g.: - // - "Mon, 02 Jan 2006 15:04:05 MST" - // - "Mon, 02 Jan 2006 15:04:05 -0700" - if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // ANSIC, UnixDatem, RubyDate e.g - // - "Mon Jan 2 15:04:05 2006" - // - "Mon Jan 2 15:04:05 MST 2006" - // - "Mon Jan 02 15:04:05 -0700 2006" - if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // Linux, e.g. - // - "Mon 2 Jan 15:04:05 MST 2006" - // - "Tue 23 Jan 15:04:05 -0700 2023" - if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // RFC850, e.g. - // - "Monday, 02-Jan-06 15:04:05 MST" - backtrackedSlice := r.source[r.head-3:] - var matchedDay []byte - for _, dw := range longDayNames { - if bytes.HasPrefix(backtrackedSlice, dw) { - matchedDay = dw - break - } - } - if matchedDay != nil { - r.head += len(matchedDay) - 3 - if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) { - r.emit(false, placeholderTimestamp) - return true - } - } - - r.cur -= 3 // unconsume - r.resetHead() - return false - } - r.resetHead() - - if r.advanceMonthName() { - r.commit() // provisionally consume this - - // Linux journald logs and others similar like this: - // - Feb 29 23:00:14 - // - Apr-10 23:43:46.807 - // - Jul 1 00:21:28 - if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) { - r.commit() - // This is already a timestamp, but let's try to match subseconds as well - if r.advanceChar('.') && r.advanceUint() { - r.commit() - } else { - r.resetHead() - } - r.emit(false, placeholderTimestamp) - return true - } - r.cur -= 3 // unconsume - r.resetHead() - return false - } - r.resetHead() - - return false -} - -func (r *replacer) wasNumPreservingKeyword() bool { - for _, key := range numPreservingKeys { - pPos := r.cur - 1 - len(key) - if pPos < -1 { - return false // all subsequent keys are longer - } - if pPos != -1 && !boundaryChars[r.source[pPos]] { - continue - } - if bytes.HasPrefix(r.source[pPos+1:], key) { - return true - } - } - return false -} - -func (r *replacer) replaceWithPlaceholders() { - lineLen := len(r.source) - - var c byte - onBoundary := true - for r.cur = 0; r.cur < lineLen; r.cur++ { - c = r.source[r.cur] - switch { - // If we're currently not at a boundary, the only thing we need to check - // is whether the current char would create a boundary in the next iteration. - case !onBoundary: - onBoundary = boundaryChars[c] - - // We weren't at a boundary and now we are, so check if we match one - // of the special keywords that will preserve numbers - if onBoundary { - r.preserveNextNumbers = r.wasNumPreservingKeyword() - } - - // If we've reached this far, it means we're currently at a boundary! - - // A lot of very complex logic if we encounter a number at a boundary, - // so we move that to a helper function. - case '0' <= c && c <= '9': - r.copyUpToCurrent() - onBoundary = r.handleNumberStart(false) - - // Handle negative numbers, potentially - case c == '-': - next := r.cur + 1 - // This might be a number, a date, an IP address, etc. So we don't - // know if this is a minus sign to replace or a dash to copy yet. - if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' { - // Copy everything before the dash, but mark it as consumed. - r.copyUpToCurrent() - r.cur++ - r.consumeUpToCurrent() - onBoundary = r.handleNumberStart(true) - } else { - onBoundary = true - } - - // Try to match weird timestamps. They require a lot of remaining - // length, generally start with a capitalized day of the week or month - // name (1 upper case letter followed by 2 lower case letters). - // - // We are basically looking for something that may match this here: - // Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec - // - // The detailed check would be performed by the actual handler: - case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 && - 'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' && - 'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y': - r.copyUpToCurrent() - onBoundary = r.handleWeirdTimestamp() - - // This could be the start of an lower case hex string: - case 'a' <= c && c <= 'f': - r.copyUpToCurrent() - r.resetHead() - onBoundary = r.handleHex(false, 0, 'a', 'f', true) - - // This could be the start of an upper case hex string: - case 'A' <= c && c <= 'F': - r.copyUpToCurrent() - r.resetHead() - onBoundary = r.handleHex(false, 0, 'A', 'F', true) - - // If we haven't actually matched anything, update whether we're still - // on a boundary character and continue onto the next one. - default: - onBoundary = boundaryChars[c] - } - } - - if r.cur > r.tail { - r.dest = append(r.dest, r.source[r.tail:]...) - r.consumeUpToCurrent() - } -} - -func Preprocess(content []byte) []byte { - // ~floor(120%), to allow for some expansion from replacements, hopefully - // without needing to allocate more memory - r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)} - r.replaceWithPlaceholders() - return r.dest -} diff --git a/pkg/pattern/tokenization/tokenization.go b/pkg/pattern/tokenization/tokenization.go deleted file mode 100644 index 52b239d049..0000000000 --- a/pkg/pattern/tokenization/tokenization.go +++ /dev/null @@ -1,198 +0,0 @@ -package tokenization - -import ( - "unsafe" -) - -const placeholderEndOfLine = "<...>" - -// Outside of quoted strings, these are the delimiters between tokens. However, -// they are not going to be a part of the tokens themselves and will be replaced -// by spaces in the actual log template. -var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true} - -type tokenizer struct { - // Input - rawLine []byte - maxTokens int - - // State and position iterators - buf []byte - tpos int - tokenCount int - maybeJSON bool - - // Result values, the values in the `tokens` slice reference line and shouldn't - // allocate new memory. - line string - tokens []string -} - -func (t *tokenizer) countOrSaveToken(endTokenPos int) { - if t.tokens != nil { - // Intentionally written like this and not with append(), so this can - // panic if we ever exceed the preallocated slice size, since that means - // we have a nasty bug in handleNextToken() below. - t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos] - } - t.tokenCount++ - t.tpos = endTokenPos -} - -func (t *tokenizer) handleNextToken() bool { - escaped := false - var c, curQuoteChar byte - curQuotePos := -1 - - lineLen := len(t.line) - for p := t.tpos; p < lineLen; p++ { - c = t.line[p] - switch { - - // If the previous character was a backslash, we ignore the next - // character, unless it was an non-token delimiter (space, tab, etc.) - // outside of a quoted string. - case escaped: - if curQuotePos < 0 && delimiters[c] { - t.countOrSaveToken(p + 1) - return true - } else { - escaped = false - } - - // If we weren't already escaped and we encounter a backslash, toggle - // the escaped flag and ignore the current byte. - case c == '\\': - escaped = true - - // Non-ASCII / UTF8 / invalid character, consider it a part of the - // current token, for lack of a better efficient option... - case c > 127: - // Intentionally blank, continues to the next char - - // If we are currently in a quoted part of the string, the current - // character is also part of the current token. The only special case - // here is if the current character is a matching quote, that means - // we'll no longer be quoted. - case curQuotePos >= 0: - if c == curQuoteChar { // end quote - curQuotePos = -1 - } - - // If we encounter a qoute character and we were not already in a quoted - // part of the line, mark that we are now in a quote from that type. - case c == '"' || c == '\'' || c == '`': - curQuoteChar = c - curQuotePos = p - - // If we encounter a delimiter outside of a quote, count or save the - // token and skip the delimiter. - case delimiters[c]: - t.countOrSaveToken(p + 1) - return true - - // Handle likely JSON object keys that have been serialized without - // spaces. For example, something like this: - // `{"context":{"taskId":1},"message":"starting",...}` - // - // If the line starts or ends with curly braces, we consider it might be - // a JSON log and try to detect the `":` part of the message that isn't - // followed by a delimiter character. If we find that pattern, we - // consider everything up to and including the `:` character as a - // separate token. - // - // Similarly, we try to detect the `,"` pattern and also split a token - // before the comma value. The `p > t.tpos` check is crucial here, - // because it ensures that we're not at the start of a token, i.e. there - // wasn't a delimiter right before the comma. - case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen: - if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] { - t.countOrSaveToken(p + 1) - return true - } - if c == ',' && t.line[p+1] == '"' { - t.countOrSaveToken(p) - return true - } - } - - // By default we do nothing, simply advance one character forward - // because the current character was a part of the current token. - } - - // We have an unterminated single quote at position `curQuotePos`. To handle - // this edge case somewhat gracefully, we can emit everything up to that - // unterminated quote and the quote itself as a single token, and continue - // fairly normally from there. - if curQuotePos > 0 { - t.countOrSaveToken(curQuotePos + 1) - return true - } - - if t.tpos < len(t.line) { - t.countOrSaveToken(len(t.line)) - return true - } - - return false -} - -// This function is called twice! The first time it counts the tokens but -// doesn't save them. Afterwards we allocate the tokens return slice with -// exactly the correct capacity and we call it again, this time to save them. -func (t *tokenizer) process() { - // We want to handle the end of the string as a single token, so we start - // the loop from 1. - for i := 1; i < t.maxTokens; i++ { - if !t.handleNextToken() { - break - } - } - - if t.tpos >= len(t.line) { - return - } - - // We have token count more than or equal to maxTokens, add one last token - // containing placeholderEndOfLine to signal that. - if t.tokens != nil { - t.tokens[t.tokenCount] = placeholderEndOfLine - } - t.tokenCount++ - t.tpos += len(placeholderEndOfLine) -} - -func (t *tokenizer) tokenize() []string { - t.buf = Preprocess(t.rawLine) - - // We use unsafe to convert buf to a string without any new allocations. - // This is safe because t.buf won't be used or referenced anywhere else - // besides here from now on. - t.line = unsafe.String(unsafe.SliceData(t.buf), len(t.buf)) - - if len(t.line) >= 2 && (t.line[0] == '{' || t.line[len(t.line)-1] == '}') { - t.maybeJSON = true - } - - t.process() - - // If we have super long lines (more than twice the size we need to get the - // maxTokens we want), copy just the part we need so the tokens don't hold a - // reference to the original huge []byte slice. - if t.tokenCount == t.maxTokens && t.tpos*2 < len(t.line) { - t.line = string(t.buf[0 : t.tpos+1]) - } - - t.tokens = make([]string, t.tokenCount) // intentionally like this, see comment in countOrSaveToken() - t.tokenCount = 0 - t.tpos = 0 - t.process() - - return t.tokens -} - -func PreprocessAndTokenize(content []byte) []string { - t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens - - return t.tokenize() -} diff --git a/pkg/pattern/tokenization/tokenization_benchmark_test.go b/pkg/pattern/tokenization/tokenization_benchmark_test.go deleted file mode 100644 index c614f57576..0000000000 --- a/pkg/pattern/tokenization/tokenization_benchmark_test.go +++ /dev/null @@ -1,26 +0,0 @@ -package tokenization - -import ( - "fmt" - "testing" -) - -var sampleLogBenchCount = 0 - -func BenchmarkTokenizationTestCases(b *testing.B) { - for i, tc := range tokenizationRealisticTestCases { - line := []byte(tc.line) - b.Run(fmt.Sprintf("test-case-%d", i), func(b *testing.B) { - for i := 0; i < b.N; i++ { - PreprocessAndTokenize(line) - } - }) - } -} - -func BenchmarkTokenizationPlayground(b *testing.B) { - line := []byte(`level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`) - for i := 0; i < b.N; i++ { - PreprocessAndTokenize(line) - } -} diff --git a/pkg/pattern/tokenization/tokenization_test.go b/pkg/pattern/tokenization/tokenization_test.go deleted file mode 100644 index 1461e14d35..0000000000 --- a/pkg/pattern/tokenization/tokenization_test.go +++ /dev/null @@ -1,387 +0,0 @@ -package tokenization - -import ( - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -type tokenizationTestCase struct { - line string - expResult []string -} - -var tokenizationCornerTestCases = []tokenizationTestCase{ - { - "", - []string{}, - }, - { - " foo ", - // TOFIX: maybe this is not exactly what we want? should we try to combine empty space into a single token? - []string{" ", "foo ", " "}, - }, - { - "foo bar baz", - []string{"foo ", "bar ", "baz"}, - }, - { - "\nfoo\t bar baz\r\n", - // TOFIX: same as above - should we combine empty space into a single token? - []string{"\n", "foo\t", " ", " ", "bar ", "baz\r", "\n"}, - }, - { - "ends single char C", - []string{"ends ", "single ", "char ", "C"}, - }, - { - "0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Z Y X W V U T S R Q P O N M L K J I H G F E D C B A z y x w v u t s r q p o n m l k j i h g f e d c b a 9 8 7 6 5 4 3 2 1 0", - // The tail end of strings longer than maxTokens is returned as a single token - []string{" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "a ", "b ", "c ", "d ", "e ", "f ", "g ", "h ", "i ", "j ", "k ", "l ", "m ", "n ", "o ", "p ", "q ", "r ", "s ", "t ", "u ", "v ", "w ", "x ", "y ", "z ", "A ", "B ", "C ", "D ", "E ", "F ", "G ", "H ", "I ", "J ", "K ", "L ", "M ", "N ", "O ", "P ", "Q ", "R ", "S ", "T ", "U ", "V ", "W ", "X ", "Y ", "Z ", "Z ", "Y ", "X ", "W ", "V ", "U ", "T ", "S ", "R ", "Q ", "P ", "O ", "N ", "M ", "L ", "K ", "J ", "I ", "H ", "G ", "F ", "E ", "D ", "C ", "B ", "A ", "z ", "y ", "x ", "w ", "v ", "u ", "t ", "s ", "r ", "q ", "p ", "<...>"}, - }, - { - `a "quoted string"`, - []string{"a ", `"quoted string"`}, - }, - { - `a "quoted string with \" escapes!"`, - []string{"a ", `"quoted string with \" escapes!"`}, - }, - { - `a 'singly quoted string"'`, - []string{"a ", `'singly quoted string"'`}, - }, - { - `a 'singly quoted string" \''`, - []string{"a ", `'singly quoted string" \''`}, - }, - { - `a 'singly quoted string" \\\''`, - []string{"a ", `'singly quoted string" \\\''`}, - }, - { - `a'twisted excappe\\' "with an unterminated quote" 'at_the_end`, - []string{`a'twisted excappe\\' `, `"with an unterminated quote" `, `'`, "at_the_end"}, - }, - { - `a "quoted string 'inception'"!` + "`woot`'much`wow'", - []string{"a ", `"quoted string 'inception'"!` + "`woot`'much`wow'"}, - }, - { - `unterminated"quote`, - []string{`unterminated"`, `quote`}, - }, - { - "`mix`" + ` "and" 'match'`, - []string{"`mix` ", `"and" `, `'match'`}, - }, - { - "`mix`" + ` "and" 'match'`, - []string{"`mix` ", `"and" `, `'match'`}, - }, - { - `{"json":"uninterrupted \"logline\"","foo":"bar"}`, - []string{`{"json":`, `"uninterrupted \"logline\""`, `,"foo":`, `"bar"}`}, - }, - { - `{"weirdo1":`, - []string{`{"weirdo1":`}, - }, - { - // This and the corner case below are not optimal, though they're - // unlikely to be encountered in the real world, so it's hopefully a - // nice tradeoff for the performance of not needing a full JSON parser. - `{"weirdo2":}`, - []string{`{"weirdo2":`, `}`}, - }, - { - `,"weirdo3":}`, - []string{`,"weirdo3":`, `}`}, - }, - { - // We deliberately do treat "escaped" whitespaces outside of quotes as - // delimiters, i.e. whitespaces outside of strings cannot be escaped. - `weird\ escape`, - []string{`weird\ `, `escape`}, - }, - { - "-3.14-foo 0.0.0.0/24-0.0.0.1-255.255.255.255-256.255.255.255 1337-ber 0.12-ber n0tnumb3er 12faux -123.0.1.123 -123 -1231.11 333. 123.456. 123.45-", - []string{"-foo ", "/---... ", "-ber ", "-ber ", "n0tnumb3er ", "12faux ", "- ", " ", " ", ". ", ".. ", "-"}, - }, - { - "2022-12-31 12:12:31 3022-12-31 12:12:31-Jul 1 00:21:28", - []string{" ", "-- ", "::-"}, - }, - { - "2022/12/01 12:12:31 - 2022/13/32 12:12:31", - []string{" ", "- ", "// ", "::"}, - }, - { - "UUIDS: 123e4567-e89b-12d3-a456-426614174000, 550E8400-E29B-41D4-A716-446655440000, -00000000-0000-0000-0000-000000000000, 12345678-dead-beef-1337-000000000000 {c6ad1a63-10b5-460e-ab2c-05c13604539d} ''''", - []string{"UUIDS: ", ", ", ", ", "-, ", " ", "{} ", "''<>''"}, - }, - // Mixed case UUID and hex strings are ignored, to limit false positives - { - "Not UUIDS: 123e4567-E89B-12d3-a456-426614174000, 1234567-dead-beef-1337-00000000000a", - []string{"Not ", "UUIDS: ", "123e4567-E89B-12d3-a456-, ", "-dead-beef--"}, - }, - { - "Hexes: 0x0123456789 0xabcdef0123 deadbeef1337-ABCDEF0123456?0123456789ab:FFFFAAAAFFFF Curses: 0x012345678 dEaDbeef1337 abcdefabcde ABCDEFABCDE 0xASDFASDFASDF abcdef0123456NOT", - []string{"Hexes: ", " ", " ", "-?: ", "Curses: ", "0x012345678 ", "dEaDbeef1337 ", "abcdefabcde ", "ABCDEFABCDE ", "0xASDFASDFASDF ", "abcdef0123456NOT"}, - }, - { - "30546354_3313121680 0_123_456_789 foo_123", - []string{"_ ", "___ ", "foo_"}, - }, - { - `3.31ms/1h2m|-12h2m6.1s 31m "165m2.1s(6h0m12.05us)" -451325.31µs 6m23μs 123h21m3.4124561s/0s/-0.0123ms`, - []string{"/| ", " ", `"()" `, " ", " ", "//"}, - }, - { - // Invalid duration values - "3.31.1ms 3h121m3.4124561s 1h0.12s 100usa 0.12msa", - []string{". ", "3h121m3. ", "1h0. ", "100usa ", "0.12msa"}, - }, - { - // We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense) - "2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b", - []string{ - " ", "- ", " ", " ", ": ", - ";/ ", "- ", " ", "or ", " ", "BeNot ", " ", "b ", ""}, - }, - { - `status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`, - []string{"status=123 ", "status_code:500 ", "status ", "200 ", `status="-1" `, `status_code:"404" `, "httpStatus=200"}, - }, - { - `status_code_foo=123 status_code:500.1 status 2023-09-06T00:59:59.98 status:"404KiB"`, - []string{"status_code_foo= ", "status_code: ", "status ", " ", `status:""`}, - }, -} - -// TODO: add these to the benchmark tests -var tokenizationRealisticTestCases = []tokenizationTestCase{ - // logfmt from loki, metrics.go with a lot of number values of various types - { - `level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`, - - []string{ - "level=info ", "ts= ", "caller=metrics.go: ", "component=frontend ", "org_id= ", "traceID= ", "latency=fast ", `query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " `, "query_hash= ", "query_type=limited ", "range_type=range ", "length= ", "start_delta= ", "end_delta= ", "step= ", "duration= ", "status=200 ", "limit= ", "returned_lines= ", "throughput= ", "total_bytes= ", "total_bytes_non_indexed_labels= ", "lines_per_second= ", "total_lines= ", "post_filter_lines= ", "total_entries= ", "store_chunks_download_time= ", "queue_time= ", "splits= ", "shards= ", "chunk_refs_fetch_time= ", "cache_chunk_req= ", "cache_chunk_hit= ", "cache_chunk_bytes_stored= ", "cache_chunk_bytes_fetched= ", "cache_chunk_download_time= ", "cache_index_req= ", "cache_index_hit= ", "cache_index_download_time= ", "cache_stats_results_req= ", "cache_stats_results_hit= ", "cache_stats_results_download_time= ", "cache_result_req= ", "cache_result_hit= ", "cache_result_download_time= ", "token_id=gcom-", - }, - }, - // logfmt from loki, with string multi-word messages - { - `level=debug ts=2023-09-06T00:59:59.98214402Z caller=shard_resolver.go:114 bytes=205kB chunks=2 streams=2 entries=200 msg="queried index" type=single matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" duration=9.498885ms from=2023-09-06T00:48:53.138Z through=2023-09-06T00:49:43.138Z length=50s`, - []string{ - "level=debug ", "ts= ", "caller=shard_resolver.go: ", "bytes= ", "chunks= ", "streams= ", "entries= ", `msg="queried index" `, "type=single ", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" `, "duration= ", "from= ", "through= ", "length=", - }, - }, - // tricky loki distributor message: - { - `level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`, - []string{ - "level=debug ", "ts= ", "caller=push.go: ", "org_id= ", "traceID= ", `msg="push request parsed" `, "path=/loki/api/v1/push ", "contentType=application/x-protobuf ", "contentEncoding= ", `bodySize="" `, "streams= ", "entries= ", `streamLabelsSize="" `, `entriesSize="" `, `structuredMetadataSize="" `, `totalSize="" `, "mostRecentLagMs= ", "adaptiveLogsDroppedLines= ", "adaptiveLogsDroppedSize= ", "adaptiveLogsMatchedLines=", - }, - }, - // random JSON logs - { - `{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`, - []string{`{"timestamp": `, `"", `, `"level": `, `"debug", `, `"message": `, `"Server starting", `, `"server_id": `, `"abcdefghij", `, `"start_time": `, `""}`}, - }, - { - // JSON logs without spaces between elements, like how JavaScript's JSON.Stringify() produces: - `{"context":{"taskId":1},"message":"starting task ID 1","sequence":0,"time":1506776210000,"version":"1.0.0"}`, - []string{`{"context":`, `{"taskId":`, `}`, `,"message":`, `"starting task ID "`, `,"sequence":`, ``, `,"time":`, ``, `,"version":`, `".."}`}, - }, - // Android logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Android/Android_2k.log - { - // This test case has an unterminated double quote - // - // TOFIX: - // - timestamp is not correctly detected - // - empty "" token? - `03-17 16:13:40.345 1702 14638 D PowerManagerService: release:lock=166121161, flg=0x0, tag="RILJ_ACK_WL", name=com.android.phone", ws=null, uid=1001, pid=2626`, - []string{"- ", ":: ", " ", " ", " ", "D ", "PowerManagerService: ", "release:lock=, ", "flg=0x0, ", `tag="RILJ_ACK_WL", `, `name=com.android.phone"`, ", ", "ws=null, ", "uid=, ", "pid="}, - }, - { - // TOFIX: - // - timestamp is not correctly detected - // - empty "" tokens - `03-17 16:13:47.518 1702 8671 D ActivityManager: Skipping, withExcluded: false, tr.intent:Intent { act=android.intent.action.VIEW dat=file:///storage/emulated/0/Tencent/QQfile_recv/b.apk typ=application/vnd.android.package-archive flg=0x10800000 cmp=com.android.packageinstaller/.PackageInstallerActivity (has extras) }`, - []string{ - "- ", ":: ", " ", " ", " ", " ", "D ", "ActivityManager: ", "Skipping, ", "withExcluded: ", "false, ", "tr.intent:Intent ", "{ ", "act=android.intent.action.VIEW ", "dat=file:///storage/emulated//Tencent/QQfile_recv/b.apk ", "typ=application/vnd.android.package-archive ", "flg=0x10800000 ", "cmp=com.android.packageinstaller/.PackageInstallerActivity ", "(has ", "extras) ", "}", - }, - }, - // Apache logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Apache/Apache_2k.log - { - `[Mon Dec 05 13:16:27 2005] [notice] jk2_init() Found child 5877 in scoreboard slot 9`, - []string{"[] ", "[notice] ", "jk2_init() ", "Found ", "child ", " ", "in ", "scoreboard ", "slot ", ""}, - }, - { - `[Mon Dec 05 19:14:11 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties`, - []string{"[] ", "[notice] ", "workerEnv.init() ", "ok ", "/etc/httpd/conf/workers2.properties"}, - }, - // nginx logs by running `docker run -p 80:80 -v $(pwd):/usr/share/nginx/html nginx` locally - { - `2024/03/27 14:31:42 [error] 29#29: *1 directory index of "/usr/share/nginx/html/" is forbidden, client: 172.17.0.1, server: localhost, request: "GET / HTTP/1.1", host: "127.0.0.1"`, - []string{" ", "[error] ", "#: ", "* ", "directory ", "index ", "of ", "\"/usr/share/nginx/html/\" ", "is ", "forbidden, ", "client: ", ", ", "server: ", "localhost, ", "request: ", `"GET / HTTP/", `, "host: ", "\"\""}, - }, - { - // TOFIX: - // - probably not all numbers should be replaced with , e.g. for "*1", "(2:", "HTTP/1.1" it's definitely a worse UX - `2024/03/27 14:34:37 [error] 29#29: *1 open() "/usr/share/nginx/html/test url with spaces" failed (2: No such file or directory), client: 172.17.0.1, server: localhost, request: "GET /test%20url%20with%20spaces HTTP/1.1", host: "127.0.0.1" - 172.17.0.1 - - [31/Mar/2024:14:34:37 +0000] "GET /test%20url%20with%20spaces HTTP/1.1" 404 153 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0" "-"`, - []string{" ", "[error] ", "#: ", "* ", "open() ", `"/usr/share/nginx/html/test url with spaces" `, "failed ", "(: ", "No ", "such ", "file ", "or ", "directory), ", "client: ", ", ", "server: ", "localhost, ", "request: ", `"GET /test%20url%20with%20spaces HTTP/", `, "host: ", "\"\"\n", "\t", "\t", " ", "- ", "- ", "[] ", `"GET /test%20url%20with%20spaces HTTP/" `, " ", " ", "\"-\" ", `"Mozilla/ (X11; Linux x86_; rv:) Gecko/ Firefox/" `, `"-"`}, - }, - // Linux systemd (journalctl) logs - { - `Mar 27 11:52:21 hostname systemd-networkd[2043]: enp6s0: LLDP Rx: Invoking callback for 'refreshed' event.`, - []string{" ", "hostname ", "systemd-networkd[]: ", "enp6s0: ", "LLDP ", "Rx: ", "Invoking ", "callback ", "for ", "'refreshed' ", "event."}, - }, - { - `Feb 29 23:00:14 nixos dbus-daemon[11432]: [system] Activating via systemd: service name='org.opensuse.Snapper' unit='snapperd.service' requested by ':1.324' (uid=0 pid=22089 comm="/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe" label="kernel")`, - []string{" ", "nixos ", "dbus-daemon[]: ", "[system] ", "Activating ", "via ", "systemd: ", "service ", "name='org.opensuse.Snapper' ", "unit='snapperd.service' ", "requested ", "by ", "':' ", "(uid= ", "pid= ", "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\" ", "label=\"kernel\")"}, - }, - // Random slack logs: - { - `Apr-10 23:43:46.807 [API-Q] (T02S4RCS0) c37dfd20-1712781826.804 conversations.suggestions is ACTIVE`, - []string{" ", "[API-Q] ", "(T02S4RCS0) ", "c37dfd20- ", "conversations.suggestions ", "is ", "ACTIVE"}, - }, - { - `Apr-11 00:01:57.743 [DEVICE-PERMISSIONS-MA] Permissions saved to local storage: {"permissions":{"microphone":"granted","camera":"prompt","screen":"prompt"},"timestamp":1712782917742}`, - []string{" ", "[DEVICE-PERMISSIONS-MA] ", "Permissions ", "saved ", "to ", "local ", "storage: ", `{"permissions":`, `{"microphone":`, `"granted"`, `,"camera":`, `"prompt"`, `,"screen":`, `"prompt"}`, `,"timestamp":`, `}`}, - }, - // Another weird log from loki: - { - `ts=2023-09-06T00:59:59.900879737Z caller=spanlogger.go:86 user=29 level=debug msg="querying ingester" params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=2023-09-05 23:20:28.030285153 +0000 UTC, end=2023-09-05 23:20:48.030285153 +0000 UTC, limit=1000, shards="`, - []string{"ts= ", "caller=spanlogger.go: ", "user= ", "level=debug ", `msg="querying ingester" `, `params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=, end=, limit=, shards="`}, - }, - // {``, []string{}}, -} - -func getTimestampTests(t *testing.T) []tokenizationTestCase { - timeStamps := []string{ - "2023-01-03T15:04:05.999999999Z", - "2024-02-29T11:12:03.33+02:00", - "2022-03-02T23:59:59Z", - "2024-04-01T01:33:59+03:00", - "2021-05-07T00:00:00-04:30", - "1999-06-12T13:37:31.123+03:00", - "2005-07-17T12:00:00Z", - "1988-08-03T00:00:00Z", - "2020-09-30T00:00:59.9999+03:00", - "2034-10-31T12:34:56.7890+11:23", - "2034-11-01T01:23:45.67890Z", - "2025-12-31T01:23:45.67890Z", - } - - timestampConst := string(placeholderTimestamp) - layouts := []struct { - layout string - expResult []string - }{ - // TOFIX: all of the commented out lines. - - //{time.Layout, []string{timestampConst}}, - {time.ANSIC, []string{timestampConst}}, - {time.UnixDate, []string{timestampConst}}, - {"Mon _2 Jan 15:04:05 MST 2006", []string{timestampConst}}, // linux `date` - {time.RubyDate, []string{timestampConst}}, - {time.RFC822, []string{timestampConst}}, - {time.RFC822Z, []string{timestampConst}}, - {time.RFC850, []string{timestampConst}}, - {time.RFC1123, []string{timestampConst}}, - {time.RFC1123Z, []string{timestampConst}}, - {time.RFC3339, []string{timestampConst}}, - {time.RFC3339Nano, []string{timestampConst}}, - //{time.Stamp, []string{timestampConst}}, - //{time.StampMilli, []string{timestampConst}}, - //{time.StampMicro, []string{timestampConst}}, - //{time.StampNano, []string{timestampConst}}, - {time.DateTime, []string{timestampConst}}, - - // TOFIX: maybe consider these timestamps? - {time.DateOnly, []string{"--"}}, - {time.TimeOnly, []string{"::"}}, - } - - result := make([]tokenizationTestCase, 0, len(timeStamps)*len(layouts)) - for _, tss := range timeStamps { - ts, err := time.Parse(time.RFC3339Nano, tss) - require.NoError(t, err) - for _, layout := range layouts { - result = append(result, tokenizationTestCase{ - line: ts.Format(layout.layout), - expResult: layout.expResult, - }) - } - } - return result -} - -type tokenizationTestCasePack struct { - name string - cases []tokenizationTestCase -} - -func TestTokenization(t *testing.T) { - packs := []tokenizationTestCasePack{ - {"cornerTestCases", tokenizationCornerTestCases}, - {"timestamps", getTimestampTests(t)}, - {"realisticTestCases", tokenizationRealisticTestCases}, - } - - for i, pack := range packs { - pack := pack - t.Run(fmt.Sprintf("test-pack-%d-%s", i, pack.name), func(t *testing.T) { - for j, tc := range pack.cases { - tc := tc - t.Run(fmt.Sprintf("case-%d", j), func(t *testing.T) { - result := PreprocessAndTokenize([]byte(tc.line)) - assert.Equal( - t, tc.expResult, result, - fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result), - ) - }) - } - }) - } -} - -func TestTokenizationMemcpy(t *testing.T) { - buf := make([]byte, 0, 1000) - for i := 0; i < 300; i++ { - buf = append(buf, 'a', ' ') - } - tokenizer := tokenizer{rawLine: buf, maxTokens: 10} - tokenizer.tokenize() - require.Less(t, len(tokenizer.line), 100) -} - -// Useful for running single test cases in isolation -func TestTokenizationPlayground(t *testing.T) { - tc := tokenizationTestCase{ - "foo 121113.21231 bar 123.0.1.123 -123 -1231.11", - []string{"foo ", " ", "bar ", " ", " ", ""}, - } - result := PreprocessAndTokenize([]byte(tc.line)) - assert.Equal( - t, tc.expResult, result, - fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result), - ) -} - -func TestReplacementPlayground(t *testing.T) { - line := "Jul 1 00:21:28" - exp := "" - r := replacer{source: []byte(line)} - r.replaceWithPlaceholders() - - assert.Equal( - t, exp, string(r.dest), - fmt.Sprintf("Log line: %q", line), - ) -}