mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
900 lines
27 KiB
900 lines
27 KiB
package tokenization
|
|
|
|
import (
|
|
"bytes"
|
|
"slices"
|
|
)
|
|
|
|
var (
|
|
placeholderNumber = []byte("<NUM>")
|
|
placeholderHex = []byte("<HEX>")
|
|
placeholderUUID = []byte("<UUID>")
|
|
placeholderTimestamp = []byte("<TIMESTAMP>")
|
|
placeholderDuration = []byte("<DURATION>")
|
|
placeholderBytesize = []byte("<BYTESIZE>")
|
|
placeholderIP = []byte("<IP>")
|
|
)
|
|
|
|
// Integer numbers after these words won't be replaced by `<NUM>`.
|
|
var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")}
|
|
|
|
const (
|
|
maxYear = 2100 // anything above that is unlikely to be a timestamp...
|
|
minHexLen = 12 // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings
|
|
)
|
|
|
|
var boundaryChars = [256]bool{}
|
|
|
|
func init() {
|
|
for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary
|
|
if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' {
|
|
boundaryChars[i] = true
|
|
}
|
|
}
|
|
|
|
// We need these keys sorted in an ascending length to efficiently compare them.
|
|
slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int {
|
|
return len(a) - len(b)
|
|
})
|
|
}
|
|
|
|
type replacer struct {
|
|
source, dest []byte
|
|
|
|
// This is the last position that was copied to the destination buffer.
|
|
tail int
|
|
// This is the current position we are working with
|
|
cur int
|
|
// This is essentially used for lookahed operations
|
|
head int
|
|
|
|
// Here's some ASCII art to visualize that (the long string of dashes
|
|
// visualizes the log line:
|
|
//
|
|
// 0 <already in dest, copied or replaced > `tail` <about to be copied or replaced> `cur` <lookahead> `head` <remaining unprocessed log> `len(source)`
|
|
// |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------|
|
|
|
|
// A somewhat hacky solution that allows us to preserve specific numeric
|
|
// values we care about, like status and status_code.
|
|
preserveNextNumbers bool
|
|
}
|
|
|
|
// commit advances the current position marker to the lookahead position, i.e.
|
|
// we are committing to consume everything we've looked ahead so far.
|
|
func (r *replacer) commit() {
|
|
r.cur = r.head
|
|
}
|
|
|
|
func (r *replacer) resetHead() {
|
|
r.head = r.cur
|
|
}
|
|
|
|
func (r *replacer) resetHeadExpr() bool {
|
|
r.head = r.cur
|
|
return true // useful when included in boolean expressions with advance()
|
|
}
|
|
|
|
func (r *replacer) backtrack() {
|
|
r.head--
|
|
}
|
|
|
|
func (r *replacer) consumeUpToCurrent() {
|
|
r.tail = r.cur
|
|
}
|
|
|
|
// advanceUintRet returns the actual value of the number we read, as well as its
|
|
// length. The value might overflow for large numbers, so it's also important to
|
|
// check the length!
|
|
func (r *replacer) advanceUintRet() (val uint, length uint) {
|
|
var c byte
|
|
for ; r.head < len(r.source); r.head++ {
|
|
c = r.source[r.head]
|
|
if c < '0' || '9' < c {
|
|
break
|
|
}
|
|
length++
|
|
val = val*10 + uint(c-'0') // TODO: use bitwise trick?
|
|
}
|
|
|
|
return val, length
|
|
}
|
|
|
|
func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) {
|
|
var c byte
|
|
for ; r.head < len(r.source); r.head++ {
|
|
c = r.source[r.head]
|
|
if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) {
|
|
break
|
|
}
|
|
length++
|
|
}
|
|
|
|
return length
|
|
}
|
|
|
|
func (r *replacer) advanceUint() bool {
|
|
_, l := r.advanceUintRet()
|
|
return l > 0
|
|
}
|
|
|
|
func (r *replacer) advanceUintUpTo(val uint, length uint) bool {
|
|
foundVal, foundLength := r.advanceUintRet()
|
|
return foundLength > 0 && foundVal <= val && foundLength <= length
|
|
}
|
|
|
|
func (r *replacer) advanceYear() bool {
|
|
return r.advanceUintUpTo(maxYear, 4)
|
|
}
|
|
|
|
func (r *replacer) advanceChar(c byte) bool {
|
|
if r.head < len(r.source) && r.source[r.head] == c {
|
|
r.head++
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) peekNextIsBoundary() bool {
|
|
if r.head >= len(r.source) {
|
|
return true // we are at the end of the line
|
|
}
|
|
return boundaryChars[r.source[r.head]]
|
|
}
|
|
|
|
func (r *replacer) peekFirstNonInt() (c byte) {
|
|
for i := r.head; i < len(r.source); i++ {
|
|
c = r.source[i]
|
|
if c < '0' || '9' < c {
|
|
return c
|
|
}
|
|
}
|
|
return 0 // we can return the 0 value here!
|
|
}
|
|
|
|
func (r *replacer) peekNext4() (result [4]byte) {
|
|
overhead := len(r.source) - r.head
|
|
switch {
|
|
case overhead > 3:
|
|
result[3] = r.source[r.head+3]
|
|
fallthrough
|
|
case overhead > 2:
|
|
result[2] = r.source[r.head+2]
|
|
fallthrough
|
|
case overhead > 1:
|
|
result[1] = r.source[r.head+1]
|
|
fallthrough
|
|
case overhead > 0:
|
|
result[0] = r.source[r.head+0]
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (r *replacer) advanceTimeZoneLetters() bool {
|
|
UCLetters := 0
|
|
for {
|
|
nc, ok := r.advance()
|
|
if !ok {
|
|
break
|
|
}
|
|
if nc < 'A' || nc > 'Z' {
|
|
r.backtrack()
|
|
break
|
|
}
|
|
UCLetters++
|
|
}
|
|
|
|
return UCLetters >= 2 && UCLetters <= 5
|
|
}
|
|
|
|
func (r *replacer) advanceNumericTimeZone() bool {
|
|
// See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
|
return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5)
|
|
}
|
|
|
|
// helper for handleWeirdTimestamp()
|
|
func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool {
|
|
curHead := r.head
|
|
if r.advanceChar(' ') && r.advanceNumericTimeZone() {
|
|
return true
|
|
}
|
|
r.head = curHead
|
|
if r.advanceChar(' ') && r.advanceTimeZoneLetters() {
|
|
return true
|
|
}
|
|
r.head = curHead
|
|
return isOptional
|
|
}
|
|
|
|
func (r *replacer) advanceOneOf(chars ...byte) bool {
|
|
if r.head >= len(r.source) {
|
|
return false
|
|
}
|
|
c := r.source[r.head]
|
|
|
|
for _, ec := range chars {
|
|
if c == ec {
|
|
r.head++
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) advanceTime(secondsOptional bool) bool {
|
|
return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2)))
|
|
}
|
|
|
|
// Mon|Tue|Wed|Thu|Fri|Sat|Sun
|
|
func (r *replacer) advanceDayOfTheWeek() bool {
|
|
c1, ok1 := r.advance()
|
|
c2, ok2 := r.advance()
|
|
c3, ok3 := r.advance()
|
|
if !ok1 || !ok2 || !ok3 {
|
|
return false
|
|
}
|
|
return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun
|
|
(c1 == 'M' && c2 == 'o' && c3 == 'n') ||
|
|
(c1 == 'T' && c2 == 'u' && c3 == 'e') ||
|
|
(c1 == 'W' && c2 == 'e' && c3 == 'd') ||
|
|
(c1 == 'T' && c2 == 'h' && c3 == 'u') ||
|
|
(c1 == 'F' && c2 == 'r' && c3 == 'i')
|
|
}
|
|
|
|
// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec
|
|
func (r *replacer) advanceMonthName() bool {
|
|
c1, ok1 := r.advance()
|
|
c2, ok2 := r.advance()
|
|
c3, ok3 := r.advance()
|
|
if !ok1 || !ok2 || !ok3 {
|
|
return false
|
|
}
|
|
|
|
return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul
|
|
(c2 == 'a' && c3 == 'n'))) || // Jan
|
|
(c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May
|
|
(c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec
|
|
(c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr
|
|
(c2 == 'u' && c3 == 'g'))) || // Aug
|
|
(c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct
|
|
(c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov
|
|
}
|
|
|
|
// Check if we in the middle of an UUID, exactly after the first 8 characters
|
|
// and the dash after them, e.g after "abcd0123-":
|
|
func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) {
|
|
return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
|
|
(r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
|
|
(r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
|
|
(r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary()
|
|
}
|
|
|
|
// Only moves the head forward if it successfully matches a duration
|
|
func (r *replacer) advanceDuration() (matched bool) {
|
|
curHead := r.head
|
|
|
|
var secsLen int
|
|
n := r.peekNext4()
|
|
if n[0] == 'h' {
|
|
r.head++
|
|
if boundaryChars[n[1]] {
|
|
return true // e.g. "1h", "123h"
|
|
}
|
|
if !r.advanceUintUpTo(60, 2) {
|
|
goto restore
|
|
}
|
|
n = r.peekNext4()
|
|
}
|
|
if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here
|
|
r.head++
|
|
if boundaryChars[n[1]] {
|
|
return true // e.g. "2h21m", "121m"
|
|
}
|
|
if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) {
|
|
goto restore
|
|
}
|
|
n = r.peekNext4()
|
|
}
|
|
|
|
if n[0] == 's' && boundaryChars[n[1]] {
|
|
secsLen = 1
|
|
} else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] {
|
|
secsLen = 2
|
|
} else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] {
|
|
// This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu)
|
|
secsLen = 3
|
|
} else {
|
|
goto restore
|
|
}
|
|
|
|
r.head += secsLen
|
|
return true
|
|
|
|
restore: // should be faster than a defer
|
|
r.head = curHead
|
|
return false
|
|
}
|
|
|
|
// 'b' and 'B' are not present here because of the way we check for byte size
|
|
// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would
|
|
// be considered valid byte sizes. Also, only integer numbers are accepted as
|
|
// valid bytesizes in bytes, so we handle bytes with special cases instead.
|
|
var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}
|
|
|
|
// Only moves the head forward if it successfully matches a duration
|
|
func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
|
|
if !byteSizes[c1] {
|
|
return false
|
|
}
|
|
n := r.peekNext4()
|
|
|
|
var unitLen int // not counting the first character c1, which is already advanced to
|
|
if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] {
|
|
unitLen = 1
|
|
} else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] {
|
|
unitLen = 2
|
|
} else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] {
|
|
unitLen = 3
|
|
}
|
|
|
|
if unitLen > 0 {
|
|
r.head += unitLen
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) {
|
|
// Get the next character after the space
|
|
c1, hasNext := r.advance()
|
|
if !hasNext {
|
|
return false
|
|
}
|
|
if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
|
|
return true
|
|
}
|
|
if r.advanceBytesize(c1) {
|
|
return true
|
|
}
|
|
r.backtrack()
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) advance() (c byte, advanced bool) {
|
|
if r.head >= len(r.source) {
|
|
return 0, false
|
|
}
|
|
|
|
c = r.source[r.head]
|
|
r.head++
|
|
return c, true
|
|
}
|
|
|
|
func (r *replacer) emitNumber() {
|
|
r.commit()
|
|
r.dest = append(r.dest, placeholderNumber...)
|
|
r.consumeUpToCurrent()
|
|
}
|
|
|
|
func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) {
|
|
r.commit()
|
|
if !r.preserveNextNumbers {
|
|
r.dest = append(r.dest, placeholderNumber...)
|
|
r.consumeUpToCurrent()
|
|
} else {
|
|
r.maybeEmitDash(hasMinusPrefix)
|
|
r.copyUpToCurrent()
|
|
}
|
|
}
|
|
|
|
func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) {
|
|
r.commit()
|
|
r.maybeEmitDash(hasMinusPrefix)
|
|
r.dest = append(r.dest, placeholder...)
|
|
r.consumeUpToCurrent()
|
|
}
|
|
|
|
func (r *replacer) maybeEmitDash(hasMinusPrefix bool) {
|
|
// This minus was actually a dash, so we just copy it to the result
|
|
if hasMinusPrefix {
|
|
r.dest = append(r.dest, '-')
|
|
}
|
|
}
|
|
|
|
func (r *replacer) copyUpToCurrent() {
|
|
r.dest = append(r.dest, r.source[r.tail:r.cur]...)
|
|
r.consumeUpToCurrent()
|
|
}
|
|
|
|
func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) {
|
|
// Special case that is likely a hex string of the format "0x", but we don't
|
|
// know whether the rest is upper case or lower case yet.
|
|
zeroHex := false
|
|
if n1 == 0 && l1 == 1 && c1 == 'x' {
|
|
zeroHex = true // these cannot be the start of an UUID
|
|
c1 = r.peekFirstNonInt()
|
|
}
|
|
|
|
// Special case, this might be a byte size
|
|
if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
|
|
// We do not subsume a minus sign - byte sizes are unlikely to be
|
|
// negative, it's more likely this is a dash as a part of a range
|
|
r.emit(hasMinusPrefix, placeholderBytesize)
|
|
return true
|
|
}
|
|
|
|
// Maybe we are at the start of a hex string, either something like
|
|
// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
|
|
// case letters, but to avoid false positives, we want hex replacements to
|
|
// happen only if the strings are fully lower case or fully upper case.
|
|
if 'a' <= c1 && c1 <= 'f' {
|
|
return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex)
|
|
} else if 'A' <= c1 && c1 <= 'F' {
|
|
return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex)
|
|
} else if zeroHex {
|
|
// Well, it probably wasn't a zero-hex after all, or it contained only
|
|
// digits, so try to handle that or emit what we absorbed
|
|
_, l2 := r.advanceUintRet()
|
|
if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen
|
|
r.emit(hasMinusPrefix, placeholderHex)
|
|
} else {
|
|
r.maybeEmitDash(hasMinusPrefix)
|
|
r.commit()
|
|
r.copyUpToCurrent()
|
|
}
|
|
return r.peekNextIsBoundary()
|
|
}
|
|
|
|
return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1)
|
|
}
|
|
|
|
func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) {
|
|
totalLen := l1 + r.advanceUintOrRangeRet(lc, hc)
|
|
r.commit()
|
|
|
|
if totalLen == 8 && canBeUUID {
|
|
// We might be at the first part of a UUID, right before the first dash
|
|
if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) {
|
|
r.emit(hasMinusPrefix, placeholderUUID)
|
|
return true
|
|
}
|
|
r.resetHead()
|
|
}
|
|
|
|
if totalLen >= minHexLen && r.peekNextIsBoundary() {
|
|
r.emit(hasMinusPrefix, placeholderHex)
|
|
return true
|
|
}
|
|
|
|
r.copyUpToCurrent()
|
|
return r.peekNextIsBoundary()
|
|
}
|
|
|
|
func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) {
|
|
if r.advanceBytesize(c1) {
|
|
// We do not subsume a minus sign - byte sizes are unlikely to be
|
|
// negative, it's more likely this is a dash as a part of a range
|
|
r.emit(hasMinusPrefix, placeholderBytesize)
|
|
return true
|
|
}
|
|
|
|
r.backtrack()
|
|
if r.advanceDuration() {
|
|
// We subsume hasMinusPrefix, since durations can be negative
|
|
r.emit(false, placeholderDuration)
|
|
return true
|
|
}
|
|
|
|
// We couldn't match anything, so just copy what existed.
|
|
r.maybeEmitDash(hasMinusPrefix)
|
|
r.copyUpToCurrent()
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) {
|
|
n2, l2 := r.advanceUintRet()
|
|
if l2 == 0 {
|
|
// The dot wasn't followed by another number, so emit everything before it.
|
|
r.backtrack()
|
|
r.emitNumberOrCopyText(hasMinusPrefix)
|
|
return false
|
|
}
|
|
|
|
// See if the number after the decimal is also followed by a boundary
|
|
b2, hasNext := r.advance()
|
|
|
|
// We are at the end of the string, which is a boundary, replace evertything
|
|
// up to now with a number
|
|
if !hasNext {
|
|
r.emitNumber() // this also subsumes any minus sign we had
|
|
return true
|
|
}
|
|
|
|
// The decimal number isn't followed by a boundary char (which include
|
|
// things like '.', ':', '/', etc.), so the part after the decimal is either
|
|
// not a real number, or it's some sort of a unit that can support decimals
|
|
// like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc.
|
|
if !boundaryChars[b2] {
|
|
return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
|
|
}
|
|
|
|
// This can be a byte size with a space, e.g. "3.14 GiB"
|
|
if b2 == ' ' && r.advanceSpacedBytesize(false) {
|
|
// We do not subsume a minus sign - byte sizes are unlikely to be
|
|
// negative, it's more likely this is a dash as a part of a range
|
|
r.emit(hasMinusPrefix, placeholderBytesize)
|
|
return true
|
|
}
|
|
|
|
// We have a decimal number followed by a non-dot boundary, so this is not
|
|
// an IP or a version number or anything like that.
|
|
if b2 != '.' {
|
|
r.backtrack()
|
|
r.emitNumber()
|
|
return true
|
|
}
|
|
|
|
n3, l3 := r.advanceUintRet()
|
|
if l3 == 0 || !r.peekNextIsBoundary() {
|
|
// The second dot wasn't followed by another number and a boundary, so
|
|
// emit just the first number.
|
|
r.resetHead()
|
|
r.emitNumber()
|
|
return true
|
|
}
|
|
|
|
// We have "<NUM>.<NUM>.<NUM>" at this point, so we either have something
|
|
// like a version number, or an IP address, but certainly not a simple
|
|
// decimal number we can just emit.
|
|
r.commit()
|
|
|
|
// Check if this is an IP address...
|
|
if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() {
|
|
r.emit(hasMinusPrefix, placeholderIP)
|
|
return true
|
|
}
|
|
|
|
// This wasn't an IP after all, so just emit "<NUM>.<NUM>.<NUM>". We don't
|
|
// want to assume this is a simple decimal number because of the second dot,
|
|
// e.g. this might be something like a version number.
|
|
r.resetHead()
|
|
r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus
|
|
r.dest = append(r.dest, placeholderNumber...)
|
|
r.dest = append(r.dest, '.')
|
|
r.dest = append(r.dest, placeholderNumber...)
|
|
r.dest = append(r.dest, '.')
|
|
r.dest = append(r.dest, placeholderNumber...)
|
|
r.consumeUpToCurrent()
|
|
return true
|
|
}
|
|
|
|
func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) {
|
|
if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) {
|
|
r.commit()
|
|
// continue down to parsing sub-second and timezone values
|
|
} else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() {
|
|
// We might not have a sane timestamp, but apparently a not-so-sane
|
|
// timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"...
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
} else {
|
|
// Apparently that wasn't it either, we probably just have a dash or
|
|
// forward slash after a number, so we backtrack and emit the number.
|
|
r.resetHead()
|
|
r.emitNumberOrCopyText(hasMinusPrefix)
|
|
return true
|
|
}
|
|
|
|
// We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or
|
|
// `YY[YY]/MM/DDZhh:mm:ss`
|
|
r.commit() // we want to keep this
|
|
|
|
// Now we need to also check for sub-second precision and time zones:
|
|
c, canAdvance := r.advance()
|
|
if !canAdvance {
|
|
// End of the string
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
if c == '.' {
|
|
_, intl := r.advanceUintRet()
|
|
if intl == 0 {
|
|
// No sub-second precision, the dot was not part of the timestamp
|
|
r.backtrack()
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
|
|
// We had sub-second precision, capture that too
|
|
r.commit()
|
|
|
|
// Get the next char to see if we have time zone
|
|
c, canAdvance = r.advance()
|
|
if !canAdvance {
|
|
// We are at the end of the sting after we captured the
|
|
// sub-second value.
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
}
|
|
|
|
if c == 'Z' || c == 'z' {
|
|
//UTC string, nothing to do after that
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
// See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
|
if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) {
|
|
// e.g. "2020-09-30T00:00:59.9999+03:00"
|
|
r.commit()
|
|
} else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() {
|
|
// e.g. "2023-09-05 23:20:28.030285153 +0000 UTC"
|
|
r.commit()
|
|
} else {
|
|
r.resetHead()
|
|
}
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
|
|
func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) {
|
|
r.resetHead() // keep the head pos in sync with the current pos
|
|
// We know we were at a digit due to how handleNumber() is called
|
|
n1, l1 := r.advanceUintRet()
|
|
r.commit() // we will consume this one way or another for sure
|
|
|
|
// See if the number is followed by a boundary
|
|
b1, hasNext := r.advance()
|
|
|
|
switch {
|
|
// We are at the end of the string, which is a boundary, replace evertything
|
|
// up to now with a number
|
|
case !hasNext:
|
|
r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had
|
|
return true
|
|
|
|
// The number isn't followed by a boundary char (which include things like
|
|
// '.', ' ', '/', etc.), so it's either not a real number or date, or it's
|
|
// some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB,
|
|
// 3MiB), etc.
|
|
case !boundaryChars[b1]:
|
|
return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1)
|
|
|
|
// We have a decimal point, so this can either be a plain number, a unit
|
|
// that can have a float value, or an IP address.
|
|
case b1 == '.':
|
|
return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1)
|
|
|
|
// This might be a timestamp that looks like '2024-04-01...' or
|
|
// `2024/03/27...`; timestamps in the far future are not supported :)
|
|
case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
|
|
return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)
|
|
|
|
// This might be a byte size with a space, e.g. "2 b", "3 GiB"
|
|
case b1 == ' ' && r.advanceSpacedBytesize(true):
|
|
r.emit(hasMinusPrefix, placeholderBytesize)
|
|
return true
|
|
|
|
// Weird RFC822 dates like "02 Jan 06 15:04 MST"
|
|
case n1 <= 31 && l1 <= 2 && b1 == ' ':
|
|
if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {
|
|
r.commit()
|
|
r.emit(hasMinusPrefix, placeholderTimestamp)
|
|
return true
|
|
}
|
|
// if not, go to default handler after switch statement
|
|
|
|
// It could be a UUID that starts with 8 digits
|
|
case l1 == 8 && b1 == '-':
|
|
if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) {
|
|
r.emit(hasMinusPrefix, placeholderUUID)
|
|
return true
|
|
}
|
|
// if not, go to default handler after switch statement
|
|
}
|
|
|
|
// Number with an unknown boundary - emit the number and leave the boundary
|
|
// for the following passes.
|
|
r.resetHead()
|
|
r.emitNumberOrCopyText(hasMinusPrefix)
|
|
return true
|
|
}
|
|
|
|
var longDayNames = [...][]byte{
|
|
[]byte("Sunday"),
|
|
[]byte("Monday"),
|
|
[]byte("Tuesday"),
|
|
[]byte("Wednesday"),
|
|
[]byte("Thursday"),
|
|
[]byte("Friday"),
|
|
[]byte("Saturday"),
|
|
}
|
|
|
|
func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) {
|
|
r.resetHead()
|
|
|
|
if r.advanceDayOfTheWeek() {
|
|
r.commit() // we will always consume this
|
|
|
|
// RFC1123 and RFC1123Z, e.g.:
|
|
// - "Mon, 02 Jan 2006 15:04:05 MST"
|
|
// - "Mon, 02 Jan 2006 15:04:05 -0700"
|
|
if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) {
|
|
r.emit(false, placeholderTimestamp)
|
|
return true
|
|
}
|
|
r.resetHead()
|
|
|
|
// ANSIC, UnixDatem, RubyDate e.g
|
|
// - "Mon Jan 2 15:04:05 2006"
|
|
// - "Mon Jan 2 15:04:05 MST 2006"
|
|
// - "Mon Jan 02 15:04:05 -0700 2006"
|
|
if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() {
|
|
r.emit(false, placeholderTimestamp)
|
|
return true
|
|
}
|
|
r.resetHead()
|
|
|
|
// Linux, e.g.
|
|
// - "Mon 2 Jan 15:04:05 MST 2006"
|
|
// - "Tue 23 Jan 15:04:05 -0700 2023"
|
|
if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() {
|
|
r.emit(false, placeholderTimestamp)
|
|
return true
|
|
}
|
|
r.resetHead()
|
|
|
|
// RFC850, e.g.
|
|
// - "Monday, 02-Jan-06 15:04:05 MST"
|
|
backtrackedSlice := r.source[r.head-3:]
|
|
var matchedDay []byte
|
|
for _, dw := range longDayNames {
|
|
if bytes.HasPrefix(backtrackedSlice, dw) {
|
|
matchedDay = dw
|
|
break
|
|
}
|
|
}
|
|
if matchedDay != nil {
|
|
r.head += len(matchedDay) - 3
|
|
if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) {
|
|
r.emit(false, placeholderTimestamp)
|
|
return true
|
|
}
|
|
}
|
|
|
|
r.cur -= 3 // unconsume
|
|
r.resetHead()
|
|
return false
|
|
}
|
|
r.resetHead()
|
|
|
|
if r.advanceMonthName() {
|
|
r.commit() // provisionally consume this
|
|
|
|
// Linux journald logs and others similar like this:
|
|
// - Feb 29 23:00:14
|
|
// - Apr-10 23:43:46.807
|
|
// - Jul 1 00:21:28
|
|
if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) {
|
|
r.commit()
|
|
// This is already a timestamp, but let's try to match subseconds as well
|
|
if r.advanceChar('.') && r.advanceUint() {
|
|
r.commit()
|
|
} else {
|
|
r.resetHead()
|
|
}
|
|
r.emit(false, placeholderTimestamp)
|
|
return true
|
|
}
|
|
r.cur -= 3 // unconsume
|
|
r.resetHead()
|
|
return false
|
|
}
|
|
r.resetHead()
|
|
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) wasNumPreservingKeyword() bool {
|
|
for _, key := range numPreservingKeys {
|
|
pPos := r.cur - 1 - len(key)
|
|
if pPos < -1 {
|
|
return false // all subsequent keys are longer
|
|
}
|
|
if pPos != -1 && !boundaryChars[r.source[pPos]] {
|
|
continue
|
|
}
|
|
if bytes.HasPrefix(r.source[pPos+1:], key) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (r *replacer) replaceWithPlaceholders() {
|
|
lineLen := len(r.source)
|
|
|
|
var c byte
|
|
onBoundary := true
|
|
for r.cur = 0; r.cur < lineLen; r.cur++ {
|
|
c = r.source[r.cur]
|
|
switch {
|
|
// If we're currently not at a boundary, the only thing we need to check
|
|
// is whether the current char would create a boundary in the next iteration.
|
|
case !onBoundary:
|
|
onBoundary = boundaryChars[c]
|
|
|
|
// We weren't at a boundary and now we are, so check if we match one
|
|
// of the special keywords that will preserve numbers
|
|
if onBoundary {
|
|
r.preserveNextNumbers = r.wasNumPreservingKeyword()
|
|
}
|
|
|
|
// If we've reached this far, it means we're currently at a boundary!
|
|
|
|
// A lot of very complex logic if we encounter a number at a boundary,
|
|
// so we move that to a helper function.
|
|
case '0' <= c && c <= '9':
|
|
r.copyUpToCurrent()
|
|
onBoundary = r.handleNumberStart(false)
|
|
|
|
// Handle negative numbers, potentially
|
|
case c == '-':
|
|
next := r.cur + 1
|
|
// This might be a number, a date, an IP address, etc. So we don't
|
|
// know if this is a minus sign to replace or a dash to copy yet.
|
|
if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' {
|
|
// Copy everything before the dash, but mark it as consumed.
|
|
r.copyUpToCurrent()
|
|
r.cur++
|
|
r.consumeUpToCurrent()
|
|
onBoundary = r.handleNumberStart(true)
|
|
} else {
|
|
onBoundary = true
|
|
}
|
|
|
|
// Try to match weird timestamps. They require a lot of remaining
|
|
// length, generally start with a capitalized day of the week or month
|
|
// name (1 upper case letter followed by 2 lower case letters).
|
|
//
|
|
// We are basically looking for something that may match this here:
|
|
// Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec
|
|
//
|
|
// The detailed check would be performed by the actual handler:
|
|
case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 &&
|
|
'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' &&
|
|
'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y':
|
|
r.copyUpToCurrent()
|
|
onBoundary = r.handleWeirdTimestamp()
|
|
|
|
// This could be the start of an lower case hex string:
|
|
case 'a' <= c && c <= 'f':
|
|
r.copyUpToCurrent()
|
|
r.resetHead()
|
|
onBoundary = r.handleHex(false, 0, 'a', 'f', true)
|
|
|
|
// This could be the start of an upper case hex string:
|
|
case 'A' <= c && c <= 'F':
|
|
r.copyUpToCurrent()
|
|
r.resetHead()
|
|
onBoundary = r.handleHex(false, 0, 'A', 'F', true)
|
|
|
|
// If we haven't actually matched anything, update whether we're still
|
|
// on a boundary character and continue onto the next one.
|
|
default:
|
|
onBoundary = boundaryChars[c]
|
|
}
|
|
}
|
|
|
|
if r.cur > r.tail {
|
|
r.dest = append(r.dest, r.source[r.tail:]...)
|
|
r.consumeUpToCurrent()
|
|
}
|
|
}
|
|
|
|
func Preprocess(content []byte) []byte {
|
|
// ~floor(120%), to allow for some expansion from replacements, hopefully
|
|
// without needing to allocate more memory
|
|
r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)}
|
|
r.replaceWithPlaceholders()
|
|
return r.dest
|
|
}
|
|
|