feat: Generic logline placeholder replacement and tokenization (#12799)

This code allows us to preprocess generic logs and replace highly variable dynamic data (timestamps, IPs, numbers, UUIDs, hex values, bytesizes and durations) with static placeholders for easier pattern extraction and more efficient and user-friendly matching by the Drain algorithm. Additionally, there is logic that splits generic log lines into discrete tokens that can be used with Drain for better results than just naively splitting the logs on every whitespace. The tokenization here handles quote counting and emits quoted strings as a part of the same token. On the other side, it also handles likely JSON logs without any white spaces in them better, by trying to split `{"key":value}` pairs (without actually parsing the JSON). All of this is done without using regular expressions and without actually parsing the log lines in any specific format. That's why it works very efficiently in terms of CPU usage and allocations, and should handle all log formats and unformatted logs equally well.
1 year ago · 40479029d7
parent 151d0a58ac
commit 40479029d7
4 changed files with 1462 additions and 0 deletions
--- a/pkg/pattern/tokenization/replacer.go
+++ b/pkg/pattern/tokenization/replacer.go
@ -0,0 +1,859 @@
+package tokenization
+
+import (
+	"bytes"
+	"slices"
+)
+
+var (
+	placeholderNumber    = []byte("<NUM>")
+	placeholderHex       = []byte("<HEX>")
+	placeholderUUID      = []byte("<UUID>")
+	placeholderTimestamp = []byte("<TIMESTAMP>")
+	placeholderDuration  = []byte("<DURATION>")
+	placeholderBytesize  = []byte("<BYTESIZE>")
+	placeholderIP        = []byte("<IP>")
+)
+
+// Integer numbers after these words won't be replaced by `<NUM>`.
+var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")}
+
+const (
+	maxYear   = 2100 // anything above that is unlikely to be a timestamp...
+	minHexLen = 12   // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings
+)
+
+var boundaryChars = [256]bool{}
+
+func init() {
+	for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary
+		if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' {
+			boundaryChars[i] = true
+		}
+	}
+
+	// We need these keys sorted in an ascending length to efficiently compare them.
+	slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int {
+		return len(a) - len(b)
+	})
+}
+
+type replacer struct {
+	source, dest []byte
+
+	// This is the last position that was copied to the destination buffer.
+	tail int
+	// This is the current position we are working with
+	cur int
+	// This is essentially used for lookahed operations
+	head int
+
+	// Here's some ASCII art to visualize that (the long string of dashes
+	// visualizes the log line:
+	//
+	// 0 <already in dest, copied or replaced >   `tail`   <about to be copied or replaced>   `cur`   <lookahead>   `head`   <remaining unprocessed log>   `len(source)`
+	// |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------|
+
+	// A somewhat hacky solution that allows us to preserve specific numeric
+	// values we care about, like status and status_code.
+	preserveNextNumbers bool
+}
+
+// commit advances the current position marker to the lookahead position, i.e.
+// we are committing to consume everything we've looked ahead so far.
+func (r *replacer) commit() {
+	r.cur = r.head
+}
+
+func (r *replacer) resetHead() {
+	r.head = r.cur
+}
+
+func (r *replacer) resetHeadExpr() bool {
+	r.head = r.cur
+	return true // useful when included in boolean expressions with advance()
+}
+
+func (r *replacer) backtrack() {
+	r.head--
+}
+
+func (r *replacer) consumeUpToCurrent() {
+	r.tail = r.cur
+}
+
+// advanceUintRet returns the actual value of the number we read, as well as its
+// length. The value might overflow for large numbers, so it's also important to
+// check the length!
+func (r *replacer) advanceUintRet() (val uint, length uint) {
+	var c byte
+	for ; r.head < len(r.source); r.head++ {
+		c = r.source[r.head]
+		if c < '0' || '9' < c {
+			break
+		}
+		length++
+		val = val*10 + uint(c-'0') // TODO: use bitwise trick?
+	}
+
+	return val, length
+}
+
+func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) {
+	var c byte
+	for ; r.head < len(r.source); r.head++ {
+		c = r.source[r.head]
+		if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) {
+			break
+		}
+		length++
+	}
+
+	return length
+}
+
+func (r *replacer) advanceUint() bool {
+	_, l := r.advanceUintRet()
+	return l > 0
+}
+
+func (r *replacer) advanceUintUpTo(val uint, length uint) bool {
+	foundVal, foundLength := r.advanceUintRet()
+	return foundLength > 0 && foundVal <= val && foundLength <= length
+}
+
+func (r *replacer) advanceYear() bool {
+	return r.advanceUintUpTo(maxYear, 4)
+}
+
+func (r *replacer) advanceChar(c byte) bool {
+	if r.head < len(r.source) && r.source[r.head] == c {
+		r.head++
+		return true
+	}
+	return false
+}
+
+func (r *replacer) peekNextIsBoundary() bool {
+	if r.head >= len(r.source) {
+		return true // we are at the end of the line
+	}
+	return boundaryChars[r.source[r.head]]
+}
+
+func (r *replacer) peekFirstNonInt() (c byte) {
+	for i := r.head; i < len(r.source); i++ {
+		c = r.source[i]
+		if c < '0' || '9' < c {
+			return c
+		}
+	}
+	return 0 // we can return the 0 value here!
+}
+
+func (r *replacer) peekNext4() (result [4]byte) {
+	overhead := len(r.source) - r.head
+	switch {
+	case overhead > 3:
+		result[3] = r.source[r.head+3]
+		fallthrough
+	case overhead > 2:
+		result[2] = r.source[r.head+2]
+		fallthrough
+	case overhead > 1:
+		result[1] = r.source[r.head+1]
+		fallthrough
+	case overhead > 0:
+		result[0] = r.source[r.head+0]
+	}
+	return result
+}
+
+func (r *replacer) advanceTimeZoneLetters() bool {
+	UCLetters := 0
+	for {
+		nc, ok := r.advance()
+		if !ok {
+			break
+		}
+		if nc < 'A' || nc > 'Z' {
+			r.backtrack()
+			break
+		}
+		UCLetters++
+	}
+
+	return UCLetters >= 2 && UCLetters <= 5
+}
+
+func (r *replacer) advanceNumericTimeZone() bool {
+	// See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+	return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5)
+}
+
+// helper for handleWeirdTimestamp()
+func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool {
+	curHead := r.head
+	if r.advanceChar(' ') && r.advanceNumericTimeZone() {
+		return true
+	}
+	r.head = curHead
+	if r.advanceChar(' ') && r.advanceTimeZoneLetters() {
+		return true
+	}
+	r.head = curHead
+	return isOptional
+}
+
+func (r *replacer) advanceOneOf(chars ...byte) bool {
+	if r.head >= len(r.source) {
+		return false
+	}
+	c := r.source[r.head]
+
+	for _, ec := range chars {
+		if c == ec {
+			r.head++
+			return true
+		}
+	}
+	return false
+}
+
+func (r *replacer) advanceTime(secondsOptional bool) bool {
+	return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2)))
+}
+
+// Mon|Tue|Wed|Thu|Fri|Sat|Sun
+func (r *replacer) advanceDayOfTheWeek() bool {
+	c1, ok1 := r.advance()
+	c2, ok2 := r.advance()
+	c3, ok3 := r.advance()
+	if !ok1 || !ok2 || !ok3 {
+		return false
+	}
+	return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun
+		(c1 == 'M' && c2 == 'o' && c3 == 'n') ||
+		(c1 == 'T' && c2 == 'u' && c3 == 'e') ||
+		(c1 == 'W' && c2 == 'e' && c3 == 'd') ||
+		(c1 == 'T' && c2 == 'h' && c3 == 'u') ||
+		(c1 == 'F' && c2 == 'r' && c3 == 'i')
+}
+
+// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec
+func (r *replacer) advanceMonthName() bool {
+	c1, ok1 := r.advance()
+	c2, ok2 := r.advance()
+	c3, ok3 := r.advance()
+	if !ok1 || !ok2 || !ok3 {
+		return false
+	}
+
+	return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul
+		(c2 == 'a' && c3 == 'n'))) || // Jan
+		(c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May
+		(c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec
+		(c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr
+			(c2 == 'u' && c3 == 'g'))) || // Aug
+		(c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct
+		(c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov
+}
+
+// Check if we in the middle of an UUID, exactly after the first 8 characters
+// and the dash after them, e.g after "abcd0123-":
+func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) {
+	return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
+		(r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
+		(r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') &&
+		(r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary()
+}
+
+// Only moves the head forward if it successfully matches a duration
+func (r *replacer) advanceDuration() (matched bool) {
+	curHead := r.head
+
+	var secsLen int
+	n := r.peekNext4()
+	if n[0] == 'h' {
+		r.head++
+		if boundaryChars[n[1]] {
+			return true // e.g. "1h", "123h"
+		}
+		if !r.advanceUintUpTo(60, 2) {
+			goto restore
+		}
+		n = r.peekNext4()
+	}
+	if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here
+		r.head++
+		if boundaryChars[n[1]] {
+			return true // e.g. "2h21m", "121m"
+		}
+		if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) {
+			goto restore
+		}
+		n = r.peekNext4()
+	}
+
+	if n[0] == 's' && boundaryChars[n[1]] {
+		secsLen = 1
+	} else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] {
+		secsLen = 2
+	} else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] {
+		// This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu)
+		secsLen = 3
+	} else {
+		goto restore
+	}
+
+	r.head += secsLen
+	return true
+
+restore: // should be faster than a defer
+	r.head = curHead
+	return false
+}
+
+var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}
+
+// Only moves the head forward if it successfully matches a duration
+func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
+	if !byteSizes[c1] {
+		return false
+	}
+	n := r.peekNext4()
+
+	var unitLen int // not counting the first character c1, which is already advanced to
+	if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] {
+		unitLen = 1
+	} else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] {
+		unitLen = 2
+	} else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] {
+		unitLen = 3
+	}
+
+	if unitLen > 0 {
+		r.head += unitLen
+		return true
+	}
+	return false
+}
+
+func (r *replacer) advance() (c byte, advanced bool) {
+	if r.head >= len(r.source) {
+		return 0, false
+	}
+
+	c = r.source[r.head]
+	r.head++
+	return c, true
+}
+
+func (r *replacer) emitNumber() {
+	r.commit()
+	r.dest = append(r.dest, placeholderNumber...)
+	r.consumeUpToCurrent()
+}
+
+func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) {
+	r.commit()
+	if !r.preserveNextNumbers {
+		r.dest = append(r.dest, placeholderNumber...)
+		r.consumeUpToCurrent()
+	} else {
+		r.maybeEmitDash(hasMinusPrefix)
+		r.copyUpToCurrent()
+	}
+}
+
+func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) {
+	r.commit()
+	r.maybeEmitDash(hasMinusPrefix)
+	r.dest = append(r.dest, placeholder...)
+	r.consumeUpToCurrent()
+}
+
+func (r *replacer) maybeEmitDash(hasMinusPrefix bool) {
+	// This minus was actually a dash, so we just copy it to the result
+	if hasMinusPrefix {
+		r.dest = append(r.dest, '-')
+	}
+}
+
+func (r *replacer) copyUpToCurrent() {
+	r.dest = append(r.dest, r.source[r.tail:r.cur]...)
+	r.consumeUpToCurrent()
+}
+
+func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) {
+	// Special case that is likely a hex string of the format "0x", but we don't
+	// know whether the rest is upper case or lower case yet.
+	zeroHex := false
+	if n1 == 0 && l1 == 1 && c1 == 'x' {
+		zeroHex = true // these cannot be the start of an UUID
+		c1 = r.peekFirstNonInt()
+	}
+
+	// Maybe we are at the start of a hex string, either something like
+	// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
+	// case letters, but to avoid false positives, we want hex replacements to
+	// happen only if the strings are fully lower case or fully upper case.
+	if 'a' <= c1 && c1 <= 'f' {
+		return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex)
+	} else if 'A' <= c1 && c1 <= 'F' {
+		return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex)
+	} else if zeroHex {
+		// Well, it probably wasn't a zero-hex after all, or it contained only
+		// digits, so try to handle that or emit what we absorbed
+		_, l2 := r.advanceUintRet()
+		if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen
+			r.emit(hasMinusPrefix, placeholderHex)
+		} else {
+			r.maybeEmitDash(hasMinusPrefix)
+			r.commit()
+			r.copyUpToCurrent()
+		}
+		return r.peekNextIsBoundary()
+	}
+
+	return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1)
+}
+
+func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) {
+	totalLen := l1 + r.advanceUintOrRangeRet(lc, hc)
+	r.commit()
+
+	if totalLen == 8 && canBeUUID {
+		// We might be at the first part of a UUID, right before the first dash
+		if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) {
+			r.emit(hasMinusPrefix, placeholderUUID)
+			return true
+		}
+		r.resetHead()
+	}
+
+	if totalLen >= minHexLen && r.peekNextIsBoundary() {
+		r.emit(hasMinusPrefix, placeholderHex)
+		return true
+	}
+
+	r.copyUpToCurrent()
+	return r.peekNextIsBoundary()
+}
+
+func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) {
+	if r.advanceBytesize(c1) {
+		// We do not subsume a minus sign - byte sizes are unlikely to be
+		// negative, it's more likely this is a dash as a part of a range
+		r.emit(hasMinusPrefix, placeholderBytesize)
+		return true
+	}
+
+	r.backtrack()
+	if r.advanceDuration() {
+		// We subsume hasMinusPrefix, since durations can be negative
+		r.emit(false, placeholderDuration)
+		return true
+	}
+
+	// We couldn't match anything, so just copy what existed.
+	r.maybeEmitDash(hasMinusPrefix)
+	r.copyUpToCurrent()
+	return false
+}
+
+func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) {
+	n2, l2 := r.advanceUintRet()
+	if l2 == 0 {
+		// The dot wasn't followed by another number, so emit everything before it.
+		r.backtrack()
+		r.emitNumberOrCopyText(hasMinusPrefix)
+		return false
+	}
+
+	// See if the number after the decimal is also followed by a boundary
+	b2, hasNext := r.advance()
+
+	// We are at the end of the string, which is a boundary, replace evertything
+	// up to now with a number
+	if !hasNext {
+		r.emitNumber() // this also subsumes any minus sign we had
+		return true
+	}
+
+	// The decimal number isn't followed by a boundary char (which include
+	// things like '.', ':', '/', etc.), so the part after the decimal is either
+	// not a real number, or it's some sort of a unit that can support decimals
+	// like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc.
+	if !boundaryChars[b2] {
+		return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
+	}
+
+	// We have a decimal number followed by a non-dot boundary, so this is not
+	// an IP or a version number or anything like that.
+	if b2 != '.' {
+		r.backtrack()
+		r.emitNumber()
+		return true
+	}
+
+	n3, l3 := r.advanceUintRet()
+	if l3 == 0 || !r.peekNextIsBoundary() {
+		// The second dot wasn't followed by another number and a boundary, so
+		// emit just the first number.
+		r.resetHead()
+		r.emitNumber()
+		return true
+	}
+
+	// We have "<NUM>.<NUM>.<NUM>" at this point, so we either have something
+	// like a version number, or an IP address, but certainly not a simple
+	// decimal number we can just emit.
+	r.commit()
+
+	// Check if this is an IP address...
+	if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() {
+		r.emit(hasMinusPrefix, placeholderIP)
+		return true
+	}
+
+	// This wasn't an IP after all, so just emit "<NUM>.<NUM>.<NUM>". We don't
+	// want to assume this is a simple decimal number because of the second dot,
+	// e.g. this might be something like a version number.
+	r.resetHead()
+	r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus
+	r.dest = append(r.dest, placeholderNumber...)
+	r.dest = append(r.dest, '.')
+	r.dest = append(r.dest, placeholderNumber...)
+	r.dest = append(r.dest, '.')
+	r.dest = append(r.dest, placeholderNumber...)
+	r.consumeUpToCurrent()
+	return true
+}
+
+func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) {
+	if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) {
+		r.commit()
+		// continue down to parsing sub-second and timezone values
+	} else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() {
+		// We might not have a sane timestamp, but apparently a not-so-sane
+		// timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"...
+		r.emit(hasMinusPrefix, placeholderTimestamp)
+		return true
+	} else {
+		// Apparently that wasn't it either, we probably just have a dash or
+		// forward slash after a number, so we backtrack and emit the number.
+		r.resetHead()
+		r.emitNumberOrCopyText(hasMinusPrefix)
+		return true
+	}
+
+	// We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or
+	// `YY[YY]/MM/DDZhh:mm:ss`
+	r.commit() // we want to keep this
+
+	// Now we need to also check for sub-second precision and time zones:
+	c, canAdvance := r.advance()
+	if !canAdvance {
+		// End of the string
+		r.emit(hasMinusPrefix, placeholderTimestamp)
+		return true
+	}
+	if c == '.' {
+		_, intl := r.advanceUintRet()
+		if intl == 0 {
+			// No sub-second precision, the dot was not part of the timestamp
+			r.backtrack()
+			r.emit(hasMinusPrefix, placeholderTimestamp)
+			return true
+		}
+
+		// We had sub-second precision, capture that too
+		r.commit()
+
+		// Get the next char to see if we have time zone
+		c, canAdvance = r.advance()
+		if !canAdvance {
+			// We are at the end of the sting after we captured the
+			// sub-second value.
+			r.emit(hasMinusPrefix, placeholderTimestamp)
+			return true
+		}
+	}
+
+	if c == 'Z' || c == 'z' {
+		//UTC string, nothing to do after that
+		r.emit(hasMinusPrefix, placeholderTimestamp)
+		return true
+	}
+	// See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+	if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) {
+		// e.g. "2020-09-30T00:00:59.9999+03:00"
+		r.commit()
+	} else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() {
+		// e.g. "2023-09-05 23:20:28.030285153 +0000 UTC"
+		r.commit()
+	} else {
+		r.resetHead()
+	}
+	r.emit(hasMinusPrefix, placeholderTimestamp)
+	return true
+}
+
+func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) {
+	r.resetHead() // keep the head pos in sync with the current pos
+	// We know we were at a digit due to how handleNumber() is called
+	n1, l1 := r.advanceUintRet()
+	r.commit() // we will consume this one way or another for sure
+
+	// See if the number is followed by a boundary
+	b1, hasNext := r.advance()
+
+	switch {
+	// We are at the end of the string, which is a boundary, replace evertything
+	// up to now with a number
+	case !hasNext:
+		r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had
+		return true
+
+	// The number isn't followed by a boundary char (which include things like
+	// '.', ' ', '/', etc.), so it's either not a real number or date, or it's
+	// some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB,
+	// 3MiB), etc.
+	case !boundaryChars[b1]:
+		return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1)
+
+	// We have a decimal point, so this can either be a plain number, a unit
+	// that can have a float value, or an IP address.
+	case b1 == '.':
+		return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1)
+
+	// This might be a timestamp that looks like '2024-04-01...' or
+	// `2024/03/27...`; timestamps in the far future are not supported :)
+	case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
+		return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)
+
+	// Weird RFC822 dates like "02 Jan 06 15:04 MST"
+	case n1 <= 31 && l1 <= 2 && b1 == ' ':
+		if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {
+			r.commit()
+			r.emit(hasMinusPrefix, placeholderTimestamp)
+			return true
+		}
+		// if not, go to default handler after switch statement
+
+	// It could be a UUID that starts with 8 digits
+	case l1 == 8 && b1 == '-':
+		if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) {
+			r.emit(hasMinusPrefix, placeholderUUID)
+			return true
+		}
+		// if not, go to default handler after switch statement
+	}
+
+	// Number with an unknown boundary - emit the number and leave the boundary
+	// for the following passes.
+	r.resetHead()
+	r.emitNumberOrCopyText(hasMinusPrefix)
+	return true
+}
+
+var longDayNames = [...][]byte{
+	[]byte("Sunday"),
+	[]byte("Monday"),
+	[]byte("Tuesday"),
+	[]byte("Wednesday"),
+	[]byte("Thursday"),
+	[]byte("Friday"),
+	[]byte("Saturday"),
+}
+
+func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) {
+	r.resetHead()
+
+	if r.advanceDayOfTheWeek() {
+		r.commit() // we will always consume this
+
+		// RFC1123 and RFC1123Z, e.g.:
+		// - "Mon, 02 Jan 2006 15:04:05 MST"
+		// - "Mon, 02 Jan 2006 15:04:05 -0700"
+		if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) {
+			r.emit(false, placeholderTimestamp)
+			return true
+		}
+		r.resetHead()
+
+		// ANSIC, UnixDatem, RubyDate e.g
+		// - "Mon Jan  2 15:04:05 2006"
+		// - "Mon Jan  2 15:04:05 MST 2006"
+		// - "Mon Jan 02 15:04:05 -0700 2006"
+		if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() {
+			r.emit(false, placeholderTimestamp)
+			return true
+		}
+		r.resetHead()
+
+		// Linux, e.g.
+		// - "Mon  2 Jan 15:04:05 MST 2006"
+		// - "Tue 23 Jan 15:04:05 -0700 2023"
+		if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() {
+			r.emit(false, placeholderTimestamp)
+			return true
+		}
+		r.resetHead()
+
+		// RFC850, e.g.
+		//  - "Monday, 02-Jan-06 15:04:05 MST"
+		backtrackedSlice := r.source[r.head-3:]
+		var matchedDay []byte
+		for _, dw := range longDayNames {
+			if bytes.HasPrefix(backtrackedSlice, dw) {
+				matchedDay = dw
+				break
+			}
+		}
+		if matchedDay != nil {
+			r.head += len(matchedDay) - 3
+			if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) {
+				r.emit(false, placeholderTimestamp)
+				return true
+			}
+		}
+
+		r.cur -= 3 // unconsume
+		r.resetHead()
+		return false
+	}
+	r.resetHead()
+
+	if r.advanceMonthName() {
+		r.commit() // provisionally consume this
+
+		// Linux journald logs and others similar like this:
+		//  - Feb 29 23:00:14
+		//  - Apr-10 23:43:46.807
+		//  - Jul  1 00:21:28
+		if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) {
+			r.commit()
+			// This is already a timestamp, but let's try to match subseconds as well
+			if r.advanceChar('.') && r.advanceUint() {
+				r.commit()
+			} else {
+				r.resetHead()
+			}
+			r.emit(false, placeholderTimestamp)
+			return true
+		}
+		r.cur -= 3 // unconsume
+		r.resetHead()
+		return false
+	}
+	r.resetHead()
+
+	return false
+}
+
+func (r *replacer) wasNumPreservingKeyword() bool {
+	for _, key := range numPreservingKeys {
+		pPos := r.cur - 1 - len(key)
+		if pPos < -1 {
+			return false // all subsequent keys are longer
+		}
+		if pPos != -1 && !boundaryChars[r.source[pPos]] {
+			continue
+		}
+		if bytes.HasPrefix(r.source[pPos+1:], key) {
+			return true
+		}
+	}
+	return false
+}
+
+func (r *replacer) replaceWithPlaceholders() {
+	lineLen := len(r.source)
+
+	var c byte
+	onBoundary := true
+	for r.cur = 0; r.cur < lineLen; r.cur++ {
+		c = r.source[r.cur]
+		switch {
+		// If we're currently not at a boundary, the only thing we need to check
+		// is whether the current char would create a boundary in the next iteration.
+		case !onBoundary:
+			onBoundary = boundaryChars[c]
+
+			// We weren't at a boundary and now we are, so check if we match one
+			// of the special keywords that will preserve numbers
+			if onBoundary {
+				r.preserveNextNumbers = r.wasNumPreservingKeyword()
+			}
+
+		// If we've reached this far, it means we're currently at a boundary!
+
+		// A lot of very complex logic if we encounter a number at a boundary,
+		// so we move that to a helper function.
+		case '0' <= c && c <= '9':
+			r.copyUpToCurrent()
+			onBoundary = r.handleNumberStart(false)
+
+		// Handle negative numbers, potentially
+		case c == '-':
+			next := r.cur + 1
+			// This might be a number, a date, an IP address, etc. So we don't
+			// know if this is a minus sign to replace or a dash to copy yet.
+			if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' {
+				// Copy everything before the dash, but mark it as consumed.
+				r.copyUpToCurrent()
+				r.cur++
+				r.consumeUpToCurrent()
+				onBoundary = r.handleNumberStart(true)
+			} else {
+				onBoundary = true
+			}
+
+		// Try to match weird timestamps. They require a lot of remaining
+		// length, generally start with a capitalized day of the week or month
+		// name (1 upper case letter followed by 2 lower case letters).
+		//
+		// We are basically looking for something that may match this here:
+		//   Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec
+		//
+		// The detailed check would be performed by the actual handler:
+		case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 &&
+			'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' &&
+			'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y':
+			r.copyUpToCurrent()
+			onBoundary = r.handleWeirdTimestamp()
+
+		// This could be the start of an lower case hex string:
+		case 'a' <= c && c <= 'f':
+			r.copyUpToCurrent()
+			r.resetHead()
+			onBoundary = r.handleHex(false, 0, 'a', 'f', true)
+
+		// This could be the start of an upper case hex string:
+		case 'A' <= c && c <= 'F':
+			r.copyUpToCurrent()
+			r.resetHead()
+			onBoundary = r.handleHex(false, 0, 'A', 'F', true)
+
+		// If we haven't actually matched anything, update whether we're still
+		// on a boundary character and continue onto the next one.
+		default:
+			onBoundary = boundaryChars[c]
+		}
+	}
+
+	if r.cur > r.tail {
+		r.dest = append(r.dest, r.source[r.tail:]...)
+		r.consumeUpToCurrent()
+	}
+}
+
+func Preprocess(content []byte) []byte {
+	// ~floor(120%), to allow for some expansion from replacements, hopefully
+	// without needing to allocate more memory
+	r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)}
+	r.replaceWithPlaceholders()
+	return r.dest
+}
--- a/pkg/pattern/tokenization/tokenization.go
+++ b/pkg/pattern/tokenization/tokenization.go
@ -0,0 +1,201 @@
+package tokenization
+
+import (
+	"bytes"
+	"unsafe"
+)
+
+const placeholderEndOfLine = "<...>"
+
+// Outside of quoted strings, these are the delimiters between tokens. However,
+// they are not going to be a part of the tokens themselves and will be replaced
+// by spaces in the actual log template.
+var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true}
+
+type tokenizer struct {
+	// Input
+	rawLine   []byte
+	maxTokens int
+
+	// State and position iterators
+	buf        []byte
+	tpos       int
+	tokenCount int
+	maybeJSON  bool
+
+	// Result values, the values in the `tokens` slice reference line and shouldn't
+	// allocate new memory.
+	line   string
+	tokens []string
+}
+
+func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) {
+	if t.tokens != nil {
+		// Intentionally written like this and not with append(), so this can
+		// panic if we ever exceed the preallocated slice size, since that means
+		// we have a nasty bug in handleNextToken() below.
+		t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos]
+	}
+	t.tokenCount++
+	t.tpos = endTokenPos + skip
+}
+
+func (t *tokenizer) handleNextToken() bool {
+	escaped := false
+	var c, curQuoteChar byte
+	curQuotePos := -1
+
+	lineLen := len(t.line)
+	for p := t.tpos; p < lineLen; p++ {
+		c = t.line[p]
+		switch {
+
+		// If the previous character was a backslash, we ignore the next
+		// character, unless it was an non-token delimiter (space, tab, etc.)
+		// outside of a quoted string.
+		case escaped:
+			if curQuotePos < 0 && delimiters[c] {
+				t.countOrSaveToken(p, 1)
+				return true
+			} else {
+				escaped = false
+			}
+
+		// If we weren't already escaped and we encounter a backslash, toggle
+		// the escaped flag and ignore the current byte.
+		case c == '\\':
+			escaped = true
+
+		// Non-ASCII / UTF8 / invalid character, consider it a part of the
+		// current token, for lack of a better efficient option...
+		case c > 127:
+			// Intentionally blank, continues to the next char
+
+		// If we are currently in a quoted part of the string, the current
+		// character is also part of the current token. The only special case
+		// here is if the current character is a matching quote, that means
+		// we'll no longer be quoted.
+		case curQuotePos >= 0:
+			if c == curQuoteChar { // end quote
+				curQuotePos = -1
+			}
+
+		// If we encounter a qoute character and we were not already in a quoted
+		// part of the line, mark that we are now in a quote from that type.
+		case c == '"' || c == '\'' || c == '`':
+			curQuoteChar = c
+			curQuotePos = p
+
+		// If we encounter a delimiter outside of a quote, count or save the
+		// token and skip the delimiter.
+		case delimiters[c]:
+			t.countOrSaveToken(p, 1)
+			return true
+
+		// Handle likely JSON object keys that have been serialized without
+		// spaces. For example, something like this:
+		//   `{"context":{"taskId":1},"message":"starting",...}`
+		//
+		// If the line starts or ends with curly braces, we consider it might be
+		// a JSON log and try to detect the `":` part of the message that isn't
+		// followed by a delimiter character. If we find that pattern, we
+		// consider everything up to and including the `:` character as a
+		// separate token.
+		//
+		// Similarly, we try to detect the `,"` pattern and also split a token
+		// before the comma value. The `p > t.tpos` check is crucial here,
+		// because it ensures that we're not at the start of a token, i.e. there
+		// wasn't a delimiter right before the comma.
+		case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen:
+			if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] {
+				t.countOrSaveToken(p+1, 0)
+				return true
+			}
+			if c == ',' && t.line[p+1] == '"' {
+				t.countOrSaveToken(p, 0)
+				return true
+			}
+		}
+
+		// By default we do nothing, simply advance one character forward
+		// because the current character was a part of the current token.
+	}
+
+	// We have an unterminated single quote at position `curQuotePos`. To handle
+	// this edge case somewhat gracefully, we can emit everything up to that
+	// unterminated quote and the quote itself as a single token, and continue
+	// fairly normally from there.
+	if curQuotePos > 0 {
+		t.countOrSaveToken(curQuotePos+1, 0)
+		return true
+	}
+
+	if t.tpos < len(t.line) {
+		t.countOrSaveToken(len(t.line), 0)
+		return true
+	}
+
+	return false
+}
+
+// This function is called twice! The first time it counts the tokens but
+// doesn't save them. Afterwards we allocate the tokens return slice with
+// exactly the correct capacity and we call it again, this time to save them.
+func (t *tokenizer) process() {
+	// We want to handle the end of the string as a single token, so we start
+	// the loop from 1.
+	for i := 1; i < t.maxTokens; i++ {
+		if !t.handleNextToken() {
+			break
+		}
+	}
+
+	if t.tpos >= len(t.line) {
+		return
+	}
+
+	// We have token count more than or equal to maxTokens, add one last token
+	// containing placeholderEndOfLine to signal that.
+	if t.tokens != nil {
+		t.tokens[t.tokenCount] = placeholderEndOfLine
+	}
+	t.tokenCount++
+	t.tpos += len(placeholderEndOfLine)
+}
+
+func (t *tokenizer) tokenize() []string {
+	t.buf = Preprocess(t.rawLine)
+
+	// We use unsafe to convert buf to a string without any new allocations.
+	// This is safe because t.buf won't be used or referenced anywhere else
+	// besides here from now on.
+	t.line = unsafe.String(unsafe.SliceData(t.buf), len(t.buf))
+
+	if len(t.line) >= 2 && (t.line[0] == '{' || t.line[len(t.line)-1] == '}') {
+		t.maybeJSON = true
+	}
+
+	t.process()
+
+	// If we have super long lines (more than twice the size we need to get the
+	// maxTokens we want), copy just the part we need so the tokens don't hold a
+	// reference to the original huge []byte slice.
+	if t.tokenCount == t.maxTokens && t.tpos*2 < len(t.line) {
+		t.line = string(t.buf[0 : t.tpos+1])
+	}
+
+	t.tokens = make([]string, t.tokenCount) // intentionally like this, see comment in countOrSaveToken()
+	t.tokenCount = 0
+	t.tpos = 0
+	t.process()
+
+	return t.tokens
+}
+
+func PreprocessAndTokenize(content []byte) []string {
+	content = bytes.TrimSpace(content)
+
+	t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens
+
+	return t.tokenize()
+}
--- a/pkg/pattern/tokenization/tokenization_benchmark_test.go
+++ b/pkg/pattern/tokenization/tokenization_benchmark_test.go
@ -0,0 +1,26 @@
+package tokenization
+
+import (
+	"fmt"
+	"testing"
+)
+
+var sampleLogBenchCount = 0
+
+func BenchmarkTokenizationTestCases(b *testing.B) {
+	for i, tc := range tokenizationRealisticTestCases {
+		line := []byte(tc.line)
+		b.Run(fmt.Sprintf("test-case-%d", i), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				PreprocessAndTokenize(line)
+			}
+		})
+	}
+}
+
+func BenchmarkTokenizationPlayground(b *testing.B) {
+	line := []byte(`level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`)
+	for i := 0; i < b.N; i++ {
+		PreprocessAndTokenize(line)
+	}
+}
--- a/pkg/pattern/tokenization/tokenization_test.go
+++ b/pkg/pattern/tokenization/tokenization_test.go
@ -0,0 +1,376 @@
+package tokenization
+
+import (
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+type tokenizationTestCase struct {
+	line      string
+	expResult []string
+}
+
+var tokenizationCornerTestCases = []tokenizationTestCase{
+	{
+		"",
+		[]string{},
+	},
+	{
+		" foo  ",
+		[]string{"foo"},
+	},
+	{
+		"foo bar baz",
+		[]string{"foo", "bar", "baz"},
+	},
+	{
+		"\nfoo\t  bar baz\r\n",
+		// TOFIX: remove empty tokens?
+		[]string{"foo", "", "", "bar", "baz"},
+	},
+	{
+		"ends single char C",
+		[]string{"ends", "single", "char", "C"},
+	},
+	{
+		"0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Z Y X W V U T S R Q P O N M L K J I H G F E D C B A z y x w v u t s r q p o n m l k j i h g f e d c b a 9 8 7 6 5 4 3 2 1 0",
+		// The tail end of strings longer than maxTokens is returned as a single token
+		[]string{"<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "<NUM>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "H", "G", "F", "E", "D", "C", "B", "A", "z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "<...>"},
+	},
+	{
+		`a "quoted string"`,
+		[]string{"a", `"quoted string"`},
+	},
+	{
+		`a "quoted string with \" escapes!"`,
+		[]string{"a", `"quoted string with \" escapes!"`},
+	},
+	{
+		`a 'singly quoted string"'`,
+		[]string{"a", `'singly quoted string"'`},
+	},
+	{
+		`a 'singly quoted string" \''`,
+		[]string{"a", `'singly quoted string" \''`},
+	},
+	{
+		`a 'singly quoted string" \\\''`,
+		[]string{"a", `'singly quoted string" \\\''`},
+	},
+	{
+		`a'twisted excappe\\' "with an unterminated quote" 'at_the_end`,
+		[]string{`a'twisted excappe\\'`, `"with an unterminated quote"`, `'`, "at_the_end"},
+	},
+	{
+		`a "quoted string 'inception'"!` + "`woot`'much`wow'",
+		[]string{"a", `"quoted string 'inception'"!` + "`woot`'much`wow'"},
+	},
+	{
+		`unterminated"quote`,
+		[]string{`unterminated"`, `quote`},
+	},
+	{
+		"`mix`" + ` "and" 'match'`,
+		[]string{"`mix`", `"and"`, `'match'`},
+	},
+	{
+		"`mix`" + ` "and" 'match'`,
+		[]string{"`mix`", `"and"`, `'match'`},
+	},
+	{
+		`{"json":"uninterrupted \"logline\"","foo":"bar"}`,
+		[]string{`{"json":`, `"uninterrupted \"logline\""`, `,"foo":`, `"bar"}`},
+	},
+	{
+		`{"weirdo1":`,
+		[]string{`{"weirdo1":`},
+	},
+	{
+		// This and the corner case below are not optimal, though they're
+		// unlikely to be encountered in the real world, so it's hopefully a
+		// nice tradeoff for the performance of not needing a full JSON parser.
+		`{"weirdo2":}`,
+		[]string{`{"weirdo2":`, `}`},
+	},
+	{
+		`,"weirdo3":}`,
+		[]string{`,"weirdo3":`, `}`},
+	},
+	{
+		// We deliberately do treat "escaped" whitespaces outside of quotes as
+		// delimiters, i.e. whitespaces outside of strings cannot be escaped.
+		`weird\ escape`,
+		[]string{`weird\`, `escape`},
+	},
+	{
+		"-3.14-foo 0.0.0.0/24-0.0.0.1-255.255.255.255-256.255.255.255 1337-ber 0.12-ber n0tnumb3er 12faux -123.0.1.123 -123 -1231.11 333. 123.456. 123.45-",
+		[]string{"<NUM>-foo", "<IP>/<NUM>-<IP>-<IP>-<NUM>.<NUM>.<NUM>.<NUM>", "<NUM>-ber", "<NUM>-ber", "n0tnumb3er", "12faux", "-<IP>", "<NUM>", "<NUM>", "<NUM>.", "<NUM>.<NUM>.", "<NUM>-"},
+	},
+	{
+		"2022-12-31 12:12:31 3022-12-31 12:12:31-Jul  1 00:21:28",
+		[]string{"<TIMESTAMP>", "<NUM>-<NUM>-<NUM>", "<NUM>:<NUM>:<NUM>-<TIMESTAMP>"},
+	},
+	{
+		"2022/12/01 12:12:31 - 2022/13/32 12:12:31",
+		[]string{"<TIMESTAMP>", "-", "<NUM>/<NUM>/<NUM>", "<NUM>:<NUM>:<NUM>"},
+	},
+	{
+		"UUIDS: 123e4567-e89b-12d3-a456-426614174000, 550E8400-E29B-41D4-A716-446655440000, -00000000-0000-0000-0000-000000000000, 12345678-dead-beef-1337-000000000000 {c6ad1a63-10b5-460e-ab2c-05c13604539d} ''<A3AE4842-E9AA-4C27-9509-772DB3CC3190>''",
+		[]string{"UUIDS:", "<UUID>,", "<UUID>,", "-<UUID>,", "<UUID>", "{<UUID>}", "''<<UUID>>''"},
+	},
+	// Mixed case UUID and hex strings are ignored, to limit false positives
+	{
+		"Not UUIDS: 123e4567-E89B-12d3-a456-426614174000, 1234567-dead-beef-1337-00000000000a",
+		[]string{"Not", "UUIDS:", "123e4567-E89B-12d3-a456-<NUM>,", "<NUM>-dead-beef-<NUM>-<HEX>"},
+	},
+	{
+		"Hexes: 0x0123456789 0xabcdef0123 deadbeef1337-ABCDEF0123456?0123456789ab:FFFFAAAAFFFF Curses: 0x012345678 dEaDbeef1337 abcdefabcde ABCDEFABCDE 0xASDFASDFASDF abcdef0123456NOT",
+		[]string{"Hexes:", "<HEX>", "<HEX>", "<HEX>-<HEX>?<HEX>:<HEX>", "Curses:", "0x012345678", "dEaDbeef1337", "abcdefabcde", "ABCDEFABCDE", "0xASDFASDFASDF", "abcdef0123456NOT"},
+	},
+	{
+		"30546354_3313121680 0_123_456_789 foo_123",
+		[]string{"<NUM>_<NUM>", "<NUM>_<NUM>_<NUM>_<NUM>", "foo_<NUM>"},
+	},
+	{
+		`3.31ms/1h2m|-12h2m6.1s 31m "165m2.1s(6h0m12.05us)" -451325.31µs 6m23μs 123h21m3.4124561s/0s/-0.0123ms`,
+		[]string{"<DURATION>/<DURATION>|<DURATION>", "<DURATION>", `"<DURATION>(<DURATION>)"`, "<DURATION>", "<DURATION>", "<DURATION>/<DURATION>/<DURATION>"},
+	},
+	{
+		// Invalid duration values
+		"3.31.1ms 3h121m3.4124561s 1h0.12s 100usa 0.12msa",
+		[]string{"<NUM>.<DURATION>", "3h121m3.<DURATION>", "1h0.<DURATION>", "100usa", "0.12msa"},
+	},
+	{
+		"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit",
+		[]string{"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>"},
+	},
+	{
+		`status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`,
+		[]string{"status=123", "status_code:500", "status", "200", `status="-1"`, `status_code:"404"`, "httpStatus=200"},
+	},
+	{
+		`status_code_foo=123 status_code:500.1 status 2023-09-06T00:59:59.98 status:"404KiB"`,
+		[]string{"status_code_foo=<NUM>", "status_code:<NUM>", "status", "<TIMESTAMP>", `status:"<BYTESIZE>"`},
+	},
+}
+
+// TODO: add these to the benchmark tests
+var tokenizationRealisticTestCases = []tokenizationTestCase{
+	// logfmt from loki, metrics.go with a lot of number values of various types
+	{
+		`level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`,
+
+		[]string{
+			"level=info", "ts=<TIMESTAMP>", "caller=metrics.go:<NUM>", "component=frontend", "org_id=<NUM>", "traceID=<HEX>", "latency=fast", `query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} "`, "query_hash=<NUM>", "query_type=limited", "range_type=range", "length=<DURATION>", "start_delta=<DURATION>", "end_delta=<DURATION>", "step=<DURATION>", "duration=<DURATION>", "status=200", "limit=<NUM>", "returned_lines=<NUM>", "throughput=<BYTESIZE>", "total_bytes=<BYTESIZE>", "total_bytes_non_indexed_labels=<BYTESIZE>", "lines_per_second=<NUM>", "total_lines=<NUM>", "post_filter_lines=<NUM>", "total_entries=<NUM>", "store_chunks_download_time=<DURATION>", "queue_time=<DURATION>", "splits=<NUM>", "shards=<NUM>", "chunk_refs_fetch_time=<DURATION>", "cache_chunk_req=<NUM>", "cache_chunk_hit=<NUM>", "cache_chunk_bytes_stored=<NUM>", "cache_chunk_bytes_fetched=<NUM>", "cache_chunk_download_time=<DURATION>", "cache_index_req=<NUM>", "cache_index_hit=<NUM>", "cache_index_download_time=<DURATION>", "cache_stats_results_req=<NUM>", "cache_stats_results_hit=<NUM>", "cache_stats_results_download_time=<DURATION>", "cache_result_req=<NUM>", "cache_result_hit=<NUM>", "cache_result_download_time=<DURATION>", "token_id=gcom-<NUM>",
+		},
+	},
+	// logfmt from loki, with string multi-word messages
+	{
+		`level=debug ts=2023-09-06T00:59:59.98214402Z caller=shard_resolver.go:114 bytes=205kB chunks=2 streams=2 entries=200 msg="queried index" type=single matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" duration=9.498885ms from=2023-09-06T00:48:53.138Z through=2023-09-06T00:49:43.138Z length=50s`,
+		[]string{
+			"level=debug", "ts=<TIMESTAMP>", "caller=shard_resolver.go:<NUM>", "bytes=<BYTESIZE>", "chunks=<NUM>", "streams=<NUM>", "entries=<NUM>", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=<DURATION>", "from=<TIMESTAMP>", "through=<TIMESTAMP>", "length=<DURATION>",
+		},
+	},
+	// random JSON logs
+	{
+		`{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,
+		[]string{`{"timestamp":`, `"<TIMESTAMP>",`, `"level":`, `"debug",`, `"message":`, `"Server starting",`, `"server_id":`, `"abcdefghij",`, `"start_time":`, `"<TIMESTAMP>"}`},
+	},
+	{
+		// JSON logs without spaces between elements, like how JavaScript's JSON.Stringify() produces:
+		`{"context":{"taskId":1},"message":"starting task ID 1","sequence":0,"time":1506776210000,"version":"1.0.0"}`,
+		[]string{`{"context":`, `{"taskId":`, `<NUM>}`, `,"message":`, `"starting task ID <NUM>"`, `,"sequence":`, `<NUM>`, `,"time":`, `<NUM>`, `,"version":`, `"<NUM>.<NUM>.<NUM>"}`},
+	},
+	// Android logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Android/Android_2k.log
+	{
+		// This test case has an unterminated double quote
+		//
+		// TOFIX:
+		//  - timestamp is not correctly detected
+		//  - empty "" token?
+		`03-17 16:13:40.345  1702 14638 D PowerManagerService: release:lock=166121161, flg=0x0, tag="RILJ_ACK_WL", name=com.android.phone", ws=null, uid=1001, pid=2626`,
+		[]string{"<NUM>-<NUM>", "<NUM>:<NUM>:<NUM>", "", "<NUM>", "<NUM>", "D", "PowerManagerService:", "release:lock=<NUM>,", "flg=0x0,", `tag="RILJ_ACK_WL",`, `name=com.android.phone"`, ",", "ws=null,", "uid=<NUM>,", "pid=<NUM>"},
+	},
+	{
+		// TOFIX:
+		//  - timestamp is not correctly detected
+		//  - empty "" tokens
+		`03-17 16:13:47.518  1702  8671 D ActivityManager: Skipping, withExcluded: false, tr.intent:Intent { act=android.intent.action.VIEW dat=file:///storage/emulated/0/Tencent/QQfile_recv/b.apk typ=application/vnd.android.package-archive flg=0x10800000 cmp=com.android.packageinstaller/.PackageInstallerActivity (has extras) }`,
+		[]string{
+			"<NUM>-<NUM>", "<NUM>:<NUM>:<NUM>", "", "<NUM>", "", "<NUM>", "D", "ActivityManager:", "Skipping,", "withExcluded:", "false,", "tr.intent:Intent", "{", "act=android.intent.action.VIEW", "dat=file:///storage/emulated/<NUM>/Tencent/QQfile_recv/b.apk", "typ=application/vnd.android.package-archive", "flg=0x10800000", "cmp=com.android.packageinstaller/.PackageInstallerActivity", "(has", "extras)", "}",
+		},
+	},
+	// Apache logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Apache/Apache_2k.log
+	{
+		`[Mon Dec 05 13:16:27 2005] [notice] jk2_init() Found child 5877 in scoreboard slot 9`,
+		[]string{"[<TIMESTAMP>]", "[notice]", "jk2_init()", "Found", "child", "<NUM>", "in", "scoreboard", "slot", "<NUM>"},
+	},
+	{
+		`[Mon Dec 05 19:14:11 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties`,
+		[]string{"[<TIMESTAMP>]", "[notice]", "workerEnv.init()", "ok", "/etc/httpd/conf/workers2.properties"},
+	},
+	// nginx logs by running `docker run -p 80:80 -v $(pwd):/usr/share/nginx/html nginx` locally
+	{
+		`2024/03/27 14:31:42 [error] 29#29: *1 directory index of "/usr/share/nginx/html/" is forbidden, client: 172.17.0.1, server: localhost, request: "GET / HTTP/1.1", host: "127.0.0.1"`,
+		[]string{"<TIMESTAMP>", "[error]", "<NUM>#<NUM>:", "*<NUM>", "directory", "index", "of", "\"/usr/share/nginx/html/\"", "is", "forbidden,", "client:", "<IP>,", "server:", "localhost,", "request:", `"GET / HTTP/<NUM>",`, "host:", "\"<IP>\""},
+	},
+	{
+		// TOFIX:
+		//  - probably not all numbers should be replaced with <NUM>, e.g. for "*1", "(2:", "HTTP/1.1" it's definitely a worse UX
+		`2024/03/27 14:34:37 [error] 29#29: *1 open() "/usr/share/nginx/html/test url with spaces" failed (2: No such file or directory), client: 172.17.0.1, server: localhost, request: "GET /test%20url%20with%20spaces HTTP/1.1", host: "127.0.0.1"
+		172.17.0.1 - - [31/Mar/2024:14:34:37 +0000] "GET /test%20url%20with%20spaces HTTP/1.1" 404 153 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0" "-"`,
+		[]string{"<TIMESTAMP>", "[error]", "<NUM>#<NUM>:", "*<NUM>", "open()", `"/usr/share/nginx/html/test url with spaces"`, "failed", "(<NUM>:", "No", "such", "file", "or", "directory),", "client:", "<IP>,", "server:", "localhost,", "request:", `"GET /test%20url%20with%20spaces HTTP/<NUM>",`, "host:", "\"<IP>\"", "", "", "<IP>", "-", "-", "[<TIMESTAMP>]", `"GET /test%20url%20with%20spaces HTTP/<NUM>"`, "<NUM>", "<NUM>", "\"-\"", `"Mozilla/<NUM> (X11; Linux x86_<NUM>; rv:<NUM>) Gecko/<NUM> Firefox/<NUM>"`, `"-"`},
+	},
+	// Linux systemd (journalctl) logs
+	{
+		`Mar 27 11:52:21 hostname systemd-networkd[2043]: enp6s0: LLDP Rx: Invoking callback for 'refreshed' event.`,
+		[]string{"<TIMESTAMP>", "hostname", "systemd-networkd[<NUM>]:", "enp6s0:", "LLDP", "Rx:", "Invoking", "callback", "for", "'refreshed'", "event."},
+	},
+	{
+		`Feb 29 23:00:14 nixos dbus-daemon[11432]: [system] Activating via systemd: service name='org.opensuse.Snapper' unit='snapperd.service' requested by ':1.324' (uid=0 pid=22089 comm="/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe" label="kernel")`,
+		[]string{"<TIMESTAMP>", "nixos", "dbus-daemon[<NUM>]:", "[system]", "Activating", "via", "systemd:", "service", "name='org.opensuse.Snapper'", "unit='snapperd.service'", "requested", "by", "':<NUM>'", "(uid=<NUM>", "pid=<NUM>", "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\"", "label=\"kernel\")"},
+	},
+	// Random slack logs:
+	{
+		`Apr-10 23:43:46.807 [API-Q] (T02S4RCS0) c37dfd20-1712781826.804 conversations.suggestions is ACTIVE`,
+		[]string{"<TIMESTAMP>", "[API-Q]", "(T02S4RCS0)", "c37dfd20-<NUM>", "conversations.suggestions", "is", "ACTIVE"},
+	},
+	{
+		`Apr-11 00:01:57.743 [DEVICE-PERMISSIONS-MA] Permissions saved to local storage: {"permissions":{"microphone":"granted","camera":"prompt","screen":"prompt"},"timestamp":1712782917742}`,
+		[]string{"<TIMESTAMP>", "[DEVICE-PERMISSIONS-MA]", "Permissions", "saved", "to", "local", "storage:", `{"permissions":`, `{"microphone":`, `"granted"`, `,"camera":`, `"prompt"`, `,"screen":`, `"prompt"}`, `,"timestamp":`, `<NUM>}`},
+	},
+	// Another weird log from loki:
+	{
+		`ts=2023-09-06T00:59:59.900879737Z caller=spanlogger.go:86 user=29 level=debug msg="querying ingester" params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=2023-09-05 23:20:28.030285153 +0000 UTC, end=2023-09-05 23:20:48.030285153 +0000 UTC, limit=1000, shards="`,
+		[]string{"ts=<TIMESTAMP>", "caller=spanlogger.go:<NUM>", "user=<NUM>", "level=debug", `msg="querying ingester"`, `params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=<TIMESTAMP>, end=<TIMESTAMP>, limit=<NUM>, shards="`},
+	},
+	// {``, []string{}},
+}
+
+func getTimestampTests(t *testing.T) []tokenizationTestCase {
+	timeStamps := []string{
+		"2023-01-03T15:04:05.999999999Z",
+		"2024-02-29T11:12:03.33+02:00",
+		"2022-03-02T23:59:59Z",
+		"2024-04-01T01:33:59+03:00",
+		"2021-05-07T00:00:00-04:30",
+		"1999-06-12T13:37:31.123+03:00",
+		"2005-07-17T12:00:00Z",
+		"1988-08-03T00:00:00Z",
+		"2020-09-30T00:00:59.9999+03:00",
+		"2034-10-31T12:34:56.7890+11:23",
+		"2034-11-01T01:23:45.67890Z",
+		"2025-12-31T01:23:45.67890Z",
+	}
+
+	timestampConst := string(placeholderTimestamp)
+	layouts := []struct {
+		layout    string
+		expResult []string
+	}{
+		// TOFIX: all of the commented out lines.
+
+		//{time.Layout, []string{timestampConst}},
+		{time.ANSIC, []string{timestampConst}},
+		{time.UnixDate, []string{timestampConst}},
+		{"Mon _2 Jan 15:04:05 MST 2006", []string{timestampConst}}, // linux `date`
+		{time.RubyDate, []string{timestampConst}},
+		{time.RFC822, []string{timestampConst}},
+		{time.RFC822Z, []string{timestampConst}},
+		{time.RFC850, []string{timestampConst}},
+		{time.RFC1123, []string{timestampConst}},
+		{time.RFC1123Z, []string{timestampConst}},
+		{time.RFC3339, []string{timestampConst}},
+		{time.RFC3339Nano, []string{timestampConst}},
+		//{time.Stamp, []string{timestampConst}},
+		//{time.StampMilli, []string{timestampConst}},
+		//{time.StampMicro, []string{timestampConst}},
+		//{time.StampNano, []string{timestampConst}},
+		{time.DateTime, []string{timestampConst}},
+
+		// TOFIX: maybe consider these timestamps?
+		{time.DateOnly, []string{"<NUM>-<NUM>-<NUM>"}},
+		{time.TimeOnly, []string{"<NUM>:<NUM>:<NUM>"}},
+	}
+
+	result := make([]tokenizationTestCase, 0, len(timeStamps)*len(layouts))
+	for _, tss := range timeStamps {
+		ts, err := time.Parse(time.RFC3339Nano, tss)
+		require.NoError(t, err)
+		for _, layout := range layouts {
+			result = append(result, tokenizationTestCase{
+				line:      ts.Format(layout.layout),
+				expResult: layout.expResult,
+			})
+		}
+	}
+	return result
+}
+
+type tokenizationTestCasePack struct {
+	name  string
+	cases []tokenizationTestCase
+}
+
+func TestTokenization(t *testing.T) {
+	packs := []tokenizationTestCasePack{
+		{"cornerTestCases", tokenizationCornerTestCases},
+		{"timestamps", getTimestampTests(t)},
+		{"realisticTestCases", tokenizationRealisticTestCases},
+	}
+
+	for i, pack := range packs {
+		pack := pack
+		t.Run(fmt.Sprintf("test-pack-%d-%s", i, pack.name), func(t *testing.T) {
+			for j, tc := range pack.cases {
+				tc := tc
+				t.Run(fmt.Sprintf("case-%d", j), func(t *testing.T) {
+					result := PreprocessAndTokenize([]byte(tc.line))
+					assert.Equal(
+						t, tc.expResult, result,
+						fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result),
+					)
+				})
+			}
+		})
+	}
+}
+
+func TestTokenizationMemcpy(t *testing.T) {
+	buf := make([]byte, 0, 1000)
+	for i := 0; i < 300; i++ {
+		buf = append(buf, 'a', ' ')
+	}
+	tokenizer := tokenizer{rawLine: buf, maxTokens: 10}
+	tokenizer.tokenize()
+	require.Less(t, len(tokenizer.line), 100)
+}
+
+// Useful for running single test cases in isolation
+func TestTokenizationPlayground(t *testing.T) {
+	tc := tokenizationTestCase{
+		"foo 121113.21231 bar 123.0.1.123 -123 -1231.11",
+		[]string{"foo", "<NUM>", "bar", "<IP>", "<NUM>", "<NUM>"},
+	}
+	result := PreprocessAndTokenize([]byte(tc.line))
+	assert.Equal(
+		t, tc.expResult, result,
+		fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result),
+	)
+}
+
+func TestReplacementPlayground(t *testing.T) {
+	line := "Jul  1 00:21:28"
+	exp := "<TIMESTAMP>"
+	r := replacer{source: []byte(line)}
+	r.replaceWithPlaceholders()
+
+	assert.Equal(
+		t, exp, string(r.dest),
+		fmt.Sprintf("Log line: %q", line),
+	)
+}