@ -21,53 +21,54 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
} ,
{
" foo " ,
[ ] string { "foo" } ,
// TOFIX: maybe this is not exactly what we want? should we try to combine empty space into a single token?
[ ] string { " " , "foo " , " " } ,
} ,
{
"foo bar baz" ,
[ ] string { "foo" , "bar" , "baz" } ,
[ ] string { "foo " , "bar " , "baz" } ,
} ,
{
"\nfoo\t bar baz\r\n" ,
// TOFIX: remove empty tokens ?
[ ] string { "foo" , "" , "" , "bar" , "baz" } ,
// TOFIX: same as above - should we combine empty space into a single token ?
[ ] string { "\n" , " foo\t " , " " , " " , "bar " , "baz\r" , "\n " } ,
} ,
{
"ends single char C" ,
[ ] string { "ends" , "single" , "char" , "C" } ,
[ ] string { "ends " , "single " , "char " , "C" } ,
} ,
{
"0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Z Y X W V U T S R Q P O N M L K J I H G F E D C B A z y x w v u t s r q p o n m l k j i h g f e d c b a 9 8 7 6 5 4 3 2 1 0" ,
// The tail end of strings longer than maxTokens is returned as a single token
[ ] string { "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "<NUM>" , "a" , "b" , "c" , "d" , "e" , "f" , "g" , "h" , "i" , "j" , "k" , "l" , "m" , "n" , "o" , "p" , "q" , "r" , "s" , "t" , "u" , "v" , "w" , "x" , "y" , "z" , "A" , "B" , "C" , "D" , "E" , "F" , "G" , "H" , "I" , "J" , "K" , "L" , "M" , "N" , "O" , "P" , "Q" , "R" , "S" , "T" , "U" , "V" , "W" , "X" , "Y" , "Z" , "Z" , "Y" , "X" , "W" , "V" , "U" , "T" , "S" , "R" , "Q" , "P" , "O" , "N" , "M" , "L" , "K" , "J" , "I" , "H" , "G" , "F" , "E" , "D" , "C" , "B" , "A" , "z" , "y" , "x" , "w" , "v" , "u" , "t" , "s" , "r" , "q" , "p" , "<...>" } ,
[ ] string { "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "<NUM> " , "a " , "b " , "c " , "d " , "e " , "f " , "g " , "h " , "i " , "j " , "k " , "l " , "m " , "n " , "o " , "p " , "q " , "r " , "s " , "t " , "u " , "v " , "w " , "x " , "y " , "z " , "A " , "B " , "C " , "D " , "E " , "F " , "G " , "H " , "I " , "J " , "K " , "L " , "M " , "N " , "O " , "P " , "Q " , "R " , "S " , "T " , "U " , "V " , "W " , "X " , "Y " , "Z " , "Z " , "Y " , "X " , "W " , "V " , "U " , "T " , "S " , "R " , "Q " , "P " , "O " , "N " , "M " , "L " , "K " , "J " , "I " , "H " , "G " , "F " , "E " , "D " , "C " , "B " , "A " , "z " , "y " , "x " , "w " , "v " , "u " , "t " , "s " , "r " , "q " , "p " , "<...>" } ,
} ,
{
` a "quoted string" ` ,
[ ] string { "a" , ` "quoted string" ` } ,
[ ] string { "a " , ` "quoted string" ` } ,
} ,
{
` a "quoted string with \" escapes!" ` ,
[ ] string { "a" , ` "quoted string with \" escapes!" ` } ,
[ ] string { "a " , ` "quoted string with \" escapes!" ` } ,
} ,
{
` a 'singly quoted string"' ` ,
[ ] string { "a" , ` 'singly quoted string"' ` } ,
[ ] string { "a " , ` 'singly quoted string"' ` } ,
} ,
{
` a 'singly quoted string" \'' ` ,
[ ] string { "a" , ` 'singly quoted string" \'' ` } ,
[ ] string { "a " , ` 'singly quoted string" \'' ` } ,
} ,
{
` a 'singly quoted string" \\\'' ` ,
[ ] string { "a" , ` 'singly quoted string" \\\'' ` } ,
[ ] string { "a " , ` 'singly quoted string" \\\'' ` } ,
} ,
{
` a'twisted excappe\\' "with an unterminated quote" 'at_the_end ` ,
[ ] string { ` a'twisted excappe\\' ` , ` "with an unterminated quote" ` , ` ' ` , "at_the_end" } ,
[ ] string { ` a'twisted excappe\\' ` , ` "with an unterminated quote" ` , ` ' ` , "at_the_end" } ,
} ,
{
` a "quoted string 'inception'"! ` + "`woot`'much`wow'" ,
[ ] string { "a" , ` "quoted string 'inception'"! ` + "`woot`'much`wow'" } ,
[ ] string { "a " , ` "quoted string 'inception'"! ` + "`woot`'much`wow'" } ,
} ,
{
` unterminated"quote ` ,
@ -75,11 +76,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
} ,
{
"`mix`" + ` "and" 'match' ` ,
[ ] string { "`mix`" , ` "and" ` , ` 'match' ` } ,
[ ] string { "`mix` " , ` "and" ` , ` 'match' ` } ,
} ,
{
"`mix`" + ` "and" 'match' ` ,
[ ] string { "`mix`" , ` "and" ` , ` 'match' ` } ,
[ ] string { "`mix` " , ` "and" ` , ` 'match' ` } ,
} ,
{
` { "json":"uninterrupted \"logline\"","foo":"bar"} ` ,
@ -104,60 +105,60 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
// We deliberately do treat "escaped" whitespaces outside of quotes as
// delimiters, i.e. whitespaces outside of strings cannot be escaped.
` weird\ escape ` ,
[ ] string { ` weird\ ` , ` escape ` } ,
[ ] string { ` weird\ ` , ` escape ` } ,
} ,
{
"-3.14-foo 0.0.0.0/24-0.0.0.1-255.255.255.255-256.255.255.255 1337-ber 0.12-ber n0tnumb3er 12faux -123.0.1.123 -123 -1231.11 333. 123.456. 123.45-" ,
[ ] string { "<NUM>-foo" , "<IP>/<NUM>-<IP>-<IP>-<NUM>.<NUM>.<NUM>.<NUM>" , "<NUM>-ber" , "<NUM>-ber" , "n0tnumb3er" , "12faux" , "-<IP>" , "<NUM>" , "<NUM>" , "<NUM>." , "<NUM>.<NUM>." , "<NUM>-" } ,
[ ] string { "<NUM>-foo " , "<IP>/<NUM>-<IP>-<IP>-<NUM>.<NUM>.<NUM>.<NUM> " , "<NUM>-ber " , "<NUM>-ber " , "n0tnumb3er " , "12faux " , "-<IP> " , "<NUM> " , "<NUM> " , "<NUM>. " , "<NUM>.<NUM>. " , "<NUM>-" } ,
} ,
{
"2022-12-31 12:12:31 3022-12-31 12:12:31-Jul 1 00:21:28" ,
[ ] string { "<TIMESTAMP>" , "<NUM>-<NUM>-<NUM>" , "<NUM>:<NUM>:<NUM>-<TIMESTAMP>" } ,
[ ] string { "<TIMESTAMP> " , "<NUM>-<NUM>-<NUM> " , "<NUM>:<NUM>:<NUM>-<TIMESTAMP>" } ,
} ,
{
"2022/12/01 12:12:31 - 2022/13/32 12:12:31" ,
[ ] string { "<TIMESTAMP>" , "-" , "<NUM>/<NUM>/<NUM>" , "<NUM>:<NUM>:<NUM>" } ,
[ ] string { "<TIMESTAMP> " , "- " , "<NUM>/<NUM>/<NUM> " , "<NUM>:<NUM>:<NUM>" } ,
} ,
{
"UUIDS: 123e4567-e89b-12d3-a456-426614174000, 550E8400-E29B-41D4-A716-446655440000, -00000000-0000-0000-0000-000000000000, 12345678-dead-beef-1337-000000000000 {c6ad1a63-10b5-460e-ab2c-05c13604539d} ''<A3AE4842-E9AA-4C27-9509-772DB3CC3190>''" ,
[ ] string { "UUIDS:" , "<UUID>," , "<UUID>," , "-<UUID>," , "<UUID>" , "{<UUID>}" , "''<<UUID>>''" } ,
[ ] string { "UUIDS: " , "<UUID>, " , "<UUID>, " , "-<UUID>, " , "<UUID> " , "{<UUID>} " , "''<<UUID>>''" } ,
} ,
// Mixed case UUID and hex strings are ignored, to limit false positives
{
"Not UUIDS: 123e4567-E89B-12d3-a456-426614174000, 1234567-dead-beef-1337-00000000000a" ,
[ ] string { "Not" , "UUIDS:" , "123e4567-E89B-12d3-a456-<NUM>," , "<NUM>-dead-beef-<NUM>-<HEX>" } ,
[ ] string { "Not " , "UUIDS: " , "123e4567-E89B-12d3-a456-<NUM>, " , "<NUM>-dead-beef-<NUM>-<HEX>" } ,
} ,
{
"Hexes: 0x0123456789 0xabcdef0123 deadbeef1337-ABCDEF0123456?0123456789ab:FFFFAAAAFFFF Curses: 0x012345678 dEaDbeef1337 abcdefabcde ABCDEFABCDE 0xASDFASDFASDF abcdef0123456NOT" ,
[ ] string { "Hexes:" , "<HEX>" , "<HEX>" , "<HEX>-<HEX>?<HEX>:<HEX>" , "Curses:" , "0x012345678" , "dEaDbeef1337" , "abcdefabcde" , "ABCDEFABCDE" , "0xASDFASDFASDF" , "abcdef0123456NOT" } ,
[ ] string { "Hexes: " , "<HEX> " , "<HEX> " , "<HEX>-<HEX>?<HEX>:<HEX> " , "Curses: " , "0x012345678 " , "dEaDbeef1337 " , "abcdefabcde " , "ABCDEFABCDE " , "0xASDFASDFASDF " , "abcdef0123456NOT" } ,
} ,
{
"30546354_3313121680 0_123_456_789 foo_123" ,
[ ] string { "<NUM>_<NUM>" , "<NUM>_<NUM>_<NUM>_<NUM>" , "foo_<NUM>" } ,
[ ] string { "<NUM>_<NUM> " , "<NUM>_<NUM>_<NUM>_<NUM> " , "foo_<NUM>" } ,
} ,
{
` 3.31ms/1h2m|-12h2m6.1s 31m "165m2.1s(6h0m12.05us)" -451325.31µs 6m23μs 123h21m3.4124561s/0s/-0.0123ms ` ,
[ ] string { "<DURATION>/<DURATION>|<DURATION>" , "<DURATION>" , ` "<DURATION>(<DURATION>)" ` , "<DURATION>" , "<DURATION>" , "<DURATION>/<DURATION>/<DURATION>" } ,
[ ] string { "<DURATION>/<DURATION>|<DURATION> " , "<DURATION> " , ` "<DURATION>(<DURATION>)" ` , "<DURATION> " , "<DURATION> " , "<DURATION>/<DURATION>/<DURATION>" } ,
} ,
{
// Invalid duration values
"3.31.1ms 3h121m3.4124561s 1h0.12s 100usa 0.12msa" ,
[ ] string { "<NUM>.<DURATION>" , "3h121m3.<DURATION>" , "1h0.<DURATION>" , "100usa" , "0.12msa" } ,
[ ] string { "<NUM>.<DURATION> " , "3h121m3.<DURATION> " , "1h0.<DURATION> " , "100usa " , "0.12msa" } ,
} ,
{
// We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense)
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b" ,
[ ] string {
"<BYTESIZE>" , "<BYTESIZE>-<BYTESIZE>" , "<BYTESIZE>" , "<BYTESIZE>" , "<BYTESIZE>:<BYTESIZE>" ,
"<BYTESIZE>;<BYTESIZE>/<BYTESIZE>" , "-" , "<BYTESIZE>" , "or" , "<NUM>" , "BeNot" , "<NUM>" , "b" , "<BYTESIZE>" } ,
"<BYTESIZE> " , "<BYTESIZE>-<BYTESIZE> " , "<BYTESIZE> " , "<BYTESIZE> " , "<BYTESIZE>:<BYTESIZE> " ,
"<BYTESIZE>;<BYTESIZE>/<BYTESIZE> " , "- " , "<BYTESIZE> " , "or " , "<NUM> " , "BeNot " , "<NUM> " , "b " , "<BYTESIZE>" } ,
} ,
{
` status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200 ` ,
[ ] string { "status=123" , "status_code:500" , "status" , "200" , ` status="-1" ` , ` status_code:"404" ` , "httpStatus=200" } ,
[ ] string { "status=123 " , "status_code:500 " , "status " , "200 " , ` status="-1" ` , ` status_code:"404" ` , "httpStatus=200" } ,
} ,
{
` status_code_foo=123 status_code:500.1 status 2023-09-06T00:59:59.98 status:"404KiB" ` ,
[ ] string { "status_code_foo=<NUM>" , "status_code:<NUM>" , "status" , "<TIMESTAMP>" , ` status:"<BYTESIZE>" ` } ,
[ ] string { "status_code_foo=<NUM> " , "status_code:<NUM> " , "status " , "<TIMESTAMP> " , ` status:"<BYTESIZE>" ` } ,
} ,
}
@ -168,27 +169,27 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
` level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query=" { stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234 ` ,
[ ] string {
"level=info" , "ts=<TIMESTAMP>" , "caller=metrics.go:<NUM>" , "component=frontend" , "org_id=<NUM>" , "traceID=<HEX>" , "latency=fast" , ` query=" { stream=\"stdout\",pod=\"loki-canary-nl54q\"} " ` , "query_hash=<NUM>" , "query_type=limited" , "range_type=range" , "length=<DURATION>" , "start_delta=<DURATION>" , "end_delta=<DURATION>" , "step=<DURATION>" , "duration=<DURATION>" , "status=200" , "limit=<NUM>" , "returned_lines=<NUM>" , "throughput=<BYTESIZE>" , "total_bytes=<BYTESIZE>" , "total_bytes_non_indexed_labels=<BYTESIZE>" , "lines_per_second=<NUM>" , "total_lines=<NUM>" , "post_filter_lines=<NUM>" , "total_entries=<NUM>" , "store_chunks_download_time=<DURATION>" , "queue_time=<DURATION>" , "splits=<NUM>" , "shards=<NUM>" , "chunk_refs_fetch_time=<DURATION>" , "cache_chunk_req=<NUM>" , "cache_chunk_hit=<NUM>" , "cache_chunk_bytes_stored=<NUM>" , "cache_chunk_bytes_fetched=<NUM>" , "cache_chunk_download_time=<DURATION>" , "cache_index_req=<NUM>" , "cache_index_hit=<NUM>" , "cache_index_download_time=<DURATION>" , "cache_stats_results_req=<NUM>" , "cache_stats_results_hit=<NUM>" , "cache_stats_results_download_time=<DURATION>" , "cache_result_req=<NUM>" , "cache_result_hit=<NUM>" , "cache_result_download_time=<DURATION>" , "token_id=gcom-<NUM>" ,
"level=info " , "ts=<TIMESTAMP> " , "caller=metrics.go:<NUM> " , "component=frontend " , "org_id=<NUM> " , "traceID=<HEX> " , "latency=fast " , ` query=" { stream=\"stdout\",pod=\"loki-canary-nl54q\"} " ` , "query_hash=<NUM> " , "query_type=limited " , "range_type=range " , "length=<DURATION> " , "start_delta=<DURATION> " , "end_delta=<DURATION> " , "step=<DURATION> " , "duration=<DURATION> " , "status=200 " , "limit=<NUM> " , "returned_lines=<NUM> " , "throughput=<BYTESIZE> " , "total_bytes=<BYTESIZE> " , "total_bytes_non_indexed_labels=<BYTESIZE> " , "lines_per_second=<NUM> " , "total_lines=<NUM> " , "post_filter_lines=<NUM> " , "total_entries=<NUM> " , "store_chunks_download_time=<DURATION> " , "queue_time=<DURATION> " , "splits=<NUM> " , "shards=<NUM> " , "chunk_refs_fetch_time=<DURATION> " , "cache_chunk_req=<NUM> " , "cache_chunk_hit=<NUM> " , "cache_chunk_bytes_stored=<NUM> " , "cache_chunk_bytes_fetched=<NUM> " , "cache_chunk_download_time=<DURATION> " , "cache_index_req=<NUM> " , "cache_index_hit=<NUM> " , "cache_index_download_time=<DURATION> " , "cache_stats_results_req=<NUM> " , "cache_stats_results_hit=<NUM> " , "cache_stats_results_download_time=<DURATION> " , "cache_result_req=<NUM> " , "cache_result_hit=<NUM> " , "cache_result_download_time=<DURATION> " , "token_id=gcom-<NUM>" ,
} ,
} ,
// logfmt from loki, with string multi-word messages
{
` level=debug ts=2023-09-06T00:59:59.98214402Z caller=shard_resolver.go:114 bytes=205kB chunks=2 streams=2 entries=200 msg="queried index" type=single matchers=" { stream=\"stdout\", pod=\"loki-canary-v75j4\"}" duration=9.498885ms from=2023-09-06T00:48:53.138Z through=2023-09-06T00:49:43.138Z length=50s ` ,
[ ] string {
"level=debug" , "ts=<TIMESTAMP>" , "caller=shard_resolver.go:<NUM>" , "bytes=<BYTESIZE>" , "chunks=<NUM>" , "streams=<NUM>" , "entries=<NUM>" , ` msg="queried index" ` , "type=single" , ` matchers=" { stream=\"stdout\", pod=\"loki-canary-v75j4\"}" ` , "duration=<DURATION>" , "from=<TIMESTAMP>" , "through=<TIMESTAMP>" , "length=<DURATION>" ,
"level=debug " , "ts=<TIMESTAMP> " , "caller=shard_resolver.go:<NUM> " , "bytes=<BYTESIZE> " , "chunks=<NUM> " , "streams=<NUM> " , "entries=<NUM> " , ` msg="queried index" ` , "type=single " , ` matchers=" { stream=\"stdout\", pod=\"loki-canary-v75j4\"}" ` , "duration=<DURATION> " , "from=<TIMESTAMP> " , "through=<TIMESTAMP> " , "length=<DURATION>" ,
} ,
} ,
// tricky loki distributor message:
{
` level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37 ` ,
[ ] string {
"level=debug" , "ts=<TIMESTAMP>" , "caller=push.go:<NUM>" , "org_id=<NUM>" , "traceID=<HEX>" , ` msg="push request parsed" ` , "path=/loki/api/v1/push" , "contentType=application/x-protobuf" , "contentEncoding=" , ` bodySize="<BYTESIZE>" ` , "streams=<NUM>" , "entries=<NUM>" , ` streamLabelsSize="<BYTESIZE>" ` , ` entriesSize="<BYTESIZE>" ` , ` structuredMetadataSize="<BYTESIZE>" ` , ` totalSize="<BYTESIZE>" ` , "mostRecentLagMs=<NUM>" , "adaptiveLogsDroppedLines=<NUM>" , "adaptiveLogsDroppedSize=<NUM>" , "adaptiveLogsMatchedLines=<NUM>" ,
"level=debug " , "ts=<TIMESTAMP> " , "caller=push.go:<NUM> " , "org_id=<NUM> " , "traceID=<HEX> " , ` msg="push request parsed" ` , "path=/loki/api/v1/push " , "contentType=application/x-protobuf " , "contentEncoding= " , ` bodySize="<BYTESIZE>" ` , "streams=<NUM> " , "entries=<NUM> " , ` streamLabelsSize="<BYTESIZE>" ` , ` entriesSize="<BYTESIZE>" ` , ` structuredMetadataSize="<BYTESIZE>" ` , ` totalSize="<BYTESIZE>" ` , "mostRecentLagMs=<NUM> " , "adaptiveLogsDroppedLines=<NUM> " , "adaptiveLogsDroppedSize=<NUM> " , "adaptiveLogsMatchedLines=<NUM>" ,
} ,
} ,
// random JSON logs
{
` { "timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"} ` ,
[ ] string { ` { "timestamp": ` , ` "<TIMESTAMP>", ` , ` "level": ` , ` "debug", ` , ` "message": ` , ` "Server starting", ` , ` "server_id": ` , ` "abcdefghij", ` , ` "start_time": ` , ` "<TIMESTAMP>"} ` } ,
[ ] string { ` { "timestamp": ` , ` "<TIMESTAMP>", ` , ` "level": ` , ` "debug", ` , ` "message": ` , ` "Server starting", ` , ` "server_id": ` , ` "abcdefghij", ` , ` "start_time": ` , ` "<TIMESTAMP>"} ` } ,
} ,
{
// JSON logs without spaces between elements, like how JavaScript's JSON.Stringify() produces:
@ -203,7 +204,7 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
// - timestamp is not correctly detected
// - empty "" token?
` 03-17 16:13:40.345 1702 14638 D PowerManagerService: release:lock=166121161, flg=0x0, tag="RILJ_ACK_WL", name=com.android.phone", ws=null, uid=1001, pid=2626 ` ,
[ ] string { "<NUM>-<NUM>" , "<NUM>:<NUM>:<NUM>" , "" , "<NUM>" , "<NUM>" , "D" , "PowerManagerService:" , "release:lock=<NUM>," , "flg=0x0," , ` tag="RILJ_ACK_WL", ` , ` name=com.android.phone" ` , "," , "ws=null," , "uid=<NUM>," , "pid=<NUM>" } ,
[ ] string { "<NUM>-<NUM> " , "<NUM>:<NUM>:<NUM> " , " " , "<NUM> " , "<NUM> " , "D " , "PowerManagerService: " , "release:lock=<NUM>, " , "flg=0x0, " , ` tag="RILJ_ACK_WL", ` , ` name=com.android.phone" ` , ", " , "ws=null, " , "uid=<NUM>, " , "pid=<NUM>" } ,
} ,
{
// TOFIX:
@ -211,52 +212,52 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
// - empty "" tokens
` 03-17 16:13:47.518 1702 8671 D ActivityManager: Skipping, withExcluded: false, tr.intent:Intent { act=android.intent.action.VIEW dat=file:///storage/emulated/0/Tencent/QQfile_recv/b.apk typ=application/vnd.android.package-archive flg=0x10800000 cmp=com.android.packageinstaller/.PackageInstallerActivity (has extras) } ` ,
[ ] string {
"<NUM>-<NUM>" , "<NUM>:<NUM>:<NUM>" , "" , "<NUM>" , "" , "<NUM>" , "D" , "ActivityManager:" , "Skipping," , "withExcluded:" , "false," , "tr.intent:Intent" , "{" , "act=android.intent.action.VIEW" , "dat=file:///storage/emulated/<NUM>/Tencent/QQfile_recv/b.apk" , "typ=application/vnd.android.package-archive" , "flg=0x10800000" , "cmp=com.android.packageinstaller/.PackageInstallerActivity" , "(has" , "extras)" , "}" ,
"<NUM>-<NUM> " , "<NUM>:<NUM>:<NUM> " , " " , "<NUM> " , " " , "<NUM> " , "D " , "ActivityManager: " , "Skipping, " , "withExcluded: " , "false, " , "tr.intent:Intent " , "{ " , "act=android.intent.action.VIEW " , "dat=file:///storage/emulated/<NUM>/Tencent/QQfile_recv/b.apk " , "typ=application/vnd.android.package-archive " , "flg=0x10800000 " , "cmp=com.android.packageinstaller/.PackageInstallerActivity " , "(has " , "extras) " , "}" ,
} ,
} ,
// Apache logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Apache/Apache_2k.log
{
` [Mon Dec 05 13:16:27 2005] [notice] jk2_init() Found child 5877 in scoreboard slot 9 ` ,
[ ] string { "[<TIMESTAMP>]" , "[notice]" , "jk2_init()" , "Found" , "child" , "<NUM>" , "in" , "scoreboard" , "slot" , "<NUM>" } ,
[ ] string { "[<TIMESTAMP>] " , "[notice] " , "jk2_init() " , "Found " , "child " , "<NUM> " , "in " , "scoreboard " , "slot " , "<NUM>" } ,
} ,
{
` [Mon Dec 05 19:14:11 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties ` ,
[ ] string { "[<TIMESTAMP>]" , "[notice]" , "workerEnv.init()" , "ok" , "/etc/httpd/conf/workers2.properties" } ,
[ ] string { "[<TIMESTAMP>] " , "[notice] " , "workerEnv.init() " , "ok " , "/etc/httpd/conf/workers2.properties" } ,
} ,
// nginx logs by running `docker run -p 80:80 -v $(pwd):/usr/share/nginx/html nginx` locally
{
` 2024/03/27 14:31:42 [error] 29#29: *1 directory index of "/usr/share/nginx/html/" is forbidden, client: 172.17.0.1, server: localhost, request: "GET / HTTP/1.1", host: "127.0.0.1" ` ,
[ ] string { "<TIMESTAMP>" , "[error]" , "<NUM>#<NUM>:" , "*<NUM>" , "directory" , "index" , "of" , "\"/usr/share/nginx/html/\"" , "is" , "forbidden," , "client:" , "<IP>," , "server:" , "localhost," , "request:" , ` "GET / HTTP/<NUM>", ` , "host:" , "\"<IP>\"" } ,
[ ] string { "<TIMESTAMP> " , "[error] " , "<NUM>#<NUM>: " , "*<NUM> " , "directory " , "index " , "of " , "\"/usr/share/nginx/html/\" " , "is " , "forbidden, " , "client: " , "<IP>, " , "server: " , "localhost, " , "request: " , ` "GET / HTTP/<NUM>", ` , "host: " , "\"<IP>\"" } ,
} ,
{
// TOFIX:
// - probably not all numbers should be replaced with <NUM>, e.g. for "*1", "(2:", "HTTP/1.1" it's definitely a worse UX
` 2024 / 03 / 27 14 : 34 : 37 [ error ] 29 # 29 : * 1 open ( ) "/usr/share/nginx/html/test url with spaces" failed ( 2 : No such file or directory ) , client : 172.17 .0 .1 , server : localhost , request : "GET /test%20url%20with%20spaces HTTP/1.1" , host : "127.0.0.1"
172.17 .0 .1 - - [ 31 / Mar / 2024 : 14 : 34 : 37 + 0000 ] "GET /test%20url%20with%20spaces HTTP/1.1" 404 153 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0" "-" ` ,
[ ] string { "<TIMESTAMP>" , "[error]" , "<NUM>#<NUM>:" , "*<NUM>" , "open()" , ` "/usr/share/nginx/html/test url with spaces" ` , "failed" , "(<NUM>:" , "No" , "such" , "file" , "or" , "directory)," , "client:" , "<IP>," , "server:" , "localhost," , "request:" , ` "GET /test%20url%20with%20spaces HTTP/<NUM>", ` , "host:" , "\"<IP>\"" , "" , "" , "<IP>" , "-" , "-" , "[<TIMESTAMP>]" , ` "GET /test%20url%20with%20spaces HTTP/<NUM>" ` , "<NUM>" , "<NUM>" , "\"-\"" , ` "Mozilla/<NUM> (X11; Linux x86_<NUM>; rv:<NUM>) Gecko/<NUM> Firefox/<NUM>" ` , ` "-" ` } ,
[ ] string { "<TIMESTAMP> " , "[error] " , "<NUM>#<NUM>: " , "*<NUM> " , "open() " , ` "/usr/share/nginx/html/test url with spaces" ` , "failed " , "(<NUM>: " , "No " , "such " , "file " , "or " , "directory), " , "client: " , "<IP>, " , "server: " , "localhost, " , "request: " , ` "GET /test%20url%20with%20spaces HTTP/<NUM>", ` , "host: " , "\"<IP>\"\n " , "\t " , "\t " , "<IP> " , "- " , "- " , "[<TIMESTAMP>] " , ` "GET /test%20url%20with%20spaces HTTP/<NUM>" ` , "<NUM> " , "<NUM> " , "\"-\" " , ` "Mozilla/<NUM> (X11; Linux x86_<NUM>; rv:<NUM>) Gecko/<NUM> Firefox/<NUM>" ` , ` "-" ` } ,
} ,
// Linux systemd (journalctl) logs
{
` Mar 27 11:52:21 hostname systemd-networkd[2043]: enp6s0: LLDP Rx: Invoking callback for 'refreshed' event. ` ,
[ ] string { "<TIMESTAMP>" , "hostname" , "systemd-networkd[<NUM>]:" , "enp6s0:" , "LLDP" , "Rx:" , "Invoking" , "callback" , "for" , "'refreshed'" , "event." } ,
[ ] string { "<TIMESTAMP> " , "hostname " , "systemd-networkd[<NUM>]: " , "enp6s0: " , "LLDP " , "Rx: " , "Invoking " , "callback " , "for " , "'refreshed' " , "event." } ,
} ,
{
` Feb 29 23:00:14 nixos dbus-daemon[11432]: [system] Activating via systemd: service name='org.opensuse.Snapper' unit='snapperd.service' requested by ':1.324' (uid=0 pid=22089 comm="/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe" label="kernel") ` ,
[ ] string { "<TIMESTAMP>" , "nixos" , "dbus-daemon[<NUM>]:" , "[system]" , "Activating" , "via" , "systemd:" , "service" , "name='org.opensuse.Snapper'" , "unit='snapperd.service'" , "requested" , "by" , "':<NUM>'" , "(uid=<NUM>" , "pid=<NUM>" , "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\"" , "label=\"kernel\")" } ,
[ ] string { "<TIMESTAMP> " , "nixos " , "dbus-daemon[<NUM>]: " , "[system] " , "Activating " , "via " , "systemd: " , "service " , "name='org.opensuse.Snapper' " , "unit='snapperd.service' " , "requested " , "by " , "':<NUM>' " , "(uid=<NUM> " , "pid=<NUM> " , "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\" " , "label=\"kernel\")" } ,
} ,
// Random slack logs:
{
` Apr-10 23:43:46.807 [API-Q] (T02S4RCS0) c37dfd20-1712781826.804 conversations.suggestions is ACTIVE ` ,
[ ] string { "<TIMESTAMP>" , "[API-Q]" , "(T02S4RCS0)" , "c37dfd20-<NUM>" , "conversations.suggestions" , "is" , "ACTIVE" } ,
[ ] string { "<TIMESTAMP> " , "[API-Q] " , "(T02S4RCS0) " , "c37dfd20-<NUM> " , "conversations.suggestions " , "is " , "ACTIVE" } ,
} ,
{
` Apr-11 00:01:57.743 [DEVICE-PERMISSIONS-MA] Permissions saved to local storage: { "permissions": { "microphone":"granted","camera":"prompt","screen":"prompt"},"timestamp":1712782917742} ` ,
[ ] string { "<TIMESTAMP>" , "[DEVICE-PERMISSIONS-MA]" , "Permissions" , "saved" , "to" , "local" , "storage:" , ` { "permissions": ` , ` { "microphone": ` , ` "granted" ` , ` ,"camera": ` , ` "prompt" ` , ` ,"screen": ` , ` "prompt"} ` , ` ,"timestamp": ` , ` <NUM>} ` } ,
[ ] string { "<TIMESTAMP> " , "[DEVICE-PERMISSIONS-MA] " , "Permissions " , "saved " , "to " , "local " , "storage: " , ` { "permissions": ` , ` { "microphone": ` , ` "granted" ` , ` ,"camera": ` , ` "prompt" ` , ` ,"screen": ` , ` "prompt"} ` , ` ,"timestamp": ` , ` <NUM>} ` } ,
} ,
// Another weird log from loki:
{
` ts=2023-09-06T00:59:59.900879737Z caller=spanlogger.go:86 user=29 level=debug msg="querying ingester" params="selector= { stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=2023-09-05 23:20:28.030285153 +0000 UTC, end=2023-09-05 23:20:48.030285153 +0000 UTC, limit=1000, shards=" ` ,
[ ] string { "ts=<TIMESTAMP>" , "caller=spanlogger.go:<NUM>" , "user=<NUM>" , "level=debug" , ` msg="querying ingester" ` , ` params="selector= { stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=<TIMESTAMP>, end=<TIMESTAMP>, limit=<NUM>, shards=" ` } ,
[ ] string { "ts=<TIMESTAMP> " , "caller=spanlogger.go:<NUM> " , "user=<NUM> " , "level=debug " , ` msg="querying ingester" ` , ` params="selector= { stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=<TIMESTAMP>, end=<TIMESTAMP>, limit=<NUM>, shards=" ` } ,
} ,
// {``, []string{}},
}
@ -364,7 +365,7 @@ func TestTokenizationMemcpy(t *testing.T) {
func TestTokenizationPlayground ( t * testing . T ) {
tc := tokenizationTestCase {
"foo 121113.21231 bar 123.0.1.123 -123 -1231.11" ,
[ ] string { "foo" , "<NUM>" , "bar" , "<IP>" , "<NUM>" , "<NUM>" } ,
[ ] string { "foo " , "<NUM> " , "bar " , "<IP> " , "<NUM> " , "<NUM>" } ,
}
result := PreprocessAndTokenize ( [ ] byte ( tc . line ) )
assert . Equal (