mirror of https://github.com/postgres/postgres
A security patch changed them today, so close the coverage gap now. Test that buffer overrun is avoided when pg_mblen*() requires more than the number of bytes remaining. This does not cover the calls in dict_thesaurus.c or in dict_synonym.c. That code is straightforward. To change that code's input, one must have access to modify installed OS files, so low-privilege users are not a threat. Testing this would likewise require changing installed share/postgresql/tsearch_data, which was enough of an obstacle to not bother. Security: CVE-2026-2006 Backpatch-through: 14 Co-authored-by: Thomas Munro <thomas.munro@gmail.com> Co-authored-by: Noah Misch <noah@leadboat.com> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>REL_16_STABLE
parent
d837fb0292
commit
4c08960d97
@ -0,0 +1,8 @@ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
-- Index 50 translations of the word "Mathematics" |
||||
CREATE TEMP TABLE mb (s text); |
||||
\copy mb from 'data/trgm_utf8.data' |
||||
CREATE INDEX ON mb USING gist(s gist_trgm_ops); |
||||
@ -0,0 +1,3 @@ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
@ -0,0 +1,9 @@ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
|
||||
-- Index 50 translations of the word "Mathematics" |
||||
CREATE TEMP TABLE mb (s text); |
||||
\copy mb from 'data/trgm_utf8.data' |
||||
CREATE INDEX ON mb USING gist(s gist_trgm_ops); |
||||
@ -0,0 +1,401 @@ |
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
\getenv libdir PG_LIBDIR |
||||
\getenv dlsuffix PG_DLSUFFIX |
||||
\set regresslib :libdir '/regress' :dlsuffix |
||||
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); |
||||
INSERT INTO regress_encoding |
||||
VALUES ('café', |
||||
'caf' || test_bytea_to_text('\xc3'), |
||||
'café' || test_bytea_to_text('\x00') || 'dcba', |
||||
'caf' || test_bytea_to_text('\xc300') || 'dcba'); |
||||
SELECT good, truncated, with_nul FROM regress_encoding; |
||||
good | truncated | with_nul |
||||
------+-----------+---------- |
||||
café | caf | café |
||||
(1 row) |
||||
|
||||
SELECT length(good) FROM regress_encoding; |
||||
length |
||||
-------- |
||||
4 |
||||
(1 row) |
||||
|
||||
SELECT substring(good, 3, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
f |
||||
(1 row) |
||||
|
||||
SELECT substring(good, 4, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
é |
||||
(1 row) |
||||
|
||||
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; |
||||
regexp_replace |
||||
---------------- |
||||
é |
||||
(1 row) |
||||
|
||||
SELECT reverse(good) FROM regress_encoding; |
||||
reverse |
||||
--------- |
||||
éfac |
||||
(1 row) |
||||
|
||||
-- invalid short mb character = error |
||||
SELECT length(truncated) FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
SELECT substring(truncated, 1, 1) FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
SELECT reverse(truncated) FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
-- invalid short mb character = silently dropped |
||||
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; |
||||
regexp_replace |
||||
---------------- |
||||
caf |
||||
(1 row) |
||||
|
||||
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string |
||||
-- contains NUL at a character boundary position, some functions treat it as a |
||||
-- character while others treat it as a terminator, as implementation details. |
||||
-- NUL = terminator |
||||
SELECT length(with_nul) FROM regress_encoding; |
||||
length |
||||
-------- |
||||
4 |
||||
(1 row) |
||||
|
||||
SELECT substring(with_nul, 3, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
f |
||||
(1 row) |
||||
|
||||
SELECT substring(with_nul, 4, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
é |
||||
(1 row) |
||||
|
||||
SELECT substring(with_nul, 5, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
|
||||
(1 row) |
||||
|
||||
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; |
||||
convert_to |
||||
------------ |
||||
\x |
||||
(1 row) |
||||
|
||||
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; |
||||
regexp_replace |
||||
---------------- |
||||
é |
||||
(1 row) |
||||
|
||||
-- NUL = character |
||||
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; |
||||
with_nul | reverse | reverse |
||||
----------+---------+--------- |
||||
café | abcd | café |
||||
(1 row) |
||||
|
||||
-- If a corrupted string contains NUL in the tail bytes of a multibyte |
||||
-- character (invalid in all encodings), it is considered part of the |
||||
-- character for length purposes. An error will only be raised in code paths |
||||
-- that convert or verify encodings. |
||||
SELECT length(truncated_with_nul) FROM regress_encoding; |
||||
length |
||||
-------- |
||||
8 |
||||
(1 row) |
||||
|
||||
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
f |
||||
(1 row) |
||||
|
||||
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
|
||||
(1 row) |
||||
|
||||
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00 |
||||
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; |
||||
substring |
||||
----------- |
||||
d |
||||
(1 row) |
||||
|
||||
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT reverse(truncated_with_nul) FROM regress_encoding; |
||||
reverse |
||||
--------- |
||||
abcd |
||||
(1 row) |
||||
|
||||
-- unbounded: sequence would overrun the string! |
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
test_mblen_func |
||||
----------------- |
||||
2 |
||||
(1 row) |
||||
|
||||
-- condition detected when using the length/range variants |
||||
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
-- unbounded: sequence would overrun the string, if the terminator were really |
||||
-- the end of it |
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
test_mblen_func |
||||
----------------- |
||||
2 |
||||
(1 row) |
||||
|
||||
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
test_mblen_func |
||||
----------------- |
||||
2 |
||||
(1 row) |
||||
|
||||
-- condition detected when using the cstr variants |
||||
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
ERROR: invalid byte sequence for encoding "UTF8": 0xc3 |
||||
DROP TABLE regress_encoding; |
||||
-- mb<->wchar conversions |
||||
CREATE FUNCTION test_encoding(encoding text, description text, input bytea) |
||||
RETURNS VOID LANGUAGE plpgsql AS |
||||
$$ |
||||
DECLARE |
||||
prefix text; |
||||
len int; |
||||
wchars int[]; |
||||
round_trip bytea; |
||||
result text; |
||||
BEGIN |
||||
prefix := rpad(encoding || ' ' || description || ':', 28); |
||||
|
||||
-- XXX could also test validation, length functions and include client |
||||
-- only encodings with these test cases |
||||
|
||||
IF test_valid_server_encoding(encoding) THEN |
||||
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); |
||||
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); |
||||
if input = round_trip then |
||||
result := 'OK'; |
||||
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then |
||||
result := 'truncated'; |
||||
else |
||||
result := 'failed'; |
||||
end if; |
||||
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; |
||||
END IF; |
||||
END; |
||||
$$; |
||||
-- No validation is done on the encoding itself, just the length to avoid |
||||
-- overruns, so some of the byte sequences below are bogus. They cover |
||||
-- all code branches, server encodings only for now. |
||||
CREATE TABLE encoding_tests (encoding text, description text, input bytea); |
||||
INSERT INTO encoding_tests VALUES |
||||
-- LATIN1, other single-byte encodings |
||||
('LATIN1', 'ASCII', 'a'), |
||||
('LATIN1', 'extended', '\xe9'), |
||||
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): |
||||
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
||||
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_JP', 'ASCII', 'a'), |
||||
('EUC_JP', 'CS1, short', '\x80'), |
||||
('EUC_JP', 'CS1', '\x8002'), |
||||
('EUC_JP', 'CS2, short', '\x8e'), |
||||
('EUC_JP', 'CS2', '\x8e02'), |
||||
('EUC_JP', 'CS3, short', '\x8f'), |
||||
('EUC_JP', 'CS3, short', '\x8f02'), |
||||
('EUC_JP', 'CS3', '\x8f0203'), |
||||
-- EUC_CN |
||||
-- 3 8e (CS2, not used but arbitrarily considered to have length 3) |
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_CN', 'ASCII', 'a'), |
||||
('EUC_CN', 'CS1, short', '\x80'), |
||||
('EUC_CN', 'CS1', '\x8002'), |
||||
('EUC_CN', 'CS2, short', '\x8e'), |
||||
('EUC_CN', 'CS2, short', '\x8e02'), |
||||
('EUC_CN', 'CS2', '\x8e0203'), |
||||
('EUC_CN', 'CS3, short', '\x8f'), |
||||
('EUC_CN', 'CS3, short', '\x8f02'), |
||||
('EUC_CN', 'CS3', '\x8f0203'), |
||||
-- EUC_TW: |
||||
-- 4 8e (CS2) |
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_TW', 'ASCII', 'a'), |
||||
('EUC_TW', 'CS1, short', '\x80'), |
||||
('EUC_TW', 'CS1', '\x8002'), |
||||
('EUC_TW', 'CS2, short', '\x8e'), |
||||
('EUC_TW', 'CS2, short', '\x8e02'), |
||||
('EUC_TW', 'CS2, short', '\x8e0203'), |
||||
('EUC_TW', 'CS2', '\x8e020304'), |
||||
('EUC_TW', 'CS3, short', '\x8f'), |
||||
('EUC_TW', 'CS3, short', '\x8f02'), |
||||
('EUC_TW', 'CS3', '\x8f0203'), |
||||
-- UTF8 |
||||
-- 2 c0..df |
||||
-- 3 e0..ef |
||||
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) |
||||
-- 5 f8..fb (not supported) |
||||
-- 6 fc..fd (not supported) |
||||
('UTF8', 'ASCII', 'a'), |
||||
('UTF8', '2 byte, short', '\xdf'), |
||||
('UTF8', '2 byte', '\xdf82'), |
||||
('UTF8', '3 byte, short', '\xef'), |
||||
('UTF8', '3 byte, short', '\xef82'), |
||||
('UTF8', '3 byte', '\xef8283'), |
||||
('UTF8', '4 byte, short', '\xf7'), |
||||
('UTF8', '4 byte, short', '\xf782'), |
||||
('UTF8', '4 byte, short', '\xf78283'), |
||||
('UTF8', '4 byte', '\xf7828384'), |
||||
('UTF8', '5 byte, unsupported', '\xfb'), |
||||
('UTF8', '5 byte, unsupported', '\xfb82'), |
||||
('UTF8', '5 byte, unsupported', '\xfb8283'), |
||||
('UTF8', '5 byte, unsupported', '\xfb828384'), |
||||
('UTF8', '5 byte, unsupported', '\xfb82838485'), |
||||
('UTF8', '6 byte, unsupported', '\xfd'), |
||||
('UTF8', '6 byte, unsupported', '\xfd82'), |
||||
('UTF8', '6 byte, unsupported', '\xfd8283'), |
||||
('UTF8', '6 byte, unsupported', '\xfd828384'), |
||||
('UTF8', '6 byte, unsupported', '\xfd82838485'), |
||||
('UTF8', '6 byte, unsupported', '\xfd8283848586'), |
||||
-- MULE_INTERNAL |
||||
-- 2 81..8d LC1 |
||||
-- 3 90..99 LC2 |
||||
('MULE_INTERNAL', 'ASCII', 'a'), |
||||
('MULE_INTERNAL', 'LC1, short', '\x81'), |
||||
('MULE_INTERNAL', 'LC1', '\x8182'), |
||||
('MULE_INTERNAL', 'LC2, short', '\x90'), |
||||
('MULE_INTERNAL', 'LC2, short', '\x9082'), |
||||
('MULE_INTERNAL', 'LC2', '\x908283'); |
||||
SELECT COUNT(test_encoding(encoding, description, input)) > 0 |
||||
FROM encoding_tests; |
||||
NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK |
||||
NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated |
||||
NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK |
||||
NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated |
||||
NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK |
||||
NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated |
||||
NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated |
||||
NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK |
||||
NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated |
||||
NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK |
||||
NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated |
||||
NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated |
||||
NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK |
||||
NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated |
||||
NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated |
||||
NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK |
||||
NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK |
||||
NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK |
||||
NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated |
||||
NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK |
||||
NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated |
||||
NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK |
||||
NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated |
||||
NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated |
||||
NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK |
||||
NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated |
||||
NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated |
||||
NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated |
||||
NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK |
||||
NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed |
||||
NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed |
||||
NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed |
||||
NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed |
||||
NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed |
||||
NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed |
||||
NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK |
||||
NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated |
||||
NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK |
||||
NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated |
||||
NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated |
||||
NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
DROP TABLE encoding_tests; |
||||
DROP FUNCTION test_encoding; |
||||
DROP FUNCTION test_text_to_wchars; |
||||
DROP FUNCTION test_mblen_func; |
||||
DROP FUNCTION test_bytea_to_text; |
||||
DROP FUNCTION test_text_to_bytea; |
||||
-- substring slow path: multi-byte escape char vs. multi-byte pattern char. |
||||
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); |
||||
substring |
||||
----------- |
||||
|
||||
(1 row) |
||||
|
||||
-- Levenshtein distance metric: exercise character length cache. |
||||
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); |
||||
ERROR: column "real§_name" does not exist |
||||
LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); |
||||
^ |
||||
HINT: Perhaps you meant to reference the column "x.real_name". |
||||
-- JSON errcontext: truncate long data. |
||||
SELECT repeat(U&'\00A7', 30)::json; |
||||
ERROR: invalid input syntax for type json |
||||
DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid. |
||||
CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§ |
||||
@ -0,0 +1,4 @@ |
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
@ -0,0 +1,16 @@ |
||||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent |
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all |
||||
-- of EUC_KR, also run the test in UTF8. |
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. |
||||
SELECT POSITION( |
||||
convert_from('\xbcf6c7d0', 'EUC_KR') IN |
||||
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); |
||||
position |
||||
---------- |
||||
5 |
||||
(1 row) |
||||
|
||||
@ -0,0 +1,6 @@ |
||||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent |
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all |
||||
-- of EUC_KR, also run the test in UTF8. |
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
@ -0,0 +1,228 @@ |
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
|
||||
\getenv libdir PG_LIBDIR |
||||
\getenv dlsuffix PG_DLSUFFIX |
||||
|
||||
\set regresslib :libdir '/regress' :dlsuffix |
||||
|
||||
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean |
||||
AS :'regresslib' LANGUAGE C STRICT; |
||||
|
||||
|
||||
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); |
||||
INSERT INTO regress_encoding |
||||
VALUES ('café', |
||||
'caf' || test_bytea_to_text('\xc3'), |
||||
'café' || test_bytea_to_text('\x00') || 'dcba', |
||||
'caf' || test_bytea_to_text('\xc300') || 'dcba'); |
||||
|
||||
SELECT good, truncated, with_nul FROM regress_encoding; |
||||
|
||||
SELECT length(good) FROM regress_encoding; |
||||
SELECT substring(good, 3, 1) FROM regress_encoding; |
||||
SELECT substring(good, 4, 1) FROM regress_encoding; |
||||
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; |
||||
SELECT reverse(good) FROM regress_encoding; |
||||
|
||||
-- invalid short mb character = error |
||||
SELECT length(truncated) FROM regress_encoding; |
||||
SELECT substring(truncated, 1, 1) FROM regress_encoding; |
||||
SELECT reverse(truncated) FROM regress_encoding; |
||||
-- invalid short mb character = silently dropped |
||||
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; |
||||
|
||||
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string |
||||
-- contains NUL at a character boundary position, some functions treat it as a |
||||
-- character while others treat it as a terminator, as implementation details. |
||||
|
||||
-- NUL = terminator |
||||
SELECT length(with_nul) FROM regress_encoding; |
||||
SELECT substring(with_nul, 3, 1) FROM regress_encoding; |
||||
SELECT substring(with_nul, 4, 1) FROM regress_encoding; |
||||
SELECT substring(with_nul, 5, 1) FROM regress_encoding; |
||||
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; |
||||
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; |
||||
-- NUL = character |
||||
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; |
||||
|
||||
-- If a corrupted string contains NUL in the tail bytes of a multibyte |
||||
-- character (invalid in all encodings), it is considered part of the |
||||
-- character for length purposes. An error will only be raised in code paths |
||||
-- that convert or verify encodings. |
||||
|
||||
SELECT length(truncated_with_nul) FROM regress_encoding; |
||||
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; |
||||
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; |
||||
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; |
||||
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; |
||||
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; |
||||
SELECT reverse(truncated_with_nul) FROM regress_encoding; |
||||
|
||||
-- unbounded: sequence would overrun the string! |
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
|
||||
-- condition detected when using the length/range variants |
||||
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) |
||||
FROM regress_encoding; |
||||
|
||||
-- unbounded: sequence would overrun the string, if the terminator were really |
||||
-- the end of it |
||||
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
|
||||
-- condition detected when using the cstr variants |
||||
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) |
||||
FROM regress_encoding; |
||||
|
||||
DROP TABLE regress_encoding; |
||||
|
||||
-- mb<->wchar conversions |
||||
CREATE FUNCTION test_encoding(encoding text, description text, input bytea) |
||||
RETURNS VOID LANGUAGE plpgsql AS |
||||
$$ |
||||
DECLARE |
||||
prefix text; |
||||
len int; |
||||
wchars int[]; |
||||
round_trip bytea; |
||||
result text; |
||||
BEGIN |
||||
prefix := rpad(encoding || ' ' || description || ':', 28); |
||||
|
||||
-- XXX could also test validation, length functions and include client |
||||
-- only encodings with these test cases |
||||
|
||||
IF test_valid_server_encoding(encoding) THEN |
||||
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); |
||||
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); |
||||
if input = round_trip then |
||||
result := 'OK'; |
||||
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then |
||||
result := 'truncated'; |
||||
else |
||||
result := 'failed'; |
||||
end if; |
||||
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; |
||||
END IF; |
||||
END; |
||||
$$; |
||||
-- No validation is done on the encoding itself, just the length to avoid |
||||
-- overruns, so some of the byte sequences below are bogus. They cover |
||||
-- all code branches, server encodings only for now. |
||||
CREATE TABLE encoding_tests (encoding text, description text, input bytea); |
||||
INSERT INTO encoding_tests VALUES |
||||
-- LATIN1, other single-byte encodings |
||||
('LATIN1', 'ASCII', 'a'), |
||||
('LATIN1', 'extended', '\xe9'), |
||||
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): |
||||
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
||||
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_JP', 'ASCII', 'a'), |
||||
('EUC_JP', 'CS1, short', '\x80'), |
||||
('EUC_JP', 'CS1', '\x8002'), |
||||
('EUC_JP', 'CS2, short', '\x8e'), |
||||
('EUC_JP', 'CS2', '\x8e02'), |
||||
('EUC_JP', 'CS3, short', '\x8f'), |
||||
('EUC_JP', 'CS3, short', '\x8f02'), |
||||
('EUC_JP', 'CS3', '\x8f0203'), |
||||
-- EUC_CN |
||||
-- 3 8e (CS2, not used but arbitrarily considered to have length 3) |
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_CN', 'ASCII', 'a'), |
||||
('EUC_CN', 'CS1, short', '\x80'), |
||||
('EUC_CN', 'CS1', '\x8002'), |
||||
('EUC_CN', 'CS2, short', '\x8e'), |
||||
('EUC_CN', 'CS2, short', '\x8e02'), |
||||
('EUC_CN', 'CS2', '\x8e0203'), |
||||
('EUC_CN', 'CS3, short', '\x8f'), |
||||
('EUC_CN', 'CS3, short', '\x8f02'), |
||||
('EUC_CN', 'CS3', '\x8f0203'), |
||||
-- EUC_TW: |
||||
-- 4 8e (CS2) |
||||
-- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
||||
-- 2 80..ff (CS1) |
||||
('EUC_TW', 'ASCII', 'a'), |
||||
('EUC_TW', 'CS1, short', '\x80'), |
||||
('EUC_TW', 'CS1', '\x8002'), |
||||
('EUC_TW', 'CS2, short', '\x8e'), |
||||
('EUC_TW', 'CS2, short', '\x8e02'), |
||||
('EUC_TW', 'CS2, short', '\x8e0203'), |
||||
('EUC_TW', 'CS2', '\x8e020304'), |
||||
('EUC_TW', 'CS3, short', '\x8f'), |
||||
('EUC_TW', 'CS3, short', '\x8f02'), |
||||
('EUC_TW', 'CS3', '\x8f0203'), |
||||
-- UTF8 |
||||
-- 2 c0..df |
||||
-- 3 e0..ef |
||||
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) |
||||
-- 5 f8..fb (not supported) |
||||
-- 6 fc..fd (not supported) |
||||
('UTF8', 'ASCII', 'a'), |
||||
('UTF8', '2 byte, short', '\xdf'), |
||||
('UTF8', '2 byte', '\xdf82'), |
||||
('UTF8', '3 byte, short', '\xef'), |
||||
('UTF8', '3 byte, short', '\xef82'), |
||||
('UTF8', '3 byte', '\xef8283'), |
||||
('UTF8', '4 byte, short', '\xf7'), |
||||
('UTF8', '4 byte, short', '\xf782'), |
||||
('UTF8', '4 byte, short', '\xf78283'), |
||||
('UTF8', '4 byte', '\xf7828384'), |
||||
('UTF8', '5 byte, unsupported', '\xfb'), |
||||
('UTF8', '5 byte, unsupported', '\xfb82'), |
||||
('UTF8', '5 byte, unsupported', '\xfb8283'), |
||||
('UTF8', '5 byte, unsupported', '\xfb828384'), |
||||
('UTF8', '5 byte, unsupported', '\xfb82838485'), |
||||
('UTF8', '6 byte, unsupported', '\xfd'), |
||||
('UTF8', '6 byte, unsupported', '\xfd82'), |
||||
('UTF8', '6 byte, unsupported', '\xfd8283'), |
||||
('UTF8', '6 byte, unsupported', '\xfd828384'), |
||||
('UTF8', '6 byte, unsupported', '\xfd82838485'), |
||||
('UTF8', '6 byte, unsupported', '\xfd8283848586'), |
||||
-- MULE_INTERNAL |
||||
-- 2 81..8d LC1 |
||||
-- 3 90..99 LC2 |
||||
('MULE_INTERNAL', 'ASCII', 'a'), |
||||
('MULE_INTERNAL', 'LC1, short', '\x81'), |
||||
('MULE_INTERNAL', 'LC1', '\x8182'), |
||||
('MULE_INTERNAL', 'LC2, short', '\x90'), |
||||
('MULE_INTERNAL', 'LC2, short', '\x9082'), |
||||
('MULE_INTERNAL', 'LC2', '\x908283'); |
||||
|
||||
SELECT COUNT(test_encoding(encoding, description, input)) > 0 |
||||
FROM encoding_tests; |
||||
|
||||
DROP TABLE encoding_tests; |
||||
DROP FUNCTION test_encoding; |
||||
DROP FUNCTION test_text_to_wchars; |
||||
DROP FUNCTION test_mblen_func; |
||||
DROP FUNCTION test_bytea_to_text; |
||||
DROP FUNCTION test_text_to_bytea; |
||||
|
||||
|
||||
-- substring slow path: multi-byte escape char vs. multi-byte pattern char. |
||||
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); |
||||
-- Levenshtein distance metric: exercise character length cache. |
||||
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); |
||||
-- JSON errcontext: truncate long data. |
||||
SELECT repeat(U&'\00A7', 30)::json; |
||||
@ -0,0 +1,12 @@ |
||||
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent |
||||
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all |
||||
-- of EUC_KR, also run the test in UTF8. |
||||
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
|
||||
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. |
||||
SELECT POSITION( |
||||
convert_from('\xbcf6c7d0', 'EUC_KR') IN |
||||
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); |
||||
Loading…
Reference in new issue