diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index dbecd7160d6..d8ea7f45bcc 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -133,6 +133,7 @@ static text *text_substring(Datum str, int32 start, int32 length, bool length_not_specified); +static int pg_mbcharcliplen_chars(const char *mbstr, int len, int limit); static text *text_overlay(text *t1, text *t2, int sp, int sl); static int text_position(text *t1, text *t2, Oid collid); static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state); @@ -586,7 +587,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 S = start; /* start position */ int32 S1; /* adjusted start position */ int32 L1; /* adjusted substring length */ - int32 E; /* end position */ + int32 E; /* end position, exclusive */ /* * SQL99 says S can be zero or negative (which we don't document), but we @@ -684,11 +685,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) else { /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. + * Ending at position 1, exclusive, obviously yields an empty + * string. A zero or negative value can happen if the start was + * negative or one. SQL99 says to return a zero-length string. */ - if (E < 1) + if (E <= 1) return cstring_to_text(""); /* @@ -698,11 +699,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) L1 = E - S1; /* - * Total slice size in bytes can't be any longer than the start - * position plus substring length times the encoding max length. - * If that overflows, we can just use -1. + * Total slice size in bytes can't be any longer than the + * inclusive end position times the encoding max length. If that + * overflows, we can just use -1. */ - if (pg_mul_s32_overflow(E, eml, &slice_size)) + if (pg_mul_s32_overflow(E - 1, eml, &slice_size)) slice_size = -1; } @@ -725,9 +726,17 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) return cstring_to_text(""); } - /* Now we can get the actual length of the slice in MB characters */ - slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - slice_len); + /* + * Now we can get the actual length of the slice in MB characters, + * stopping at the end of the substring. Continuing beyond the + * substring end could find an incomplete character attributable + * solely to DatumGetTextPSlice() chopping in the middle of a + * character, and it would be superfluous work at best. + */ + slice_strlen = + (slice_size == -1 ? + pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) : + pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1)); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -782,6 +791,35 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) return NULL; } +/* + * pg_mbcharcliplen_chars - + * Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes. + * + * This mirrors all the dubious historical behavior, so it's static to + * discourage proliferation. The assertions are specific to the one caller. + */ +static int +pg_mbcharcliplen_chars(const char *mbstr, int len, int limit) +{ + int nch = 0; + int l; + + Assert(len > 0); + Assert(limit > 0); + Assert(pg_database_encoding_max_length() > 1); + + while (len > 0 && *mbstr) + { + l = pg_mblen_with_len(mbstr, len); + nch++; + if (nch == limit) + break; + len -= l; + mbstr += l; + } + return nch; +} + /* * textoverlay * Replace specified substring of first string with second diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out index ea1f38cff41..cac1cb74782 100644 --- a/src/test/regress/expected/encoding.out +++ b/src/test/regress/expected/encoding.out @@ -63,7 +63,13 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; + substring +----------- + caf +(1 row) + +SELECT substring(truncated, 1, 4) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 SELECT reverse(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 @@ -375,7 +381,43 @@ NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK t (1 row) +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; + substring +----------- + +(1 row) + +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +ERROR: invalid byte sequence for encoding "UTF8": 0xe2 0x80 +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + substring +----------- + 🚀 +(1 row) + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func; diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql index b9543c0cb32..782f90f0d62 100644 --- a/src/test/regress/sql/encoding.sql +++ b/src/test/regress/sql/encoding.sql @@ -40,7 +40,8 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; +SELECT substring(truncated, 1, 4) FROM regress_encoding; SELECT reverse(truncated) FROM regress_encoding; -- invalid short mb character = silently dropped SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; @@ -212,7 +213,23 @@ INSERT INTO encoding_tests VALUES SELECT COUNT(test_encoding(encoding, description, input)) > 0 FROM encoding_tests; +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func;