Further improvement of make_greater_string.

Make sure that it considers all the possibilities that the old code did, instead of trying only one possibility per character position. To keep the runtime in bounds, instead tweak the character incrementers to not try every possible multibyte character code. Remove unnecessary logic to restore the old character value on failure. Additional comment and formatting cleanup.
14 years ago · eb5834d5af
parent fae54e4a16
commit eb5834d5af
2 changed files with 183 additions and 188 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -5701,13 +5701,23 @@ byte_increment(unsigned char *ptr, int len)
 * and "9" is seen as largest by the collation, and append that to the given
 * prefix before trying to find a string that compares as larger.
 *
- * If we max out the righthand byte, truncate off the last character
- * and start incrementing the next.  For example, if "z" were the last
- * character in the sort order, then we could produce "foo" as a
- * string greater than "fonz".
+ * To search for a greater string, we repeatedly "increment" the rightmost
+ * character, using an encoding-specific character incrementer function.
+ * When it's no longer possible to increment the last character, we truncate
+ * off that character and start incrementing the next-to-rightmost.
+ * For example, if "z" were the last character in the sort order, then we
+ * could produce "foo" as a string greater than "fonz".
 *
 * This could be rather slow in the worst case, but in most cases we
 * won't have to try more than one or two strings before succeeding.
+ *
+ * Note that it's important for the character incrementer not to be too anal
+ * about producing every possible character code, since in some cases the only
+ * way to get a larger string is to increment a previous character position.
+ * So we don't want to spend too much time trying every possible character
+ * code at the last position.  A good rule of thumb is to be sure that we
+ * don't try more than 256*K values for a K-byte character (and definitely
+ * not 256^K, which is what an exhaustive search would approach).
 */
 Const *
 make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
@ -5779,17 +5789,19 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 		}
 	}

+	/* Select appropriate character-incrementer function */
 	if (datatype == BYTEAOID)
-		charinc = &byte_increment;
+		charinc = byte_increment;
 	else
 		charinc = pg_database_encoding_character_incrementer();

+	/* And search ... */
 	while (len > 0)
 	{
-		int		charlen;
+		int			charlen;
 		unsigned char *lastchar;
-		Const	   *workstr_const;

+		/* Identify the last character --- for bytea, just the last byte */
 		if (datatype == BYTEAOID)
 			charlen = 1;
 		else
@ -5799,9 +5811,15 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 		/*
 		 * Try to generate a larger string by incrementing the last character
 		 * (for BYTEA, we treat each byte as a character).
+		 *
+		 * Note: the incrementer function is expected to return true if it's
+		 * generated a valid-per-the-encoding new character, otherwise false.
+		 * The contents of the character on false return are unspecified.
 		 */
-		if (charinc(lastchar, charlen))
+		while (charinc(lastchar, charlen))
 		{
+			Const	   *workstr_const;
+
 			if (datatype == BYTEAOID)
 				workstr_const = string_to_bytea_const(workstr, len);
 			else
@ -5825,7 +5843,8 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
 		}

 		/*
-		 * Truncate off the last character or byte.
+		 * No luck here, so truncate off the last character and try to
+		 * increment the next one.
 		 */
 		len -= charlen;
 		workstr[len] = '\0';
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@ -1337,85 +1337,78 @@ pg_utf8_islegal(const unsigned char *source, int length)
 #ifndef FRONTEND

 /*
- * Generic character increment function.
+ * Generic character incrementer function.
 *
 * Not knowing anything about the properties of the encoding in use, we just
- * keep incrementing the last byte until pg_verifymbstr() likes the result,
- * or we run out of values to try.
- *
- * Like all character-increment functions, we must restore the original input
- * string on failure.
+ * keep incrementing the last byte until we get a validly-encoded result,
+ * or we run out of values to try.  We don't bother to try incrementing
+ * higher-order bytes, so there's no growth in runtime for wider characters.
+ * (If we did try to do that, we'd need to consider the likelihood that 255
+ * is not a valid final byte in the encoding.)
 */
 static bool
 pg_generic_charinc(unsigned char *charptr, int len)
 {
- 	unsigned char *lastchar = (unsigned char *) (charptr + len - 1);
- 	unsigned char savelastchar = *lastchar;
- 	const char *const_charptr = (const char *)charptr;
- 
- 	while (*lastchar < (unsigned char) 255)
- 	{
- 		(*lastchar)++;
- 		if (!pg_verifymbstr(const_charptr, len, true))
- 			continue;
- 		return true;
- 	}
- 
- 	*lastchar = savelastchar;
- 	return false;
+	unsigned char *lastbyte = charptr + len - 1;
+	mbverifier	mbverify;
+
+	/* We can just invoke the character verifier directly. */
+	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+
+	while (*lastbyte < (unsigned char) 255)
+	{
+		(*lastbyte)++;
+		if ((*mbverify) (charptr, len) == len)
+			return true;
+	}
+
+	return false;
 }

 /*
- * UTF-8 character increment function.
+ * UTF-8 character incrementer function.
 *
 * For a one-byte character less than 0x7F, we just increment the byte.
 *
 * For a multibyte character, every byte but the first must fall between 0x80
 * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
- * the last byte that's not already at its maximum value, and set any following
- * bytes back to 0x80.  If we can't find a byte that's less than the maximum
- * allowable vale, we simply fail.  We also have some special-case logic to
- * skip regions used for surrogate pair handling, as those should not occur in
- * valid UTF-8.
+ * the last byte that's not already at its maximum value.  If we can't find a
+ * byte that's less than the maximum allowable value, we simply fail.  We also
+ * need some special-case logic to skip regions used for surrogate pair
+ * handling, as those should not occur in valid UTF-8.
 *
- * Like all character-increment functions, we must restore the original input
- * string on failure.
+ * Note that we don't reset lower-order bytes back to their minimums, since
+ * we can't afford to make an exhaustive search (see make_greater_string).
 */
 static bool
 pg_utf8_increment(unsigned char *charptr, int length)
 {
- 	unsigned char a;
- 	unsigned char bak[4];
+	unsigned char a;
 	unsigned char limit;

- 	switch (length)
- 	{
- 		default:
- 			/* reject lengths 5 and 6 for now */
- 			return false;
- 		case 4:
-			bak[3] = charptr[3];
- 			a = charptr[3];
- 			if (a < 0xBF)
- 			{
- 				charptr[3]++;
- 				break;
- 			}
- 			charptr[3] = 0x80;
- 			/* FALL THRU */
- 		case 3:
-			bak[2] = charptr[2];
- 			a = charptr[2];
- 			if (a < 0xBF)
- 			{
- 				charptr[2]++;
- 				break;
- 			}
- 			charptr[2] = 0x80;
- 			/* FALL THRU */
- 		case 2:
-			bak[1] = charptr[1];
- 			a = charptr[1];
+	switch (length)
+	{
+		default:
+			/* reject lengths 5 and 6 for now */
+			return false;
+		case 4:
+			a = charptr[3];
+			if (a < 0xBF)
+			{
+				charptr[3]++;
+				break;
+			}
+			/* FALL THRU */
+		case 3:
+			a = charptr[2];
+			if (a < 0xBF)
+			{
+				charptr[2]++;
+				break;
+			}
+			/* FALL THRU */
+		case 2:
+			a = charptr[1];
 			switch (*charptr)
 			{
 				case 0xED:
@ -1430,147 +1423,126 @@ pg_utf8_increment(unsigned char *charptr, int length)
 			}
 			if (a < limit)
 			{
- 				charptr[1]++;
- 				break;
- 			}
- 			charptr[1] = 0x80;
- 			/* FALL THRU */
- 		case 1:
-			bak[0] = *charptr;
- 			a = *charptr;
- 			if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
-			{
-				/* Restore original string. */
-				memcpy(charptr, bak, length);
- 				return false;
- 			}
- 			charptr[0]++;
- 			break;
- 	}
+				charptr[1]++;
+				break;
+			}
+			/* FALL THRU */
+		case 1:
+			a = *charptr;
+			if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
+				return false;
+			charptr[0]++;
+			break;
+	}

- 	return true;
+	return true;
 }

 /*
- * EUC-JP character increment function.
+ * EUC-JP character incrementer function.
 *
- * If the sequence starts with SS2(0x8e), it must be a two-byte sequence
- * representing JIS X 0201 characters with the second byte ranges between
- * 0xa1 and 0xde.  We just increment the last byte if it's less than 0xde,
- * and otherwise rewrite whole the sequence to 0xa1 0xa1.
+ * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
+ * representing JIS X 0201 characters with the second byte ranging between
+ * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
+ * and otherwise rewrite the whole sequence to 0xa1 0xa1.
 *
- * If the sequence starts with SS3(0x8f), it must be a three-byte sequence
- * which the last two bytes ranges between 0xa1 and 0xfe.  The last byte
- * is incremented, carrying overflow to the second-to-last byte.
+ * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
+ * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
+ * is incremented if possible, otherwise the second-to-last byte.
 *
- * If the sequence starts with the values other than the aboves and its MSB
+ * If the sequence starts with a value other than the above and its MSB
 * is set, it must be a two-byte sequence representing JIS X 0208 characters
- * with both bytes ranges between 0xa1 and 0xfe.  The last byte is incremented,
- * carrying overflow to the second-to-last byte.
+ * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
+ * incremented if possible, otherwise the second-to-last byte.
 *
- * Otherwise the sequence is consists of single byte representing ASCII
- * characters. It is incremented up to 0x7f.
- *    
- * Only three EUC-JP byte sequences shown below - which have no character
- * allocated - make this function to fail in spite of its validity: 0x7f,
- * 0xfe 0xfe, 0x8f 0xfe 0xfe.
+ * Otherwise, the sequence is a single-byte ASCII character. It is
+ * incremented up to 0x7f.
 */
 static bool
 pg_eucjp_increment(unsigned char *charptr, int length)
 {
- 	unsigned char bak[3];
- 	unsigned char c1, c2;
- 	signed int i;
+	unsigned char c1,
+				c2;
+	int			i;

- 	c1 = *charptr;
+	c1 = *charptr;

- 	switch (c1)
- 	{
- 		case SS2:	/* JIS X 0201 */
- 			if (length != 2)
+	switch (c1)
+	{
+		case SS2:				/* JIS X 0201 */
+			if (length != 2)
 				return false;

- 			c2 = charptr[1];
-
- 			if (c2 > 0xde)
- 				charptr[0] = charptr[1] = 0xa1;
- 			else if (c2 < 0xa1)
- 				charptr[1] = 0xa1;
- 			else
- 				charptr[1]++;
+			c2 = charptr[1];

- 			break;
+			if (c2 >= 0xdf)
+				charptr[0] = charptr[1] = 0xa1;
+			else if (c2 < 0xa1)
+				charptr[1] = 0xa1;
+			else
+				charptr[1]++;
+			break;

- 		case SS3:	/* JIS X 0212 */
- 			if (length != 3)
+		case SS3:				/* JIS X 0212 */
+			if (length != 3)
 				return false;

- 			for (i = 2; i > 0; i--)
- 			{
-				bak[i] = charptr[i];
- 				c2 = charptr[i];
- 				if (c2 < 0xa1)
- 				{
- 					charptr[i] = 0xa1;
- 					return true;
- 				}
- 				else if (c2 < 0xfe)
- 				{
- 					charptr[i]++;
- 					break;
- 				}
- 				charptr[i] = 0xa1;
- 			}
-
- 			if (i == 0)	  /* Out of 3-byte code region */
- 			{
-				charptr[1] = bak[1];
-				charptr[2] = bak[2];
- 				return false;
- 			}
- 			break;
-
- 		default:
- 			if (IS_HIGHBIT_SET(c1))	 /* JIS X 0208? */
- 			{
- 				if (length != 2)
+			for (i = 2; i > 0; i--)
+			{
+				c2 = charptr[i];
+				if (c2 < 0xa1)
+				{
+					charptr[i] = 0xa1;
+					return true;
+				}
+				else if (c2 < 0xfe)
+				{
+					charptr[i]++;
+					return true;
+				}
+			}
+
+			/* Out of 3-byte code region */
+			return false;
+
+		default:
+			if (IS_HIGHBIT_SET(c1))		/* JIS X 0208? */
+			{
+				if (length != 2)
+					return false;
+
+				for (i = 1; i >= 0; i--)
+				{
+					c2 = charptr[i];
+					if (c2 < 0xa1)
+					{
+						charptr[i] = 0xa1;
+						return true;
+					}
+					else if (c2 < 0xfe)
+					{
+						charptr[i]++;
+						return true;
+					}
+				}
+
+				/* Out of 2 byte code region */
+				return false;
+			}
+			else
+			{	/* ASCII, single byte */
+				if (c1 > 0x7e)
 					return false;
+				(*charptr)++;
+			}
+			break;
+	}

- 				for (i = 1 ; i >= 0 ; i--)	/* i must be signed */
- 				{
-					bak[i] = charptr[i];
- 					c2 = charptr[i];
- 					if (c2 < 0xa1)
- 					{
- 						charptr[i] = 0xa1;
- 						return true;
- 					}
- 					else if (c2 < 0xfe)
- 					{
- 						charptr[i]++;
- 						break;
- 					}
- 					charptr[i] = 0xa1;
- 				}
-
- 				if (i < 0)	/* Out of 2 byte code region */
- 				{
- 					charptr[0] = bak[0];
- 					charptr[1] = bak[1];
- 					return false;
- 				}
- 			}
- 			else
- 			{	/* ASCII, single byte */
- 				if (c1 > 0x7e)
- 					return false;
- 				(*charptr)++;
- 			}
- 	}
-
- 	return true;
+	return true;
 }
-#endif
+
+#endif /* !FRONTEND */
+

 /*
 *-------------------------------------------------------------------
@ -1697,19 +1669,23 @@ pg_database_encoding_max_length(void)
 }

 /*
- * give the character incrementer for the encoding for the current database
+ * get the character incrementer for the encoding for the current database
 */
 mbcharacter_incrementer
 pg_database_encoding_character_incrementer(void)
 {
+	/*
+	 * Eventually it might be best to add a field to pg_wchar_table[],
+	 * but for now we just use a switch.
+	 */
 	switch (GetDatabaseEncoding())
 	{
 		case PG_UTF8:
 			return pg_utf8_increment;
-			
+
 		case PG_EUC_JP:
 			return pg_eucjp_increment;
-			
+
 		default:
 			return pg_generic_charinc;
 	}
@ -1908,4 +1884,4 @@ report_untranslatable_char(int src_encoding, int dest_encoding,
 			 pg_enc2name_tbl[dest_encoding].name)));
 }

-#endif
+#endif /* !FRONTEND */