Optimization for lower(), upper(), casefold() functions.

Improve performance and reduce table sizes for case mapping. The main case mapping table stores only 16-bit offsets, which can be used to look up the mapped code point in any of the case tables (fold, lower, upper, or title case). Simple case pairs point to the same offsets. Generate a function in generate-unicode_case_table.pl that consists of a nested branches to test for specific codepoint ranges that determine the offset in the main table. Other approaches were considered, such as representing these ranges as another structure (rather than branches in a generated function), or a different approach such as a radix tree, or perfect hashing. The author implemented and tested these alternatives and settled on the generated branches. Author: Alexander Borisov <lex.borisov@gmail.com> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Discussion: https://postgr.es/m/7cac7e66-9a3b-4e3f-a997-42aa0c401f80%40gmail.com
10 months ago · 27bdec0684
parent c3953226a0
commit 27bdec0684
3 changed files with 13671 additions and 3215 deletions
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@ -231,26 +231,24 @@ while (my $line = <$FH>)
 close $FH;

 # assign sequential array indexes to the special mappings
-my $special_idx = 0;
+# 0 is reserved for NULL
+my $special_idx = 1;
 foreach my $code (sort { $a <=> $b } (keys %special))
 {
 	$special{$code}{Index} = $special_idx++;
 }

+# determine size of array
+my $num_special = scalar(keys %special) + 1;
+
+die
+  "special case map contains $num_special entries which cannot be represented in uint8"
+  if ($num_special > 256);
+
 # Start writing out the output files
 open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";

-# determine size of array given that codepoints <= 0x80 are dense and
-# the rest of the entries are sparse
-my $num_simple = 0x80;
-foreach my $code (sort { $a <=> $b } (keys %simple))
-{
-	$num_simple++ unless $code < 0x80;
-}
-
-my $num_special = scalar(keys %special) + 1;
-
 print $OT <<"EOS";
 /*-------------------------------------------------------------------------
 *
@ -298,24 +296,17 @@ typedef enum

 typedef struct
 {
-	pg_wchar	codepoint;		/* Unicode codepoint */
 	int16		conditions;
 	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
 } pg_special_case;

-typedef struct
-{
-	pg_wchar	codepoint;		/* Unicode codepoint */
-	pg_wchar	simplemap[NCaseKind];
-	const pg_special_case *special_case;
-} pg_case_map;
-
 /*
 * Special case mappings that aren't representable in the simple map.
 * Entries are referenced from simple_case_map.
 */
 static const pg_special_case special_case[$num_special] =
 {
+	{0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}},
 EOS

 foreach my $code (sort { $a <=> $b } (keys %special))
@ -332,23 +323,46 @@ foreach my $code (sort { $a <=> $b } (keys %special))
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
 	my $fold = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
-	printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
-	printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
+	printf $OT "\t{%s, ", $special{$code}{Conditions};
+	printf $OT
+	  "{[CaseLower] = {%s},[CaseTitle] = {%s},[CaseUpper] = {%s},[CaseFold] = {%s}}},\n",
+	  $lower, $title, $upper, $fold;
 }
+print $OT "};\n";

-print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
-print $OT <<"EOS";
-};
-
-/*
- * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
- * sparse for higher codepoints (requiring scan or binary search).
- */
-static const pg_case_map case_map[$num_simple] =
+# Separate maps for each case form, starting with the reserved entry
+# at index 0. The first element is the result code point, and the
+# second element is the input code point (which is not ultimately
+# stored in the C array, it's just there as a comment).
+my %map = (
+	lower => [ [ 0, -1 ] ],
+	title => [ [ 0, -1 ] ],
+	upper => [ [ 0, -1 ] ],
+	fold => [ [ 0, -1 ] ],
+	special => [ [ 0, -1 ] ]);
+
+
+# Current index into the map arrays above.
+my $index = 1;
+
+# Sets of case forms/variations. Simple case pairs have the same set
+# of case forms, e.g. the letters 'a' and 'A' both lowercase to 'a';
+# both uppercase to 'A', etc. By tracking unique sets using a hash, we
+# cut the size needed for the maps in half (some characters are
+# exceptions, so it's not exactly half). The key is an array of all
+# case forms, and the value is an index into the maps.
+my %case_forms;
+
+# Perl doesn't allow arrays as hash keys, so we need to transform the
+# set of case forms to a scalar.
+sub get_hash_key
 {
-EOS
+	return join ",", @_;
+}

-printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
+# Create map entries for all codepoints < 0x80, so that the caller can
+# have a fast-path lookup without needing to go through the main
+# table.
 for (my $code = 0; $code < 0x80; $code++)
 {
 	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
@ -358,26 +372,310 @@ for (my $code = 0; $code < 0x80; $code++)

 	die "unexpected special case for code $code"
 	  if defined $special{$code};
-	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
-	  $code, $lc, $tc, $uc, $fc;
+
+	push @{ $map{lower} }, [ $lc, $code ];
+	push @{ $map{title} }, [ $tc, $code ];
+	push @{ $map{upper} }, [ $uc, $code ];
+	push @{ $map{fold} }, [ $fc, $code ];
+	push @{ $map{special} }, [ 0, $code ];
+
+	my $key = get_hash_key($lc, $tc, $uc, $fc, 0);
+
+	$simple{$code}{Index} = $index;
+	$case_forms{$key} = $index++;
 }
-printf $OT "\n";

-printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
+# Create map entries for all characters >= 0x80 that have case
+# mappings (any character with a special case mapping also has an
+# entry in %simple).
 foreach my $code (sort { $a <=> $b } (keys %simple))
 {
 	next unless $code >= 0x80;    # already output above

-	my $map = $simple{$code};
-	my $special_case = "NULL";
+	my $entry = $simple{$code};
+	my $special_case = 0;
 	if (exists $special{$code})
 	{
-		$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
+		$special_case = $special{$code}{Index};
 	}
-	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
-	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
-	  $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
+
+	my $key = get_hash_key(
+		$entry->{Simple_Lowercase}, $entry->{Simple_Titlecase},
+		$entry->{Simple_Uppercase}, $entry->{Simple_Foldcase},
+		$special_case);
+
+	unless (exists $case_forms{$key})
+	{
+		$case_forms{$key} = $index++;
+
+		push @{ $map{lower} }, [ $entry->{Simple_Lowercase}, $code ];
+		push @{ $map{title} }, [ $entry->{Simple_Titlecase}, $code ];
+		push @{ $map{upper} }, [ $entry->{Simple_Uppercase}, $code ];
+		push @{ $map{fold} }, [ $entry->{Simple_Foldcase}, $code ];
+		push @{ $map{special} }, [ $special_case, $code ];
+	}
+
+	$simple{$code}{Index} = $case_forms{$key};
+}
+
+die
+  "mapping tables contains $index entries which cannot be represented in uint16"
+  if ($index > 65536);
+
+foreach my $kind ('lower', 'title', 'upper', 'fold')
+{
+	print $OT <<"EOS";
+
+/*
+ * The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
+ * given codepoint.
+ */
+static const pg_wchar case_map_$kind\[$index\] =
+{
+EOS
+
+	foreach my $entry (@{ $map{$kind} })
+	{
+		my $comment =
+		  @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]);
+		print $OT
+		  sprintf("\t0x%06x,\t\t\t\t\t/* %s */\n", @$entry[0], $comment);
+	}
+
+	print $OT "\n};\n";
+}
+
+print $OT <<"EOS";
+
+/*
+ * The entry case_map_special[case_index(codepoint)] is the index in
+ * special_case for that codepoint, or 0 if no special case mapping exists.
+ */
+static const uint8 case_map_special\[$index\] =
+{
+EOS
+
+foreach my $entry (@{ $map{special} })
+{
+	my $s = sprintf("%d,", @$entry[0]);
+	$s .= "\t" if length($s) < 4;
+	my $comment =
+	  @$entry[1] == -1 ? "reserved" : sprintf("U+%06x", @$entry[1]);
+	print $OT sprintf("\t%s\t\t\t\t\t\t/* %s */\n", $s, $comment);
+}
+
+print $OT "\n};\n";
+
+print $OT <<"EOS";
+
+/*
+ * Map for each case kind.
+ */
+static const pg_wchar *casekind_map[NCaseKind] =
+{
+	[CaseLower] = case_map_lower,
+	[CaseTitle] = case_map_title,
+	[CaseUpper] = case_map_upper,
+	[CaseFold] = case_map_fold,
+};
+EOS
+
+my @codepoints = keys %simple;
+my $range = make_ranges(\@codepoints, 500);
+my @case_map_lines = range_tables($range);
+my $case_map_length = scalar @case_map_lines;
+my $case_map_table = join "\n", @case_map_lines;
+
+print $OT <<"EOS";
+
+/*
+ * Used by case_index() to map a codepoint to an index that can be used in any
+ * of the following arrays: case_map_lower, case_map_title, case_map_upper,
+ * case_map_fold.
+ */
+static const uint16 case_map[$case_map_length] =
+{
+$case_map_table
+};
+
+
+EOS
+
+# First range is the fast path. It must start at codepoint zero, and
+# the end is the fastpath limit. Track the limit here and then
+# remove it before generating the other branches.
+die "first range must start at 0" unless ${ @$range[0] }{Start} == 0;
+my $fastpath_limit = sprintf("0x%04X", ${ @$range[0] }{End});
+shift @$range;
+
+print $OT <<"EOS";
+/*
+ * case_index()
+ *
+ * Given a code point, compute the index in the case_map at which we can find
+ * the offset into the mapping tables.
+ */
+static inline uint16
+case_index(pg_wchar cp)
+{
+	/* Fast path for codepoints < $fastpath_limit */
+	if (cp < $fastpath_limit)
+	{
+		return case_map[cp];
+	}
+
+EOS
+
+print $OT join("\n", @{ branch($range, 0, $#$range, 1) });
+
+print $OT <<"EOS";
+
+
+	return 0;
+}
+EOS
+
+close $OT;
+
+# The function generates C code with a series of nested if-else conditions
+# to search for the matching interval.
+sub branch
+{
+	my ($range, $from, $to, $indent) = @_;
+	my ($idx, $space, $entry, $table, @result);
+
+	$idx = ($from + int(($to - $from) / 2));
+	return \@result unless exists $range->[$idx];
+
+	$space = "\t" x $indent;
+
+	$entry = $range->[$idx];
+
+	# IF state
+	if ($idx == $from)
+	{
+		if ($idx == 0)
+		{
+			push @result,
+			  sprintf("%sif (cp >= 0x%04X && cp < 0x%04X)\n%s{",
+				$space, $entry->{Start}, $entry->{End}, $space);
+		}
+		else
+		{
+			push @result,
+			  sprintf("%sif (cp < 0x%04X)\n%s{",
+				$space, $entry->{End}, $space);
+		}
+
+		push @result,
+		  sprintf("%s\treturn case_map[cp - 0x%04X + %d];",
+			$space, $entry->{Start}, $entry->{Offset});
+	}
+	else
+	{
+		push @result,
+		  sprintf("%sif (cp < 0x%04X)\n%s{", $space, $entry->{End}, $space);
+		push @result, @{ branch($range, $from, $idx - 1, $indent + 1) };
+	}
+
+	push @result, $space . "}";
+
+	# return now if it's the last range
+	return \@result if $idx == (scalar @$range) - 1;
+
+	# ELSE looks ahead to the next range to avoid adding an
+	# unnecessary level of branching.
+	$entry = @$range[ $idx + 1 ];
+
+	# ELSE state
+	push @result,
+	  sprintf("%selse if (cp >= 0x%04X)\n%s{",
+		$space, $entry->{Start}, $space);
+
+	if ($idx == $to)
+	{
+		push @result,
+		  sprintf("%s\treturn case_map\[cp - 0x%04X + %d];",
+			$space, $entry->{Start}, $entry->{Offset});
+	}
+	else
+	{
+		push @result, @{ branch($range, $idx + 1, $to, $indent + 1) };
+	}
+
+	push @result, $space . "}";
+
+	return \@result;
+}
+
+# Group numbers into ranges where the difference between neighboring
+# elements does not exceed $limit. If the difference is greater, a new
+# range is created. This is used to break the sequence into intervals
+# where the gaps between numbers are greater than limit.
+#
+# For example, if there are numbers 1, 2, 3, 5, 6 and limit = 1, then
+# there is a difference of 2 between 3 and 5, which is greater than 1,
+# so there will be ranges 1-3 and 5-6.
+sub make_ranges
+{
+	my ($nums, $limit) = @_;
+	my ($prev, $start, $curr, $total, @sorted, @range);
+
+	@sorted = sort { $a <=> $b } @$nums;
+
+	die "expecting at least 2 codepoints" if (scalar @sorted < 2);
+
+	$start = shift @sorted;
+
+	die "expecting first codepoint to start at 0" unless $start == 0;
+
+	$prev = $start;
+	$total = 0;
+
+	# append final 'undef' to signal final iteration
+	push @sorted, undef;
+
+	foreach $curr (@sorted)
+	{
+		# if last iteration always append the range
+		if (!defined($curr) || ($curr - $prev > $limit))
+		{
+			push @range,
+			  {
+				Start => $start,
+				End => $prev + 1,
+				Offset => $total
+			  };
+			$total += $prev + 1 - $start;
+			$start = $curr;
+		}
+
+		$prev = $curr;
+	}
+
+	return \@range;
+}
+
+# The function combines all ranges into the case_map table. Ranges may
+# include codepoints without a case mapping at all, in which case the
+# entry in case_map should be zero.
+sub range_tables
+{
+	my ($range) = @_;
+	my (@lines, @result);
+
+	foreach my $entry (@$range)
+	{
+		my $start = $entry->{Start};
+		my $end = $entry->{End} - 1;
+
+		foreach my $cp ($start .. $end)
+		{
+			my $idx = sprintf("%d,", ($simple{$cp}{Index} || 0));
+			$idx .= "\t" if length($idx) < 4;
+			push @lines, sprintf("\t%s\t\t\t\t\t\t/* U+%06X */", $idx, $cp);
+		}
+	}
+
+	return @lines;
 }
-print $OT "};\n";
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@ -27,7 +27,7 @@ enum CaseMapResult
 	CASEMAP_SPECIAL,
 };

-static const pg_case_map *find_case_map(pg_wchar ucs);
+static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
@ -38,33 +38,33 @@ static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
 {
-	const pg_case_map *map = find_case_map(code);
+	pg_wchar	cp = find_case_map(code, case_map_lower);

-	return map ? map->simplemap[CaseLower] : code;
+	return cp != 0 ? cp : code;
 }

 pg_wchar
 unicode_titlecase_simple(pg_wchar code)
 {
-	const pg_case_map *map = find_case_map(code);
+	pg_wchar	cp = find_case_map(code, case_map_title);

-	return map ? map->simplemap[CaseTitle] : code;
+	return cp != 0 ? cp : code;
 }

 pg_wchar
 unicode_uppercase_simple(pg_wchar code)
 {
-	const pg_case_map *map = find_case_map(code);
+	pg_wchar	cp = find_case_map(code, case_map_upper);

-	return map ? map->simplemap[CaseUpper] : code;
+	return cp != 0 ? cp : code;
 }

 pg_wchar
 unicode_casefold_simple(pg_wchar code)
 {
-	const pg_case_map *map = find_case_map(code);
+	pg_wchar	cp = find_case_map(code, case_map_fold);

-	return map ? map->simplemap[CaseFold] : code;
+	return cp != 0 ? cp : code;
 }

 /*
@ -387,64 +387,48 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
 		const char *src, size_t srclen, size_t srcoff,
 		pg_wchar *simple, const pg_wchar **special)
 {
-	const pg_case_map *map;
+	uint16		idx;

+	/* Fast path for codepoints < 0x80 */
 	if (u1 < 0x80)
 	{
-		*simple = case_map[u1].simplemap[casekind];
+		/*
+		 * The first elements in all tables are reserved as 0 (as NULL). The
+		 * data starts at index 1, not 0.
+		 */
+		*simple = casekind_map[casekind][u1 + 1];

 		return CASEMAP_SIMPLE;
 	}

-	map = find_case_map(u1);
+	idx = case_index(u1);

-	if (map == NULL)
+	if (idx == 0)
 		return CASEMAP_SELF;

-	if (full && map->special_case != NULL &&
-		check_special_conditions(map->special_case->conditions,
+	if (full && case_map_special[idx] &&
+		check_special_conditions(special_case[case_map_special[idx]].conditions,
 								 src, srclen, srcoff))
 	{
-		*special = map->special_case->map[casekind];
+		*special = special_case[case_map_special[idx]].map[casekind];
 		return CASEMAP_SPECIAL;
 	}

-	*simple = map->simplemap[casekind];
+	*simple = casekind_map[casekind][idx];

 	return CASEMAP_SIMPLE;
 }

-/* find entry in simple case map, if any */
-static const pg_case_map *
-find_case_map(pg_wchar ucs)
+/*
+ * Find entry in simple case map.
+ * If the entry does not exist, 0 will be returned.
+ */
+static pg_wchar
+find_case_map(pg_wchar ucs, const pg_wchar *map)
 {
-	int			min;
-	int			mid;
-	int			max;
-
-	/* all chars <= 0x80 are stored in array for fast lookup */
-	Assert(lengthof(case_map) >= 0x80);
+	/* Fast path for codepoints < 0x80 */
 	if (ucs < 0x80)
-	{
-		const pg_case_map *map = &case_map[ucs];
-
-		Assert(map->codepoint == ucs);
-		return map;
-	}
-
-	/* otherwise, binary search */
-	min = 0x80;
-	max = lengthof(case_map) - 1;
-	while (max >= min)
-	{
-		mid = (min + max) / 2;
-		if (ucs > case_map[mid].codepoint)
-			min = mid + 1;
-		else if (ucs < case_map[mid].codepoint)
-			max = mid - 1;
-		else
-			return &case_map[mid];
-	}
-
-	return NULL;
+		/* The first elements in all tables are reserved as 0 (as NULL). */
+		return map[ucs + 1];
+	return map[case_index(ucs)];
 }
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h