@ -1,16 +1,22 @@
#!/usr/bin/perl
#
# Generate a composition table, using Unicode data files as input
# Generate a composition table and its lookup utilities, using Unicode data
# files as input.
#
# Input: UnicodeData.txt and CompositionExclusions.txt
# Output: unicode_norm_table.h
# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
#
# Copyright (c) 2000-2020, PostgreSQL Global Development Group
use strict ;
use warnings ;
my $ output_file = "unicode_norm_table.h" ;
use FindBin ;
use lib "$FindBin::RealBin/../../tools/" ;
use PerfectHash ;
my $ output_table_file = "unicode_norm_table.h" ;
my $ output_func_file = "unicode_norm_hashfunc.h" ;
my $ FH ;
@ -64,11 +70,13 @@ close $FH;
my $ num_characters = scalar @ characters ;
# Start writing out the output file
open my $ OUTPUT , '>' , $ output_file
or die "Could not open output file $output_file: $!\n" ;
# Start writing out the output files
open my $ OT , '>' , $ output_table_file
or die "Could not open output file $output_table_file: $!\n" ;
open my $ OF , '>' , $ output_func_file
or die "Could not open output file $output_func_file: $!\n" ;
print $ OUTPUT << HEADER ;
print $ OT << HEADER ;
/ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*
* unicode_norm_table . h
@ -111,8 +119,53 @@ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
{
HEADER
print $ OF << HEADER ;
/ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*
* unicode_norm_hashfunc . h
* Perfect hash functions used for Unicode normalization
*
* Portions Copyright ( c ) 1996 - 2020 , PostgreSQL Global Development Group
* Portions Copyright ( c ) 1994 , Regents of the University of California
*
* src /include/commo n / unicode_norm_hashfunc . h
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* /
/ *
* File auto - generated by src /common/ unicode / generate - unicode_norm_table . pl ,
* do not edit . There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
* here .
* /
#include "common/unicode_norm_table.h"
/* Typedef for perfect hash functions */
typedef int ( * cp_hash_func ) ( const void * key ) ;
/* Information for lookups with perfect hash functions */
typedef struct
{
const pg_unicode_decomposition * decomps ;
cp_hash_func hash ;
int num_decomps ;
} pg_unicode_decompinfo ;
typedef struct
{
const uint16 * inverse_lookup ;
cp_hash_func hash ;
int num_recomps ;
} pg_unicode_recompinfo ;
HEADER
my $ decomp_index = 0 ;
my $ decomp_string = "" ;
my @ dec_cp_packed ;
my $ main_index = 0 ;
my @ rec_info ;
my $ last_code = $ characters [ - 1 ] - > { code } ;
foreach my $ char ( @ characters )
@ -121,6 +174,9 @@ foreach my $char (@characters)
my $ class = $ char - > { class } ;
my $ decomp = $ char - > { decomp } ;
# Save the code point bytes as a string in network order.
push @ dec_cp_packed , pack ( 'N' , hex ( $ char - > { code } ) ) ;
# The character decomposition mapping field in UnicodeData.txt is a list
# of unicode codepoints, separated by space. But it can be prefixed with
# so-called compatibility formatting tag, like "<compat>", or "<font>".
@ -163,7 +219,7 @@ foreach my $char (@characters)
{
foreach my $ lcode ( @ composition_exclusion_codes )
{
if ( $ lcode eq $ char - > { c ode } )
if ( $ lcode eq $ code )
{
$ flags . = " | DECOMP_NO_COMPOSE" ;
$ comment = "in exclusion list" ;
@ -171,11 +227,26 @@ foreach my $char (@characters)
}
}
}
# Save info for recomposeable codepoints.
# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
# above! See also the inverse lookup in recompose_code() found in
# src/common/unicode_norm.c.
if ( ! ( $ flags =~ /DECOMP_COMPAT/ || $ flags =~ /DECOMP_NO_COMPOSE/ ) )
{
push @ rec_info ,
{
code = > $ code ,
main_index = > $ main_index ,
first = > $ first_decomp ,
second = > $ decomp_elts [ 0 ]
} ;
}
}
if ( $ decomp_size == 0 )
{
print $ OUTPUT "\t{0x$code, $class, 0$flags, 0}" ;
print $ OT "\t{0x$code, $class, 0$flags, 0}" ;
}
elsif ( $ decomp_size == 1 && length ( $ first_decomp ) <= 4 )
{
@ -183,12 +254,11 @@ foreach my $char (@characters)
# The decomposition consists of a single codepoint, and it fits
# in a uint16, so we can store it "inline" in the main table.
$ flags . = " | DECOMP_INLINE" ;
print $ OUTPU T "\t{0x$code, $class, 1$flags, 0x$first_decomp}" ;
print $ OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}" ;
}
else
{
print $ OUTPUT
"\t{0x$code, $class, $decomp_size$flags, $decomp_index}" ;
print $ OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}" ;
# Now save the decompositions into a dedicated area that will
# be written afterwards. First build the entry dedicated to
@ -205,25 +275,17 @@ foreach my $char (@characters)
}
# Print a comma after all items except the last one.
print $ OUTPUT "," unless ( $ code eq $ last_code ) ;
if ( $ comment ne "" )
{
print $ OT "," unless ( $ code eq $ last_code ) ;
# If the line is wide already, indent the comment with one tab,
# otherwise with two. This is to make the output match the way
# pgindent would mangle it. (This is quite hacky. To do this
# properly, we should actually track how long the line is so far,
# but this works for now.)
print $ OUTPUT "\t" if ( $ decomp_index < 10 ) ;
print $ OT "\t/* $comment */" if ( $ comment ne "" ) ;
print $ OT "\n" ;
print $ OUTPUT "\t/* $comment */" if ( $ comment ne "" ) ;
}
print $ OUTPUT "\n" ;
$ main_index + + ;
}
print $ OUTPU T "\n};\n\n" ;
print $ OT "\n};\n\n" ;
# Print the array of decomposed codes.
print $ OUTPU T << HEADER ;
print $ OT << HEADER ;
/* codepoints array */
static const uint32 UnicodeDecomp_codepoints [ $ decomp_index ] =
{
@ -231,4 +293,114 @@ $decomp_string
} ;
HEADER
close $ OUTPUT ;
# Emit the definition of the decomp hash function.
my $ dec_funcname = 'Decomp_hash_func' ;
my $ dec_func = PerfectHash:: generate_hash_function ( \ @ dec_cp_packed ,
$ dec_funcname , fixed_key_length = > 4 ) ;
print $ OF "/* Perfect hash function for decomposition */\n" ;
print $ OF "static $dec_func\n" ;
# Emit the structure that wraps the hash lookup information into
# one variable.
print $ OF << HEADER ;
/* Hash lookup information for decomposition */
static const pg_unicode_decompinfo UnicodeDecompInfo =
{
UnicodeDecompMain ,
$ dec_funcname ,
$ num_characters
} ;
HEADER
# Find the lowest codepoint that decomposes to each recomposeable
# code pair and create a mapping to it.
my $ recomp_string = "" ;
my @ rec_cp_packed ;
my % seenit ;
my $ firstentry = 1 ;
foreach my $ rec ( sort recomp_sort @ rec_info )
{
# The hash key is formed by concatenating the bytes of the two
# codepoints. See also recompose_code() in common/unicode_norm.c.
my $ hashkey = ( hex ( $ rec - > { first } ) << 32 ) | hex ( $ rec - > { second } ) ;
# We are only interested in the lowest code point that decomposes
# to the given code pair.
next if $ seenit { $ hashkey } ;
# Save the hash key bytes in network order
push @ rec_cp_packed , pack ( 'Q>' , $ hashkey ) ;
# Append inverse lookup element
$ recomp_string . = ",\n" if ! $ firstentry ;
$ recomp_string . = sprintf "\t/* U+%s+%s -> U+%s */ %s" ,
$ rec - > { first } ,
$ rec - > { second } ,
$ rec - > { code } ,
$ rec - > { main_index } ;
$ seenit { $ hashkey } = 1 ;
$ firstentry = 0 ;
}
# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
my $ num_recomps = scalar @ rec_cp_packed ;
print $ OF << HEADER ;
/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
static const uint16 RecompInverseLookup [ $ num_recomps ] =
{
$ recomp_string
} ;
HEADER
# Emit the definition of the recomposition hash function.
my $ rec_funcname = 'Recomp_hash_func' ;
my $ rec_func =
PerfectHash:: generate_hash_function ( \ @ rec_cp_packed , $ rec_funcname ,
fixed_key_length = > 8 ) ;
print $ OF "/* Perfect hash function for recomposition */\n" ;
print $ OF "static $rec_func\n" ;
# Emit the structure that wraps the hash lookup information into
# one variable.
print $ OF << HEADER ;
/* Hash lookup information for recomposition */
static const pg_unicode_recompinfo UnicodeRecompInfo =
{
RecompInverseLookup ,
$ rec_funcname ,
$ num_recomps
} ;
HEADER
close $ OT ;
close $ OF ;
sub recomp_sort
{
my $ a1 = hex ( $ a - > { first } ) ;
my $ b1 = hex ( $ b - > { first } ) ;
my $ a2 = hex ( $ a - > { second } ) ;
my $ b2 = hex ( $ b - > { second } ) ;
# First sort by the first code point
return - 1 if $ a1 < $ b1 ;
return 1 if $ a1 > $ b1 ;
# Then sort by the second code point
return - 1 if $ a2 < $ b2 ;
return 1 if $ a2 > $ b2 ;
# Finally sort by the code point that decomposes into first and
# second ones.
my $ acode = hex ( $ a - > { code } ) ;
my $ bcode = hex ( $ b - > { code } ) ;
return - 1 if $ acode < $ bcode ;
return - 1 if $ acode > $ bcode ;
die "found duplicate entries of recomposeable code pairs" ;
}