mirror of https://github.com/postgres/postgres
Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fipull/18/head
parent
6c303223be
commit
1de9cc0dcc
@ -1,128 +1,76 @@ |
||||
#! /usr/bin/perl |
||||
# |
||||
# Copyright (c) 2001-2016, PostgreSQL Global Development Group |
||||
# Copyright (c) 2007-2016, PostgreSQL Global Development Group |
||||
# |
||||
# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl |
||||
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl |
||||
# |
||||
# Generate UTF-8 <--> EUC_CN code conversion tables from |
||||
# map files provided by Unicode organization. |
||||
# Unfortunately it is prohibited by the organization |
||||
# to distribute the map files. So if you try to use this script, |
||||
# you have to obtain GB2312.TXT from |
||||
# the organization's ftp site. |
||||
# Generate UTF-8 <--> GB18030 code conversion tables from |
||||
# "gb-18030-2000.xml", obtained from |
||||
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ |
||||
# |
||||
# GB2312.TXT format: |
||||
# GB2312 code in hex |
||||
# UCS-2 code in hex |
||||
# # and Unicode name (not used in this script) |
||||
# The lines we care about in the source file look like |
||||
# <a u="009A" b="81 30 83 36"/> |
||||
# where the "u" field is the Unicode code point in hex, |
||||
# and the "b" field is the hex byte sequence for GB18030 |
||||
|
||||
require "ucs2utf.pl"; |
||||
require "convutils.pm"; |
||||
|
||||
# first generate UTF-8 --> EUC_CN table |
||||
# Read the input |
||||
|
||||
$in_file = "GB2312.TXT"; |
||||
$in_file = "gb-18030-2000.xml"; |
||||
|
||||
open(FILE, $in_file) || die("cannot open $in_file"); |
||||
|
||||
my @mapping; |
||||
|
||||
while (<FILE>) |
||||
{ |
||||
chop; |
||||
if (/^#/) |
||||
{ |
||||
next; |
||||
} |
||||
($c, $u, $rest) = split; |
||||
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/); |
||||
$u = $1; |
||||
$c = $2; |
||||
$c =~ s/ //g; |
||||
$ucs = hex($u); |
||||
$code = hex($c); |
||||
if ($code >= 0x80 && $ucs >= 0x0080) |
||||
{ |
||||
$utf = &ucs2utf($ucs); |
||||
if ($array{$utf} ne "") |
||||
{ |
||||
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; |
||||
next; |
||||
} |
||||
$count++; |
||||
|
||||
$array{$utf} = ($code | 0x8080); |
||||
} |
||||
} |
||||
close(FILE); |
||||
|
||||
$file = "utf8_to_euc_cn.map"; |
||||
open(FILE, "> $file") || die("cannot open $file"); |
||||
|
||||
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; |
||||
print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n"; |
||||
|
||||
for $index (sort { $a <=> $b } keys(%array)) |
||||
{ |
||||
$code = $array{$index}; |
||||
$count--; |
||||
if ($count == 0) |
||||
# The GB-18030 character set, which we use as the source, contains |
||||
# a lot of extra characters on top of the GB2312 character set that |
||||
# EUC_CN encodes. Filter out those extra characters. |
||||
next if (($code & 0xFF) < 0xA1); |
||||
next if (!($code >= 0xA100 && $code <= 0xA9FF || |
||||
$code >= 0xB000 && $code <= 0xF7FF)); |
||||
|
||||
next if ($code >= 0xA2A1 && $code <= 0xA2B0); |
||||
next if ($code >= 0xA2E3 && $code <= 0xA2E4); |
||||
next if ($code >= 0xA2EF && $code <= 0xA2F0); |
||||
next if ($code >= 0xA2FD && $code <= 0xA2FE); |
||||
next if ($code >= 0xA4F4 && $code <= 0xA4FE); |
||||
next if ($code >= 0xA5F7 && $code <= 0xA5FE); |
||||
next if ($code >= 0xA6B9 && $code <= 0xA6C0); |
||||
next if ($code >= 0xA6D9 && $code <= 0xA6FE); |
||||
next if ($code >= 0xA7C2 && $code <= 0xA7D0); |
||||
next if ($code >= 0xA7F2 && $code <= 0xA7FE); |
||||
next if ($code >= 0xA8BB && $code <= 0xA8C4); |
||||
next if ($code >= 0xA8EA && $code <= 0xA8FE); |
||||
next if ($code >= 0xA9A1 && $code <= 0xA9A3); |
||||
next if ($code >= 0xA9F0 && $code <= 0xA9FE); |
||||
next if ($code >= 0xD7FA && $code <= 0xD7FE); |
||||
|
||||
# A couple of characters are mapped differently from GB-2312 or GB-18030 |
||||
if ($code == 0xA1A4) |
||||
{ |
||||
printf FILE " {0x%04x, 0x%04x}\n", $index, $code; |
||||
$ucs = 0x30FB; |
||||
} |
||||
else |
||||
{ |
||||
printf FILE " {0x%04x, 0x%04x},\n", $index, $code; |
||||
} |
||||
} |
||||
|
||||
print FILE "};\n"; |
||||
close(FILE); |
||||
|
||||
# |
||||
# then generate EUC_CN --> UTF8 table |
||||
# |
||||
reset 'array'; |
||||
|
||||
open(FILE, $in_file) || die("cannot open $in_file"); |
||||
|
||||
while (<FILE>) |
||||
{ |
||||
chop; |
||||
if (/^#/) |
||||
if ($code == 0xA1AA) |
||||
{ |
||||
next; |
||||
$ucs = 0x2015; |
||||
} |
||||
($c, $u, $rest) = split; |
||||
$ucs = hex($u); |
||||
$code = hex($c); |
||||
if ($code >= 0x80 && $ucs >= 0x0080) |
||||
{ |
||||
$utf = &ucs2utf($ucs); |
||||
if ($array{$code} ne "") |
||||
{ |
||||
printf STDERR "Warning: duplicate code: %04x\n", $ucs; |
||||
next; |
||||
} |
||||
$count++; |
||||
|
||||
$code |= 0x8080; |
||||
$array{$code} = $utf; |
||||
push @mapping, { |
||||
ucs => $ucs, |
||||
code => $code, |
||||
direction => 'both' |
||||
} |
||||
} |
||||
close(FILE); |
||||
|
||||
$file = "euc_cn_to_utf8.map"; |
||||
open(FILE, "> $file") || die("cannot open $file"); |
||||
|
||||
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; |
||||
print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n"; |
||||
for $index (sort { $a <=> $b } keys(%array)) |
||||
{ |
||||
$utf = $array{$index}; |
||||
$count--; |
||||
if ($count == 0) |
||||
{ |
||||
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; |
||||
} |
||||
else |
||||
{ |
||||
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; |
||||
} |
||||
} |
||||
|
||||
print FILE "};\n"; |
||||
close(FILE); |
||||
print_tables("EUC_CN", \@mapping); |
||||
|
||||
@ -0,0 +1,31 @@ |
||||
#! /usr/bin/perl |
||||
# |
||||
# Copyright (c) 2001-2016, PostgreSQL Global Development Group |
||||
# |
||||
# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl |
||||
# |
||||
# Generate UTF-8 <--> JOHAB conversion tables from |
||||
# map files provided by Unicode organization. |
||||
# Unfortunately it is prohibited by the organization |
||||
# to distribute the map files. So if you try to use this script, |
||||
# you have to obtain the map files from the organization's ftp site. |
||||
# ftp://www.unicode.org/Public/MAPPINGS/ |
||||
# We assume the file include three tab-separated columns: |
||||
# JOHAB code in hex |
||||
# UCS-2 code in hex |
||||
# # and Unicode name (not used in this script) |
||||
|
||||
require "convutils.pm"; |
||||
|
||||
# Load the source file. |
||||
|
||||
my $mapping = &read_source("JOHAB.TXT"); |
||||
|
||||
# Some extra characters that are not in JOHAB.TXT |
||||
push @$mapping, ( |
||||
{direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'}, |
||||
{direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'}, |
||||
{direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'} |
||||
); |
||||
|
||||
print_tables("JOHAB", $mapping); |
||||
@ -0,0 +1,51 @@ |
||||
#! /usr/bin/perl |
||||
# |
||||
# Copyright (c) 2007-2016, PostgreSQL Global Development Group |
||||
# |
||||
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl |
||||
# |
||||
# Generate UTF-8 <--> UHC code conversion tables from |
||||
# "windows-949-2000.xml", obtained from |
||||
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ |
||||
# |
||||
# The lines we care about in the source file look like |
||||
# <a u="009A" b="81 30 83 36"/> |
||||
# where the "u" field is the Unicode code point in hex, |
||||
# and the "b" field is the hex byte sequence for UHC |
||||
|
||||
require "convutils.pm"; |
||||
|
||||
# Read the input |
||||
|
||||
$in_file = "windows-949-2000.xml"; |
||||
|
||||
open(FILE, $in_file) || die("cannot open $in_file"); |
||||
|
||||
my @mapping; |
||||
|
||||
while (<FILE>) |
||||
{ |
||||
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/); |
||||
$u = $1; |
||||
$c = $2; |
||||
$c =~ s/ //g; |
||||
$ucs = hex($u); |
||||
$code = hex($c); |
||||
|
||||
next if ($code == 0x0080 || $code == 0x00FF); |
||||
|
||||
if ($code >= 0x80 && $ucs >= 0x0080) |
||||
{ |
||||
push @mapping, { |
||||
ucs => $ucs, |
||||
code => $code, |
||||
direction => 'both' |
||||
} |
||||
} |
||||
} |
||||
close(FILE); |
||||
|
||||
# One extra character that's not in the source file. |
||||
push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' }; |
||||
|
||||
print_tables("UHC", \@mapping); |
||||
@ -0,0 +1,282 @@ |
||||
# |
||||
# Copyright (c) 2001-2016, PostgreSQL Global Development Group |
||||
# |
||||
# src/backend/utils/mb/Unicode/convutils.pm |
||||
|
||||
use strict; |
||||
|
||||
####################################################################### |
||||
# convert UCS-4 to UTF-8 |
||||
# |
||||
sub ucs2utf |
||||
{ |
||||
my ($ucs) = @_; |
||||
my $utf; |
||||
|
||||
if ($ucs <= 0x007f) |
||||
{ |
||||
$utf = $ucs; |
||||
} |
||||
elsif ($ucs > 0x007f && $ucs <= 0x07ff) |
||||
{ |
||||
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); |
||||
} |
||||
elsif ($ucs > 0x07ff && $ucs <= 0xffff) |
||||
{ |
||||
$utf = |
||||
((($ucs >> 12) | 0xe0) << 16) | |
||||
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); |
||||
} |
||||
else |
||||
{ |
||||
$utf = |
||||
((($ucs >> 18) | 0xf0) << 24) | |
||||
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | |
||||
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); |
||||
} |
||||
return ($utf); |
||||
} |
||||
|
||||
####################################################################### |
||||
# read_source - common routine to read source file |
||||
# |
||||
# fname ; input file name |
||||
sub read_source |
||||
{ |
||||
my ($fname) = @_; |
||||
my @r; |
||||
|
||||
open(my $in, '<', $fname) || die("cannot open $fname"); |
||||
|
||||
while (<$in>) |
||||
{ |
||||
next if (/^#/); |
||||
chop; |
||||
|
||||
next if (/^$/); # Ignore empty lines |
||||
|
||||
next if (/^0x([0-9A-F]+)\s+(#.*)$/); |
||||
|
||||
# Skip the first column for JIS0208.TXT |
||||
if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/) |
||||
{ |
||||
print STDERR "READ ERROR at line $. in $fname: $_\n"; |
||||
exit; |
||||
} |
||||
my $out = {f => $fname, l => $., |
||||
code => hex($1), |
||||
ucs => hex($2), |
||||
comment => $4, |
||||
direction => "both" |
||||
}; |
||||
|
||||
# Ignore pure ASCII mappings. PostgreSQL character conversion code |
||||
# never even passes these to the conversion code. |
||||
next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); |
||||
|
||||
push(@r, $out); |
||||
} |
||||
close($in); |
||||
|
||||
return \@r; |
||||
} |
||||
|
||||
################################################################## |
||||
# print_tables : output mapping tables |
||||
# |
||||
# Arguments: |
||||
# charset - string name of the character set. |
||||
# table - mapping table (see format below) |
||||
# verbose - if 1, output comment on each line, |
||||
# if 2, also output source file name and number |
||||
# |
||||
# |
||||
# |
||||
# Mapping table format: |
||||
# |
||||
# Mapping table is a list of hashes. Each hash has the following fields: |
||||
# direction - Direction: 'both', 'from_unicode' or 'to_unicode' |
||||
# ucs - Unicode code point |
||||
# ucs_second - Second Unicode code point, if this is a "combined" character. |
||||
# code - Byte sequence in the "other" character set, as an integer |
||||
# comment - Text representation of the character |
||||
# f - Source filename |
||||
# l - Line number in source file |
||||
# |
||||
# |
||||
sub print_tables |
||||
{ |
||||
my ($charset, $table, $verbose) = @_; |
||||
|
||||
# Build an array with only the to-UTF8 direction mappings |
||||
my @to_unicode; |
||||
my @to_unicode_combined; |
||||
my @from_unicode; |
||||
my @from_unicode_combined; |
||||
|
||||
foreach my $i (@$table) |
||||
{ |
||||
if (defined $i->{ucs_second}) |
||||
{ |
||||
my $entry = {utf8 => ucs2utf($i->{ucs}), |
||||
utf8_second => ucs2utf($i->{ucs_second}), |
||||
code => $i->{code}, |
||||
comment => $i->{comment}, |
||||
f => $i->{f}, l => $i->{l}}; |
||||
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") |
||||
{ |
||||
push @to_unicode_combined, $entry; |
||||
} |
||||
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") |
||||
{ |
||||
push @from_unicode_combined, $entry; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
my $entry = {utf8 => ucs2utf($i->{ucs}), |
||||
code => $i->{code}, |
||||
comment => $i->{comment}, |
||||
f => $i->{f}, l => $i->{l}}; |
||||
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") |
||||
{ |
||||
push @to_unicode, $entry; |
||||
} |
||||
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") |
||||
{ |
||||
push @from_unicode, $entry; |
||||
} |
||||
} |
||||
} |
||||
|
||||
print_to_utf8_map($charset, \@to_unicode, $verbose); |
||||
print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); |
||||
print_from_utf8_map($charset, \@from_unicode, $verbose); |
||||
print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); |
||||
} |
||||
|
||||
sub print_from_utf8_map |
||||
{ |
||||
my ($charset, $table, $verbose) = @_; |
||||
|
||||
my $last_comment = ""; |
||||
|
||||
my $fname = lc("utf8_to_${charset}.map"); |
||||
print "- Writing UTF8=>${charset} conversion table: $fname\n"; |
||||
open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; |
||||
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". |
||||
"static const pg_utf_to_local ULmap${charset}[ %d ] = {", |
||||
scalar(@$table)); |
||||
my $first = 1; |
||||
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) |
||||
{ |
||||
print($out ",") if (!$first); |
||||
$first = 0; |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
|
||||
printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code}); |
||||
if ($verbose >= 2) |
||||
{ |
||||
$last_comment = "$$i{f}:$$i{l} $$i{comment}"; |
||||
} |
||||
else |
||||
{ |
||||
$last_comment = $$i{comment}; |
||||
} |
||||
} |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
print $out "\n};\n"; |
||||
close($out); |
||||
} |
||||
|
||||
sub print_from_utf8_combined_map |
||||
{ |
||||
my ($charset, $table, $verbose) = @_; |
||||
|
||||
my $last_comment = ""; |
||||
|
||||
my $fname = lc("utf8_to_${charset}_combined.map"); |
||||
print "- Writing UTF8=>${charset} conversion table: $fname\n"; |
||||
open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; |
||||
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". |
||||
"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", |
||||
scalar(@$table)); |
||||
my $first = 1; |
||||
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) |
||||
{ |
||||
print($out ",") if (!$first); |
||||
$first = 0; |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
|
||||
printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code}); |
||||
$last_comment = "$$i{comment}"; |
||||
} |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
print $out "\n};\n"; |
||||
close($out); |
||||
} |
||||
|
||||
sub print_to_utf8_map |
||||
{ |
||||
my ($charset, $table, $verbose) = @_; |
||||
|
||||
my $last_comment = ""; |
||||
|
||||
my $fname = lc("${charset}_to_utf8.map"); |
||||
|
||||
print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; |
||||
open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; |
||||
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". |
||||
"static const pg_local_to_utf LUmap${charset}[ %d ] = {", |
||||
scalar(@$table)); |
||||
my $first = 1; |
||||
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) |
||||
{ |
||||
print($out ",") if (!$first); |
||||
$first = 0; |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
|
||||
printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8}); |
||||
if ($verbose >= 2) |
||||
{ |
||||
$last_comment = "$$i{f}:$$i{l} $$i{comment}"; |
||||
} |
||||
else |
||||
{ |
||||
$last_comment = $$i{comment}; |
||||
} |
||||
} |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
print $out "\n};\n"; |
||||
close($out); |
||||
} |
||||
|
||||
sub print_to_utf8_combined_map |
||||
{ |
||||
my ($charset, $table, $verbose) = @_; |
||||
|
||||
my $last_comment = ""; |
||||
|
||||
my $fname = lc("${charset}_to_utf8_combined.map"); |
||||
|
||||
print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; |
||||
open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; |
||||
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". |
||||
"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", |
||||
scalar(@$table)); |
||||
my $first = 1; |
||||
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) |
||||
{ |
||||
print($out ",") if (!$first); |
||||
$first = 0; |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
|
||||
printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second}); |
||||
$last_comment = "$$i{comment}"; |
||||
} |
||||
print($out "\t/* $last_comment */") if ($verbose); |
||||
print $out "\n};\n"; |
||||
close($out); |
||||
} |
||||
|
||||
1; |
||||
@ -1,35 +0,0 @@ |
||||
# |
||||
# Copyright (c) 2001-2016, PostgreSQL Global Development Group |
||||
# |
||||
# src/backend/utils/mb/Unicode/ucs2utf.pl |
||||
# convert UCS-4 to UTF-8 |
||||
# |
||||
sub ucs2utf |
||||
{ |
||||
local ($ucs) = @_; |
||||
local $utf; |
||||
|
||||
if ($ucs <= 0x007f) |
||||
{ |
||||
$utf = $ucs; |
||||
} |
||||
elsif ($ucs > 0x007f && $ucs <= 0x07ff) |
||||
{ |
||||
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); |
||||
} |
||||
elsif ($ucs > 0x07ff && $ucs <= 0xffff) |
||||
{ |
||||
$utf = |
||||
((($ucs >> 12) | 0xe0) << 16) | |
||||
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); |
||||
} |
||||
else |
||||
{ |
||||
$utf = |
||||
((($ucs >> 18) | 0xf0) << 24) | |
||||
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | |
||||
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); |
||||
} |
||||
return ($utf); |
||||
} |
||||
1; |
||||
Loading…
Reference in new issue