mirror of https://github.com/postgres/postgres
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentrautpull/143/head
parent
7021d3b176
commit
a02b37fc08
@ -0,0 +1,108 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* category_test.c |
||||||
|
* Program to test Unicode general category functions. |
||||||
|
* |
||||||
|
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group |
||||||
|
* |
||||||
|
* IDENTIFICATION |
||||||
|
* src/common/unicode/category_test.c |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#include "postgres_fe.h" |
||||||
|
|
||||||
|
#include <stdio.h> |
||||||
|
#include <stdlib.h> |
||||||
|
#include <string.h> |
||||||
|
|
||||||
|
#ifdef USE_ICU |
||||||
|
#include <unicode/uchar.h> |
||||||
|
#endif |
||||||
|
#include "common/unicode_category.h" |
||||||
|
#include "common/unicode_version.h" |
||||||
|
|
||||||
|
/*
|
||||||
|
* Parse version into integer for easy comparison. |
||||||
|
*/ |
||||||
|
#ifdef USE_ICU |
||||||
|
static int |
||||||
|
parse_unicode_version(const char *version) |
||||||
|
{ |
||||||
|
int n, |
||||||
|
major, |
||||||
|
minor; |
||||||
|
|
||||||
|
n = sscanf(version, "%d.%d", &major, &minor); |
||||||
|
|
||||||
|
Assert(n == 2); |
||||||
|
Assert(minor < 100); |
||||||
|
|
||||||
|
return major * 100 + minor; |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
/*
|
||||||
|
* Exhaustively test that the Unicode category for each codepoint matches that |
||||||
|
* returned by ICU. |
||||||
|
*/ |
||||||
|
int |
||||||
|
main(int argc, char **argv) |
||||||
|
{ |
||||||
|
#ifdef USE_ICU |
||||||
|
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION); |
||||||
|
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION); |
||||||
|
int pg_skipped_codepoints = 0; |
||||||
|
int icu_skipped_codepoints = 0; |
||||||
|
|
||||||
|
printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION); |
||||||
|
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION); |
||||||
|
|
||||||
|
for (UChar32 code = 0; code <= 0x10ffff; code++) |
||||||
|
{ |
||||||
|
uint8_t pg_category = unicode_category(code); |
||||||
|
uint8_t icu_category = u_charType(code); |
||||||
|
|
||||||
|
if (pg_category != icu_category) |
||||||
|
{ |
||||||
|
/*
|
||||||
|
* A version mismatch means that some assigned codepoints in the |
||||||
|
* newer version may be unassigned in the older version. That's |
||||||
|
* OK, though the test will not cover those codepoints marked |
||||||
|
* unassigned in the older version (that is, it will no longer be |
||||||
|
* an exhaustive test). |
||||||
|
*/ |
||||||
|
if (pg_category == PG_U_UNASSIGNED && |
||||||
|
pg_unicode_version < icu_unicode_version) |
||||||
|
pg_skipped_codepoints++; |
||||||
|
else if (icu_category == PG_U_UNASSIGNED && |
||||||
|
icu_unicode_version < pg_unicode_version) |
||||||
|
icu_skipped_codepoints++; |
||||||
|
else |
||||||
|
{ |
||||||
|
printf("FAILURE for codepoint %06x\n", code); |
||||||
|
printf("Postgres category: %02d %s %s\n", pg_category, |
||||||
|
unicode_category_abbrev(pg_category), |
||||||
|
unicode_category_string(pg_category)); |
||||||
|
printf("ICU category: %02d %s %s\n", icu_category, |
||||||
|
unicode_category_abbrev(icu_category), |
||||||
|
unicode_category_string(icu_category)); |
||||||
|
printf("\n"); |
||||||
|
exit(1); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if (pg_skipped_codepoints > 0) |
||||||
|
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n", |
||||||
|
pg_skipped_codepoints); |
||||||
|
if (icu_skipped_codepoints > 0) |
||||||
|
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n", |
||||||
|
icu_skipped_codepoints); |
||||||
|
|
||||||
|
printf("category_test: All tests successful!\n"); |
||||||
|
exit(0); |
||||||
|
#else |
||||||
|
printf("ICU support required for test; skipping.\n"); |
||||||
|
exit(0); |
||||||
|
#endif |
||||||
|
} |
@ -0,0 +1,204 @@ |
|||||||
|
#!/usr/bin/perl |
||||||
|
# |
||||||
|
# Generate a code point category table and its lookup utilities, using |
||||||
|
# Unicode data files as input. |
||||||
|
# |
||||||
|
# Input: UnicodeData.txt |
||||||
|
# Output: unicode_category_table.h |
||||||
|
# |
||||||
|
# Copyright (c) 2000-2023, PostgreSQL Global Development Group |
||||||
|
|
||||||
|
use strict; |
||||||
|
use warnings; |
||||||
|
use Getopt::Long; |
||||||
|
|
||||||
|
use FindBin; |
||||||
|
use lib "$FindBin::RealBin/../../tools/"; |
||||||
|
|
||||||
|
my $CATEGORY_UNASSIGNED = 'Cn'; |
||||||
|
|
||||||
|
my $output_path = '.'; |
||||||
|
|
||||||
|
GetOptions('outdir:s' => \$output_path); |
||||||
|
|
||||||
|
my $output_table_file = "$output_path/unicode_category_table.h"; |
||||||
|
|
||||||
|
my $FH; |
||||||
|
|
||||||
|
# Read entries from UnicodeData.txt into a list of codepoint ranges |
||||||
|
# and their general category. |
||||||
|
my @category_ranges = (); |
||||||
|
my $range_start = undef; |
||||||
|
my $range_end = undef; |
||||||
|
my $range_category = undef; |
||||||
|
|
||||||
|
# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in |
||||||
|
# codepoints represents a range, and $gap_category is equal to the |
||||||
|
# category for both (which must match). Otherwise, the gap represents |
||||||
|
# unassigned code points. |
||||||
|
my $gap_category = undef; |
||||||
|
|
||||||
|
open($FH, '<', "$output_path/UnicodeData.txt") |
||||||
|
or die "Could not open $output_path/UnicodeData.txt: $!."; |
||||||
|
while (my $line = <$FH>) |
||||||
|
{ |
||||||
|
my @elts = split(';', $line); |
||||||
|
my $code = hex($elts[0]); |
||||||
|
my $name = $elts[1]; |
||||||
|
my $category = $elts[2]; |
||||||
|
|
||||||
|
die "codepoint out of range" if $code > 0x10FFFF; |
||||||
|
die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED; |
||||||
|
|
||||||
|
if (!defined($range_start)) { |
||||||
|
my $code_str = sprintf "0x%06x", $code; |
||||||
|
die if defined($range_end) || defined($range_category) || defined($gap_category); |
||||||
|
die "unexpected first entry <..., Last>" if ($name =~ /Last>/); |
||||||
|
die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000; |
||||||
|
|
||||||
|
# initialize |
||||||
|
$range_start = $code; |
||||||
|
$range_end = $code; |
||||||
|
$range_category = $category; |
||||||
|
if ($name =~ /<.*, First>$/) { |
||||||
|
$gap_category = $category; |
||||||
|
} else { |
||||||
|
$gap_category = $CATEGORY_UNASSIGNED; |
||||||
|
} |
||||||
|
next; |
||||||
|
} |
||||||
|
|
||||||
|
# Gap in codepoints detected. If it's a different category than |
||||||
|
# the current range, emit the current range and initialize a new |
||||||
|
# range representing the gap. |
||||||
|
if ($range_end + 1 != $code && $range_category ne $gap_category) { |
||||||
|
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); |
||||||
|
$range_start = $range_end + 1; |
||||||
|
$range_end = $code - 1; |
||||||
|
$range_category = $gap_category; |
||||||
|
} |
||||||
|
|
||||||
|
# different category; new range |
||||||
|
if ($range_category ne $category) { |
||||||
|
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); |
||||||
|
$range_start = $code; |
||||||
|
$range_end = $code; |
||||||
|
$range_category = $category; |
||||||
|
} |
||||||
|
|
||||||
|
if ($name =~ /<.*, First>$/) { |
||||||
|
die "<..., First> entry unexpectedly follows another <..., First> entry" |
||||||
|
if $gap_category ne $CATEGORY_UNASSIGNED; |
||||||
|
$gap_category = $category; |
||||||
|
} |
||||||
|
elsif ($name =~ /<.*, Last>$/) { |
||||||
|
die "<..., First> and <..., Last> entries have mismatching general category" |
||||||
|
if $gap_category ne $category; |
||||||
|
$gap_category = $CATEGORY_UNASSIGNED; |
||||||
|
} |
||||||
|
else { |
||||||
|
die "unexpected entry found between <..., First> and <..., Last>" |
||||||
|
if $gap_category ne $CATEGORY_UNASSIGNED; |
||||||
|
} |
||||||
|
|
||||||
|
$range_end = $code; |
||||||
|
} |
||||||
|
close $FH; |
||||||
|
|
||||||
|
die "<..., First> entry with no corresponding <..., Last> entry" |
||||||
|
if $gap_category ne $CATEGORY_UNASSIGNED; |
||||||
|
|
||||||
|
# emit final range |
||||||
|
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); |
||||||
|
|
||||||
|
# emit range for any unassigned code points after last entry |
||||||
|
if ($range_end < 0x10FFFF) { |
||||||
|
$range_start = $range_end + 1; |
||||||
|
$range_end = 0x10FFFF; |
||||||
|
$range_category = $CATEGORY_UNASSIGNED; |
||||||
|
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); |
||||||
|
} |
||||||
|
|
||||||
|
my $num_ranges = scalar @category_ranges; |
||||||
|
|
||||||
|
# See: https://www.unicode.org/reports/tr44/#General_Category_Values |
||||||
|
my $categories = { |
||||||
|
Cn => 'PG_U_UNASSIGNED', |
||||||
|
Lu => 'PG_U_UPPERCASE_LETTER', |
||||||
|
Ll => 'PG_U_LOWERCASE_LETTER', |
||||||
|
Lt => 'PG_U_TITLECASE_LETTER', |
||||||
|
Lm => 'PG_U_MODIFIER_LETTER', |
||||||
|
Lo => 'PG_U_OTHER_LETTER', |
||||||
|
Mn => 'PG_U_NONSPACING_MARK', |
||||||
|
Me => 'PG_U_ENCLOSING_MARK', |
||||||
|
Mc => 'PG_U_SPACING_MARK', |
||||||
|
Nd => 'PG_U_DECIMAL_NUMBER', |
||||||
|
Nl => 'PG_U_LETTER_NUMBER', |
||||||
|
No => 'PG_U_OTHER_NUMBER', |
||||||
|
Zs => 'PG_U_SPACE_SEPARATOR', |
||||||
|
Zl => 'PG_U_LINE_SEPARATOR', |
||||||
|
Zp => 'PG_U_PARAGRAPH_SEPARATOR', |
||||||
|
Cc => 'PG_U_CONTROL', |
||||||
|
Cf => 'PG_U_FORMAT', |
||||||
|
Co => 'PG_U_PRIVATE_USE', |
||||||
|
Cs => 'PG_U_SURROGATE', |
||||||
|
Pd => 'PG_U_DASH_PUNCTUATION', |
||||||
|
Ps => 'PG_U_OPEN_PUNCTUATION', |
||||||
|
Pe => 'PG_U_CLOSE_PUNCTUATION', |
||||||
|
Pc => 'PG_U_CONNECTOR_PUNCTUATION', |
||||||
|
Po => 'PG_U_OTHER_PUNCTUATION', |
||||||
|
Sm => 'PG_U_MATH_SYMBOL', |
||||||
|
Sc => 'PG_U_CURRENCY_SYMBOL', |
||||||
|
Sk => 'PG_U_MODIFIER_SYMBOL', |
||||||
|
So => 'PG_U_OTHER_SYMBOL', |
||||||
|
Pi => 'PG_U_INITIAL_PUNCTUATION', |
||||||
|
Pf => 'PG_U_FINAL_PUNCTUATION' |
||||||
|
}; |
||||||
|
|
||||||
|
# Start writing out the output files |
||||||
|
open my $OT, '>', $output_table_file |
||||||
|
or die "Could not open output file $output_table_file: $!\n"; |
||||||
|
|
||||||
|
print $OT <<HEADER; |
||||||
|
/*------------------------------------------------------------------------- |
||||||
|
* |
||||||
|
* unicode_category_table.h |
||||||
|
* Category table for Unicode character classification. |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* src/include/common/unicode_category_table.h |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "common/unicode_category.h" |
||||||
|
|
||||||
|
/* |
||||||
|
* File auto-generated by src/common/unicode/generate-unicode_category_table.pl, |
||||||
|
* do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H |
||||||
|
* here. |
||||||
|
*/ |
||||||
|
typedef struct |
||||||
|
{ |
||||||
|
uint32 first; /* Unicode codepoint */ |
||||||
|
uint32 last; /* Unicode codepoint */ |
||||||
|
uint8 category; /* General Category */ |
||||||
|
} pg_category_range; |
||||||
|
|
||||||
|
/* table of Unicode codepoint ranges and their categories */ |
||||||
|
static const pg_category_range unicode_categories[$num_ranges] = |
||||||
|
{ |
||||||
|
HEADER |
||||||
|
|
||||||
|
my $firsttime = 1; |
||||||
|
foreach my $range (@category_ranges) { |
||||||
|
printf $OT ",\n" unless $firsttime; |
||||||
|
$firsttime = 0; |
||||||
|
|
||||||
|
my $category = $categories->{$range->{category}}; |
||||||
|
die "category missing: $range->{category}" unless $category; |
||||||
|
printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category; |
||||||
|
} |
||||||
|
print $OT "\n};\n"; |
@ -0,0 +1,46 @@ |
|||||||
|
#!/usr/bin/perl |
||||||
|
# |
||||||
|
# Generate header file with Unicode version used by Postgres. |
||||||
|
# |
||||||
|
# Output: unicode_version.h |
||||||
|
# |
||||||
|
# Copyright (c) 2000-2023, PostgreSQL Global Development Group |
||||||
|
|
||||||
|
use strict; |
||||||
|
use warnings; |
||||||
|
use Getopt::Long; |
||||||
|
|
||||||
|
use FindBin; |
||||||
|
use lib "$FindBin::RealBin/../../tools/"; |
||||||
|
|
||||||
|
my $output_path = '.'; |
||||||
|
my $version_str = undef; |
||||||
|
|
||||||
|
GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str); |
||||||
|
|
||||||
|
my @version_parts = split /\./, $version_str; |
||||||
|
|
||||||
|
my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1]; |
||||||
|
|
||||||
|
my $output_file = "$output_path/unicode_version.h"; |
||||||
|
|
||||||
|
# Start writing out the output files |
||||||
|
open my $OT, '>', $output_file |
||||||
|
or die "Could not open output file $output_file: $!\n"; |
||||||
|
|
||||||
|
print $OT <<HEADER; |
||||||
|
/*------------------------------------------------------------------------- |
||||||
|
* |
||||||
|
* unicode_version.h |
||||||
|
* Unicode version used by Postgres. |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* src/include/common/unicode_version.h |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
|
||||||
|
#define PG_UNICODE_VERSION "$unicode_version_str" |
||||||
|
HEADER |
@ -0,0 +1,195 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* unicode_category.c |
||||||
|
* Determine general category of Unicode characters. |
||||||
|
* |
||||||
|
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group |
||||||
|
* |
||||||
|
* IDENTIFICATION |
||||||
|
* src/common/unicode_category.c |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#ifndef FRONTEND |
||||||
|
#include "postgres.h" |
||||||
|
#else |
||||||
|
#include "postgres_fe.h" |
||||||
|
#endif |
||||||
|
|
||||||
|
#include "common/unicode_category.h" |
||||||
|
#include "common/unicode_category_table.h" |
||||||
|
|
||||||
|
/*
|
||||||
|
* Unicode general category for the given codepoint. |
||||||
|
*/ |
||||||
|
pg_unicode_category |
||||||
|
unicode_category(pg_wchar ucs) |
||||||
|
{ |
||||||
|
int min = 0; |
||||||
|
int mid; |
||||||
|
int max = lengthof(unicode_categories) - 1; |
||||||
|
|
||||||
|
Assert(ucs >= unicode_categories[0].first && |
||||||
|
ucs <= unicode_categories[max].last); |
||||||
|
|
||||||
|
while (max >= min) |
||||||
|
{ |
||||||
|
mid = (min + max) / 2; |
||||||
|
if (ucs > unicode_categories[mid].last) |
||||||
|
min = mid + 1; |
||||||
|
else if (ucs < unicode_categories[mid].first) |
||||||
|
max = mid - 1; |
||||||
|
else |
||||||
|
return unicode_categories[mid].category; |
||||||
|
} |
||||||
|
|
||||||
|
Assert(false); |
||||||
|
return (pg_unicode_category) - 1; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Description of Unicode general category. |
||||||
|
*/ |
||||||
|
const char * |
||||||
|
unicode_category_string(pg_unicode_category category) |
||||||
|
{ |
||||||
|
switch (category) |
||||||
|
{ |
||||||
|
case PG_U_UNASSIGNED: |
||||||
|
return "Unassigned"; |
||||||
|
case PG_U_UPPERCASE_LETTER: |
||||||
|
return "Uppercase_Letter"; |
||||||
|
case PG_U_LOWERCASE_LETTER: |
||||||
|
return "Lowercase_Letter"; |
||||||
|
case PG_U_TITLECASE_LETTER: |
||||||
|
return "Titlecase_Letter"; |
||||||
|
case PG_U_MODIFIER_LETTER: |
||||||
|
return "Modifier_Letter"; |
||||||
|
case PG_U_OTHER_LETTER: |
||||||
|
return "Other_Letter"; |
||||||
|
case PG_U_NONSPACING_MARK: |
||||||
|
return "Nonspacing_Mark"; |
||||||
|
case PG_U_ENCLOSING_MARK: |
||||||
|
return "Enclosing_Mark"; |
||||||
|
case PG_U_SPACING_MARK: |
||||||
|
return "Spacing_Mark"; |
||||||
|
case PG_U_DECIMAL_NUMBER: |
||||||
|
return "Decimal_Number"; |
||||||
|
case PG_U_LETTER_NUMBER: |
||||||
|
return "Letter_Number"; |
||||||
|
case PG_U_OTHER_NUMBER: |
||||||
|
return "Other_Number"; |
||||||
|
case PG_U_SPACE_SEPARATOR: |
||||||
|
return "Space_Separator"; |
||||||
|
case PG_U_LINE_SEPARATOR: |
||||||
|
return "Line_Separator"; |
||||||
|
case PG_U_PARAGRAPH_SEPARATOR: |
||||||
|
return "Paragraph_Separator"; |
||||||
|
case PG_U_CONTROL: |
||||||
|
return "Control"; |
||||||
|
case PG_U_FORMAT: |
||||||
|
return "Format"; |
||||||
|
case PG_U_PRIVATE_USE: |
||||||
|
return "Private_Use"; |
||||||
|
case PG_U_SURROGATE: |
||||||
|
return "Surrogate"; |
||||||
|
case PG_U_DASH_PUNCTUATION: |
||||||
|
return "Dash_Punctuation"; |
||||||
|
case PG_U_OPEN_PUNCTUATION: |
||||||
|
return "Open_Punctuation"; |
||||||
|
case PG_U_CLOSE_PUNCTUATION: |
||||||
|
return "Close_Punctuation"; |
||||||
|
case PG_U_CONNECTOR_PUNCTUATION: |
||||||
|
return "Connector_Punctuation"; |
||||||
|
case PG_U_OTHER_PUNCTUATION: |
||||||
|
return "Other_Punctuation"; |
||||||
|
case PG_U_MATH_SYMBOL: |
||||||
|
return "Math_Symbol"; |
||||||
|
case PG_U_CURRENCY_SYMBOL: |
||||||
|
return "Currency_Symbol"; |
||||||
|
case PG_U_MODIFIER_SYMBOL: |
||||||
|
return "Modifier_Symbol"; |
||||||
|
case PG_U_OTHER_SYMBOL: |
||||||
|
return "Other_Symbol"; |
||||||
|
case PG_U_INITIAL_PUNCTUATION: |
||||||
|
return "Initial_Punctuation"; |
||||||
|
case PG_U_FINAL_PUNCTUATION: |
||||||
|
return "Final_Punctuation"; |
||||||
|
} |
||||||
|
|
||||||
|
Assert(false); |
||||||
|
return "Unrecognized"; /* keep compiler quiet */ |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Short code for Unicode general category. |
||||||
|
*/ |
||||||
|
const char * |
||||||
|
unicode_category_abbrev(pg_unicode_category category) |
||||||
|
{ |
||||||
|
switch (category) |
||||||
|
{ |
||||||
|
case PG_U_UNASSIGNED: |
||||||
|
return "Cn"; |
||||||
|
case PG_U_UPPERCASE_LETTER: |
||||||
|
return "Lu"; |
||||||
|
case PG_U_LOWERCASE_LETTER: |
||||||
|
return "Ll"; |
||||||
|
case PG_U_TITLECASE_LETTER: |
||||||
|
return "Lt"; |
||||||
|
case PG_U_MODIFIER_LETTER: |
||||||
|
return "Lm"; |
||||||
|
case PG_U_OTHER_LETTER: |
||||||
|
return "Lo"; |
||||||
|
case PG_U_NONSPACING_MARK: |
||||||
|
return "Mn"; |
||||||
|
case PG_U_ENCLOSING_MARK: |
||||||
|
return "Me"; |
||||||
|
case PG_U_SPACING_MARK: |
||||||
|
return "Mc"; |
||||||
|
case PG_U_DECIMAL_NUMBER: |
||||||
|
return "Nd"; |
||||||
|
case PG_U_LETTER_NUMBER: |
||||||
|
return "Nl"; |
||||||
|
case PG_U_OTHER_NUMBER: |
||||||
|
return "No"; |
||||||
|
case PG_U_SPACE_SEPARATOR: |
||||||
|
return "Zs"; |
||||||
|
case PG_U_LINE_SEPARATOR: |
||||||
|
return "Zl"; |
||||||
|
case PG_U_PARAGRAPH_SEPARATOR: |
||||||
|
return "Zp"; |
||||||
|
case PG_U_CONTROL: |
||||||
|
return "Cc"; |
||||||
|
case PG_U_FORMAT: |
||||||
|
return "Cf"; |
||||||
|
case PG_U_PRIVATE_USE: |
||||||
|
return "Co"; |
||||||
|
case PG_U_SURROGATE: |
||||||
|
return "Cs"; |
||||||
|
case PG_U_DASH_PUNCTUATION: |
||||||
|
return "Pd"; |
||||||
|
case PG_U_OPEN_PUNCTUATION: |
||||||
|
return "Ps"; |
||||||
|
case PG_U_CLOSE_PUNCTUATION: |
||||||
|
return "Pe"; |
||||||
|
case PG_U_CONNECTOR_PUNCTUATION: |
||||||
|
return "Pc"; |
||||||
|
case PG_U_OTHER_PUNCTUATION: |
||||||
|
return "Po"; |
||||||
|
case PG_U_MATH_SYMBOL: |
||||||
|
return "Sm"; |
||||||
|
case PG_U_CURRENCY_SYMBOL: |
||||||
|
return "Sc"; |
||||||
|
case PG_U_MODIFIER_SYMBOL: |
||||||
|
return "Sk"; |
||||||
|
case PG_U_OTHER_SYMBOL: |
||||||
|
return "So"; |
||||||
|
case PG_U_INITIAL_PUNCTUATION: |
||||||
|
return "Pi"; |
||||||
|
case PG_U_FINAL_PUNCTUATION: |
||||||
|
return "Pf"; |
||||||
|
} |
||||||
|
|
||||||
|
Assert(false); |
||||||
|
return "??"; /* keep compiler quiet */ |
||||||
|
} |
@ -0,0 +1,68 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* unicode_category.h |
||||||
|
* Routines for determining the category of Unicode characters. |
||||||
|
* |
||||||
|
* These definitions can be used by both frontend and backend code. |
||||||
|
* |
||||||
|
* Copyright (c) 2017-2023, PostgreSQL Global Development Group |
||||||
|
* |
||||||
|
* src/include/common/unicode_category.h |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#ifndef UNICODE_CATEGORY_H |
||||||
|
#define UNICODE_CATEGORY_H |
||||||
|
|
||||||
|
#include "mb/pg_wchar.h" |
||||||
|
|
||||||
|
/*
|
||||||
|
* Unicode General Category Values |
||||||
|
* |
||||||
|
* See: https://www.unicode.org/reports/tr44/#General_Category_Values
|
||||||
|
* |
||||||
|
* The Unicode stability policy guarantees: "The enumeration of |
||||||
|
* General_Category property values is fixed. No new values will be |
||||||
|
* added". See: https://www.unicode.org/policies/stability_policy.html |
||||||
|
* |
||||||
|
* Numeric values chosen to match corresponding ICU UCharCategory. |
||||||
|
*/ |
||||||
|
typedef enum pg_unicode_category |
||||||
|
{ |
||||||
|
PG_U_UNASSIGNED = 0, /* Cn */ |
||||||
|
PG_U_UPPERCASE_LETTER = 1, /* Lu */ |
||||||
|
PG_U_LOWERCASE_LETTER = 2, /* Ll */ |
||||||
|
PG_U_TITLECASE_LETTER = 3, /* Lt */ |
||||||
|
PG_U_MODIFIER_LETTER = 4, /* Lm */ |
||||||
|
PG_U_OTHER_LETTER = 5, /* Lo */ |
||||||
|
PG_U_NONSPACING_MARK = 6, /* Mn */ |
||||||
|
PG_U_ENCLOSING_MARK = 7, /* Me */ |
||||||
|
PG_U_SPACING_MARK = 8, /* Mc */ |
||||||
|
PG_U_DECIMAL_NUMBER = 9, /* Nd */ |
||||||
|
PG_U_LETTER_NUMBER = 10, /* Nl */ |
||||||
|
PG_U_OTHER_NUMBER = 11, /* No */ |
||||||
|
PG_U_SPACE_SEPARATOR = 12, /* Zs */ |
||||||
|
PG_U_LINE_SEPARATOR = 13, /* Zl */ |
||||||
|
PG_U_PARAGRAPH_SEPARATOR = 14, /* Zp */ |
||||||
|
PG_U_CONTROL = 15, /* Cc */ |
||||||
|
PG_U_FORMAT = 16, /* Cf */ |
||||||
|
PG_U_PRIVATE_USE = 17, /* Co */ |
||||||
|
PG_U_SURROGATE = 18, /* Cs */ |
||||||
|
PG_U_DASH_PUNCTUATION = 19, /* Pd */ |
||||||
|
PG_U_OPEN_PUNCTUATION = 20, /* Ps */ |
||||||
|
PG_U_CLOSE_PUNCTUATION = 21, /* Pe */ |
||||||
|
PG_U_CONNECTOR_PUNCTUATION = 22, /* Pc */ |
||||||
|
PG_U_OTHER_PUNCTUATION = 23, /* Po */ |
||||||
|
PG_U_MATH_SYMBOL = 24, /* Sm */ |
||||||
|
PG_U_CURRENCY_SYMBOL = 25, /* Sc */ |
||||||
|
PG_U_MODIFIER_SYMBOL = 26, /* Sk */ |
||||||
|
PG_U_OTHER_SYMBOL = 27, /* So */ |
||||||
|
PG_U_INITIAL_PUNCTUATION = 28, /* Pi */ |
||||||
|
PG_U_FINAL_PUNCTUATION = 29 /* Pf */ |
||||||
|
} pg_unicode_category; |
||||||
|
|
||||||
|
extern pg_unicode_category unicode_category(pg_wchar ucs); |
||||||
|
const char *unicode_category_string(pg_unicode_category category); |
||||||
|
const char *unicode_category_abbrev(pg_unicode_category category); |
||||||
|
|
||||||
|
#endif /* UNICODE_CATEGORY_H */ |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,14 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* unicode_version.h |
||||||
|
* Unicode version used by Postgres. |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* src/include/common/unicode_version.h |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
|
||||||
|
#define PG_UNICODE_VERSION "15.1" |
Loading…
Reference in new issue