mirror of https://github.com/postgres/postgres
This modernized version of Soundex works significantly better than the original, particularly for non-English names. Dag Lem, reviewed by quite a few people along the way Discussion: https://postgr.es/m/yger1atbgfy.fsf@sid.nimrod.nopull/137/head
parent
728015a470
commit
a290378a37
@ -0,0 +1,577 @@ |
||||
/*
|
||||
* Daitch-Mokotoff Soundex |
||||
* |
||||
* Copyright (c) 2023, PostgreSQL Global Development Group |
||||
* |
||||
* This module was originally sponsored by Finance Norway / |
||||
* Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> |
||||
* |
||||
* The implementation of the Daitch-Mokotoff Soundex System aims at correctness |
||||
* and high performance, and can be summarized as follows: |
||||
* |
||||
* - The processing of each phoneme is initiated by an O(1) table lookup. |
||||
* - For phonemes containing more than one character, a coding tree is traversed |
||||
* to process the complete phoneme. |
||||
* - The (alternate) soundex codes are produced digit by digit in-place in |
||||
* another tree structure. |
||||
* |
||||
* References: |
||||
* |
||||
* https://www.avotaynu.com/soundex.htm
|
||||
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
* https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
|
||||
* https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
|
||||
* https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
|
||||
* https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
|
||||
* |
||||
* A few notes on other implementations: |
||||
* |
||||
* - All other known implementations have the same unofficial rules for "UE", |
||||
* these are also adapted by this implementation (0, 1, NC). |
||||
* - The only other known implementation which is capable of generating all |
||||
* correct soundex codes in all cases is the JOS Soundex Calculator at |
||||
* https://www.jewishgen.org/jos/jossound.htm
|
||||
* - "J" is considered (only) a vowel in dmlat.php |
||||
* - The official rules for "RS" are commented out in dmlat.php |
||||
* - Identical code digits for adjacent letters are not collapsed correctly in |
||||
* dmsoundex.php when double digit codes are involved. E.g. "BESST" yields |
||||
* 744300 instead of 743000 as for "BEST". |
||||
* - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java |
||||
* - "Y" is not considered a vowel in DaitchMokotoffSoundex.java |
||||
*/ |
||||
|
||||
#include "postgres.h" |
||||
|
||||
#include "catalog/pg_type.h" |
||||
#include "mb/pg_wchar.h" |
||||
#include "utils/array.h" |
||||
#include "utils/builtins.h" |
||||
#include "utils/memutils.h" |
||||
|
||||
|
||||
/*
|
||||
* The soundex coding chart table is adapted from |
||||
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||
* See daitch_mokotoff_header.pl for details. |
||||
*/ |
||||
|
||||
/* Generated coding chart table */ |
||||
#include "daitch_mokotoff.h" |
||||
|
||||
#define DM_CODE_DIGITS 6 |
||||
|
||||
/* Node in soundex code tree */ |
||||
typedef struct dm_node |
||||
{ |
||||
int soundex_length; /* Length of generated soundex code */ |
||||
char soundex[DM_CODE_DIGITS]; /* Soundex code */ |
||||
int is_leaf; /* Candidate for complete soundex code */ |
||||
int last_update; /* Letter number for last update of node */ |
||||
char code_digit; /* Last code digit, 0 - 9 */ |
||||
|
||||
/*
|
||||
* One or two alternate code digits leading to this node. If there are two |
||||
* digits, one of them is always an 'X'. Repeated code digits and 'X' lead |
||||
* back to the same node. |
||||
*/ |
||||
char prev_code_digits[2]; |
||||
/* One or two alternate code digits moving forward. */ |
||||
char next_code_digits[2]; |
||||
/* ORed together code index(es) used to reach current node. */ |
||||
int prev_code_index; |
||||
int next_code_index; |
||||
/* Possible nodes branching out from this node - digits 0-9. */ |
||||
struct dm_node *children[10]; |
||||
/* Next node in linked list. Alternating index for each iteration. */ |
||||
struct dm_node *next[2]; |
||||
} dm_node; |
||||
|
||||
/* Template for new node in soundex code tree. */ |
||||
static const dm_node start_node = { |
||||
.soundex_length = 0, |
||||
.soundex = "000000", /* Six digits */ |
||||
.is_leaf = 0, |
||||
.last_update = 0, |
||||
.code_digit = '\0', |
||||
.prev_code_digits = {'\0', '\0'}, |
||||
.next_code_digits = {'\0', '\0'}, |
||||
.prev_code_index = 0, |
||||
.next_code_index = 0, |
||||
.children = {NULL}, |
||||
.next = {NULL} |
||||
}; |
||||
|
||||
/* Dummy soundex codes at end of input. */ |
||||
static const dm_codes end_codes[2] = |
||||
{ |
||||
{ |
||||
"X", "X", "X" |
||||
} |
||||
}; |
||||
|
||||
/* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */ |
||||
static const char iso8859_1_to_ascii_upper[] = |
||||
/*
|
||||
"`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ" |
||||
*/ |
||||
"`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY"; |
||||
|
||||
/* Internal C implementation */ |
||||
static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex); |
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(daitch_mokotoff); |
||||
|
||||
Datum |
||||
daitch_mokotoff(PG_FUNCTION_ARGS) |
||||
{ |
||||
text *arg = PG_GETARG_TEXT_PP(0); |
||||
Datum retval; |
||||
char *string; |
||||
ArrayBuildState *soundex; |
||||
MemoryContext old_ctx, |
||||
tmp_ctx; |
||||
|
||||
/* Work in a temporary context to simplify cleanup. */ |
||||
tmp_ctx = AllocSetContextCreate(CurrentMemoryContext, |
||||
"daitch_mokotoff temporary context", |
||||
ALLOCSET_DEFAULT_SIZES); |
||||
old_ctx = MemoryContextSwitchTo(tmp_ctx); |
||||
|
||||
/* We must convert the string to UTF-8 if it isn't already. */ |
||||
string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg), |
||||
PG_UTF8); |
||||
|
||||
/* The result is built in this ArrayBuildState. */ |
||||
soundex = initArrayResult(TEXTOID, tmp_ctx, false); |
||||
|
||||
if (!daitch_mokotoff_coding(string, soundex)) |
||||
{ |
||||
/* No encodable characters in input */ |
||||
MemoryContextSwitchTo(old_ctx); |
||||
MemoryContextDelete(tmp_ctx); |
||||
PG_RETURN_NULL(); |
||||
} |
||||
|
||||
retval = makeArrayResult(soundex, old_ctx); |
||||
|
||||
MemoryContextSwitchTo(old_ctx); |
||||
MemoryContextDelete(tmp_ctx); |
||||
|
||||
PG_RETURN_DATUM(retval); |
||||
} |
||||
|
||||
|
||||
/* Initialize soundex code tree node for next code digit. */ |
||||
static void |
||||
initialize_node(dm_node *node, int last_update) |
||||
{ |
||||
if (node->last_update < last_update) |
||||
{ |
||||
node->prev_code_digits[0] = node->next_code_digits[0]; |
||||
node->prev_code_digits[1] = node->next_code_digits[1]; |
||||
node->next_code_digits[0] = '\0'; |
||||
node->next_code_digits[1] = '\0'; |
||||
node->prev_code_index = node->next_code_index; |
||||
node->next_code_index = 0; |
||||
node->is_leaf = 0; |
||||
node->last_update = last_update; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Update soundex code tree node with next code digit. */ |
||||
static void |
||||
add_next_code_digit(dm_node *node, int code_index, char code_digit) |
||||
{ |
||||
/* OR in index 1 or 2. */ |
||||
node->next_code_index |= code_index; |
||||
|
||||
if (!node->next_code_digits[0]) |
||||
node->next_code_digits[0] = code_digit; |
||||
else if (node->next_code_digits[0] != code_digit) |
||||
node->next_code_digits[1] = code_digit; |
||||
} |
||||
|
||||
|
||||
/* Mark soundex code tree node as leaf. */ |
||||
static void |
||||
set_leaf(dm_node *first_node[2], dm_node *last_node[2], |
||||
dm_node *node, int ix_node) |
||||
{ |
||||
if (!node->is_leaf) |
||||
{ |
||||
node->is_leaf = 1; |
||||
|
||||
if (first_node[ix_node] == NULL) |
||||
first_node[ix_node] = node; |
||||
else |
||||
last_node[ix_node]->next[ix_node] = node; |
||||
|
||||
last_node[ix_node] = node; |
||||
node->next[ix_node] = NULL; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Find next node corresponding to code digit, or create a new node. */ |
||||
static dm_node * |
||||
find_or_create_child_node(dm_node *parent, char code_digit, |
||||
ArrayBuildState *soundex) |
||||
{ |
||||
int i = code_digit - '0'; |
||||
dm_node **nodes = parent->children; |
||||
dm_node *node = nodes[i]; |
||||
|
||||
if (node) |
||||
{ |
||||
/* Found existing child node. Skip completed nodes. */ |
||||
return node->soundex_length < DM_CODE_DIGITS ? node : NULL; |
||||
} |
||||
|
||||
/* Create new child node. */ |
||||
node = palloc_object(dm_node); |
||||
nodes[i] = node; |
||||
|
||||
*node = start_node; |
||||
memcpy(node->soundex, parent->soundex, sizeof(parent->soundex)); |
||||
node->soundex_length = parent->soundex_length; |
||||
node->soundex[node->soundex_length++] = code_digit; |
||||
node->code_digit = code_digit; |
||||
node->next_code_index = node->prev_code_index; |
||||
|
||||
if (node->soundex_length < DM_CODE_DIGITS) |
||||
{ |
||||
return node; |
||||
} |
||||
else |
||||
{ |
||||
/* Append completed soundex code to output array. */ |
||||
text *out = cstring_to_text_with_len(node->soundex, |
||||
DM_CODE_DIGITS); |
||||
|
||||
accumArrayResult(soundex, |
||||
PointerGetDatum(out), |
||||
false, |
||||
TEXTOID, |
||||
CurrentMemoryContext); |
||||
return NULL; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Update node for next code digit(s). */ |
||||
static void |
||||
update_node(dm_node *first_node[2], dm_node *last_node[2], |
||||
dm_node *node, int ix_node, |
||||
int letter_no, int prev_code_index, int next_code_index, |
||||
const char *next_code_digits, int digit_no, |
||||
ArrayBuildState *soundex) |
||||
{ |
||||
int i; |
||||
char next_code_digit = next_code_digits[digit_no]; |
||||
int num_dirty_nodes = 0; |
||||
dm_node *dirty_nodes[2]; |
||||
|
||||
initialize_node(node, letter_no); |
||||
|
||||
if (node->prev_code_index && !(node->prev_code_index & prev_code_index)) |
||||
{ |
||||
/*
|
||||
* If the sound (vowel / consonant) of this letter encoding doesn't |
||||
* correspond to the coding index of the previous letter, we skip this |
||||
* letter encoding. Note that currently, only "J" can be either a |
||||
* vowel or a consonant. |
||||
*/ |
||||
return; |
||||
} |
||||
|
||||
if (next_code_digit == 'X' || |
||||
(digit_no == 0 && |
||||
(node->prev_code_digits[0] == next_code_digit || |
||||
node->prev_code_digits[1] == next_code_digit))) |
||||
{ |
||||
/* The code digit is the same as one of the previous (i.e. not added). */ |
||||
dirty_nodes[num_dirty_nodes++] = node; |
||||
} |
||||
|
||||
if (next_code_digit != 'X' && |
||||
(digit_no > 0 || |
||||
node->prev_code_digits[0] != next_code_digit || |
||||
node->prev_code_digits[1])) |
||||
{ |
||||
/* The code digit is different from one of the previous (i.e. added). */ |
||||
node = find_or_create_child_node(node, next_code_digit, soundex); |
||||
if (node) |
||||
{ |
||||
initialize_node(node, letter_no); |
||||
dirty_nodes[num_dirty_nodes++] = node; |
||||
} |
||||
} |
||||
|
||||
for (i = 0; i < num_dirty_nodes; i++) |
||||
{ |
||||
/* Add code digit leading to the current node. */ |
||||
add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit); |
||||
|
||||
if (next_code_digits[++digit_no]) |
||||
{ |
||||
update_node(first_node, last_node, dirty_nodes[i], ix_node, |
||||
letter_no, prev_code_index, next_code_index, |
||||
next_code_digits, digit_no, |
||||
soundex); |
||||
} |
||||
else |
||||
{ |
||||
/* Add incomplete leaf node to linked list. */ |
||||
set_leaf(first_node, last_node, dirty_nodes[i], ix_node); |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Update soundex tree leaf nodes. */ |
||||
static void |
||||
update_leaves(dm_node *first_node[2], int *ix_node, int letter_no, |
||||
const dm_codes *codes, const dm_codes *next_codes, |
||||
ArrayBuildState *soundex) |
||||
{ |
||||
int i, |
||||
j, |
||||
code_index; |
||||
dm_node *node, |
||||
*last_node[2]; |
||||
const dm_code *code, |
||||
*next_code; |
||||
int ix_node_next = (*ix_node + 1) & 1; /* Alternating index: 0, 1 */ |
||||
|
||||
/* Initialize for new linked list of leaves. */ |
||||
first_node[ix_node_next] = NULL; |
||||
last_node[ix_node_next] = NULL; |
||||
|
||||
/* Process all nodes. */ |
||||
for (node = first_node[*ix_node]; node; node = node->next[*ix_node]) |
||||
{ |
||||
/* One or two alternate code sequences. */ |
||||
for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++) |
||||
{ |
||||
/* Coding for previous letter - before vowel: 1, all other: 2 */ |
||||
int prev_code_index = (code[0][0] > '1') + 1; |
||||
|
||||
/* One or two alternate next code sequences. */ |
||||
for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++) |
||||
{ |
||||
/* Determine which code to use. */ |
||||
if (letter_no == 0) |
||||
{ |
||||
/* This is the first letter. */ |
||||
code_index = 0; |
||||
} |
||||
else if (next_code[0][0] <= '1') |
||||
{ |
||||
/* The next letter is a vowel. */ |
||||
code_index = 1; |
||||
} |
||||
else |
||||
{ |
||||
/* All other cases. */ |
||||
code_index = 2; |
||||
} |
||||
|
||||
/* One or two sequential code digits. */ |
||||
update_node(first_node, last_node, node, ix_node_next, |
||||
letter_no, prev_code_index, code_index, |
||||
code[code_index], 0, |
||||
soundex); |
||||
} |
||||
} |
||||
} |
||||
|
||||
*ix_node = ix_node_next; |
||||
} |
||||
|
||||
|
||||
/*
|
||||
* Return next character, converted from UTF-8 to uppercase ASCII. |
||||
* *ix is the current string index and is incremented by the character length. |
||||
*/ |
||||
static char |
||||
read_char(const unsigned char *str, int *ix) |
||||
{ |
||||
/* Substitute character for skipped code points. */ |
||||
const char na = '\x1a'; |
||||
pg_wchar c; |
||||
|
||||
/* Decode UTF-8 character to ISO 10646 code point. */ |
||||
str += *ix; |
||||
c = utf8_to_unicode(str); |
||||
|
||||
/* Advance *ix, but (for safety) not if we've reached end of string. */ |
||||
if (c) |
||||
*ix += pg_utf_mblen(str); |
||||
|
||||
/* Convert. */ |
||||
if (c >= (unsigned char) '[' && c <= (unsigned char) ']') |
||||
{ |
||||
/* ASCII characters [, \, and ] are reserved for Ą, Ę, and Ţ/Ț. */ |
||||
return na; |
||||
} |
||||
else if (c < 0x60) |
||||
{ |
||||
/* Other non-lowercase ASCII characters can be used as-is. */ |
||||
return (char) c; |
||||
} |
||||
else if (c < 0x100) |
||||
{ |
||||
/* ISO-8859-1 code point; convert to upper-case ASCII via table. */ |
||||
return iso8859_1_to_ascii_upper[c - 0x60]; |
||||
} |
||||
else |
||||
{ |
||||
/* Conversion of non-ASCII characters in the coding chart. */ |
||||
switch (c) |
||||
{ |
||||
case 0x0104: |
||||
case 0x0105: |
||||
/* Ą/ą */ |
||||
return '['; |
||||
case 0x0118: |
||||
case 0x0119: |
||||
/* Ę/ę */ |
||||
return '\\'; |
||||
case 0x0162: |
||||
case 0x0163: |
||||
case 0x021A: |
||||
case 0x021B: |
||||
/* Ţ/ţ or Ț/ț */ |
||||
return ']'; |
||||
default: |
||||
return na; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/* Read next ASCII character, skipping any characters not in [A-\]]. */ |
||||
static char |
||||
read_valid_char(const char *str, int *ix) |
||||
{ |
||||
char c; |
||||
|
||||
while ((c = read_char((const unsigned char *) str, ix)) != '\0') |
||||
{ |
||||
if (c >= 'A' && c <= ']') |
||||
break; |
||||
} |
||||
|
||||
return c; |
||||
} |
||||
|
||||
|
||||
/* Return sound coding for "letter" (letter sequence) */ |
||||
static const dm_codes * |
||||
read_letter(const char *str, int *ix) |
||||
{ |
||||
char c, |
||||
cmp; |
||||
int i, |
||||
j; |
||||
const dm_letter *letters; |
||||
const dm_codes *codes; |
||||
|
||||
/* First letter in sequence. */ |
||||
if ((c = read_valid_char(str, ix)) == '\0') |
||||
return NULL; |
||||
|
||||
letters = &letter_[c - 'A']; |
||||
codes = letters->codes; |
||||
i = *ix; |
||||
|
||||
/* Any subsequent letters in sequence. */ |
||||
while ((letters = letters->letters) && (c = read_valid_char(str, &i))) |
||||
{ |
||||
for (j = 0; (cmp = letters[j].letter); j++) |
||||
{ |
||||
if (cmp == c) |
||||
{ |
||||
/* Letter found. */ |
||||
letters = &letters[j]; |
||||
if (letters->codes) |
||||
{ |
||||
/* Coding for letter sequence found. */ |
||||
codes = letters->codes; |
||||
*ix = i; |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
if (!cmp) |
||||
{ |
||||
/* The sequence of letters has no coding. */ |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return codes; |
||||
} |
||||
|
||||
|
||||
/*
|
||||
* Generate all Daitch-Mokotoff soundex codes for word, |
||||
* adding them to the "soundex" ArrayBuildState. |
||||
* Returns false if string has no encodable characters, else true. |
||||
*/ |
||||
static bool |
||||
daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex) |
||||
{ |
||||
int i = 0; |
||||
int letter_no = 0; |
||||
int ix_node = 0; |
||||
const dm_codes *codes, |
||||
*next_codes; |
||||
dm_node *first_node[2], |
||||
*node; |
||||
|
||||
/* First letter. */ |
||||
if (!(codes = read_letter(word, &i))) |
||||
{ |
||||
/* No encodable character in input. */ |
||||
return false; |
||||
} |
||||
|
||||
/* Starting point. */ |
||||
first_node[ix_node] = palloc_object(dm_node); |
||||
*first_node[ix_node] = start_node; |
||||
|
||||
/*
|
||||
* Loop until either the word input is exhausted, or all generated soundex |
||||
* codes are completed to six digits. |
||||
*/ |
||||
while (codes && first_node[ix_node]) |
||||
{ |
||||
next_codes = read_letter(word, &i); |
||||
|
||||
/* Update leaf nodes. */ |
||||
update_leaves(first_node, &ix_node, letter_no, |
||||
codes, next_codes ? next_codes : end_codes, |
||||
soundex); |
||||
|
||||
codes = next_codes; |
||||
letter_no++; |
||||
} |
||||
|
||||
/* Append all remaining (incomplete) soundex codes to output array. */ |
||||
for (node = first_node[ix_node]; node; node = node->next[ix_node]) |
||||
{ |
||||
text *out = cstring_to_text_with_len(node->soundex, |
||||
DM_CODE_DIGITS); |
||||
|
||||
accumArrayResult(soundex, |
||||
PointerGetDatum(out), |
||||
false, |
||||
TEXTOID, |
||||
CurrentMemoryContext); |
||||
} |
||||
|
||||
return true; |
||||
} |
@ -0,0 +1,223 @@ |
||||
#!/usr/bin/perl |
||||
# |
||||
# Generation of types and lookup tables for Daitch-Mokotoff soundex. |
||||
# |
||||
# Copyright (c) 2023, PostgreSQL Global Development Group |
||||
# |
||||
# This module was originally sponsored by Finance Norway / |
||||
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> |
||||
# |
||||
|
||||
use strict; |
||||
use warnings; |
||||
|
||||
use utf8; |
||||
use open IO => ':utf8', ':std'; |
||||
use Data::Dumper; |
||||
|
||||
die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1; |
||||
my $output_file = $ARGV[0]; |
||||
|
||||
# Open the output file |
||||
open my $OUTPUT, '>', $output_file |
||||
or die "Could not open output file $output_file: $!\n"; |
||||
|
||||
# Parse code table and generate tree for letter transitions. |
||||
my %codes; |
||||
my $table = [ {}, [ [ "", "", "" ] ] ]; |
||||
while (<DATA>) |
||||
{ |
||||
chomp; |
||||
my ($letters, $codes) = split(/\s+/); |
||||
my @codes = map { [ split(/,/) ] } split(/\|/, $codes); |
||||
|
||||
my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes); |
||||
my $val = join( |
||||
",\n", |
||||
map { |
||||
"\t{\n\t\t" |
||||
. join(", ", map { "\"$_\"" } @$_) . "\n\t}" |
||||
} @codes); |
||||
$codes{$key} = $val; |
||||
|
||||
for my $letter (split(/,/, $letters)) |
||||
{ |
||||
my $ref = $table->[0]; |
||||
# Link each character to the next in the letter combination. |
||||
my @c = split(//, $letter); |
||||
my $last_c = pop(@c); |
||||
for my $c (@c) |
||||
{ |
||||
$ref->{$c} //= [ {}, undef ]; |
||||
$ref->{$c}[0] //= {}; |
||||
$ref = $ref->{$c}[0]; |
||||
} |
||||
# The sound code for the letter combination is stored at the last character. |
||||
$ref->{$last_c}[1] = $key; |
||||
} |
||||
} |
||||
close(DATA); |
||||
|
||||
print $OUTPUT <<EOF; |
||||
/* |
||||
* Constants and lookup tables for Daitch-Mokotoff Soundex |
||||
* |
||||
* Copyright (c) 2023, PostgreSQL Global Development Group |
||||
* |
||||
* This file is generated by daitch_mokotoff_header.pl |
||||
*/ |
||||
|
||||
/* Coding chart table: Soundex codes */ |
||||
typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */ |
||||
typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */ |
||||
|
||||
/* Coding chart table: Letter in input sequence */ |
||||
struct dm_letter |
||||
{ |
||||
char letter; /* Present letter in sequence */ |
||||
const struct dm_letter *letters; /* List of possible successive letters */ |
||||
const dm_codes *codes; /* Code sequence(s) for complete sequence */ |
||||
}; |
||||
|
||||
typedef struct dm_letter dm_letter; |
||||
|
||||
/* Codes for letter sequence at start of name, before a vowel, and any other. */ |
||||
EOF |
||||
|
||||
for my $key (sort keys %codes) |
||||
{ |
||||
print $OUTPUT "static const dm_codes $key\[2\] =\n{\n" |
||||
. $codes{$key} |
||||
. "\n};\n"; |
||||
} |
||||
|
||||
print $OUTPUT <<EOF; |
||||
|
||||
/* Coding for alternative following letters in sequence. */ |
||||
EOF |
||||
|
||||
sub hash2code |
||||
{ |
||||
my ($ref, $letter) = @_; |
||||
|
||||
my @letters = (); |
||||
|
||||
my $h = $ref->[0]; |
||||
for my $key (sort keys %$h) |
||||
{ |
||||
$ref = $h->{$key}; |
||||
my $children = "NULL"; |
||||
if (defined $ref->[0]) |
||||
{ |
||||
$children = "letter_$letter$key"; |
||||
hash2code($ref, "$letter$key"); |
||||
} |
||||
my $codes = $ref->[1] // "NULL"; |
||||
push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}"); |
||||
} |
||||
|
||||
print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n"; |
||||
for (@letters) |
||||
{ |
||||
print $OUTPUT "$_,\n"; |
||||
} |
||||
print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n"; |
||||
print $OUTPUT "};\n"; |
||||
} |
||||
|
||||
hash2code($table, ''); |
||||
|
||||
close $OUTPUT; |
||||
|
||||
# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html |
||||
# |
||||
# The conversion from the coding chart to the table should be self |
||||
# explanatory, but note the differences stated below. |
||||
# |
||||
# X = NC (not coded) |
||||
# |
||||
# The non-ASCII letters in the coding chart are coded with substitute |
||||
# lowercase ASCII letters, which sort after the uppercase ASCII letters: |
||||
# |
||||
# Ą => a (use '[' for table lookup) |
||||
# Ę => e (use '\\' for table lookup) |
||||
# Ţ => t (use ']' for table lookup) |
||||
# |
||||
# The rule for "UE" does not correspond to the coding chart, however |
||||
# it is used by all other known implementations, including the one at |
||||
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey"). |
||||
# |
||||
# Note that the implementation assumes that vowels are assigned code |
||||
# 0 or 1. "J" can be either a vowel or a consonant. |
||||
# |
||||
|
||||
__DATA__ |
||||
AI,AJ,AY 0,1,X |
||||
AU 0,7,X |
||||
a X,X,6|X,X,X |
||||
A 0,X,X |
||||
B 7,7,7 |
||||
CHS 5,54,54 |
||||
CH 5,5,5|4,4,4 |
||||
CK 5,5,5|45,45,45 |
||||
CZ,CS,CSZ,CZS 4,4,4 |
||||
C 5,5,5|4,4,4 |
||||
DRZ,DRS 4,4,4 |
||||
DS,DSH,DSZ 4,4,4 |
||||
DZ,DZH,DZS 4,4,4 |
||||
D,DT 3,3,3 |
||||
EI,EJ,EY 0,1,X |
||||
EU 1,1,X |
||||
e X,X,6|X,X,X |
||||
E 0,X,X |
||||
FB 7,7,7 |
||||
F 7,7,7 |
||||
G 5,5,5 |
||||
H 5,5,X |
||||
IA,IE,IO,IU 1,X,X |
||||
I 0,X,X |
||||
J 1,X,X|4,4,4 |
||||
KS 5,54,54 |
||||
KH 5,5,5 |
||||
K 5,5,5 |
||||
L 8,8,8 |
||||
MN 66,66,66 |
||||
M 6,6,6 |
||||
NM 66,66,66 |
||||
N 6,6,6 |
||||
OI,OJ,OY 0,1,X |
||||
O 0,X,X |
||||
P,PF,PH 7,7,7 |
||||
Q 5,5,5 |
||||
RZ,RS 94,94,94|4,4,4 |
||||
R 9,9,9 |
||||
SCHTSCH,SCHTSH,SCHTCH 2,4,4 |
||||
SCH 4,4,4 |
||||
SHTCH,SHCH,SHTSH 2,4,4 |
||||
SHT,SCHT,SCHD 2,43,43 |
||||
SH 4,4,4 |
||||
STCH,STSCH,SC 2,4,4 |
||||
STRZ,STRS,STSH 2,4,4 |
||||
ST 2,43,43 |
||||
SZCZ,SZCS 2,4,4 |
||||
SZT,SHD,SZD,SD 2,43,43 |
||||
SZ 4,4,4 |
||||
S 4,4,4 |
||||
TCH,TTCH,TTSCH 4,4,4 |
||||
TH 3,3,3 |
||||
TRZ,TRS 4,4,4 |
||||
TSCH,TSH 4,4,4 |
||||
TS,TTS,TTSZ,TC 4,4,4 |
||||
TZ,TTZ,TZS,TSZ 4,4,4 |
||||
t 3,3,3|4,4,4 |
||||
T 3,3,3 |
||||
UI,UJ,UY,UE 0,1,X |
||||
U 0,X,X |
||||
V 7,7,7 |
||||
W 7,7,7 |
||||
X 5,54,54 |
||||
Y 1,X,X |
||||
ZDZ,ZDZH,ZHDZH 2,4,4 |
||||
ZD,ZHD 2,43,43 |
||||
ZH,ZS,ZSCH,ZSH 4,4,4 |
||||
Z 4,4,4 |
@ -0,0 +1,61 @@ |
||||
/* |
||||
* This test must be run in a database with UTF-8 encoding, |
||||
* because other encodings don't support all the characters used. |
||||
*/ |
||||
SELECT getdatabaseencoding() <> 'UTF8' |
||||
AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
set client_encoding = utf8; |
||||
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; |
||||
-- Accents |
||||
SELECT daitch_mokotoff('Müller'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{689000} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('Schäfer'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{479000} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('Straßburg'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{294795} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('Éregon'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{095600} |
||||
(1 row) |
||||
|
||||
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html |
||||
SELECT daitch_mokotoff('gąszczu'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{564000,540000} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('brzęczy'); |
||||
daitch_mokotoff |
||||
------------------------------- |
||||
{794640,794400,746400,744000} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('ţamas'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{364000,464000} |
||||
(1 row) |
||||
|
||||
SELECT daitch_mokotoff('țamas'); |
||||
daitch_mokotoff |
||||
----------------- |
||||
{364000,464000} |
||||
(1 row) |
||||
|
@ -0,0 +1,8 @@ |
||||
/* |
||||
* This test must be run in a database with UTF-8 encoding, |
||||
* because other encodings don't support all the characters used. |
||||
*/ |
||||
SELECT getdatabaseencoding() <> 'UTF8' |
||||
AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
@ -0,0 +1,8 @@ |
||||
/* contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql */ |
||||
|
||||
-- complain if script is sourced in psql, rather than via ALTER EXTENSION |
||||
\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.2'" to load this file. \quit |
||||
|
||||
CREATE FUNCTION daitch_mokotoff(text) RETURNS text[] |
||||
AS 'MODULE_PATHNAME', 'daitch_mokotoff' |
||||
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; |
@ -1,6 +1,6 @@ |
||||
# fuzzystrmatch extension |
||||
comment = 'determine similarities and distance between strings' |
||||
default_version = '1.1' |
||||
default_version = '1.2' |
||||
module_pathname = '$libdir/fuzzystrmatch' |
||||
relocatable = true |
||||
trusted = true |
||||
|
@ -0,0 +1,26 @@ |
||||
/* |
||||
* This test must be run in a database with UTF-8 encoding, |
||||
* because other encodings don't support all the characters used. |
||||
*/ |
||||
|
||||
SELECT getdatabaseencoding() <> 'UTF8' |
||||
AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
|
||||
set client_encoding = utf8; |
||||
|
||||
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; |
||||
|
||||
-- Accents |
||||
SELECT daitch_mokotoff('Müller'); |
||||
SELECT daitch_mokotoff('Schäfer'); |
||||
SELECT daitch_mokotoff('Straßburg'); |
||||
SELECT daitch_mokotoff('Éregon'); |
||||
|
||||
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html |
||||
SELECT daitch_mokotoff('gąszczu'); |
||||
SELECT daitch_mokotoff('brzęczy'); |
||||
SELECT daitch_mokotoff('ţamas'); |
||||
SELECT daitch_mokotoff('țamas'); |
Loading…
Reference in new issue