|
|
|
@ -23,30 +23,29 @@ |
|
|
|
|
PG_MODULE_MAGIC; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unaccent dictionary uses uncompressed suffix tree to find a |
|
|
|
|
* character to replace. Each node of tree is an array of |
|
|
|
|
* SuffixChar struct with length = 256 (n-th element of array |
|
|
|
|
* Unaccent dictionary uses a trie to find a character to replace. Each node of |
|
|
|
|
* the trie is an array of 256 TrieChar structs (n-th element of array |
|
|
|
|
* corresponds to byte) |
|
|
|
|
*/ |
|
|
|
|
typedef struct SuffixChar |
|
|
|
|
typedef struct TrieChar |
|
|
|
|
{ |
|
|
|
|
struct SuffixChar *nextChar; |
|
|
|
|
struct TrieChar *nextChar; |
|
|
|
|
char *replaceTo; |
|
|
|
|
int replacelen; |
|
|
|
|
} SuffixChar; |
|
|
|
|
} TrieChar; |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* placeChar - put str into tree's structure, byte by byte. |
|
|
|
|
* placeChar - put str into trie's structure, byte by byte. |
|
|
|
|
*/ |
|
|
|
|
static SuffixChar * |
|
|
|
|
placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) |
|
|
|
|
static TrieChar * |
|
|
|
|
placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) |
|
|
|
|
{ |
|
|
|
|
SuffixChar *curnode; |
|
|
|
|
TrieChar *curnode; |
|
|
|
|
|
|
|
|
|
if (!node) |
|
|
|
|
{ |
|
|
|
|
node = palloc(sizeof(SuffixChar) * 256); |
|
|
|
|
memset(node, 0, sizeof(SuffixChar) * 256); |
|
|
|
|
node = palloc(sizeof(TrieChar) * 256); |
|
|
|
|
memset(node, 0, sizeof(TrieChar) * 256); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
curnode = node + *str; |
|
|
|
@ -71,13 +70,14 @@ placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* initSuffixTree - create suffix tree from file. Function converts |
|
|
|
|
* UTF8-encoded file into current encoding. |
|
|
|
|
* initTrie - create trie from file. |
|
|
|
|
* |
|
|
|
|
* Function converts UTF8-encoded file into current encoding. |
|
|
|
|
*/ |
|
|
|
|
static SuffixChar * |
|
|
|
|
initSuffixTree(char *filename) |
|
|
|
|
static TrieChar * |
|
|
|
|
initTrie(char *filename) |
|
|
|
|
{ |
|
|
|
|
SuffixChar *volatile rootSuffixTree = NULL; |
|
|
|
|
TrieChar *volatile rootTrie = NULL; |
|
|
|
|
MemoryContext ccxt = CurrentMemoryContext; |
|
|
|
|
tsearch_readline_state trst; |
|
|
|
|
volatile bool skip; |
|
|
|
@ -161,7 +161,7 @@ initSuffixTree(char *filename) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (state >= 3) |
|
|
|
|
rootSuffixTree = placeChar(rootSuffixTree, |
|
|
|
|
rootTrie = placeChar(rootTrie, |
|
|
|
|
(unsigned char *) src, srclen, |
|
|
|
|
trg, trglen); |
|
|
|
|
|
|
|
|
@ -192,14 +192,14 @@ initSuffixTree(char *filename) |
|
|
|
|
|
|
|
|
|
tsearch_readline_end(&trst); |
|
|
|
|
|
|
|
|
|
return rootSuffixTree; |
|
|
|
|
return rootTrie; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* findReplaceTo - find multibyte character in tree |
|
|
|
|
* findReplaceTo - find multibyte character in trie |
|
|
|
|
*/ |
|
|
|
|
static SuffixChar * |
|
|
|
|
findReplaceTo(SuffixChar *node, unsigned char *src, int srclen) |
|
|
|
|
static TrieChar * |
|
|
|
|
findReplaceTo(TrieChar *node, unsigned char *src, int srclen) |
|
|
|
|
{ |
|
|
|
|
while (node) |
|
|
|
|
{ |
|
|
|
@ -221,7 +221,7 @@ Datum |
|
|
|
|
unaccent_init(PG_FUNCTION_ARGS) |
|
|
|
|
{ |
|
|
|
|
List *dictoptions = (List *) PG_GETARG_POINTER(0); |
|
|
|
|
SuffixChar *rootSuffixTree = NULL; |
|
|
|
|
TrieChar *rootTrie = NULL; |
|
|
|
|
bool fileloaded = false; |
|
|
|
|
ListCell *l; |
|
|
|
|
|
|
|
|
@ -235,7 +235,7 @@ unaccent_init(PG_FUNCTION_ARGS) |
|
|
|
|
ereport(ERROR, |
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
|
|
|
|
errmsg("multiple Rules parameters"))); |
|
|
|
|
rootSuffixTree = initSuffixTree(defGetString(defel)); |
|
|
|
|
rootTrie = initTrie(defGetString(defel)); |
|
|
|
|
fileloaded = true; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
@ -254,7 +254,7 @@ unaccent_init(PG_FUNCTION_ARGS) |
|
|
|
|
errmsg("missing Rules parameter"))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
PG_RETURN_POINTER(rootSuffixTree); |
|
|
|
|
PG_RETURN_POINTER(rootTrie); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(unaccent_lexize); |
|
|
|
@ -262,21 +262,21 @@ Datum unaccent_lexize(PG_FUNCTION_ARGS); |
|
|
|
|
Datum |
|
|
|
|
unaccent_lexize(PG_FUNCTION_ARGS) |
|
|
|
|
{ |
|
|
|
|
SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0); |
|
|
|
|
TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0); |
|
|
|
|
char *srcchar = (char *) PG_GETARG_POINTER(1); |
|
|
|
|
int32 len = PG_GETARG_INT32(2); |
|
|
|
|
char *srcstart, |
|
|
|
|
*trgchar = NULL; |
|
|
|
|
int charlen; |
|
|
|
|
TSLexeme *res = NULL; |
|
|
|
|
SuffixChar *node; |
|
|
|
|
TrieChar *node; |
|
|
|
|
|
|
|
|
|
srcstart = srcchar; |
|
|
|
|
while (srcchar - srcstart < len) |
|
|
|
|
{ |
|
|
|
|
charlen = pg_mblen(srcchar); |
|
|
|
|
|
|
|
|
|
node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen); |
|
|
|
|
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen); |
|
|
|
|
if (node && node->replaceTo) |
|
|
|
|
{ |
|
|
|
|
if (!res) |
|
|
|
|