mirror of https://github.com/postgres/postgres
It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.REL8_2_STABLE
parent
3b7ed9ba9c
commit
22505f4703
@ -0,0 +1,743 @@ |
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */ |
||||
|
||||
/*
|
||||
* thesaurus |
||||
* Teodor Sigaev <teodor@sigaev.ru> |
||||
*/ |
||||
#include "postgres.h" |
||||
#include "executor/spi.h" |
||||
|
||||
#include <ctype.h> |
||||
|
||||
#include "dict.h" |
||||
#include "common.h" |
||||
#include "ts_locale.h" |
||||
|
||||
typedef struct LexemeInfo { |
||||
uint16 idsubst; /* entry's number in DictThesaurus->subst */ |
||||
uint16 posinsubst; /* pos info in entry */ |
||||
uint16 tnvariant; /* total num lexemes in one variant */ |
||||
struct LexemeInfo *nextentry; |
||||
struct LexemeInfo *nextvariant; |
||||
} LexemeInfo; |
||||
|
||||
typedef struct { |
||||
char *lexeme; |
||||
LexemeInfo *entries; |
||||
} TheLexeme;
|
||||
|
||||
typedef struct { |
||||
uint16 lastlexeme; /* number lexemes to substitute */ |
||||
uint16 reslen; |
||||
TSLexeme *res; /* prepared substituted result */
|
||||
} TheSubstitute; |
||||
|
||||
typedef struct |
||||
{ |
||||
/* subdictionary to normalize lexemes */
|
||||
DictInfo subdict; |
||||
|
||||
/* Array to search lexeme by exact match */ |
||||
TheLexeme *wrds; |
||||
int nwrds; |
||||
int ntwrds; |
||||
|
||||
/* Storage of substituted result, n-th element is for
|
||||
n-th expression */ |
||||
TheSubstitute *subst; |
||||
int nsubst; |
||||
} DictThesaurus; |
||||
|
||||
PG_FUNCTION_INFO_V1(thesaurus_init); |
||||
Datum thesaurus_init(PG_FUNCTION_ARGS); |
||||
|
||||
PG_FUNCTION_INFO_V1(thesaurus_lexize); |
||||
Datum thesaurus_lexize(PG_FUNCTION_ARGS); |
||||
|
||||
static void |
||||
freeDictThesaurus(DictThesaurus * d) |
||||
{ |
||||
free(d); |
||||
} |
||||
|
||||
static void |
||||
newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) { |
||||
TheLexeme *ptr; |
||||
|
||||
if ( d->nwrds >= d->ntwrds ) { |
||||
if ( d->ntwrds == 0 ) { |
||||
d->ntwrds = 16; |
||||
d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds); |
||||
} else { |
||||
d->ntwrds *= 2; |
||||
d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); |
||||
} |
||||
if (!d->wrds) |
||||
elog(ERROR,"Out of memory"); |
||||
} |
||||
|
||||
ptr = d->wrds + d->nwrds; |
||||
d->nwrds++; |
||||
|
||||
if ( (ptr->lexeme = malloc(e-b+1)) == NULL ) |
||||
elog(ERROR,"Out of memory"); |
||||
|
||||
memcpy(ptr->lexeme, b, e-b); |
||||
ptr->lexeme[e-b] = '\0'; |
||||
|
||||
if ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL ) |
||||
elog(ERROR,"Out of memory"); |
||||
|
||||
ptr->entries->nextentry=NULL; |
||||
ptr->entries->idsubst = idsubst; |
||||
ptr->entries->posinsubst = posinsubst; |
||||
} |
||||
|
||||
static void |
||||
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) { |
||||
static int nres=0; |
||||
static int ntres = 0; |
||||
TheSubstitute *ptr; |
||||
|
||||
if ( nwrd == 0 ) { |
||||
nres = ntres = 0; |
||||
|
||||
if ( idsubst <= d->nsubst ) { |
||||
if ( d->nsubst == 0 ) { |
||||
d->nsubst = 16; |
||||
d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst); |
||||
} else { |
||||
d->nsubst *= 2; |
||||
d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst); |
||||
} |
||||
if (!d->subst) |
||||
elog(ERROR,"Out of memory"); |
||||
} |
||||
} |
||||
|
||||
ptr = d->subst + idsubst; |
||||
|
||||
ptr->lastlexeme = posinsubst-1; |
||||
|
||||
if ( nres+1 >= ntres ) { |
||||
if ( ntres == 0 ) { |
||||
ntres = 2; |
||||
ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres ); |
||||
} else { |
||||
ntres *= 2; |
||||
ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres ); |
||||
} |
||||
|
||||
if ( !ptr->res )
|
||||
elog(ERROR,"Out of memory"); |
||||
} |
||||
|
||||
if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 )
|
||||
elog(ERROR,"Out of memory"); |
||||
memcpy(ptr->res[ nres ].lexeme, b, e-b); |
||||
ptr->res[ nres ].lexeme[e-b] = '\0'; |
||||
|
||||
ptr->res[ nres ].nvariant = nwrd; |
||||
ptr->res[ nres ].flags = TSL_ADDPOS; |
||||
|
||||
ptr->res[ ++nres ].lexeme = NULL; |
||||
} |
||||
|
||||
#define TR_WAITLEX 1 |
||||
#define TR_INLEX 2 |
||||
#define TR_WAITSUBS 3 |
||||
#define TR_INSUBS 4 |
||||
|
||||
static void |
||||
thesaurusRead( char *filename, DictThesaurus *d ) { |
||||
FILE *fh; |
||||
char str[BUFSIZ]; |
||||
int lineno=0; |
||||
uint16 idsubst = 0; |
||||
|
||||
fh = fopen(to_absfilename(filename), "r"); |
||||
if (!fh) |
||||
elog(ERROR,"Thesaurus: can't open '%s' file", filename); |
||||
|
||||
while( fgets(str, sizeof(str), fh)) { |
||||
char *ptr = str; |
||||
int state = TR_WAITLEX; |
||||
char *beginwrd = NULL; |
||||
uint16 posinsubst=0; |
||||
uint16 nwrd=0; |
||||
|
||||
lineno++; |
||||
|
||||
/* is it comment ? */ |
||||
while( t_isspace(ptr) ) |
||||
ptr += pg_mblen(ptr); |
||||
if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') ) |
||||
continue; |
||||
|
||||
pg_verifymbstr(ptr, strlen(ptr), false); |
||||
while(*ptr) { |
||||
if ( state == TR_WAITLEX ) { |
||||
if ( t_iseq(ptr, ':' ) ) { |
||||
if ( posinsubst == 0 ) { |
||||
fclose(fh); |
||||
elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno); |
||||
} |
||||
state = TR_WAITSUBS; |
||||
} else if ( !t_isspace(ptr) ) { |
||||
beginwrd = ptr; |
||||
state = TR_INLEX; |
||||
} |
||||
} else if ( state == TR_INLEX ) { |
||||
if ( t_iseq(ptr, ':') ) { |
||||
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ ); |
||||
state = TR_WAITSUBS; |
||||
} else if ( t_isspace(ptr) ) { |
||||
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ ); |
||||
state = TR_WAITLEX; |
||||
} |
||||
} else if ( state == TR_WAITSUBS ) { |
||||
if ( !t_isspace(ptr) ) {
|
||||
beginwrd = ptr; |
||||
state = TR_INSUBS; |
||||
} |
||||
} else if ( state == TR_INSUBS ) { |
||||
if ( t_isspace(ptr) ) {
|
||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); |
||||
state = TR_WAITSUBS; |
||||
} |
||||
} else |
||||
elog(ERROR,"Thesaurus: Unknown state: %d", state); |
||||
|
||||
ptr += pg_mblen(ptr); |
||||
} |
||||
|
||||
if ( state == TR_INSUBS ) |
||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); |
||||
|
||||
idsubst++; |
||||
|
||||
if ( !(nwrd && posinsubst) ) { |
||||
fclose(fh); |
||||
elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno); |
||||
} |
||||
|
||||
} |
||||
|
||||
d->nsubst = idsubst; |
||||
|
||||
fclose(fh); |
||||
} |
||||
|
||||
static TheLexeme* |
||||
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) { |
||||
|
||||
if ( *nnw >= *tnm ) { |
||||
*tnm *= 2; |
||||
newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm); |
||||
if (!newwrds) |
||||
elog(ERROR,"Out of memory"); |
||||
} |
||||
|
||||
newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ); |
||||
if (!newwrds[ *nnw ].entries) |
||||
elog(ERROR,"Out of memory"); |
||||
|
||||
if ( lexeme && lexeme->lexeme ) { |
||||
newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme ); |
||||
if ( !newwrds[ *nnw ].lexeme ) |
||||
elog(ERROR,"Out of memory"); |
||||
|
||||
newwrds[ *nnw ].entries->tnvariant = tnvariant; |
||||
} else { |
||||
newwrds[ *nnw ].lexeme = NULL; |
||||
newwrds[ *nnw ].entries->tnvariant = 1; |
||||
} |
||||
|
||||
newwrds[ *nnw ].entries->idsubst = src->idsubst; |
||||
newwrds[ *nnw ].entries->posinsubst = src->posinsubst; |
||||
|
||||
newwrds[ *nnw ].entries->nextentry = NULL; |
||||
|
||||
(*nnw)++; |
||||
return newwrds; |
||||
} |
||||
|
||||
static int |
||||
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) { |
||||
if ( a==NULL || b==NULL ) |
||||
return 0; |
||||
|
||||
if ( a->idsubst == b->idsubst ) { |
||||
if ( a->posinsubst == b->posinsubst ) { |
||||
if ( a->tnvariant == b->tnvariant )
|
||||
return 0; |
||||
|
||||
return ( a->tnvariant > b->tnvariant ) ? 1 : -1; |
||||
} |
||||
|
||||
return ( a->posinsubst > b->posinsubst ) ? 1 : -1; |
||||
} |
||||
|
||||
return ( a->idsubst > b->idsubst ) ? 1 : -1; |
||||
} |
||||
|
||||
static int |
||||
cmpLexeme(TheLexeme *a, TheLexeme* b) { |
||||
if ( a->lexeme == NULL ) { |
||||
if ( b->lexeme == NULL ) |
||||
return 0; |
||||
else |
||||
return 1; |
||||
} else if ( b->lexeme == NULL ) |
||||
return -1; |
||||
|
||||
return strcmp( a->lexeme, b->lexeme ); |
||||
} |
||||
|
||||
static int |
||||
cmpLexemeQ(const void *a, const void *b) { |
||||
return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b );
|
||||
} |
||||
|
||||
static int cmpTheLexeme(const void *a, const void *b) { |
||||
TheLexeme *la = (TheLexeme*)a; |
||||
TheLexeme *lb = (TheLexeme*)b; |
||||
int res; |
||||
|
||||
if ( (res=cmpLexeme(la, lb)) != 0 ) |
||||
return res; |
||||
|
||||
return -cmpLexemeInfo(la->entries, lb->entries); |
||||
} |
||||
|
||||
static void |
||||
compileTheLexeme(DictThesaurus *d) { |
||||
int i,nnw=0, tnm=16; |
||||
TheLexeme *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds; |
||||
|
||||
if (!newwrds)
|
||||
elog(ERROR,"Out of memory"); |
||||
|
||||
for(i=0;i<d->nwrds;i++) { |
||||
TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
|
||||
FunctionCall4( |
||||
&(d->subdict.lexize_info), |
||||
PointerGetDatum(d->subdict.dictionary), |
||||
PointerGetDatum(d->wrds[i].lexeme), |
||||
Int32GetDatum(strlen(d->wrds[i].lexeme)), |
||||
PointerGetDatum(NULL) |
||||
) |
||||
); |
||||
|
||||
if ( !(ptr && ptr->lexeme) ) { |
||||
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); |
||||
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme); |
||||
} else { |
||||
while( ptr->lexeme ) { |
||||
TSLexeme *remptr = ptr+1; |
||||
int tnvar = 1; |
||||
int curvar = ptr->nvariant; |
||||
|
||||
/* compute n words in one variant */ |
||||
while( remptr->lexeme ) { |
||||
if ( remptr->nvariant != (remptr-1)->nvariant ) |
||||
break; |
||||
tnvar++; |
||||
remptr++; |
||||
} |
||||
|
||||
remptr = ptr; |
||||
while( remptr->lexeme && remptr->nvariant == curvar ) { |
||||
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
||||
remptr++; |
||||
} |
||||
|
||||
ptr = remptr; |
||||
} |
||||
} |
||||
|
||||
free( d->wrds[i].lexeme ); |
||||
free( d->wrds[i].entries ); |
||||
} |
||||
|
||||
free( d->wrds ); |
||||
d->wrds = newwrds; |
||||
d->nwrds = nnw; |
||||
d->ntwrds = tnm; |
||||
|
||||
if ( d->nwrds > 1 ) { |
||||
qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme );
|
||||
|
||||
/* uniq */ |
||||
newwrds = d->wrds; |
||||
ptrwrds = d->wrds + 1; |
||||
while( ptrwrds - d->wrds < d->nwrds ) { |
||||
if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) { |
||||
if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) { |
||||
ptrwrds->entries->nextentry = newwrds->entries; |
||||
newwrds->entries = ptrwrds->entries; |
||||
} else |
||||
free( ptrwrds->entries ); |
||||
|
||||
if ( ptrwrds->lexeme ) |
||||
free( ptrwrds->lexeme ); |
||||
} else { |
||||
newwrds++; |
||||
*newwrds = *ptrwrds; |
||||
} |
||||
|
||||
ptrwrds++; |
||||
} |
||||
|
||||
d->nwrds = newwrds - d->wrds + 1; |
||||
d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds ); |
||||
} |
||||
} |
||||
|
||||
static void |
||||
compileTheSubstitute(DictThesaurus *d) { |
||||
int i; |
||||
|
||||
for(i=0;i<d->nsubst;i++) { |
||||
TSLexeme *rem = d->subst[i].res, *outptr, *inptr; |
||||
int n=2; |
||||
|
||||
outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n ); |
||||
if ( d->subst[i].res == NULL ) |
||||
elog(ERROR,"Out of Memory"); |
||||
outptr->lexeme = NULL; |
||||
inptr = rem; |
||||
|
||||
while( inptr && inptr->lexeme ) {
|
||||
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
|
||||
FunctionCall4( |
||||
&(d->subdict.lexize_info), |
||||
PointerGetDatum(d->subdict.dictionary), |
||||
PointerGetDatum(inptr->lexeme), |
||||
Int32GetDatum(strlen(inptr->lexeme)), |
||||
PointerGetDatum(NULL) |
||||
) |
||||
); |
||||
|
||||
reml = lexized; |
||||
if ( lexized ) { |
||||
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1; |
||||
|
||||
while( lexized->lexeme ) { |
||||
if ( outptr - d->subst[i].res + 1 >= n ) { |
||||
int diff = outptr - d->subst[i].res; |
||||
n *= 2; |
||||
d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n ); |
||||
if ( d->subst[i].res == NULL ) |
||||
elog(ERROR,"Out of Memory"); |
||||
outptr = d->subst[i].res + diff; |
||||
} |
||||
|
||||
*outptr = *lexized; |
||||
if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL ) |
||||
elog(ERROR,"Out of Memory"); |
||||
|
||||
outptr++; |
||||
lexized++; |
||||
} |
||||
|
||||
if ( toset > 0) |
||||
d->subst[i].res[toset].flags |= TSL_ADDPOS; |
||||
} |
||||
|
||||
if ( inptr->lexeme ) |
||||
free( inptr->lexeme ); |
||||
inptr++; |
||||
} |
||||
|
||||
d->subst[i].reslen = outptr - d->subst[i].res; |
||||
|
||||
free(rem); |
||||
} |
||||
} |
||||
|
||||
Datum |
||||
thesaurus_init(PG_FUNCTION_ARGS) |
||||
{ |
||||
DictThesaurus *d; |
||||
Map *cfg, |
||||
*pcfg; |
||||
text *in, *subdictname=NULL; |
||||
bool fileloaded = false; |
||||
|
||||
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR), |
||||
errmsg("Thesaurus confguration error"))); |
||||
|
||||
d = (DictThesaurus *) malloc(sizeof(DictThesaurus)); |
||||
if (!d) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OUT_OF_MEMORY), |
||||
errmsg("out of memory"))); |
||||
memset(d, 0, sizeof(DictThesaurus)); |
||||
|
||||
in = PG_GETARG_TEXT_P(0); |
||||
parse_cfgdict(in, &cfg); |
||||
PG_FREE_IF_COPY(in, 0); |
||||
pcfg = cfg; |
||||
while (pcfg->key) |
||||
{ |
||||
if (pg_strcasecmp("DictFile", pcfg->key) == 0) |
||||
{ |
||||
if (fileloaded) |
||||
{ |
||||
freeDictThesaurus(d); |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
||||
errmsg("Thesaurus file is already loaded"))); |
||||
} |
||||
fileloaded = true; |
||||
thesaurusRead( pcfg->value, d ); |
||||
} |
||||
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0) |
||||
{ |
||||
if (subdictname) |
||||
{ |
||||
freeDictThesaurus(d); |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
||||
errmsg("Thesaurus: SubDictionary is already defined"))); |
||||
} |
||||
subdictname = char2text( pcfg->value ); |
||||
} |
||||
else |
||||
{ |
||||
freeDictThesaurus(d); |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_SYNTAX_ERROR), |
||||
errmsg("unrecognized option: %s => %s", |
||||
pcfg->key, pcfg->value))); |
||||
} |
||||
pfree(pcfg->key); |
||||
pfree(pcfg->value); |
||||
pcfg++; |
||||
} |
||||
pfree(cfg); |
||||
|
||||
if (!fileloaded) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
||||
errmsg("Thesaurus file isn't defined"))); |
||||
|
||||
if ( subdictname ) { |
||||
DictInfo *subdictptr; |
||||
/*
|
||||
* we already in SPI, but name2id_dict()/finddict() |
||||
* invoke SPI_connect() |
||||
*/ |
||||
SPI_push();
|
||||
|
||||
subdictptr = finddict( name2id_dict( subdictname ) ); |
||||
|
||||
SPI_pop(); |
||||
|
||||
d->subdict = *subdictptr; |
||||
} else
|
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
||||
errmsg("Thesaurus: SubDictionary isn't defined"))); |
||||
|
||||
compileTheLexeme( d ); |
||||
compileTheSubstitute(d); |
||||
|
||||
PG_RETURN_POINTER(d); |
||||
} |
||||
|
||||
static LexemeInfo* |
||||
findTheLexeme(DictThesaurus *d, char * lexeme) { |
||||
TheLexeme key = { lexeme, NULL }, *res; |
||||
|
||||
if ( d->nwrds == 0 ) |
||||
return NULL; |
||||
|
||||
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ); |
||||
|
||||
if ( res == NULL ) |
||||
return NULL; |
||||
return res->entries; |
||||
} |
||||
|
||||
static bool |
||||
matchIdSubst(LexemeInfo *stored, uint16 idsubst) { |
||||
bool res = true; |
||||
|
||||
if (stored) { |
||||
res = false; |
||||
|
||||
for(; stored; stored=stored->nextvariant)
|
||||
if ( stored->idsubst == idsubst ) { |
||||
res = true; |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return res; |
||||
} |
||||
|
||||
static LexemeInfo* |
||||
findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) { |
||||
for(;;) { |
||||
int i; |
||||
LexemeInfo *ptr = newin[0]; |
||||
|
||||
for(i=0; i<newn; i++) { |
||||
while(newin[i] && newin[i]->idsubst < ptr->idsubst)
|
||||
newin[i] = newin[i]->nextentry; |
||||
|
||||
if ( newin[i] == NULL ) |
||||
return in; |
||||
|
||||
if ( newin[i]->idsubst > ptr->idsubst ) { |
||||
ptr = newin[i]; |
||||
i=-1; |
||||
continue; |
||||
} |
||||
|
||||
while(newin[i]->idsubst == ptr->idsubst) { |
||||
if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) { |
||||
ptr = newin[i]; |
||||
break; |
||||
} |
||||
|
||||
newin[i] = newin[i]->nextentry; |
||||
if ( newin[i] == NULL ) |
||||
return in; |
||||
} |
||||
|
||||
if ( newin[i]->idsubst != ptr->idsubst ) { |
||||
ptr = newin[i]; |
||||
i=-1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */ |
||||
|
||||
ptr->nextvariant = in; |
||||
in = ptr; |
||||
} |
||||
|
||||
/* step forward */ |
||||
for(i=0; i<newn; i++) |
||||
newin[i] = newin[i]->nextentry; |
||||
} |
||||
|
||||
return NULL; |
||||
} |
||||
|
||||
static TSLexeme* |
||||
copyTSLexeme( TheSubstitute *ts ) { |
||||
TSLexeme *res; |
||||
uint16 i; |
||||
|
||||
res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) ); |
||||
for(i=0;i<ts->reslen;i++) {
|
||||
res[i] = ts->res[i]; |
||||
res[i].lexeme = pstrdup( ts->res[i].lexeme ); |
||||
} |
||||
|
||||
res[ts->reslen].lexeme = NULL; |
||||
|
||||
return res; |
||||
} |
||||
|
||||
static TSLexeme* |
||||
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) { |
||||
*moreres = false; |
||||
while(info) { |
||||
Assert( info->idsubst < d->nsubst ); |
||||
if ( info->nextvariant ) |
||||
*moreres = true; |
||||
if ( d->subst[ info->idsubst ].lastlexeme == curpos )
|
||||
return copyTSLexeme( d->subst + info->idsubst ); |
||||
info = info->nextvariant; |
||||
} |
||||
|
||||
return NULL; |
||||
} |
||||
|
||||
Datum |
||||
thesaurus_lexize(PG_FUNCTION_ARGS) |
||||
{ |
||||
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0); |
||||
DictSubState *dstate = (DictSubState*)PG_GETARG_POINTER(3); |
||||
TSLexeme *res=NULL; |
||||
LexemeInfo *stored, *info = NULL; |
||||
uint16 curpos = 0; |
||||
bool moreres = false; |
||||
|
||||
if ( dstate == NULL || PG_NARGS() < 4 ) |
||||
elog(ERROR,"Forbidden call of thesaurus or nested call"); |
||||
|
||||
if ( dstate->isend )
|
||||
PG_RETURN_POINTER(NULL); |
||||
stored = (LexemeInfo*) dstate->private; |
||||
|
||||
if (stored)
|
||||
curpos = stored->posinsubst+1; |
||||
|
||||
res =(TSLexeme*) DatumGetPointer ( |
||||
FunctionCall4( |
||||
&(d->subdict.lexize_info), |
||||
PointerGetDatum(d->subdict.dictionary), |
||||
PG_GETARG_DATUM(1), |
||||
PG_GETARG_INT32(2), |
||||
PointerGetDatum(NULL) |
||||
) |
||||
); |
||||
|
||||
if ( res && res->lexeme ) { |
||||
TSLexeme *ptr = res , *basevar; |
||||
|
||||
while( ptr->lexeme ) { |
||||
uint16 nv = ptr->nvariant; |
||||
uint16 i,nlex = 0; |
||||
LexemeInfo **infos; |
||||
|
||||
basevar = ptr; |
||||
while( ptr->lexeme && nv == ptr->nvariant ) { |
||||
nlex++; |
||||
ptr++; |
||||
} |
||||
|
||||
infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex); |
||||
for(i=0;i<nlex;i++)
|
||||
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL ) |
||||
break; |
||||
|
||||
if ( i<nlex ) {
|
||||
/* no chance to find */ |
||||
pfree( infos ); |
||||
continue; |
||||
} |
||||
|
||||
info = findVariant( info, stored, curpos, infos, nlex); |
||||
} |
||||
|
||||
} else { |
||||
LexemeInfo *infos = findTheLexeme(d, NULL); |
||||
info = findVariant( NULL, stored, curpos, &infos, 1); |
||||
} |
||||
|
||||
dstate->private = (void*)info; |
||||
|
||||
if ( !info ) { |
||||
dstate->getnext = false; |
||||
PG_RETURN_POINTER(NULL); |
||||
} |
||||
|
||||
if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) { |
||||
dstate->getnext = moreres; |
||||
PG_RETURN_POINTER(res); |
||||
} |
||||
|
||||
dstate->getnext = true; |
||||
|
||||
PG_RETURN_POINTER(NULL);
|
||||
} |
@ -0,0 +1,19 @@ |
||||
# |
||||
# Theasurus config file. Character ':' splits |
||||
# string to part: |
||||
# to be substituted string |
||||
# substituting string |
||||
# |
||||
|
||||
#one two three : 123 |
||||
#one two : 12 |
||||
#one : 1 |
||||
#two : 2 |
||||
|
||||
#foo bar : blah blah |
||||
#f bar : fbar |
||||
#e bar : ebar |
||||
#g bar bar : gbarbar |
||||
#asd:sdffff |
||||
#qwerty:qwer wert erty |
||||
|
@ -0,0 +1,261 @@ |
||||
/*
|
||||
* lexize stream of lexemes
|
||||
* Teodor Sigaev <teodor@sigaev.ru> |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include <ctype.h> |
||||
#include <locale.h> |
||||
|
||||
#include "ts_cfg.h" |
||||
#include "dict.h" |
||||
|
||||
void |
||||
LexizeInit(LexizeData *ld, TSCfgInfo *cfg) { |
||||
ld->cfg = cfg; |
||||
ld->curDictId = InvalidOid; |
||||
ld->posDict = 0; |
||||
ld->towork.head = ld->towork.tail = ld->curSub = NULL; |
||||
ld->waste.head = ld->waste.tail = NULL; |
||||
ld->lastRes=NULL; |
||||
ld->tmpRes=NULL; |
||||
} |
||||
|
||||
static void |
||||
LPLAddTail(ListParsedLex *list, ParsedLex *newpl) { |
||||
if ( list->tail ) { |
||||
list->tail->next = newpl; |
||||
list->tail = newpl; |
||||
} else |
||||
list->head = list->tail = newpl; |
||||
newpl->next = NULL; |
||||
} |
||||
|
||||
static ParsedLex* |
||||
LPLRemoveHead(ListParsedLex *list) { |
||||
ParsedLex *res = list->head; |
||||
|
||||
if ( list->head )
|
||||
list->head = list->head->next; |
||||
|
||||
if ( list->head == NULL ) |
||||
list->tail = NULL; |
||||
|
||||
return res; |
||||
} |
||||
|
||||
|
||||
void |
||||
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) { |
||||
ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); |
||||
|
||||
newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); |
||||
newpl->type = type; |
||||
newpl->lemm = lemm; |
||||
newpl->lenlemm = lenlemm; |
||||
LPLAddTail(&ld->towork, newpl); |
||||
ld->curSub = ld->towork.tail; |
||||
} |
||||
|
||||
static void |
||||
RemoveHead(LexizeData *ld) { |
||||
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); |
||||
|
||||
ld->posDict = 0; |
||||
} |
||||
|
||||
static void |
||||
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) { |
||||
if ( correspondLexem ) { |
||||
*correspondLexem = ld->waste.head; |
||||
} else { |
||||
ParsedLex *tmp, *ptr = ld->waste.head; |
||||
|
||||
while(ptr) { |
||||
tmp = ptr->next; |
||||
pfree(ptr); |
||||
ptr = tmp; |
||||
} |
||||
} |
||||
ld->waste.head = ld->waste.tail = NULL; |
||||
} |
||||
|
||||
static void |
||||
moveToWaste(LexizeData *ld, ParsedLex *stop) { |
||||
bool go = true; |
||||
|
||||
while( ld->towork.head && go) { |
||||
if (ld->towork.head == stop) { |
||||
ld->curSub = stop->next; |
||||
go = false; |
||||
} |
||||
RemoveHead(ld); |
||||
} |
||||
} |
||||
|
||||
static void |
||||
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) { |
||||
if ( ld->tmpRes ) { |
||||
TSLexeme *ptr; |
||||
for( ptr=ld->tmpRes; ptr->lexeme; ptr++ )
|
||||
pfree( ptr->lexeme ); |
||||
pfree( ld->tmpRes ); |
||||
} |
||||
ld->tmpRes = res; |
||||
ld->lastRes = lex; |
||||
} |
||||
|
||||
TSLexeme* |
||||
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { |
||||
int i; |
||||
ListDictionary *map; |
||||
DictInfo *dict; |
||||
TSLexeme *res; |
||||
|
||||
if ( ld->curDictId == InvalidOid ) { |
||||
/*
|
||||
* usial mode: dictionary wants only one word, |
||||
* but we should keep in mind that we should go through |
||||
* all stack |
||||
*/ |
||||
|
||||
while( ld->towork.head ) { |
||||
ParsedLex *curVal = ld->towork.head; |
||||
|
||||
map = ld->cfg->map + curVal->type; |
||||
|
||||
if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {
|
||||
/* skip this type of lexeme */ |
||||
RemoveHead(ld); |
||||
continue; |
||||
} |
||||
|
||||
for (i = ld->posDict; i < map->len; i++) { |
||||
dict = finddict(DatumGetObjectId(map->dict_id[i])); |
||||
|
||||
ld->dictState.isend = ld->dictState.getnext = false; |
||||
ld->dictState.private = NULL; |
||||
res = (TSLexeme *) DatumGetPointer( FunctionCall4( |
||||
&(dict->lexize_info), |
||||
PointerGetDatum(dict->dictionary), |
||||
PointerGetDatum(curVal->lemm), |
||||
Int32GetDatum(curVal->lenlemm), |
||||
PointerGetDatum(&ld->dictState) |
||||
)); |
||||
|
||||
if ( ld->dictState.getnext ) { |
||||
/*
|
||||
* dictinary wants next word, so setup and store |
||||
* current position and go to multiword mode |
||||
*/ |
||||
|
||||
ld->curDictId = DatumGetObjectId(map->dict_id[i]); |
||||
ld->posDict = i+1; |
||||
ld->curSub = curVal->next; |
||||
if ( res ) |
||||
setNewTmpRes(ld, curVal, res); |
||||
return LexizeExec(ld, correspondLexem); |
||||
} |
||||
|
||||
if (!res) /* dictionary doesn't know this lexeme */ |
||||
continue; |
||||
|
||||
RemoveHead(ld); |
||||
setCorrLex(ld, correspondLexem); |
||||
return res; |
||||
} |
||||
|
||||
RemoveHead(ld); |
||||
}
|
||||
} else { /* curDictId is valid */ |
||||
dict = finddict(ld->curDictId); |
||||
|
||||
/*
|
||||
* Dictionary ld->curDictId asks us about following words |
||||
*/ |
||||
|
||||
while( ld->curSub ) { |
||||
ParsedLex *curVal = ld->curSub; |
||||
|
||||
map = ld->cfg->map + curVal->type; |
||||
|
||||
if (curVal->type != 0) { |
||||
bool dictExists = false; |
||||
|
||||
if (curVal->type >= ld->cfg->len || map->len == 0 ) {
|
||||
/* skip this type of lexeme */ |
||||
ld->curSub = curVal->next; |
||||
continue; |
||||
} |
||||
|
||||
/*
|
||||
* We should be sure that current type of lexeme is recognized by |
||||
* our dictinonary: we just check is it exist in
|
||||
* list of dictionaries ? |
||||
*/ |
||||
for(i=0;i < map->len && !dictExists; i++)
|
||||
if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) ) |
||||
dictExists = true; |
||||
|
||||
if ( !dictExists ) { |
||||
/*
|
||||
* Dictionary can't work with current tpe of lexeme, |
||||
* return to basic mode and redo all stored lexemes |
||||
*/ |
||||
ld->curDictId = InvalidOid; |
||||
return LexizeExec(ld, correspondLexem); |
||||
} |
||||
}
|
||||
|
||||
ld->dictState.isend = (curVal->type==0) ? true : false; |
||||
ld->dictState.getnext = false; |
||||
|
||||
res = (TSLexeme *) DatumGetPointer( FunctionCall4( |
||||
&(dict->lexize_info), |
||||
PointerGetDatum(dict->dictionary), |
||||
PointerGetDatum(curVal->lemm), |
||||
Int32GetDatum(curVal->lenlemm), |
||||
PointerGetDatum(&ld->dictState) |
||||
)); |
||||
|
||||
if ( ld->dictState.getnext ) { |
||||
/* Dictionary wants one more */ |
||||
ld->curSub = curVal->next; |
||||
if ( res ) |
||||
setNewTmpRes(ld, curVal, res); |
||||
continue; |
||||
} |
||||
|
||||
if ( res || ld->tmpRes ) { |
||||
/*
|
||||
* Dictionary normalizes lexemes, |
||||
* so we remove from stack all used lexemes , |
||||
* return to basic mode and redo end of stack (if it exists) |
||||
*/ |
||||
if ( res ) { |
||||
moveToWaste( ld, ld->curSub ); |
||||
} else { |
||||
res = ld->tmpRes; |
||||
moveToWaste( ld, ld->lastRes ); |
||||
} |
||||
|
||||
/* reset to initial state */ |
||||
ld->curDictId = InvalidOid; |
||||
ld->posDict = 0; |
||||
ld->lastRes = NULL; |
||||
ld->tmpRes = NULL; |
||||
setCorrLex(ld, correspondLexem); |
||||
return res; |
||||
} |
||||
|
||||
/* Dict don't want next lexem and didn't recognize anything,
|
||||
redo from ld->towork.head */ |
||||
ld->curDictId = InvalidOid; |
||||
return LexizeExec(ld, correspondLexem); |
||||
}
|
||||
} |
||||
|
||||
setCorrLex(ld, correspondLexem); |
||||
return NULL; |
||||
} |
||||
|
Loading…
Reference in new issue