@ -15,7 +15,8 @@
PG_MODULE_MAGIC ;
/* GUC variables */
double similarity_threshold = 0.3f ;
double similarity_threshold = 0.3f ;
double word_similarity_threshold = 0.6f ;
void _PG_init ( void ) ;
@ -23,8 +24,20 @@ PG_FUNCTION_INFO_V1(set_limit);
PG_FUNCTION_INFO_V1 ( show_limit ) ;
PG_FUNCTION_INFO_V1 ( show_trgm ) ;
PG_FUNCTION_INFO_V1 ( similarity ) ;
PG_FUNCTION_INFO_V1 ( word_similarity ) ;
PG_FUNCTION_INFO_V1 ( similarity_dist ) ;
PG_FUNCTION_INFO_V1 ( similarity_op ) ;
PG_FUNCTION_INFO_V1 ( word_similarity_op ) ;
PG_FUNCTION_INFO_V1 ( word_similarity_commutator_op ) ;
PG_FUNCTION_INFO_V1 ( word_similarity_dist_op ) ;
PG_FUNCTION_INFO_V1 ( word_similarity_dist_commutator_op ) ;
/* Trigram with position */
typedef struct
{
trgm trg ;
int index ;
} pos_trgm ;
/*
* Module load callback
@ -45,11 +58,23 @@ _PG_init(void)
NULL ,
NULL ,
NULL ) ;
DefineCustomRealVariable ( " pg_trgm.word_similarity_threshold " ,
" Sets the threshold used by the <%% operator. " ,
" Valid range is 0.0 .. 1.0. " ,
& word_similarity_threshold ,
0.6 ,
0.0 ,
1.0 ,
PGC_USERSET ,
0 ,
NULL ,
NULL ,
NULL ) ;
}
/*
* Deprecated function .
* Use " pg_trgm.similarity_threshold " GUC variable instead of this function
* Use " pg_trgm.similarity_threshold " GUC variable instead of this function .
*/
Datum
set_limit ( PG_FUNCTION_ARGS )
@ -59,14 +84,14 @@ set_limit(PG_FUNCTION_ARGS)
if ( nlimit < 0 | | nlimit > 1.0 )
ereport ( ERROR ,
( errcode ( ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE ) ,
errmsg ( " wrong limi t, should be between 0 and 1 " ) ) ) ;
errmsg ( " wrong threshold , should be between 0 and 1 " ) ) ) ;
similarity_threshold = nlimit ;
PG_RETURN_FLOAT4 ( similarity_threshold ) ;
}
/*
* Deprecated function .
* Use " pg_trgm.similarity_threshold " GUC variable instead of this function
* Use " pg_trgm.similarity_threshold " GUC variable instead of this function .
*/
Datum
show_limit ( PG_FUNCTION_ARGS )
@ -199,38 +224,28 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
return tptr ;
}
TRGM *
generate_trgm ( char * str , int slen )
/*
* Make array of trigrams without sorting and removing duplicate items .
*
* trg : where to return the array of trigrams .
* str : source string , of length slen bytes .
*
* Returns length of the generated array .
*/
static int
generate_trgm_only ( trgm * trg , char * str , int slen )
{
TRGM * trg ;
char * buf ;
trgm * tptr ;
int len ,
charlen ,
char * buf ;
int charlen ,
bytelen ;
char * bword ,
* eword ;
/*
* Guard against possible overflow in the palloc requests below . ( We
* don ' t worry about the additive constants , since palloc can detect
* requests that are a little above MaxAllocSize - - - we just need to
* prevent integer overflow in the multiplications . )
*/
if ( ( Size ) ( slen / 2 ) > = ( MaxAllocSize / ( sizeof ( trgm ) * 3 ) ) | |
( Size ) slen > = ( MaxAllocSize / pg_database_encoding_max_length ( ) ) )
ereport ( ERROR ,
( errcode ( ERRCODE_PROGRAM_LIMIT_EXCEEDED ) ,
errmsg ( " out of memory " ) ) ) ;
trg = ( TRGM * ) palloc ( TRGMHDRSIZE + sizeof ( trgm ) * ( slen / 2 + 1 ) * 3 ) ;
trg - > flag = ARRKEY ;
SET_VARSIZE ( trg , TRGMHDRSIZE ) ;
if ( slen + LPADDING + RPADDING < 3 | | slen = = 0 )
return trg ;
return 0 ;
tptr = GETARR ( trg ) ;
tptr = trg ;
/* Allocate a buffer for case-folded, blank-padded words */
buf = ( char * ) palloc ( slen * pg_database_encoding_max_length ( ) + 4 ) ;
@ -270,7 +285,47 @@ generate_trgm(char *str, int slen)
pfree ( buf ) ;
if ( ( len = tptr - GETARR ( trg ) ) = = 0 )
return tptr - trg ;
}
/*
* Guard against possible overflow in the palloc requests below . ( We
* don ' t worry about the additive constants , since palloc can detect
* requests that are a little above MaxAllocSize - - - we just need to
* prevent integer overflow in the multiplications . )
*/
static void
protect_out_of_mem ( int slen )
{
if ( ( Size ) ( slen / 2 ) > = ( MaxAllocSize / ( sizeof ( trgm ) * 3 ) ) | |
( Size ) slen > = ( MaxAllocSize / pg_database_encoding_max_length ( ) ) )
ereport ( ERROR ,
( errcode ( ERRCODE_PROGRAM_LIMIT_EXCEEDED ) ,
errmsg ( " out of memory " ) ) ) ;
}
/*
* Make array of trigrams with sorting and removing duplicate items .
*
* str : source string , of length slen bytes .
*
* Returns the sorted array of unique trigrams .
*/
TRGM *
generate_trgm ( char * str , int slen )
{
TRGM * trg ;
int len ;
protect_out_of_mem ( slen ) ;
trg = ( TRGM * ) palloc ( TRGMHDRSIZE + sizeof ( trgm ) * ( slen / 2 + 1 ) * 3 ) ;
trg - > flag = ARRKEY ;
len = generate_trgm_only ( GETARR ( trg ) , str , slen ) ;
SET_VARSIZE ( trg , CALCGTSIZE ( ARRKEY , len ) ) ;
if ( len = = 0 )
return trg ;
/*
@ -287,6 +342,285 @@ generate_trgm(char *str, int slen)
return trg ;
}
/*
* Make array of positional trigrams from two trigram arrays trg1 and trg2 .
*
* trg1 : trigram array of search pattern , of length len1 . trg1 is required
* word which positions don ' t matter and replaced with - 1.
* trg2 : trigram array of text , of length len2 . trg2 is haystack where we
* search and have to store its positions .
*
* Returns concatenated trigram array .
*/
static pos_trgm *
make_positional_trgm ( trgm * trg1 , int len1 , trgm * trg2 , int len2 )
{
pos_trgm * result ;
int i , len = len1 + len2 ;
result = ( pos_trgm * ) palloc ( sizeof ( pos_trgm ) * len ) ;
for ( i = 0 ; i < len1 ; i + + )
{
memcpy ( & result [ i ] . trg , & trg1 [ i ] , sizeof ( trgm ) ) ;
result [ i ] . index = - 1 ;
}
for ( i = 0 ; i < len2 ; i + + )
{
memcpy ( & result [ i + len1 ] . trg , & trg2 [ i ] , sizeof ( trgm ) ) ;
result [ i + len1 ] . index = i ;
}
return result ;
}
/*
* Compare position trigrams : compare trigrams first and position second .
*/
static int
comp_ptrgm ( const void * v1 , const void * v2 )
{
const pos_trgm * p1 = ( const pos_trgm * ) v1 ;
const pos_trgm * p2 = ( const pos_trgm * ) v2 ;
int cmp ;
cmp = CMPTRGM ( p1 - > trg , p2 - > trg ) ;
if ( cmp ! = 0 )
return cmp ;
if ( p1 - > index < p2 - > index )
return - 1 ;
else if ( p1 - > index = = p2 - > index )
return 0 ;
else
return 1 ;
}
/*
* Iterative search function which calculates maximum similarity with word in
* the string . But maximum similarity is calculated only if check_only = = false .
*
* trg2indexes : array which stores indexes of the array " found " .
* found : array which stores true of false values .
* ulen1 : count of unique trigrams of array " trg1 " .
* len2 : length of array " trg2 " and array " trg2indexes " .
* len : length of the array " found " .
* check_only : if true then only check existaince of similar search pattern in
* text .
*
* Returns word similarity .
*/
static float4
iterate_word_similarity ( int * trg2indexes ,
bool * found ,
int ulen1 ,
int len2 ,
int len ,
bool check_only )
{
int * lastpos ,
i ,
ulen2 = 0 ,
count = 0 ,
upper = - 1 ,
lower = - 1 ;
float4 smlr_cur ,
smlr_max = 0.0f ;
/* Memorise last position of each trigram */
lastpos = ( int * ) palloc ( sizeof ( int ) * len ) ;
memset ( lastpos , - 1 , sizeof ( int ) * len ) ;
for ( i = 0 ; i < len2 ; i + + )
{
/* Get index of next trigram */
int trgindex = trg2indexes [ i ] ;
/* Update last position of this trigram */
if ( lower > = 0 | | found [ trgindex ] )
{
if ( lastpos [ trgindex ] < 0 )
{
ulen2 + + ;
if ( found [ trgindex ] )
count + + ;
}
lastpos [ trgindex ] = i ;
}
/* Adjust lower bound if this trigram is present in required substing */
if ( found [ trgindex ] )
{
int prev_lower ,
tmp_ulen2 ,
tmp_lower ,
tmp_count ;
upper = i ;
if ( lower = = - 1 )
{
lower = i ;
ulen2 = 1 ;
}
smlr_cur = CALCSML ( count , ulen1 , ulen2 ) ;
/* Also try to adjust upper bound for greater similarity */
tmp_count = count ;
tmp_ulen2 = ulen2 ;
prev_lower = lower ;
for ( tmp_lower = lower ; tmp_lower < = upper ; tmp_lower + + )
{
float smlr_tmp = CALCSML ( tmp_count , ulen1 , tmp_ulen2 ) ;
int tmp_trgindex ;
if ( smlr_tmp > smlr_cur )
{
smlr_cur = smlr_tmp ;
ulen2 = tmp_ulen2 ;
lower = tmp_lower ;
count = tmp_count ;
}
/*
* if we only check that word similarity is greater than
* pg_trgm . word_similarity_threshold we do not need to calculate
* a maximum similarity .
*/
if ( check_only & & smlr_cur > = word_similarity_threshold )
break ;
tmp_trgindex = trg2indexes [ tmp_lower ] ;
if ( lastpos [ tmp_trgindex ] = = tmp_lower )
{
tmp_ulen2 - - ;
if ( found [ tmp_trgindex ] )
tmp_count - - ;
}
}
smlr_max = Max ( smlr_max , smlr_cur ) ;
/*
* if we only check that word similarity is greater than
* pg_trgm . word_similarity_threshold we do not need to calculate a
* maximum similarity
*/
if ( check_only & & smlr_max > = word_similarity_threshold )
break ;
for ( tmp_lower = prev_lower ; tmp_lower < lower ; tmp_lower + + )
{
int tmp_trgindex ;
tmp_trgindex = trg2indexes [ tmp_lower ] ;
if ( lastpos [ tmp_trgindex ] = = tmp_lower )
lastpos [ tmp_trgindex ] = - 1 ;
}
}
}
pfree ( lastpos ) ;
return smlr_max ;
}
/*
* Calculate word similarity .
* This function prepare two arrays : " trg2indexes " and " found " . Then this arrays
* are used to calculate word similarity using iterate_word_similarity ( ) .
*
* " trg2indexes " is array which stores indexes of the array " found " .
* In other words :
* trg2indexes [ j ] = i ;
* found [ i ] = true ( or false ) ;
* If found [ i ] = = true then there is trigram trg2 [ j ] in array " trg1 " .
* If found [ i ] = = false then there is not trigram trg2 [ j ] in array " trg1 " .
*
* str1 : search pattern string , of length slen1 bytes .
* str2 : text in which we are looking for a word , of length slen2 bytes .
* check_only : if true then only check existaince of similar search pattern in
* text .
*
* Returns word similarity .
*/
static float4
calc_word_similarity ( char * str1 , int slen1 , char * str2 , int slen2 ,
bool check_only )
{
bool * found ;
pos_trgm * ptrg ;
trgm * trg1 ;
trgm * trg2 ;
int len1 ,
len2 ,
len ,
i ,
j ,
ulen1 ;
int * trg2indexes ;
float4 result ;
protect_out_of_mem ( slen1 + slen2 ) ;
/* Make positional trigrams */
trg1 = ( trgm * ) palloc ( sizeof ( trgm ) * ( slen1 / 2 + 1 ) * 3 ) ;
trg2 = ( trgm * ) palloc ( sizeof ( trgm ) * ( slen2 / 2 + 1 ) * 3 ) ;
len1 = generate_trgm_only ( trg1 , str1 , slen1 ) ;
len2 = generate_trgm_only ( trg2 , str2 , slen2 ) ;
ptrg = make_positional_trgm ( trg1 , len1 , trg2 , len2 ) ;
len = len1 + len2 ;
qsort ( ptrg , len , sizeof ( pos_trgm ) , comp_ptrgm ) ;
pfree ( trg1 ) ;
pfree ( trg2 ) ;
/*
* Merge positional trigrams array : enumerate each trigram and find its
* presence in required word .
*/
trg2indexes = ( int * ) palloc ( sizeof ( int ) * len2 ) ;
found = ( bool * ) palloc0 ( sizeof ( bool ) * len ) ;
ulen1 = 0 ;
j = 0 ;
for ( i = 0 ; i < len ; i + + )
{
if ( i > 0 )
{
int cmp = CMPTRGM ( ptrg [ i - 1 ] . trg , ptrg [ i ] . trg ) ;
if ( cmp ! = 0 )
{
if ( found [ j ] )
ulen1 + + ;
j + + ;
}
}
if ( ptrg [ i ] . index > = 0 )
{
trg2indexes [ ptrg [ i ] . index ] = j ;
}
else
{
found [ j ] = true ;
}
}
if ( found [ j ] )
ulen1 + + ;
/* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity ( trg2indexes , found , ulen1 , len2 , len ,
check_only ) ;
pfree ( trg2indexes ) ;
pfree ( found ) ;
pfree ( ptrg ) ;
return result ;
}
/*
* Extract the next non - wildcard part of a search string , ie , a word bounded
* by ' _ ' or ' % ' meta - characters , non - word characters or string end .
@ -459,17 +793,7 @@ generate_wildcard_trgm(const char *str, int slen)
bytelen ;
const char * eword ;
/*
* Guard against possible overflow in the palloc requests below . ( We
* don ' t worry about the additive constants , since palloc can detect
* requests that are a little above MaxAllocSize - - - we just need to
* prevent integer overflow in the multiplications . )
*/
if ( ( Size ) ( slen / 2 ) > = ( MaxAllocSize / ( sizeof ( trgm ) * 3 ) ) | |
( Size ) slen > = ( MaxAllocSize / pg_database_encoding_max_length ( ) ) )
ereport ( ERROR ,
( errcode ( ERRCODE_PROGRAM_LIMIT_EXCEEDED ) ,
errmsg ( " out of memory " ) ) ) ;
protect_out_of_mem ( slen ) ;
trg = ( TRGM * ) palloc ( TRGMHDRSIZE + sizeof ( trgm ) * ( slen / 2 + 1 ) * 3 ) ;
trg - > flag = ARRKEY ;
@ -590,7 +914,7 @@ show_trgm(PG_FUNCTION_ARGS)
}
float4
cnt_sml ( TRGM * trg1 , TRGM * trg2 )
cnt_sml ( TRGM * trg1 , TRGM * trg2 , bool inexact )
{
trgm * ptr1 ,
* ptr2 ;
@ -624,14 +948,15 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
}
}
# ifdef DIVUNION
return ( ( float4 ) count ) / ( ( float4 ) ( len1 + len2 - count ) ) ;
# else
return ( ( float4 ) count ) / ( ( float4 ) ( ( len1 > len2 ) ? len1 : len2 ) ) ;
# endif
/*
* If inexact then len2 is equal to count , because we don ' t know actual
* length of second string in inexact search and we can assume that count
* is a lower bound of len2 .
*/
return CALCSML ( count , len1 , inexact ? count : len2 ) ;
}
/*
* Returns whether trg2 contains all trigrams in trg1 .
* This relies on the trigram arrays being sorted .
@ -726,7 +1051,7 @@ similarity(PG_FUNCTION_ARGS)
trg1 = generate_trgm ( VARDATA ( in1 ) , VARSIZE ( in1 ) - VARHDRSZ ) ;
trg2 = generate_trgm ( VARDATA ( in2 ) , VARSIZE ( in2 ) - VARHDRSZ ) ;
res = cnt_sml ( trg1 , trg2 ) ;
res = cnt_sml ( trg1 , trg2 , false ) ;
pfree ( trg1 ) ;
pfree ( trg2 ) ;
@ -736,6 +1061,22 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4 ( res ) ;
}
Datum
word_similarity ( PG_FUNCTION_ARGS )
{
text * in1 = PG_GETARG_TEXT_PP ( 0 ) ;
text * in2 = PG_GETARG_TEXT_PP ( 1 ) ;
float4 res ;
res = calc_word_similarity ( VARDATA_ANY ( in1 ) , VARSIZE_ANY_EXHDR ( in1 ) ,
VARDATA_ANY ( in2 ) , VARSIZE_ANY_EXHDR ( in2 ) ,
false ) ;
PG_FREE_IF_COPY ( in1 , 0 ) ;
PG_FREE_IF_COPY ( in2 , 1 ) ;
PG_RETURN_FLOAT4 ( res ) ;
}
Datum
similarity_dist ( PG_FUNCTION_ARGS )
{
@ -755,3 +1096,67 @@ similarity_op(PG_FUNCTION_ARGS)
PG_RETURN_BOOL ( res > = similarity_threshold ) ;
}
Datum
word_similarity_op ( PG_FUNCTION_ARGS )
{
text * in1 = PG_GETARG_TEXT_PP ( 0 ) ;
text * in2 = PG_GETARG_TEXT_PP ( 1 ) ;
float4 res ;
res = calc_word_similarity ( VARDATA_ANY ( in1 ) , VARSIZE_ANY_EXHDR ( in1 ) ,
VARDATA_ANY ( in2 ) , VARSIZE_ANY_EXHDR ( in2 ) ,
true ) ;
PG_FREE_IF_COPY ( in1 , 0 ) ;
PG_FREE_IF_COPY ( in2 , 1 ) ;
PG_RETURN_BOOL ( res > = word_similarity_threshold ) ;
}
Datum
word_similarity_commutator_op ( PG_FUNCTION_ARGS )
{
text * in1 = PG_GETARG_TEXT_PP ( 0 ) ;
text * in2 = PG_GETARG_TEXT_PP ( 1 ) ;
float4 res ;
res = calc_word_similarity ( VARDATA_ANY ( in2 ) , VARSIZE_ANY_EXHDR ( in2 ) ,
VARDATA_ANY ( in1 ) , VARSIZE_ANY_EXHDR ( in1 ) ,
true ) ;
PG_FREE_IF_COPY ( in1 , 0 ) ;
PG_FREE_IF_COPY ( in2 , 1 ) ;
PG_RETURN_BOOL ( res > = word_similarity_threshold ) ;
}
Datum
word_similarity_dist_op ( PG_FUNCTION_ARGS )
{
text * in1 = PG_GETARG_TEXT_PP ( 0 ) ;
text * in2 = PG_GETARG_TEXT_PP ( 1 ) ;
float4 res ;
res = calc_word_similarity ( VARDATA_ANY ( in1 ) , VARSIZE_ANY_EXHDR ( in1 ) ,
VARDATA_ANY ( in2 ) , VARSIZE_ANY_EXHDR ( in2 ) ,
false ) ;
PG_FREE_IF_COPY ( in1 , 0 ) ;
PG_FREE_IF_COPY ( in2 , 1 ) ;
PG_RETURN_FLOAT4 ( 1.0 - res ) ;
}
Datum
word_similarity_dist_commutator_op ( PG_FUNCTION_ARGS )
{
text * in1 = PG_GETARG_TEXT_PP ( 0 ) ;
text * in2 = PG_GETARG_TEXT_PP ( 1 ) ;
float4 res ;
res = calc_word_similarity ( VARDATA_ANY ( in2 ) , VARSIZE_ANY_EXHDR ( in2 ) ,
VARDATA_ANY ( in1 ) , VARSIZE_ANY_EXHDR ( in1 ) ,
false ) ;
PG_FREE_IF_COPY ( in1 , 0 ) ;
PG_FREE_IF_COPY ( in2 , 1 ) ;
PG_RETURN_FLOAT4 ( 1.0 - res ) ;
}