@ -14,6 +14,7 @@
# include "postgres.h"
# include "access/htup_details.h"
# include "catalog/namespace.h"
# include "catalog/pg_type.h"
# include "commands/trigger.h"
@ -65,6 +66,7 @@ typedef struct
# define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger ( PG_FUNCTION_ARGS , bool config_column ) ;
static int tsvector_bsearch ( TSVector tsin , char * lexin , int lexin_len ) ;
/*
* Order : haspos , len , word , for all positions ( pos , weight )
@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER ( out ) ;
}
/*
* setweight ( tsin tsvector , char_weight " char " , lexemes " text " [ ] )
*
* Assign weight w to elements of tsin that are listed in lexemes .
*/
Datum
tsvector_setweight_by_filter ( PG_FUNCTION_ARGS )
{
TSVector tsin = PG_GETARG_TSVECTOR ( 0 ) ;
char char_weight = PG_GETARG_CHAR ( 1 ) ;
ArrayType * lexemes = PG_GETARG_ARRAYTYPE_P ( 2 ) ;
TSVector tsout ;
int i ,
j ,
nlexemes ,
weight ;
WordEntry * entry ;
Datum * dlexemes ;
bool * nulls ;
switch ( char_weight )
{
case ' A ' : case ' a ' :
weight = 3 ;
break ;
case ' B ' : case ' b ' :
weight = 2 ;
break ;
case ' C ' : case ' c ' :
weight = 1 ;
break ;
case ' D ' : case ' d ' :
weight = 0 ;
break ;
default :
/* internal error */
elog ( ERROR , " unrecognized weight: %c " , char_weight ) ;
}
tsout = ( TSVector ) palloc ( VARSIZE ( tsin ) ) ;
memcpy ( tsout , tsin , VARSIZE ( tsin ) ) ;
entry = ARRPTR ( tsout ) ;
deconstruct_array ( lexemes , TEXTOID , - 1 , false , ' i ' ,
& dlexemes , & nulls , & nlexemes ) ;
/*
* Assuming that lexemes array is significantly shorter than tsvector
* we can iterate through lexemes performing binary search
* of each lexeme from lexemes in tsvector .
*/
for ( i = 0 ; i < nlexemes ; i + + )
{
char * lex ;
int lex_len ,
lex_pos ;
if ( nulls [ i ] )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " lexeme array may not contain nulls " ) ) ) ;
lex = VARDATA ( dlexemes [ i ] ) ;
lex_len = VARSIZE_ANY_EXHDR ( dlexemes [ i ] ) ;
lex_pos = tsvector_bsearch ( tsout , lex , lex_len ) ;
if ( lex_pos > = 0 & & ( j = POSDATALEN ( tsout , entry + lex_pos ) ) ! = 0 )
{
WordEntryPos * p = POSDATAPTR ( tsout , entry + lex_pos ) ;
while ( j - - )
{
WEP_SETWEIGHT ( * p , weight ) ;
p + + ;
}
}
}
PG_FREE_IF_COPY ( tsin , 0 ) ;
PG_FREE_IF_COPY ( lexemes , 2 ) ;
PG_RETURN_POINTER ( tsout ) ;
}
# define compareEntry(pa, a, pb, b) \
tsCompareString ( ( pa ) + ( a ) - > pos , ( a ) - > len , \
( pb ) + ( b ) - > pos , ( b ) - > len , \
@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
return * clen - startlen ;
}
/*
* Perform binary search of given lexeme in TSVector .
* Returns lexeme position in TSVector ' s entry array or - 1 if lexeme wasn ' t
* found .
*/
static int
tsvector_bsearch ( const TSVector tsv , char * lexeme , int lexeme_len )
{
WordEntry * arrin = ARRPTR ( tsv ) ;
int StopLow = 0 ,
StopHigh = tsv - > size ,
StopMiddle ,
cmp ;
while ( StopLow < StopHigh )
{
StopMiddle = ( StopLow + StopHigh ) / 2 ;
cmp = tsCompareString ( lexeme , lexeme_len ,
STRPTR ( tsv ) + arrin [ StopMiddle ] . pos ,
arrin [ StopMiddle ] . len ,
false ) ;
if ( cmp < 0 )
StopHigh = StopMiddle ;
else if ( cmp > 0 )
StopLow = StopMiddle + 1 ;
else /* found it */
return StopMiddle ;
}
return - 1 ;
}
static int
compareint ( const void * va , const void * vb )
{
int32 a = * ( ( const int32 * ) va ) ;
int32 b = * ( ( const int32 * ) vb ) ;
if ( a = = b )
return 0 ;
return ( a > b ) ? 1 : - 1 ;
}
/*
* Internal routine to delete lexemes from TSVector by array of offsets .
*
* int * indices_to_delete - - array of lexeme offsets to delete
* int indices_count - - size of that array
*
* Returns new TSVector without given lexemes along with their positions
* and weights .
*/
static TSVector
tsvector_delete_by_indices ( TSVector tsv , int * indices_to_delete ,
int indices_count )
{
TSVector tsout ;
WordEntry * arrin = ARRPTR ( tsv ) ,
* arrout ;
char * data = STRPTR ( tsv ) ,
* dataout ;
int i , j , k ,
curoff ;
/*
* Here we overestimates tsout size , since we don ' t know exact size
* occupied by positions and weights . We will set exact size later
* after a pass through TSVector .
*/
tsout = ( TSVector ) palloc0 ( VARSIZE ( tsv ) ) ;
arrout = ARRPTR ( tsout ) ;
tsout - > size = tsv - > size - indices_count ;
/* Sort our filter array to simplify membership check later. */
if ( indices_count > 1 )
qsort ( indices_to_delete , indices_count , sizeof ( int ) , compareint ) ;
/*
* Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete .
*/
curoff = 0 ;
dataout = STRPTR ( tsout ) ;
for ( i = j = k = 0 ; i < tsv - > size ; i + + )
{
/*
* Here we should check whether current i is present in
* indices_to_delete or not . Since indices_to_delete is already
* sorted we can advance it index only when we have match .
*/
if ( k < indices_count & & i = = indices_to_delete [ k ] ) {
k + + ;
continue ;
}
/* Copy lexeme, it's positions and weights */
memcpy ( dataout + curoff , data + arrin [ i ] . pos , arrin [ i ] . len ) ;
arrout [ j ] . haspos = arrin [ i ] . haspos ;
arrout [ j ] . len = arrin [ i ] . len ;
arrout [ j ] . pos = curoff ;
curoff + = arrin [ i ] . len ;
if ( arrin [ i ] . haspos )
{
int len = POSDATALEN ( tsv , arrin + i ) * sizeof ( WordEntryPos ) +
sizeof ( uint16 ) ;
curoff = SHORTALIGN ( curoff ) ;
memcpy ( dataout + curoff ,
STRPTR ( tsv ) + SHORTALIGN ( arrin [ i ] . pos + arrin [ i ] . len ) ,
len ) ;
curoff + = len ;
}
j + + ;
}
/*
* After the pass through TSVector k should equals exactly to indices_count .
* If it isn ' t then the caller provided us with indices outside of
* [ 0 , tsv - > size ) range and estimation of tsout ' s size is wrong .
*/
Assert ( k = = indices_count ) ;
SET_VARSIZE ( tsout , CALCDATASIZE ( tsout - > size , curoff ) ) ;
return tsout ;
}
/*
* Delete given lexeme from tsvector .
* Implementation of user - level delete ( tsvector , text ) .
*/
Datum
tsvector_delete_str ( PG_FUNCTION_ARGS )
{
TSVector tsin = PG_GETARG_TSVECTOR ( 0 ) ,
tsout ;
text * tlexeme = PG_GETARG_TEXT_P ( 1 ) ;
char * lexeme = VARDATA ( tlexeme ) ;
int lexeme_len = VARSIZE_ANY_EXHDR ( tlexeme ) ,
skip_index ;
if ( ( skip_index = tsvector_bsearch ( tsin , lexeme , lexeme_len ) ) = = - 1 )
PG_RETURN_POINTER ( tsin ) ;
tsout = tsvector_delete_by_indices ( tsin , & skip_index , 1 ) ;
PG_FREE_IF_COPY ( tsin , 0 ) ;
PG_FREE_IF_COPY ( tlexeme , 1 ) ;
PG_RETURN_POINTER ( tsout ) ;
}
/*
* Delete given array of lexemes from tsvector .
* Implementation of user - level delete ( tsvector , text [ ] ) .
*/
Datum
tsvector_delete_arr ( PG_FUNCTION_ARGS )
{
TSVector tsin = PG_GETARG_TSVECTOR ( 0 ) ,
tsout ;
ArrayType * lexemes = PG_GETARG_ARRAYTYPE_P ( 1 ) ;
int i , nlex ,
skip_count ,
* skip_indices ;
Datum * dlexemes ;
bool * nulls ;
deconstruct_array ( lexemes , TEXTOID , - 1 , false , ' i ' ,
& dlexemes , & nulls , & nlex ) ;
/*
* In typical use case array of lexemes to delete is relatively small .
* So here we optimizing things for that scenario : iterate through lexarr
* performing binary search of each lexeme from lexarr in tsvector .
*/
skip_indices = palloc0 ( nlex * sizeof ( int ) ) ;
for ( i = skip_count = 0 ; i < nlex ; i + + )
{
char * lex ;
int lex_len ,
lex_pos ;
if ( nulls [ i ] )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " lexeme array may not contain nulls " ) ) ) ;
lex = VARDATA ( dlexemes [ i ] ) ;
lex_len = VARSIZE_ANY_EXHDR ( dlexemes [ i ] ) ;
lex_pos = tsvector_bsearch ( tsin , lex , lex_len ) ;
if ( lex_pos > = 0 )
skip_indices [ skip_count + + ] = lex_pos ;
}
tsout = tsvector_delete_by_indices ( tsin , skip_indices , skip_count ) ;
pfree ( skip_indices ) ;
PG_FREE_IF_COPY ( tsin , 0 ) ;
PG_FREE_IF_COPY ( lexemes , 1 ) ;
PG_RETURN_POINTER ( tsout ) ;
}
/*
* Expand tsvector as table with following columns :
* lexeme : lexeme text
* positions : integer array of lexeme positions
* weights : char array of weights corresponding to positions
*/
Datum
tsvector_unnest ( PG_FUNCTION_ARGS )
{
FuncCallContext * funcctx ;
TSVector tsin ;
if ( SRF_IS_FIRSTCALL ( ) )
{
MemoryContext oldcontext ;
TupleDesc tupdesc ;
funcctx = SRF_FIRSTCALL_INIT ( ) ;
oldcontext = MemoryContextSwitchTo ( funcctx - > multi_call_memory_ctx ) ;
tupdesc = CreateTemplateTupleDesc ( 3 , false ) ;
TupleDescInitEntry ( tupdesc , ( AttrNumber ) 1 , " lexeme " ,
TEXTOID , - 1 , 0 ) ;
TupleDescInitEntry ( tupdesc , ( AttrNumber ) 2 , " positions " ,
INT2ARRAYOID , - 1 , 0 ) ;
TupleDescInitEntry ( tupdesc , ( AttrNumber ) 3 , " weights " ,
TEXTARRAYOID , - 1 , 0 ) ;
funcctx - > tuple_desc = BlessTupleDesc ( tupdesc ) ;
funcctx - > user_fctx = PG_GETARG_TSVECTOR_COPY ( 0 ) ;
MemoryContextSwitchTo ( oldcontext ) ;
}
funcctx = SRF_PERCALL_SETUP ( ) ;
tsin = ( TSVector ) funcctx - > user_fctx ;
if ( funcctx - > call_cntr < tsin - > size )
{
WordEntry * arrin = ARRPTR ( tsin ) ;
char * data = STRPTR ( tsin ) ;
HeapTuple tuple ;
int j ,
i = funcctx - > call_cntr ;
bool nulls [ ] = { false , false , false } ;
Datum values [ 3 ] ;
values [ 0 ] = PointerGetDatum (
cstring_to_text_with_len ( data + arrin [ i ] . pos , arrin [ i ] . len )
) ;
if ( arrin [ i ] . haspos )
{
WordEntryPosVector * posv ;
Datum * positions ;
Datum * weights ;
char weight ;
/*
* Internally tsvector stores position and weight in the same
* uint16 ( 2 bits for weight , 14 for position ) . Here we extract that
* in two separate arrays .
*/
posv = _POSVECPTR ( tsin , arrin + i ) ;
positions = palloc ( posv - > npos * sizeof ( Datum ) ) ;
weights = palloc ( posv - > npos * sizeof ( Datum ) ) ;
for ( j = 0 ; j < posv - > npos ; j + + )
{
positions [ j ] = Int16GetDatum ( WEP_GETPOS ( posv - > pos [ j ] ) ) ;
weight = ' D ' - WEP_GETWEIGHT ( posv - > pos [ j ] ) ;
weights [ j ] = PointerGetDatum (
cstring_to_text_with_len ( & weight , 1 )
) ;
}
values [ 1 ] = PointerGetDatum (
construct_array ( positions , posv - > npos , INT2OID , 2 , true , ' s ' ) ) ;
values [ 2 ] = PointerGetDatum (
construct_array ( weights , posv - > npos , TEXTOID , - 1 , false , ' i ' ) ) ;
}
else
{
nulls [ 1 ] = nulls [ 2 ] = true ;
}
tuple = heap_form_tuple ( funcctx - > tuple_desc , values , nulls ) ;
SRF_RETURN_NEXT ( funcctx , HeapTupleGetDatum ( tuple ) ) ;
}
else
{
pfree ( tsin ) ;
SRF_RETURN_DONE ( funcctx ) ;
}
}
/*
* Convert tsvector to array of lexemes .
*/
Datum
tsvector_to_array ( PG_FUNCTION_ARGS )
{
TSVector tsin = PG_GETARG_TSVECTOR ( 0 ) ;
WordEntry * arrin = ARRPTR ( tsin ) ;
Datum elements [ tsin - > size ] ;
int i ;
ArrayType * array ;
for ( i = 0 ; i < tsin - > size ; i + + )
{
elements [ i ] = PointerGetDatum (
cstring_to_text_with_len ( STRPTR ( tsin ) + arrin [ i ] . pos , arrin [ i ] . len )
) ;
}
array = construct_array ( elements , tsin - > size , TEXTOID , - 1 , false , ' i ' ) ;
PG_FREE_IF_COPY ( tsin , 0 ) ;
PG_RETURN_POINTER ( array ) ;
}
/*
* Build tsvector from array of lexemes .
*/
Datum
array_to_tsvector ( PG_FUNCTION_ARGS )
{
ArrayType * v = PG_GETARG_ARRAYTYPE_P ( 0 ) ;
TSVector tsout ;
Datum * dlexemes ;
WordEntry * arrout ;
bool * nulls ;
int nitems ,
i ,
tslen ,
datalen = 0 ;
char * cur ;
deconstruct_array ( v , TEXTOID , - 1 , false , ' i ' , & dlexemes , & nulls , & nitems ) ;
for ( i = 0 ; i < nitems ; i + + )
{
if ( nulls [ i ] )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " lexeme array may not contain nulls " ) ) ) ;
datalen + = VARSIZE_ANY_EXHDR ( dlexemes [ i ] ) ;
}
tslen = CALCDATASIZE ( nitems , datalen ) ;
tsout = ( TSVector ) palloc0 ( tslen ) ;
SET_VARSIZE ( tsout , tslen ) ;
tsout - > size = nitems ;
arrout = ARRPTR ( tsout ) ;
cur = STRPTR ( tsout ) ;
for ( i = 0 ; i < nitems ; i + + )
{
char * lex = VARDATA ( dlexemes [ i ] ) ;
int lex_len = VARSIZE_ANY_EXHDR ( dlexemes [ i ] ) ;
memcpy ( cur , lex , lex_len ) ;
arrout [ i ] . haspos = 0 ;
arrout [ i ] . len = lex_len ;
arrout [ i ] . pos = cur - STRPTR ( tsout ) ;
cur + = lex_len ;
}
PG_FREE_IF_COPY ( v , 0 ) ;
PG_RETURN_POINTER ( tsout ) ;
}
/*
* Leave only elements with given weights from tsvector .
*/
Datum
tsvector_filter ( PG_FUNCTION_ARGS )
{
TSVector tsin = PG_GETARG_TSVECTOR ( 0 ) ,
tsout ;
ArrayType * weights = PG_GETARG_ARRAYTYPE_P ( 1 ) ;
WordEntry * arrin = ARRPTR ( tsin ) ,
* arrout ;
char * datain = STRPTR ( tsin ) ,
* dataout ;
Datum * dweights ;
bool * nulls ;
int nweigths ;
int i , j ;
char mask = 0 ,
cur_pos = 0 ;
deconstruct_array ( weights , CHAROID , 1 , true , ' c ' ,
& dweights , & nulls , & nweigths ) ;
for ( i = 0 ; i < nweigths ; i + + )
{
char char_weight ;
if ( nulls [ i ] )
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " weight array may not contain nulls " ) ) ) ;
char_weight = DatumGetChar ( dweights [ i ] ) ;
switch ( char_weight )
{
case ' A ' : case ' a ' :
mask = mask | 8 ;
break ;
case ' B ' : case ' b ' :
mask = mask | 4 ;
break ;
case ' C ' : case ' c ' :
mask = mask | 2 ;
break ;
case ' D ' : case ' d ' :
mask = mask | 1 ;
break ;
default :
/* internal error */
elog ( ERROR , " unrecognized weight: %c " , char_weight ) ;
}
}
tsout = ( TSVector ) palloc0 ( VARSIZE ( tsin ) ) ;
tsout - > size = tsin - > size ;
arrout = ARRPTR ( tsout ) ;
dataout = STRPTR ( tsout ) ;
for ( i = j = 0 ; i < tsin - > size ; i + + )
{
WordEntryPosVector * posvin ,
* posvout ;
int npos = 0 ;
int k ;
if ( ! arrin [ i ] . haspos )
continue ;
posvin = _POSVECPTR ( tsin , arrin + i ) ;
posvout = ( WordEntryPosVector * )
( dataout + SHORTALIGN ( cur_pos + arrin [ i ] . len ) ) ;
for ( k = 0 ; k < posvin - > npos ; k + + )
{
if ( mask & ( 1 < < WEP_GETWEIGHT ( posvin - > pos [ k ] ) ) )
posvout - > pos [ npos + + ] = posvin - > pos [ k ] ;
}
if ( ! npos ) /* no satisfactory positions found, so skip that lexeme */
continue ;
arrout [ j ] . haspos = true ;
arrout [ j ] . len = arrin [ i ] . len ;
arrout [ j ] . pos = cur_pos ;
memcpy ( dataout + cur_pos , datain + arrin [ i ] . pos , arrin [ i ] . len ) ;
posvout - > npos = npos ;
cur_pos + = SHORTALIGN ( arrin [ i ] . len ) ;
cur_pos + = POSDATALEN ( tsout , arrout + j ) * sizeof ( WordEntryPos ) +
sizeof ( uint16 ) ;
j + + ;
}
tsout - > size = j ;
if ( dataout ! = STRPTR ( tsout ) )
memmove ( STRPTR ( tsout ) , dataout , cur_pos ) ;
SET_VARSIZE ( tsout , CALCDATASIZE ( tsout - > size , cur_pos ) ) ;
PG_FREE_IF_COPY ( tsin , 0 ) ;
PG_RETURN_POINTER ( tsout ) ;
}
Datum
tsvector_concat ( PG_FUNCTION_ARGS )