@ -1915,6 +1915,12 @@ prsd_end(PG_FUNCTION_ARGS)
PG_RETURN_VOID ( ) ;
}
/*
* ts_headline support begins here
*/
/* token type classification macros */
# define LEAVETOKEN(x) ( (x)==SPACE )
# define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
# define ENDPUNCTOKEN(x) ( (x)==SPACE )
@ -1926,23 +1932,54 @@ prsd_end(PG_FUNCTION_ARGS)
# define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
# define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
/*
* Macros useful in headline selection . These rely on availability of
* " HeadlineParsedText *prs " describing some text , and " int shortword "
* describing the " short word " length parameter .
*/
/* Interesting words are non-repeated search terms */
# define INTERESTINGWORD(j) \
( prs - > words [ j ] . item & & ! prs - > words [ j ] . repeated )
/* Don't want to end at a non-word or a short word */
# define BADENDPOINT(j) \
( NOENDTOKEN ( prs - > words [ j ] . type ) | | prs - > words [ j ] . len < = shortword )
typedef struct
{
/* one cover (well, really one fragment) for mark_hl_fragments */
int32 startpos ; /* fragment's starting word index */
int32 endpos ; /* ending word index (inclusive) */
int32 poslen ; /* number of interesting words */
int32 curlen ; /* total number of words */
bool chosen ; /* chosen? */
bool excluded ; /* excluded? */
} CoverPos ;
typedef struct
{
/* callback data for checkcondition_HL */
HeadlineWordEntry * words ;
int len ;
} hlCheck ;
/*
* TS_execute callback for matching a tsquery operand to headline words
*/
static bool
checkcondition_HL ( void * opaque , QueryOperand * val , ExecPhraseData * data )
{
int i ;
hlCheck * checkval = ( hlCheck * ) opaque ;
int i ;
/* scan words array for marching items */
for ( i = 0 ; i < checkval - > len ; i + + )
{
if ( checkval - > words [ i ] . item = = val )
{
/* don't need to find all positions */
/* if data == NULL, don't need to report positions */
if ( ! data )
return true ;
@ -2038,8 +2075,14 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
return false ;
}
/*
* Apply suitable highlight marking to words selected by headline selector
*
* The words from startpos to endpos inclusive are marked per highlightall
*/
static void
mark_fragment ( HeadlineParsedText * prs , int highlight , int startpos , int endpos )
mark_fragment ( HeadlineParsedText * prs , bool highlightall ,
int startpos , int endpos )
{
int i ;
@ -2047,7 +2090,7 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
if ( prs - > words [ i ] . item )
prs - > words [ i ] . selected = 1 ;
if ( highlight = = 0 )
if ( ! highlightall )
{
if ( HLIDREPLACE ( prs - > words [ i ] . type ) )
prs - > words [ i ] . replace = 1 ;
@ -2064,16 +2107,15 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
}
}
typedef struct
{
int32 startpos ;
int32 endpos ;
int32 poslen ;
int32 curlen ;
int16 in ;
int16 excluded ;
} CoverPos ;
/*
* split a cover substring into fragments not longer than max_words
*
* At entry , * startpos and * endpos are the ( remaining ) bounds of the cover
* substring . They are updated to hold the bounds of the next fragment .
*
* * curlen and * poslen are set to the fragment ' s length , in words and
* interesting words respectively .
*/
static void
get_next_fragment ( HeadlineParsedText * prs , int * startpos , int * endpos ,
int * curlen , int * poslen , int max_words )
@ -2081,17 +2123,17 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
int i ;
/*
* Objective : Generate a fragment of words between startpos and endpos
* such that it has at most max_words and both ends has query words . If
* the startpos and endpos are the endpoints of the cover and the cover
* has fewer words than max_words , then this function should just return
* the cover
* Objective : select a fragment of words between startpos and endpos such
* that it has at most max_words and both ends have query words . If the
* startpos and endpos are the endpoints of the cover and the cover has
* fewer words than max_words , then this function should just return the
* cover
*/
/* first move startpos to an item */
for ( i = * startpos ; i < = * endpos ; i + + )
{
* startpos = i ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
break ;
}
/* cut endpos to have only max_words */
@ -2101,7 +2143,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
* curlen + = 1 ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
* poslen + = 1 ;
}
/* if the cover was cut then move back endpos to a query item */
@ -2111,7 +2153,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
for ( i = * endpos ; i > = * startpos ; i - - )
{
* endpos = i ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
break ;
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
* curlen - = 1 ;
@ -2119,8 +2161,14 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
}
}
/*
* Headline selector used when MaxFragments > 0
*
* Note : in this mode , highlightall is disregarded for phrase selection ;
* it only controls presentation details .
*/
static void
mark_hl_fragments ( HeadlineParsedText * prs , TSQuery query , int highlight ,
mark_hl_fragments ( HeadlineParsedText * prs , TSQuery query , bool highlightall ,
int shortword , int min_words ,
int max_words , int max_fragments )
{
@ -2156,7 +2204,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
/*
* Break the cover into smaller fragments such that each fragment has
* at most max_words . Also ensure that each end of the fragment is a
* at most max_words . Also ensure that each end of each fragment is a
* query word . This will allow us to stretch the fragment in either
* direction
*/
@ -2173,12 +2221,13 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
covers [ numcovers ] . endpos = endpos ;
covers [ numcovers ] . curlen = curlen ;
covers [ numcovers ] . poslen = poslen ;
covers [ numcovers ] . in = 0 ;
covers [ numcovers ] . excluded = 0 ;
covers [ numcovers ] . chosen = false ;
covers [ numcovers ] . excluded = false ;
numcovers + + ;
startpos = endpos + 1 ;
endpos = q ;
}
/* move p to generate the next cover */
p + + ;
}
@ -2196,9 +2245,10 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
*/
for ( i = 0 ; i < numcovers ; i + + )
{
if ( ! covers [ i ] . in & & ! covers [ i ] . excluded & &
( maxitems < covers [ i ] . poslen | | ( maxitems = = covers [ i ] . poslen
& & minwords > covers [ i ] . curlen ) ) )
if ( ! covers [ i ] . chosen & & ! covers [ i ] . excluded & &
( maxitems < covers [ i ] . poslen | |
( maxitems = = covers [ i ] . poslen & &
minwords > covers [ i ] . curlen ) ) )
{
maxitems = covers [ i ] . poslen ;
minwords = covers [ i ] . curlen ;
@ -2208,7 +2258,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
/* if a cover was found mark it */
if ( minI > = 0 )
{
covers [ minI ] . in = 1 ;
covers [ minI ] . chosen = true ;
/* adjust the size of cover */
startpos = covers [ minI ] . startpos ;
endpos = covers [ minI ] . endpos ;
@ -2235,8 +2285,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
}
posmarker = i ;
}
/* cut back startpos till we find a non short token */
for ( i = posmarker ; i < startpos & & ( NOENDTOKEN ( prs - > words [ i ] . type ) | | prs - > words [ i ] . len < = shortword ) ; i + + )
/* cut back startpos till we find a good endpoint */
for ( i = posmarker ; i < startpos & & BADENDPOINT ( i ) ; i + + )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen - - ;
@ -2250,8 +2300,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
curlen + + ;
posmarker = i ;
}
/* cut back endpos till we find a non-short token */
for ( i = posmarker ; i > endpos & & ( NOENDTOKEN ( prs - > words [ i ] . type ) | | prs - > words [ i ] . len < = shortword ) ; i - - )
/* cut back endpos till we find a good endpoint */
for ( i = posmarker ; i > endpos & & BADENDPOINT ( i ) ; i - - )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen - - ;
@ -2262,20 +2312,24 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
covers [ minI ] . endpos = endpos ;
covers [ minI ] . curlen = curlen ;
/* Mark the chosen fragments (covers) */
mark_fragment ( prs , highlight , startpos , endpos ) ;
mark_fragment ( prs , highlightall , startpos , endpos ) ;
num_f + + ;
/* exclude overlapping covers */
for ( i = 0 ; i < numcovers ; i + + )
{
if ( i ! = minI & & ( ( covers [ i ] . startpos > = covers [ minI ] . startpos & & covers [ i ] . startpos < = covers [ minI ] . endpos ) | | ( covers [ i ] . endpos > = covers [ minI ] . startpos & & covers [ i ] . endpos < = covers [ minI ] . endpos ) ) )
covers [ i ] . excluded = 1 ;
if ( i ! = minI & &
( ( covers [ i ] . startpos > = covers [ minI ] . startpos & &
covers [ i ] . startpos < = covers [ minI ] . endpos ) | |
( covers [ i ] . endpos > = covers [ minI ] . startpos & &
covers [ i ] . endpos < = covers [ minI ] . endpos ) ) )
covers [ i ] . excluded = true ;
}
}
else
break ;
}
/* show at least min_words we have not marked anything */
/* show at least min_words if we have not marked anything */
if ( num_f < = 0 )
{
startpos = endpos = curlen = 0 ;
@ -2285,13 +2339,17 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
curlen + + ;
endpos = i ;
}
mark_fragment ( prs , highlight , startpos , endpos ) ;
mark_fragment ( prs , highlightall , startpos , endpos ) ;
}
pfree ( covers ) ;
}
/*
* Headline selector used when MaxFragments = = 0
*/
static void
mark_hl_words ( HeadlineParsedText * prs , TSQuery query , int highlight ,
mark_hl_words ( HeadlineParsedText * prs , TSQuery query , bool highlightall ,
int shortword , int min_words , int max_words )
{
int p = 0 ,
@ -2299,66 +2357,81 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
int bestb = - 1 ,
beste = - 1 ;
int bestlen = - 1 ;
int pose = 0 ,
int pose ,
posb ,
poslen ,
curlen ;
int i ;
if ( highlight = = 0 )
if ( ! highlightall )
{
/* examine all covers, select a headline using the best one */
while ( hlCover ( prs , query , & p , & q ) )
{
/* find cover len in words */
/*
* Count words ( curlen ) and interesting words ( poslen ) within
* cover , but stop once we reach max_words . This step doesn ' t
* consider whether that ' s a good stopping point . posb and pose
* are set to the start and end indexes of the possible headline .
*/
curlen = 0 ;
poslen = 0 ;
posb = pose = p ;
for ( i = p ; i < = q & & curlen < max_words ; i + + )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen + + ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
poslen + + ;
pose = i ;
}
if ( poslen < bestlen & & ! ( NOENDTOKEN ( prs - > words [ beste ] . type ) | | prs - > words [ beste ] . len < = shortword ) )
/* XXX this optimization seems unnecessary and wrong */
if ( poslen < bestlen & & ! BADENDPOINT ( beste ) )
{
/* best already found, so try one more cover */
/* better cover already found, so try next cover */
p + + ;
continue ;
}
posb = p ;
if ( curlen < max_words )
{ /* find good end */
{
/*
* We have room to lengthen the headline , so search forward
* until it ' s full or we find a good stopping point . We ' ll
* reconsider the word at " q " , then move forward .
*/
for ( i = i - 1 ; i < prs - > curwords & & curlen < max_words ; i + + )
{
if ( i ! = q )
if ( i > q )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen + + ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
poslen + + ;
}
pose = i ;
if ( NOENDTOKEN ( prs - > words [ i ] . type ) | | prs - > words [ i ] . len < = shortword )
if ( BADENDPOINT ( i ) )
continue ;
if ( curlen > = min_words )
break ;
}
if ( curlen < min_words & & i > = prs - > curwords )
{ /* got end of text and our cover is shorter
* than min_words */
if ( curlen < min_words )
{
/*
* Reached end of text and our headline is still shorter
* than min_words , so try to extend it to the left .
*/
for ( i = p - 1 ; i > = 0 ; i - - )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen + + ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
poslen + + ;
if ( curlen > = max_words )
break ;
if ( NOENDTOKEN ( prs - > words [ i ] . type ) | | prs - > words [ i ] . len < = shortword )
if ( BADENDPOINT ( i ) )
continue ;
if ( curlen > = min_words )
break ;
@ -2367,34 +2440,48 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
}
}
else
{ /* shorter cover :((( */
{
/*
* Can ' t make headline longer , so consider making it shorter
* if needed to avoid a bad endpoint .
*/
if ( i > q )
i = q ;
for ( ; curlen > min_words ; i - - )
{
if ( ! NONWORDTOKEN ( prs - > words [ i ] . type ) )
curlen - - ;
if ( prs - > words [ i ] . item & & ! prs - > words [ i ] . repeated )
if ( INTERESTINGWORD ( i ) )
poslen - - ;
pose = i ;
if ( NOENDTOKEN ( prs - > words [ i ] . type ) | | prs - > words [ i ] . len < = shortword )
continue ;
break ;
if ( ! BADENDPOINT ( i ) )
break ;
}
}
if ( bestlen < 0 | | ( poslen > bestlen & & ! ( NOENDTOKEN ( prs - > words [ pose ] . type ) | | prs - > words [ pose ] . len < = shortword ) ) | |
( bestlen > = 0 & & ! ( NOENDTOKEN ( prs - > words [ pose ] . type ) | | prs - > words [ pose ] . len < = shortword ) & &
( NOENDTOKEN ( prs - > words [ beste ] . type ) | | prs - > words [ beste ] . len < = shortword ) ) )
/*
* Adopt this headline if it ' s the first , or if it has more
* interesting words and isn ' t ending at a bad endpoint , or if it
* replaces a bad endpoint with a good one ( XXX even if it has
* fewer interesting words ? Really ? )
*/
if ( bestlen < 0 | |
( poslen > bestlen & & ! BADENDPOINT ( pose ) ) | |
( ! BADENDPOINT ( pose ) & & BADENDPOINT ( beste ) ) )
{
bestb = posb ;
beste = pose ;
bestlen = poslen ;
}
/* move p to generate the next cover */
p + + ;
}
/*
* If we found nothing acceptable , select min_words words starting at
* the beginning .
*/
if ( bestlen < 0 )
{
curlen = 0 ;
@ -2410,32 +2497,17 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
}
else
{
/* highlightall mode: headline is whole document */
bestb = 0 ;
beste = prs - > curwords - 1 ;
}
for ( i = bestb ; i < = beste ; i + + )
{
if ( prs - > words [ i ] . item )
prs - > words [ i ] . selected = 1 ;
if ( highlight = = 0 )
{
if ( HLIDREPLACE ( prs - > words [ i ] . type ) )
prs - > words [ i ] . replace = 1 ;
else if ( HLIDSKIP ( prs - > words [ i ] . type ) )
prs - > words [ i ] . skip = 1 ;
}
else
{
if ( XMLHLIDSKIP ( prs - > words [ i ] . type ) )
prs - > words [ i ] . skip = 1 ;
}
prs - > words [ i ] . in = ( prs - > words [ i ] . repeated ) ? 0 : 1 ;
}
mark_fragment ( prs , highlightall , bestb , beste ) ;
}
/*
* Default parser ' s prsheadline function
*/
Datum
prsd_headline ( PG_FUNCTION_ARGS )
{
@ -2443,17 +2515,18 @@ prsd_headline(PG_FUNCTION_ARGS)
List * prsoptions = ( List * ) PG_GETARG_POINTER ( 1 ) ;
TSQuery query = PG_GETARG_TSQUERY ( 2 ) ;
/* from opt + start and end tag */
/* default option values: */
int min_words = 15 ;
int max_words = 35 ;
int shortword = 3 ;
int max_fragments = 0 ;
int highlight = 0 ;
bool highlightall = false ;
ListCell * l ;
/* config */
/* Extract configuration option values */
prs - > startsel = NULL ;
prs - > stopsel = NULL ;
prs - > fragdelim = NULL ;
foreach ( l , prsoptions )
{
DefElem * defel = ( DefElem * ) lfirst ( l ) ;
@ -2474,12 +2547,12 @@ prsd_headline(PG_FUNCTION_ARGS)
else if ( pg_strcasecmp ( defel - > defname , " FragmentDelimiter " ) = = 0 )
prs - > fragdelim = pstrdup ( val ) ;
else if ( pg_strcasecmp ( defel - > defname , " HighlightAll " ) = = 0 )
highlight = ( pg_strcasecmp ( val , " 1 " ) = = 0 | |
pg_strcasecmp ( val , " on " ) = = 0 | |
pg_strcasecmp ( val , " true " ) = = 0 | |
pg_strcasecmp ( val , " t " ) = = 0 | |
pg_strcasecmp ( val , " y " ) = = 0 | |
pg_strcasecmp ( val , " yes " ) = = 0 ) ;
highlightall = ( pg_strcasecmp ( val , " 1 " ) = = 0 | |
pg_strcasecmp ( val , " on " ) = = 0 | |
pg_strcasecmp ( val , " true " ) = = 0 | |
pg_strcasecmp ( val , " t " ) = = 0 | |
pg_strcasecmp ( val , " y " ) = = 0 | |
pg_strcasecmp ( val , " yes " ) = = 0 ) ;
else
ereport ( ERROR ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
@ -2487,7 +2560,8 @@ prsd_headline(PG_FUNCTION_ARGS)
defel - > defname ) ) ) ;
}
if ( highlight = = 0 )
/* in HighlightAll mode these parameters are ignored */
if ( ! highlightall )
{
if ( min_words > = max_words )
ereport ( ERROR ,
@ -2507,18 +2581,23 @@ prsd_headline(PG_FUNCTION_ARGS)
errmsg ( " MaxFragments should be >= 0 " ) ) ) ;
}
/* Apply appropriate headline selector */
if ( max_fragments = = 0 )
/* call the default headline generator */
mark_hl_words ( prs , query , highlight , shortword , min_words , max_words ) ;
mark_hl_words ( prs , query , highlightall , shortword ,
min_words , max_words ) ;
else
mark_hl_fragments ( prs , query , highlight , shortword , min_words , max_words , max_fragments ) ;
mark_hl_fragments ( prs , query , highlightall , shortword ,
min_words , max_words , max_fragments ) ;
/* Fill in default values for string options */
if ( ! prs - > startsel )
prs - > startsel = pstrdup ( " <b> " ) ;
if ( ! prs - > stopsel )
prs - > stopsel = pstrdup ( " </b> " ) ;
if ( ! prs - > fragdelim )
prs - > fragdelim = pstrdup ( " ... " ) ;
/* Caller will need these lengths, too */
prs - > startsellen = strlen ( prs - > startsel ) ;
prs - > stopsellen = strlen ( prs - > stopsel ) ;
prs - > fragdelimlen = strlen ( prs - > fragdelim ) ;