@ -122,9 +122,23 @@
* thousands of trigrams would be slow , and would likely produce so many
* false positives that we would have to traverse a large fraction of the
* index , the graph is simplified further in a lossy fashion by removing
* color trigrams until the number of trigrams after expansion is below
* the MAX_TRGM_COUNT threshold . When a color trigram is removed , the states
* connected by any arcs labelled with that trigram are merged .
* color trigrams . When a color trigram is removed , the states connected by
* any arcs labelled with that trigram are merged .
*
* Trigrams do not all have equivalent value for searching : some of them are
* more frequent and some of them are less frequent . Ideally , we would like
* to know the distribution of trigrams , but we don ' t . But because of padding
* we know for sure that the empty character is more frequent than others ,
* so we can penalize trigrams according to presence of whitespace . The
* penalty assigned to each color trigram is the number of simple trigrams
* it would produce , times the penalties [ ] multiplier associated with its
* whitespace content . ( The penalties [ ] constants were calculated by analysis
* of some real - life text . ) We eliminate color trigrams starting with the
* highest - penalty one , until we get to a total penalty of no more than
* WISH_TRGM_PENALTY . However , we cannot remove a color trigram if that would
* lead to merging the initial and final states , so we may not be able to
* reach WISH_TRGM_PENALTY . It ' s still okay so long as we have no more than
* MAX_TRGM_COUNT simple trigrams in total , otherwise we fail .
*
* 4 ) Pack the graph into a compact representation
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -199,13 +213,30 @@
* MAX_EXPANDED_STATES - How many states we allow in expanded graph
* MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
* MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
* WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
* COLOR_COUNT_LIMIT - Maximum number of characters per color
*/
# define MAX_EXPANDED_STATES 128
# define MAX_EXPANDED_ARCS 1024
# define MAX_TRGM_COUNT 256
# define WISH_TRGM_PENALTY 16
# define COLOR_COUNT_LIMIT 256
/*
* Penalty multipliers for trigram counts depending on whitespace contents .
* Numbers based on analysis of real - life texts .
*/
const float4 penalties [ 8 ] = {
1.0 , /* "aaa" */
3.5 , /* "aa " */
0.0 , /* "a a" (impossible) */
0.0 , /* "a " (impossible) */
4.2 , /* " aa" */
2.1 , /* " a " */
25.0 , /* " a" */
0.0 /* " " (impossible) */
} ;
/* Struct representing a single pg_wchar, converted back to multibyte form */
typedef struct
{
@ -339,6 +370,7 @@ typedef struct
ColorTrgm ctrgm ;
int number ;
int count ;
float4 penalty ;
bool expanded ;
List * arcs ;
} ColorTrgmInfo ;
@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
static void fillTrgm ( trgm * ptrgm , trgm_mb_char s [ 3 ] ) ;
static void mergeStates ( TrgmState * state1 , TrgmState * state2 ) ;
static int colorTrgmInfoCmp ( const void * p1 , const void * p2 ) ;
static int colorTrgmInfoCount Cmp ( const void * p1 , const void * p2 ) ;
static int colorTrgmInfoPenalty Cmp ( const void * p1 , const void * p2 ) ;
static TrgmPackedGraph * packGraph ( TrgmNFA * trgmNFA , MemoryContext rcontext ) ;
static int packArcInfoCmp ( const void * a1 , const void * a2 ) ;
@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
TrgmState * state ;
ColorTrgmInfo * colorTrgms ;
int64 totalTrgmCount ;
float4 totalTrgmPenalty ;
int number ;
/* Collect color trigrams from all arcs */
@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
}
/*
* Count number of simple trigrams generated by each color trigram .
* Count number of simple trigrams generated by each color trigram , and
* also compute a penalty value , which is the number of simple trigrams
* times a multiplier that depends on its whitespace content .
*
* Note : per - color - trigram counts cannot overflow an int so long as
* COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX , ie about
* 1290. However , the grand total totalTrgmCount might conceivably
* overflow an int , so we use int64 for that within this routine .
* overflow an int , so we use int64 for that within this routine . Also ,
* penalties are calculated in float4 arithmetic to avoid any overflow
* worries .
*/
totalTrgmCount = 0 ;
totalTrgmPenalty = 0.0f ;
for ( i = 0 ; i < trgmNFA - > colorTrgmsCount ; i + + )
{
ColorTrgmInfo * trgmInfo = & colorTrgms [ i ] ;
int j ,
count = 1 ;
count = 1 ,
typeIndex = 0 ;
for ( j = 0 ; j < 3 ; j + + )
{
TrgmColor c = trgmInfo - > ctrgm . colors [ j ] ;
if ( c ! = COLOR_BLANK )
typeIndex * = 2 ;
if ( c = = COLOR_BLANK )
typeIndex + + ;
else
count * = trgmNFA - > colorInfo [ c ] . wordCharsCount ;
}
trgmInfo - > count = count ;
totalTrgmCount + = count ;
trgmInfo - > penalty = penalties [ typeIndex ] * ( float4 ) count ;
totalTrgmPenalty + = trgmInfo - > penalty ;
}
/* Sort color trigrams in descending order of simple trigram count s */
/* Sort color trigrams in descending order of their penaltie s */
qsort ( colorTrgms , trgmNFA - > colorTrgmsCount , sizeof ( ColorTrgmInfo ) ,
colorTrgmInfoCount Cmp ) ;
colorTrgmInfoPenalty Cmp ) ;
/*
* Remove color trigrams from the graph so long as total number of simple
* trigrams exceeds MAX_TRGM_COUNT . We prefer to remove color trigrams
* with the most associated simple trigrams , since those are the most
* promising for reducing the total number of simple trigrams . When
* removing a color trigram we have to merge states connected by arcs
* labeled with that trigram . It ' s necessary to not merge initial and
* final states , because our graph becomes useless if that happens ; so we
* cannot always remove the trigram we ' d prefer to .
* Remove color trigrams from the graph so long as total penalty of color
* trigrams exceeds WISH_TRGM_PENALTY . ( If we fail to get down to
* WISH_TRGM_PENALTY , it ' s OK so long as total count is no more than
* MAX_TRGM_COUNT . ) We prefer to remove color trigrams with higher
* penalty , since those are the most promising for reducing the total
* penalty . When removing a color trigram we have to merge states
* connected by arcs labeled with that trigram . It ' s necessary to not
* merge initial and final states , because our graph becomes useless if
* that happens ; so we cannot always remove the trigram we ' d prefer to .
*/
for ( i = 0 ;
( i < trgmNFA - > colorTrgmsCount ) & & ( totalTrgmCount > MAX_TRGM_COUNT ) ;
i + + )
for ( i = 0 ; i < trgmNFA - > colorTrgmsCount ; i + + )
{
ColorTrgmInfo * trgmInfo = & colorTrgms [ i ] ;
bool canRemove = true ;
ListCell * cell ;
/* Done if we've reached the target */
if ( totalTrgmPenalty < = WISH_TRGM_PENALTY )
break ;
/*
* Does any arc of this color trigram connect initial and final
* states ? If so we can ' t remove it .
@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
mergeStates ( source , target ) ;
}
/* Mark trigram unexpanded, and update totalTrgmCount */
/* Mark trigram unexpanded, and update totals */
trgmInfo - > expanded = false ;
totalTrgmCount - = trgmInfo - > count ;
totalTrgmPenalty - = trgmInfo - > penalty ;
}
/* Did we succeed in fitting into MAX_TRGM_COUNT? */
@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2)
/*
* Compare function for sorting color trigrams in descending order of
* their simple trigrams count s.
* their penalty field s.
*/
static int
colorTrgmInfoCount Cmp ( const void * p1 , const void * p2 )
colorTrgmInfoPenalty Cmp ( const void * p1 , const void * p2 )
{
const ColorTrgmInfo * c1 = ( const ColorTrgmInfo * ) p1 ;
const ColorTrgmInfo * c2 = ( const ColorTrgmInfo * ) p2 ;
float4 penalty1 = ( ( const ColorTrgmInfo * ) p1 ) - > penalty ;
float4 penalty2 = ( ( const ColorTrgmInfo * ) p2 ) - > penalty ;
if ( c1 - > count < c2 - > count )
if ( penalty1 < penalty2 )
return 1 ;
else if ( c1 - > count = = c2 - > count )
else if ( penalty1 = = penalty2 )
return 0 ;
else
return - 1 ;