@ -133,11 +133,10 @@ static double ineq_histogram_selectivity(VariableStatData *vardata,
FmgrInfo * opproc , bool isgt ,
Datum constval , Oid consttype ) ;
static double eqjoinsel_inner ( Oid operator ,
VariableStatData * vardata1 , VariableStatData * vardata2 ,
RelOptInfo * rel1 , RelOptInfo * rel2 ) ;
VariableStatData * vardata1 , VariableStatData * vardata2 ) ;
static double eqjoinsel_semi ( Oid operator ,
VariableStatData * vardata1 , VariableStatData * vardata2 ,
RelOptInfo * rel1 , RelOptInfo * rel2 ) ;
RelOptInfo * inner_rel ) ;
static bool convert_to_scalar ( Datum value , Oid valuetypid , double * scaledvalue ,
Datum lobound , Datum hibound , Oid boundstypid ,
double * scaledlobound , double * scaledhibound ) ;
@ -1929,47 +1928,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
VariableStatData vardata1 ;
VariableStatData vardata2 ;
bool join_is_reversed ;
RelOptInfo * rel1 ;
RelOptInfo * rel2 ;
RelOptInfo * inner_rel ;
get_join_variables ( root , args , sjinfo ,
& vardata1 , & vardata2 , & join_is_reversed ) ;
/*
* Identify the join ' s direct input relations . We use the min lefthand
* and min righthand as the inputs , even though the join might actually
* get done with larger input relations . The min inputs are guaranteed to
* have been formed by now , though , and always using them ensures
* consistency of estimates .
*/
if ( ! join_is_reversed )
{
rel1 = find_join_input_rel ( root , sjinfo - > min_lefthand ) ;
rel2 = find_join_input_rel ( root , sjinfo - > min_righthand ) ;
}
else
{
rel1 = find_join_input_rel ( root , sjinfo - > min_righthand ) ;
rel2 = find_join_input_rel ( root , sjinfo - > min_lefthand ) ;
}
switch ( sjinfo - > jointype )
{
case JOIN_INNER :
case JOIN_LEFT :
case JOIN_FULL :
selec = eqjoinsel_inner ( operator , & vardata1 , & vardata2 ,
rel1 , rel2 ) ;
selec = eqjoinsel_inner ( operator , & vardata1 , & vardata2 ) ;
break ;
case JOIN_SEMI :
case JOIN_ANTI :
/*
* Look up the join ' s inner relation . min_righthand is sufficient
* information because neither SEMI nor ANTI joins permit any
* reassociation into or out of their RHS , so the righthand will
* always be exactly that set of rels .
*/
inner_rel = find_join_input_rel ( root , sjinfo - > min_righthand ) ;
if ( ! join_is_reversed )
selec = eqjoinsel_semi ( operator , & vardata1 , & vardata2 ,
rel1 , rel2 ) ;
inner_rel ) ;
else
selec = eqjoinsel_semi ( get_commutator ( operator ) ,
& vardata2 , & vardata1 ,
rel2 , rel1 ) ;
inner_rel ) ;
break ;
default :
/* other values not expected here */
@ -1995,8 +1982,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
*/
static double
eqjoinsel_inner ( Oid operator ,
VariableStatData * vardata1 , VariableStatData * vardata2 ,
RelOptInfo * rel1 , RelOptInfo * rel2 )
VariableStatData * vardata1 , VariableStatData * vardata2 )
{
double selec ;
double nd1 ;
@ -2188,26 +2174,10 @@ eqjoinsel_inner(Oid operator,
* XXX Can we be smarter if we have an MCV list for just one side ? It
* seems that if we assume equal distribution for the other side , we
* end up with the same answer anyway .
*
* An additional hack we use here is to clamp the nd1 and nd2 values
* to not more than what we are estimating the input relation sizes to
* be , providing a crude correction for the selectivity of restriction
* clauses on those relations . ( We don ' t do that in the other path
* since there we are comparing the nd values to stats for the whole
* relations . ) We can apply this clamp both with respect to the base
* relations from which the join variables come , and to the immediate
* input relations of the current join .
*/
double nullfrac1 = stats1 ? stats1 - > stanullfrac : 0.0 ;
double nullfrac2 = stats2 ? stats2 - > stanullfrac : 0.0 ;
if ( vardata1 - > rel )
nd1 = Min ( nd1 , vardata1 - > rel - > rows ) ;
nd1 = Min ( nd1 , rel1 - > rows ) ;
if ( vardata2 - > rel )
nd2 = Min ( nd2 , vardata2 - > rel - > rows ) ;
nd2 = Min ( nd2 , rel2 - > rows ) ;
selec = ( 1.0 - nullfrac1 ) * ( 1.0 - nullfrac2 ) ;
if ( nd1 > nd2 )
selec / = nd1 ;
@ -2234,7 +2204,7 @@ eqjoinsel_inner(Oid operator,
static double
eqjoinsel_semi ( Oid operator ,
VariableStatData * vardata1 , VariableStatData * vardata2 ,
RelOptInfo * rel1 , RelOptInfo * rel2 )
RelOptInfo * inner_rel )
{
double selec ;
double nd1 ;
@ -2255,6 +2225,25 @@ eqjoinsel_semi(Oid operator,
nd1 = get_variable_numdistinct ( vardata1 ) ;
nd2 = get_variable_numdistinct ( vardata2 ) ;
/*
* We clamp nd2 to be not more than what we estimate the inner relation ' s
* size to be . This is intuitively somewhat reasonable since obviously
* there can ' t be more than that many distinct values coming from the
* inner rel . The reason for the asymmetry ( ie , that we don ' t clamp nd1
* likewise ) is that this is the only pathway by which restriction clauses
* applied to the inner rel will affect the join result size estimate ,
* since set_joinrel_size_estimates will multiply SEMI / ANTI selectivity by
* only the outer rel ' s size . If we clamped nd1 we ' d be double - counting
* the selectivity of outer - rel restrictions .
*
* We can apply this clamping both with respect to the base relation from
* which the join variable comes ( if there is just one ) , and to the
* immediate inner input relation of the current join .
*/
if ( vardata2 - > rel )
nd2 = Min ( nd2 , vardata2 - > rel - > rows ) ;
nd2 = Min ( nd2 , inner_rel - > rows ) ;
if ( HeapTupleIsValid ( vardata1 - > statsTuple ) )
{
stats1 = ( Form_pg_statistic ) GETSTRUCT ( vardata1 - > statsTuple ) ;
@ -2297,11 +2286,21 @@ eqjoinsel_semi(Oid operator,
uncertainfrac ,
uncertain ;
int i ,
nmatches ;
nmatches ,
clamped_nvalues2 ;
/*
* The clamping above could have resulted in nd2 being less than
* nvalues2 ; in which case , we assume that precisely the nd2 most
* common values in the relation will appear in the join input , and so
* compare to only the first nd2 members of the MCV list . Of course
* this is frequently wrong , but it ' s the best bet we can make .
*/
clamped_nvalues2 = Min ( nvalues2 , nd2 ) ;
fmgr_info ( get_opcode ( operator ) , & eqproc ) ;
hasmatch1 = ( bool * ) palloc0 ( nvalues1 * sizeof ( bool ) ) ;
hasmatch2 = ( bool * ) palloc0 ( nvalues2 * sizeof ( bool ) ) ;
hasmatch2 = ( bool * ) palloc0 ( clamped_ nvalues2 * sizeof ( bool ) ) ;
/*
* Note we assume that each MCV will match at most one member of the
@ -2314,7 +2313,7 @@ eqjoinsel_semi(Oid operator,
{
int j ;
for ( j = 0 ; j < nvalues2 ; j + + )
for ( j = 0 ; j < clamped_ nvalues2; j + + )
{
if ( hasmatch2 [ j ] )
continue ;
@ -2358,7 +2357,7 @@ eqjoinsel_semi(Oid operator,
{
nd1 - = nmatches ;
nd2 - = nmatches ;
if ( nd1 < = nd2 | | nd2 < = 0 )
if ( nd1 < = nd2 | | nd2 < 0 )
uncertainfrac = 1.0 ;
else
uncertainfrac = nd2 / nd1 ;
@ -2379,14 +2378,7 @@ eqjoinsel_semi(Oid operator,
if ( nd1 ! = DEFAULT_NUM_DISTINCT & & nd2 ! = DEFAULT_NUM_DISTINCT )
{
if ( vardata1 - > rel )
nd1 = Min ( nd1 , vardata1 - > rel - > rows ) ;
nd1 = Min ( nd1 , rel1 - > rows ) ;
if ( vardata2 - > rel )
nd2 = Min ( nd2 , vardata2 - > rel - > rows ) ;
nd2 = Min ( nd2 , rel2 - > rows ) ;
if ( nd1 < = nd2 | | nd2 < = 0 )
if ( nd1 < = nd2 | | nd2 < 0 )
selec = 1.0 - nullfrac1 ;
else
selec = ( nd2 / nd1 ) * ( 1.0 - nullfrac1 ) ;