Make use of statistics on index expressions. There are still some

corner cases that could stand improvement, but it does all the basic stuff. A byproduct is that the selectivity routines are no longer constrained to working on simple Vars; we might in future be able to improve the behavior for subexpressions that don't match indexes.
22 years ago · a536ed53bc
parent d372bba02d
commit a536ed53bc
5 changed files with 1062 additions and 979 deletions
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -49,7 +49,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -102,8 +102,6 @@ bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;


-static Selectivity estimate_hash_bucketsize(Query *root, Var *var,
-						 int nbuckets);
 static bool cost_qual_eval_walker(Node *node, QualCost *total);
 static Selectivity approx_selectivity(Query *root, List *quals,
 				   JoinType jointype);
@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root)
 					/* not cached yet */
 					thisbucketsize =
 						estimate_hash_bucketsize(root,
-							   (Var *) get_rightop(restrictinfo->clause),
+												 get_rightop(restrictinfo->clause),
 												 virtualbuckets);
 					restrictinfo->right_bucketsize = thisbucketsize;
 				}
@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root)
 					/* not cached yet */
 					thisbucketsize =
 						estimate_hash_bucketsize(root,
-								(Var *) get_leftop(restrictinfo->clause),
+												 get_leftop(restrictinfo->clause),
 												 virtualbuckets);
 					restrictinfo->left_bucketsize = thisbucketsize;
 				}
@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root)
 	path->jpath.path.total_cost = startup_cost + run_cost;
 }

-/*
- * Estimate hash bucketsize fraction (ie, number of entries in a bucket
- * divided by total tuples in relation) if the specified Var is used
- * as a hash key.
- *
- * XXX This is really pretty bogus since we're effectively assuming that the
- * distribution of hash keys will be the same after applying restriction
- * clauses as it was in the underlying relation.  However, we are not nearly
- * smart enough to figure out how the restrict clauses might change the
- * distribution, so this will have to do for now.
- *
- * We are passed the number of buckets the executor will use for the given
- * input relation.	If the data were perfectly distributed, with the same
- * number of tuples going into each available bucket, then the bucketsize
- * fraction would be 1/nbuckets.  But this happy state of affairs will occur
- * only if (a) there are at least nbuckets distinct data values, and (b)
- * we have a not-too-skewed data distribution.	Otherwise the buckets will
- * be nonuniformly occupied.  If the other relation in the join has a key
- * distribution similar to this one's, then the most-loaded buckets are
- * exactly those that will be probed most often.  Therefore, the "average"
- * bucket size for costing purposes should really be taken as something close
- * to the "worst case" bucket size.  We try to estimate this by adjusting the
- * fraction if there are too few distinct data values, and then scaling up
- * by the ratio of the most common value's frequency to the average frequency.
- *
- * If no statistics are available, use a default estimate of 0.1.  This will
- * discourage use of a hash rather strongly if the inner relation is large,
- * which is what we want.  We do not want to hash unless we know that the
- * inner rel is well-dispersed (or the alternatives seem much worse).
- */
-static Selectivity
-estimate_hash_bucketsize(Query *root, Var *var, int nbuckets)
-{
-	Oid			relid;
-	RelOptInfo *rel;
-	HeapTuple	tuple;
-	Form_pg_statistic stats;
-	double		estfract,
-				ndistinct,
-				mcvfreq,
-				avgfreq;
-	float4	   *numbers;
-	int			nnumbers;
-
-	/* Ignore any binary-compatible relabeling */
-	if (var && IsA(var, RelabelType))
-		var = (Var *) ((RelabelType *) var)->arg;
-
-	/*
-	 * Lookup info about var's relation and attribute; if none available,
-	 * return default estimate.
-	 */
-	if (var == NULL || !IsA(var, Var))
-		return 0.1;
-
-	relid = getrelid(var->varno, root->rtable);
-	if (relid == InvalidOid)
-		return 0.1;
-
-	rel = find_base_rel(root, var->varno);
-
-	if (rel->tuples <= 0.0 || rel->rows <= 0.0)
-		return 0.1;				/* ensure we can divide below */
-
-	tuple = SearchSysCache(STATRELATT,
-						   ObjectIdGetDatum(relid),
-						   Int16GetDatum(var->varattno),
-						   0, 0);
-	if (!HeapTupleIsValid(tuple))
-	{
-		/*
-		 * If the attribute is known unique because of an index,
-		 * we can treat it as well-distributed.
-		 */
-		if (has_unique_index(rel, var->varattno))
-			return 1.0 / (double) nbuckets;
-
-		/*
-		 * Perhaps the Var is a system attribute; if so, it will have no
-		 * entry in pg_statistic, but we may be able to guess something
-		 * about its distribution anyway.
-		 */
-		switch (var->varattno)
-		{
-			case ObjectIdAttributeNumber:
-			case SelfItemPointerAttributeNumber:
-				/* these are unique, so buckets should be well-distributed */
-				return 1.0 / (double) nbuckets;
-			case TableOidAttributeNumber:
-				/* hashing this is a terrible idea... */
-				return 1.0;
-		}
-		return 0.1;
-	}
-	stats = (Form_pg_statistic) GETSTRUCT(tuple);
-
-	/*
-	 * Obtain number of distinct data values in raw relation.
-	 */
-	ndistinct = stats->stadistinct;
-	if (ndistinct < 0.0)
-		ndistinct = -ndistinct * rel->tuples;
-
-	if (ndistinct <= 0.0)		/* ensure we can divide */
-	{
-		ReleaseSysCache(tuple);
-		return 0.1;
-	}
-
-	/* Also compute avg freq of all distinct data values in raw relation */
-	avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
-
-	/*
-	 * Adjust ndistinct to account for restriction clauses.  Observe we
-	 * are assuming that the data distribution is affected uniformly by
-	 * the restriction clauses!
-	 *
-	 * XXX Possibly better way, but much more expensive: multiply by
-	 * selectivity of rel's restriction clauses that mention the target
-	 * Var.
-	 */
-	ndistinct *= rel->rows / rel->tuples;
-
-	/*
-	 * Initial estimate of bucketsize fraction is 1/nbuckets as long as
-	 * the number of buckets is less than the expected number of distinct
-	 * values; otherwise it is 1/ndistinct.
-	 */
-	if (ndistinct > (double) nbuckets)
-		estfract = 1.0 / (double) nbuckets;
-	else
-		estfract = 1.0 / ndistinct;
-
-	/*
-	 * Look up the frequency of the most common value, if available.
-	 */
-	mcvfreq = 0.0;
-
-	if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
-						 STATISTIC_KIND_MCV, InvalidOid,
-						 NULL, NULL, &numbers, &nnumbers))
-	{
-		/*
-		 * The first MCV stat is for the most common value.
-		 */
-		if (nnumbers > 0)
-			mcvfreq = numbers[0];
-		free_attstatsslot(var->vartype, NULL, 0,
-						  numbers, nnumbers);
-	}
-
-	/*
-	 * Adjust estimated bucketsize upward to account for skewed
-	 * distribution.
-	 */
-	if (avgfreq > 0.0 && mcvfreq > avgfreq)
-		estfract *= mcvfreq / avgfreq;
-
-	/*
-	 * Clamp bucketsize to sane range (the above adjustment could easily
-	 * produce an out-of-range result).  We set the lower bound a little
-	 * above zero, since zero isn't a very sane result.
-	 */
-	if (estfract < 1.0e-6)
-		estfract = 1.0e-6;
-	else if (estfract > 1.0)
-		estfract = 1.0;
-
-	ReleaseSysCache(tuple);
-
-	return (Selectivity) estfract;
-}
-

 /*
 * cost_qual_eval
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.54 2003/12/08 18:19:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.55 2004/02/17 00:52:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -214,12 +214,8 @@ find_base_rel(Query *root, int relid)
 * find_join_rel
 *	  Returns relation entry corresponding to 'relids' (a set of RT indexes),
 *	  or NULL if none exists.  This is for join relations.
- *
- * Note: there is probably no good reason for this to be called from
- * anywhere except build_join_rel, but keep it as a separate routine
- * just in case.
 */
-static RelOptInfo *
+RelOptInfo *
 find_join_rel(Query *root, Relids relids)
 {
 	List	   *joinrels;
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.53 2003/11/29 22:41:07 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.54 2004/02/17 00:52:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -77,6 +77,7 @@ extern HashPath *create_hashjoin_path(Query *root,
 extern void build_base_rel(Query *root, int relid);
 extern RelOptInfo *build_other_rel(Query *root, int relid);
 extern RelOptInfo *find_base_rel(Query *root, int relid);
+extern RelOptInfo *find_join_rel(Query *root, Relids relids);
 extern RelOptInfo *build_join_rel(Query *root,
 			   Relids joinrelids,
 			   RelOptInfo *outer_rel,
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.16 2003/11/29 22:41:16 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.17 2004/02/17 00:52:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -77,6 +77,9 @@ extern void mergejoinscansel(Query *root, Node *clause,
 extern double estimate_num_groups(Query *root, List *groupExprs,
 					double input_rows);

+extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey,
+											int nbuckets);
+
 extern Datum btcostestimate(PG_FUNCTION_ARGS);
 extern Datum rtcostestimate(PG_FUNCTION_ARGS);
 extern Datum hashcostestimate(PG_FUNCTION_ARGS);