Improve cost estimation for aggregates and window functions.

The previous coding failed to account properly for the costs of evaluating the input expressions of aggregates and window functions, as seen in a recent gripe from Claudio Freire. (I said at the time that it wasn't counting these costs at all; but on closer inspection, it was effectively charging these costs once per output tuple. That is completely wrong for aggregates, and not exactly right for window functions either.) There was also a hard-wired assumption that aggregates and window functions had procost 1.0, which is now fixed to respect the actual cataloged costs. The costing of WindowAgg is still pretty bogus, since it doesn't try to estimate the effects of spilling data to disk, but that seems like a separate issue.
15 years ago · e6a30a8c3c
parent f6322b3191
commit e6a30a8c3c
11 changed files with 199 additions and 107 deletions
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -1335,25 +1335,40 @@ cost_material(Path *path,
 *		Determines and returns the cost of performing an Agg plan node,
 *		including the cost of its input.
 *
+ * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
+ * we are using a hashed Agg node just to do grouping).
+ *
 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
 * are for appropriately-sorted input.
 */
 void
 cost_agg(Path *path, PlannerInfo *root,
-		 AggStrategy aggstrategy, int numAggs,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, double numGroups,
 		 Cost input_startup_cost, Cost input_total_cost,
 		 double input_tuples)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
+	AggClauseCosts dummy_aggcosts;
+
+	/* Use all-zero per-aggregate costs if NULL is passed */
+	if (aggcosts == NULL)
+	{
+		Assert(aggstrategy == AGG_HASHED);
+		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
+		aggcosts = &dummy_aggcosts;
+	}

 	/*
-	 * We charge one cpu_operator_cost per aggregate function per input tuple,
-	 * and another one per output tuple (corresponding to transfn and finalfn
-	 * calls respectively).  If we are grouping, we charge an additional
-	 * cpu_operator_cost per grouping column per input tuple for grouping
-	 * comparisons.
+	 * The transCost.per_tuple component of aggcosts should be charged once
+	 * per input tuple, corresponding to the costs of evaluating the aggregate
+	 * transfns and their input expressions (with any startup cost of course
+	 * charged but once).  The finalCost component is charged once per output
+	 * tuple, corresponding to the costs of evaluating the finalfns.
+	 *
+	 * If we are grouping, we charge an additional cpu_operator_cost per
+	 * grouping column per input tuple for grouping comparisons.
 	 *
 	 * We will produce a single output tuple if not grouping, and a tuple per
 	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
@ -1366,15 +1381,13 @@ cost_agg(Path *path, PlannerInfo *root,
 	 * there's roundoff error we might do the wrong thing.  So be sure that
 	 * the computations below form the same intermediate values in the same
 	 * order.
-	 *
-	 * Note: ideally we should use the pg_proc.procost costs of each
-	 * aggregate's component functions, but for now that seems like an
-	 * excessive amount of work.
 	 */
 	if (aggstrategy == AGG_PLAIN)
 	{
 		startup_cost = input_total_cost;
-		startup_cost += cpu_operator_cost * (input_tuples + 1) * numAggs;
+		startup_cost += aggcosts->transCost.startup;
+		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
+		startup_cost += aggcosts->finalCost;
 		/* we aren't grouping */
 		total_cost = startup_cost + cpu_tuple_cost;
 	}
@ -1384,19 +1397,21 @@ cost_agg(Path *path, PlannerInfo *root,
 		startup_cost = input_startup_cost;
 		total_cost = input_total_cost;
 		/* calcs phrased this way to match HASHED case, see note above */
-		total_cost += cpu_operator_cost * input_tuples * numGroupCols;
-		total_cost += cpu_operator_cost * input_tuples * numAggs;
-		total_cost += cpu_operator_cost * numGroups * numAggs;
+		total_cost += aggcosts->transCost.startup;
+		total_cost += aggcosts->transCost.per_tuple * input_tuples;
+		total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
+		total_cost += aggcosts->finalCost * numGroups;
 		total_cost += cpu_tuple_cost * numGroups;
 	}
 	else
 	{
 		/* must be AGG_HASHED */
 		startup_cost = input_total_cost;
-		startup_cost += cpu_operator_cost * input_tuples * numGroupCols;
-		startup_cost += cpu_operator_cost * input_tuples * numAggs;
+		startup_cost += aggcosts->transCost.startup;
+		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
+		startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
 		total_cost = startup_cost;
-		total_cost += cpu_operator_cost * numGroups * numAggs;
+		total_cost += aggcosts->finalCost * numGroups;
 		total_cost += cpu_tuple_cost * numGroups;
 	}

@ -1413,25 +1428,53 @@ cost_agg(Path *path, PlannerInfo *root,
 */
 void
 cost_windowagg(Path *path, PlannerInfo *root,
-			   int numWindowFuncs, int numPartCols, int numOrderCols,
+			   List *windowFuncs, int numPartCols, int numOrderCols,
 			   Cost input_startup_cost, Cost input_total_cost,
 			   double input_tuples)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
+	ListCell   *lc;

 	startup_cost = input_startup_cost;
 	total_cost = input_total_cost;

 	/*
-	 * We charge one cpu_operator_cost per window function per tuple (often a
-	 * drastic underestimate, but without a way to gauge how many tuples the
-	 * window function will fetch, it's hard to do better).  We also charge
-	 * cpu_operator_cost per grouping column per tuple for grouping
-	 * comparisons, plus cpu_tuple_cost per tuple for general overhead.
-	 */
-	total_cost += cpu_operator_cost * input_tuples * numWindowFuncs;
-	total_cost += cpu_operator_cost * input_tuples * (numPartCols + numOrderCols);
+	 * Window functions are assumed to cost their stated execution cost, plus
+	 * the cost of evaluating their input expressions, per tuple.  Since they
+	 * may in fact evaluate their inputs at multiple rows during each cycle,
+	 * this could be a drastic underestimate; but without a way to know how
+	 * many rows the window function will fetch, it's hard to do better.  In
+	 * any case, it's a good estimate for all the built-in window functions,
+	 * so we'll just do this for now.
+	 */
+	foreach(lc, windowFuncs)
+	{
+		WindowFunc *wfunc = (WindowFunc *) lfirst(lc);
+		Cost		wfunccost;
+		QualCost	argcosts;
+
+		Assert(IsA(wfunc, WindowFunc));
+
+		wfunccost = get_func_cost(wfunc->winfnoid) * cpu_operator_cost;
+
+		/* also add the input expressions' cost to per-input-row costs */
+		cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
+		startup_cost += argcosts.startup;
+		wfunccost += argcosts.per_tuple;
+
+		total_cost += wfunccost * input_tuples;
+	}
+
+	/*
+	 * We also charge cpu_operator_cost per grouping column per tuple for
+	 * grouping comparisons, plus cpu_tuple_cost per tuple for general
+	 * overhead.
+	 *
+	 * XXX this neglects costs of spooling the data to disk when it overflows
+	 * work_mem.  Sooner or later that should get accounted for.
+	 */
+	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
 	total_cost += cpu_tuple_cost * input_tuples;

 	path->startup_cost = startup_cost;
@ -2640,17 +2683,12 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
 	 * OR, NOT). Simplistic, but a lot better than no model at all.
 	 *
-	 * Note that Aggref and WindowFunc nodes are (and should be) treated like
-	 * Vars --- whatever execution cost they have is absorbed into
-	 * plan-node-specific costing.	As far as expression evaluation is
-	 * concerned they're just like Vars.
-	 *
 	 * Should we try to account for the possibility of short-circuit
 	 * evaluation of AND/OR?  Probably *not*, because that would make the
 	 * results depend on the clause ordering, and we are not in any position
 	 * to expect that the current ordering of the clauses is the one that's
-	 * going to end up being used.	(Is it worth applying order_qual_clauses
-	 * much earlier in the planning process to fix this?)
+	 * going to end up being used.  The above per-RestrictInfo caching would
+	 * not mix well with trying to re-order clauses anyway.
 	 */
 	if (IsA(node, FuncExpr))
 	{
@ -2679,6 +2717,20 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 		context->total.per_tuple += get_func_cost(saop->opfuncid) *
 			cpu_operator_cost * estimate_array_length(arraynode) * 0.5;
 	}
+	else if (IsA(node, Aggref) ||
+			 IsA(node, WindowFunc))
+	{
+		/*
+		 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
+		 * ie, zero execution cost in the current model, because they behave
+		 * essentially like Vars in execQual.c.  We disregard the costs of
+		 * their input expressions for the same reason.  The actual execution
+		 * costs of the aggregate/window functions and their arguments have to
+		 * be factored into plan-node-specific costing of the Agg or WindowAgg
+		 * plan node.
+		 */
+		return false;			/* don't recurse into children */
+	}
 	else if (IsA(node, CoerceViaIO))
 	{
 		CoerceViaIO *iocoerce = (CoerceViaIO *) node;
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -939,11 +939,11 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path)
 								 build_relation_tlist(best_path->path.parent),
 								 NIL,
 								 AGG_HASHED,
+								 NULL,
 								 numGroupCols,
 								 groupColIdx,
 								 groupOperators,
 								 numGroups,
-								 0,
 								 subplan);
 	}
 	else
@ -3841,9 +3841,9 @@ materialize_finished_plan(Plan *subplan)

 Agg *
 make_agg(PlannerInfo *root, List *tlist, List *qual,
-		 AggStrategy aggstrategy,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators,
-		 long numGroups, int numAggs,
+		 long numGroups,
 		 Plan *lefttree)
 {
 	Agg		   *node = makeNode(Agg);
@ -3859,7 +3859,7 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,

 	copy_plan_costsize(plan, lefttree); /* only care about copying size */
 	cost_agg(&agg_path, root,
-			 aggstrategy, numAggs,
+			 aggstrategy, aggcosts,
 			 numGroupCols, numGroups,
 			 lefttree->startup_cost,
 			 lefttree->total_cost,
@ -3907,7 +3907,7 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,

 WindowAgg *
 make_windowagg(PlannerInfo *root, List *tlist,
-			   int numWindowFuncs, Index winref,
+			   List *windowFuncs, Index winref,
 			   int partNumCols, AttrNumber *partColIdx, Oid *partOperators,
 			   int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators,
 			   int frameOptions, Node *startOffset, Node *endOffset,
@ -3931,7 +3931,7 @@ make_windowagg(PlannerInfo *root, List *tlist,

 	copy_plan_costsize(plan, lefttree); /* only care about copying size */
 	cost_windowagg(&windowagg_path, root,
-				   numWindowFuncs, partNumCols, ordNumCols,
+				   windowFuncs, partNumCols, ordNumCols,
 				   lefttree->startup_cost,
 				   lefttree->total_cost,
 				   lefttree->plan_rows);
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@ -187,11 +187,13 @@ preprocess_minmax_aggregates(PlannerInfo *root, List *tlist)
 * Should we skip even trying to build the standard plan, if
 * preprocess_minmax_aggregates succeeds?
 *
- * We are passed the preprocessed tlist, as well as the best path devised for
+ * We are passed the preprocessed tlist, as well as the estimated costs for
+ * doing the aggregates the regular way, and the best path devised for
 * computing the input of a standard Agg node.
 */
 Plan *
-optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
+optimize_minmax_aggregates(PlannerInfo *root, List *tlist,
+						   const AggClauseCosts *aggcosts, Path *best_path)
 {
 	Query	   *parse = root->parse;
 	Cost		total_cost;
@ -221,7 +223,7 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
 		total_cost += mminfo->pathcost;
 	}

-	cost_agg(&agg_p, root, AGG_PLAIN, list_length(root->minmax_aggs),
+	cost_agg(&agg_p, root, AGG_PLAIN, aggcosts,
 			 0, 0,
 			 best_path->startup_cost, best_path->total_cost,
 			 best_path->parent->rows);
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@ -74,7 +74,7 @@ static bool choose_hashed_grouping(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
 					   Path *cheapest_path, Path *sorted_path,
-					   double dNumGroups, AggClauseCounts *agg_counts);
+					   double dNumGroups, AggClauseCosts *agg_costs);
 static bool choose_hashed_distinct(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
@ -979,7 +979,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		Path	   *sorted_path;
 		Path	   *best_path;
 		long		numGroups = 0;
-		AggClauseCounts agg_counts;
+		AggClauseCosts agg_costs;
 		int			numGroupCols;
 		double		path_rows;
 		int			path_width;
@ -987,7 +987,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		WindowFuncLists *wflists = NULL;
 		List	   *activeWindows = NIL;

-		MemSet(&agg_counts, 0, sizeof(AggClauseCounts));
+		MemSet(&agg_costs, 0, sizeof(AggClauseCosts));

 		/* A recursive query should always have setOperations */
 		Assert(!root->hasRecursion);
@ -1034,12 +1034,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		if (parse->hasAggs)
 		{
 			/*
-			 * Will need actual number of aggregates for estimating costs.
+			 * Collect statistics about aggregates for estimating costs.
 			 * Note: we do not attempt to detect duplicate aggregates here; a
-			 * somewhat-overestimated count is okay for our present purposes.
+			 * somewhat-overestimated cost is okay for our present purposes.
 			 */
-			count_agg_clauses((Node *) tlist, &agg_counts);
-			count_agg_clauses(parse->havingQual, &agg_counts);
+			count_agg_clauses(root, (Node *) tlist, &agg_costs);
+			count_agg_clauses(root, parse->havingQual, &agg_costs);

 			/*
 			 * Preprocess MIN/MAX aggregates, if any.  Note: be careful about
@ -1176,7 +1176,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 									   tuple_fraction, limit_tuples,
 									   path_rows, path_width,
 									   cheapest_path, sorted_path,
-									   dNumGroups, &agg_counts);
+									   dNumGroups, &agg_costs);
 			/* Also convert # groups to long int --- but 'ware overflow! */
 			numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
 		}
@ -1219,6 +1219,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 */
 		result_plan = optimize_minmax_aggregates(root,
 												 tlist,
+												 &agg_costs,
 												 best_path);
 		if (result_plan != NULL)
 		{
@ -1330,11 +1331,11 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												tlist,
 												(List *) parse->havingQual,
 												AGG_HASHED,
+												&agg_costs,
 												numGroupCols,
 												groupColIdx,
 									extract_grouping_ops(parse->groupClause),
 												numGroups,
-												agg_counts.numAggs,
 												result_plan);
 				/* Hashed aggregation produces randomly-ordered results */
 				current_pathkeys = NIL;
@ -1373,11 +1374,11 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												tlist,
 												(List *) parse->havingQual,
 												aggstrategy,
+												&agg_costs,
 												numGroupCols,
 												groupColIdx,
 									extract_grouping_ops(parse->groupClause),
 												numGroups,
-												agg_counts.numAggs,
 												result_plan);
 			}
 			else if (parse->groupClause)
@ -1559,7 +1560,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 				result_plan = (Plan *)
 					make_windowagg(root,
 								   (List *) copyObject(window_tlist),
-							   list_length(wflists->windowFuncs[wc->winref]),
+								   wflists->windowFuncs[wc->winref],
 								   wc->winref,
 								   partNumCols,
 								   partColIdx,
@ -1625,12 +1626,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 											result_plan->targetlist,
 											NIL,
 											AGG_HASHED,
+											NULL,
 										  list_length(parse->distinctClause),
 								 extract_grouping_cols(parse->distinctClause,
 													result_plan->targetlist),
 								 extract_grouping_ops(parse->distinctClause),
 											numDistinctRows,
-											0,
 											result_plan);
 			/* Hashed aggregation produces randomly-ordered results */
 			current_pathkeys = NIL;
@ -2213,7 +2214,7 @@ choose_hashed_grouping(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
 					   Path *cheapest_path, Path *sorted_path,
-					   double dNumGroups, AggClauseCounts *agg_counts)
+					   double dNumGroups, AggClauseCosts *agg_costs)
 {
 	Query	   *parse = root->parse;
 	int			numGroupCols = list_length(parse->groupClause);
@ -2231,7 +2232,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * the hash table, and/or running many sorts in parallel, either of which
 	 * seems like a certain loser.)
 	 */
-	can_hash = (agg_counts->numOrderedAggs == 0 &&
+	can_hash = (agg_costs->numOrderedAggs == 0 &&
 				grouping_is_hashable(parse->groupClause));
 	can_sort = grouping_is_sortable(parse->groupClause);

@ -2261,9 +2262,9 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* Estimate per-hash-entry space at tuple width... */
 	hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData));
 	/* plus space for pass-by-ref transition values... */
-	hashentrysize += agg_counts->transitionSpace;
+	hashentrysize += agg_costs->transitionSpace;
 	/* plus the per-hash-entry overhead */
-	hashentrysize += hash_agg_entry_size(agg_counts->numAggs);
+	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);

 	if (hashentrysize * dNumGroups > work_mem * 1024L)
 		return false;
@ -2297,7 +2298,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, agg_counts->numAggs,
+	cost_agg(&hashed_p, root, AGG_HASHED, agg_costs,
 			 numGroupCols, dNumGroups,
 			 cheapest_path->startup_cost, cheapest_path->total_cost,
 			 path_rows);
@ -2328,7 +2329,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	}

 	if (parse->hasAggs)
-		cost_agg(&sorted_p, root, AGG_SORTED, agg_counts->numAggs,
+		cost_agg(&sorted_p, root, AGG_SORTED, agg_costs,
 				 numGroupCols, dNumGroups,
 				 sorted_p.startup_cost, sorted_p.total_cost,
 				 path_rows);
@ -2447,7 +2448,7 @@ choose_hashed_distinct(PlannerInfo *root,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, 0,
+	cost_agg(&hashed_p, root, AGG_HASHED, NULL,
 			 numDistinctCols, dNumDistinctRows,
 			 cheapest_startup_cost, cheapest_total_cost,
 			 path_rows);
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@ -732,12 +732,12 @@ make_union_unique(SetOperationStmt *op, Plan *plan,
 								 plan->targetlist,
 								 NIL,
 								 AGG_HASHED,
+								 NULL,
 								 list_length(groupList),
 								 extract_grouping_cols(groupList,
 													   plan->targetlist),
 								 extract_grouping_ops(groupList),
 								 numGroups,
-								 0,
 								 plan);
 		/* Hashed aggregation produces randomly-ordered results */
 		*sortClauses = NIL;
@ -814,7 +814,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, 0,
+	cost_agg(&hashed_p, root, AGG_HASHED, NULL,
 			 numGroupCols, dNumGroups,
 			 input_plan->startup_cost, input_plan->total_cost,
 			 input_plan->plan_rows);
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@ -48,6 +48,12 @@
 #include "utils/typcache.h"


+typedef struct
+{
+	PlannerInfo *root;
+	AggClauseCosts *costs;
+} count_agg_clauses_context;
+
 typedef struct
 {
 	ParamListInfo boundParams;
@ -79,7 +85,8 @@ typedef struct

 static bool contain_agg_clause_walker(Node *node, void *context);
 static bool pull_agg_clause_walker(Node *node, List **context);
-static bool count_agg_clauses_walker(Node *node, AggClauseCounts *counts);
+static bool count_agg_clauses_walker(Node *node,
+									 count_agg_clauses_context *context);
 static bool find_window_functions_walker(Node *node, WindowFuncLists *lists);
 static bool expression_returns_set_rows_walker(Node *node, double *count);
 static bool contain_subplans_walker(Node *node, void *context);
@ -448,48 +455,80 @@ pull_agg_clause_walker(Node *node, List **context)

 /*
 * count_agg_clauses
- *	  Recursively count the Aggref nodes in an expression tree.
+ *	  Recursively count the Aggref nodes in an expression tree, and
+ *	  accumulate other cost information about them too.
 *
 *	  Note: this also checks for nested aggregates, which are an error.
 *
- * We not only count the nodes, but attempt to estimate the total space
- * needed for their transition state values if all are evaluated in parallel
- * (as would be done in a HashAgg plan).  See AggClauseCounts for the exact
- * set of statistics returned.
+ * We not only count the nodes, but estimate their execution costs, and
+ * attempt to estimate the total space needed for their transition state
+ * values if all are evaluated in parallel (as would be done in a HashAgg
+ * plan).  See AggClauseCosts for the exact set of statistics collected.
 *
- * NOTE that the counts are ADDED to those already in *counts ... so the
- * caller is responsible for zeroing the struct initially.
+ * NOTE that the counts/costs are ADDED to those already in *costs ... so
+ * the caller is responsible for zeroing the struct initially.
 *
 * This does not descend into subqueries, and so should be used only after
 * reduction of sublinks to subplans, or in contexts where it's known there
 * are no subqueries.  There mustn't be outer-aggregate references either.
 */
 void
-count_agg_clauses(Node *clause, AggClauseCounts *counts)
+count_agg_clauses(PlannerInfo *root, Node *clause, AggClauseCosts *costs)
 {
-	/* no setup needed */
-	count_agg_clauses_walker(clause, counts);
+	count_agg_clauses_context context;
+
+	context.root = root;
+	context.costs = costs;
+	(void) count_agg_clauses_walker(clause, &context);
 }

 static bool
-count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
+count_agg_clauses_walker(Node *node, count_agg_clauses_context *context)
 {
 	if (node == NULL)
 		return false;
 	if (IsA(node, Aggref))
 	{
 		Aggref	   *aggref = (Aggref *) node;
-		Oid		   *inputTypes;
-		int			numArguments;
+		AggClauseCosts *costs = context->costs;
 		HeapTuple	aggTuple;
 		Form_pg_aggregate aggform;
+		Oid			aggtransfn;
+		Oid			aggfinalfn;
 		Oid			aggtranstype;
+		QualCost	argcosts;
+		Oid		   *inputTypes;
+		int			numArguments;
 		ListCell   *l;

 		Assert(aggref->agglevelsup == 0);
-		counts->numAggs++;
+
+		/* fetch info about aggregate from pg_aggregate */
+		aggTuple = SearchSysCache1(AGGFNOID,
+								   ObjectIdGetDatum(aggref->aggfnoid));
+		if (!HeapTupleIsValid(aggTuple))
+			elog(ERROR, "cache lookup failed for aggregate %u",
+				 aggref->aggfnoid);
+		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+		aggtransfn = aggform->aggtransfn;
+		aggfinalfn = aggform->aggfinalfn;
+		aggtranstype = aggform->aggtranstype;
+		ReleaseSysCache(aggTuple);
+
+		/* count it */
+		costs->numAggs++;
 		if (aggref->aggorder != NIL || aggref->aggdistinct != NIL)
-			counts->numOrderedAggs++;
+			costs->numOrderedAggs++;
+
+		/* add component function execution costs to appropriate totals */
+		costs->transCost.per_tuple += get_func_cost(aggtransfn) * cpu_operator_cost;
+		if (OidIsValid(aggfinalfn))
+			costs->finalCost += get_func_cost(aggfinalfn) * cpu_operator_cost;
+
+		/* also add the input expressions' cost to per-input-row costs */
+		cost_qual_eval_node(&argcosts, (Node *) aggref->args, context->root);
+		costs->transCost.startup += argcosts.startup;
+		costs->transCost.per_tuple += argcosts.per_tuple;

 		/* extract argument types (ignoring any ORDER BY expressions) */
 		inputTypes = (Oid *) palloc(sizeof(Oid) * list_length(aggref->args));
@ -502,16 +541,6 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 				inputTypes[numArguments++] = exprType((Node *) tle->expr);
 		}

-		/* fetch aggregate transition datatype from pg_aggregate */
-		aggTuple = SearchSysCache1(AGGFNOID,
-								   ObjectIdGetDatum(aggref->aggfnoid));
-		if (!HeapTupleIsValid(aggTuple))
-			elog(ERROR, "cache lookup failed for aggregate %u",
-				 aggref->aggfnoid);
-		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
-		aggtranstype = aggform->aggtranstype;
-		ReleaseSysCache(aggTuple);
-
 		/* resolve actual type of transition state, if polymorphic */
 		if (IsPolymorphicType(aggtranstype))
 		{
@ -554,7 +583,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 			avgwidth = get_typavgwidth(aggtranstype, aggtranstypmod);
 			avgwidth = MAXALIGN(avgwidth);

-			counts->transitionSpace += avgwidth + 2 * sizeof(void *);
+			costs->transitionSpace += avgwidth + 2 * sizeof(void *);
 		}
 		else if (aggtranstype == INTERNALOID)
 		{
@ -566,7 +595,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 			 * being kept in a private memory context, as is done by
 			 * array_agg() for instance.
 			 */
-			counts->transitionSpace += ALLOCSET_DEFAULT_INITSIZE;
+			costs->transitionSpace += ALLOCSET_DEFAULT_INITSIZE;
 		}

 		/*
@ -585,7 +614,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 	}
 	Assert(!IsA(node, SubLink));
 	return expression_tree_walker(node, count_agg_clauses_walker,
-								  (void *) counts);
+								  (void *) context);
 }


--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@ -1103,7 +1103,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 			all_hash = false;	/* don't try to hash */
 		else
 			cost_agg(&agg_path, root,
-					 AGG_HASHED, 0,
+					 AGG_HASHED, NULL,
 					 numCols, pathnode->rows,
 					 subpath->startup_cost,
 					 subpath->total_cost,
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@ -46,6 +46,20 @@ typedef struct QualCost
 	Cost		per_tuple;		/* per-evaluation cost */
 } QualCost;

+/*
+ * Costing aggregate function execution requires these statistics about
+ * the aggregates to be executed by a given Agg node.  Note that transCost
+ * includes the execution costs of the aggregates' input expressions.
+ */
+typedef struct AggClauseCosts
+{
+	int			numAggs;		/* total number of aggregate functions */
+	int			numOrderedAggs; /* number that use DISTINCT or ORDER BY */
+	QualCost	transCost;		/* total per-input-row execution costs */
+	Cost		finalCost;		/* total costs of agg final functions */
+	Size		transitionSpace;	/* space for pass-by-ref transition data */
+} AggClauseCosts;
+

 /*----------
 * PlannerGlobal
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@ -20,13 +20,6 @@
 #define is_opclause(clause)		((clause) != NULL && IsA(clause, OpExpr))
 #define is_funcclause(clause)	((clause) != NULL && IsA(clause, FuncExpr))

-typedef struct
-{
-	int			numAggs;		/* total number of aggregate calls */
-	int			numOrderedAggs; /* number that use DISTINCT or ORDER BY */
-	Size		transitionSpace;	/* for pass-by-ref transition data */
-} AggClauseCounts;
-
 typedef struct
 {
 	int			numWindowFuncs; /* total number of WindowFuncs found */
@ -56,7 +49,8 @@ extern List *make_ands_implicit(Expr *clause);

 extern bool contain_agg_clause(Node *clause);
 extern List *pull_agg_clause(Node *clause);
-extern void count_agg_clauses(Node *clause, AggClauseCounts *counts);
+extern void count_agg_clauses(PlannerInfo *root, Node *clause,
+							  AggClauseCosts *costs);

 extern bool contain_window_function(Node *clause);
 extern WindowFuncLists *find_window_functions(Node *clause, Index maxWinRef);
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@ -94,12 +94,12 @@ extern void cost_material(Path *path,
 			  Cost input_startup_cost, Cost input_total_cost,
 			  double tuples, int width);
 extern void cost_agg(Path *path, PlannerInfo *root,
-		 AggStrategy aggstrategy, int numAggs,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, double numGroups,
 		 Cost input_startup_cost, Cost input_total_cost,
 		 double input_tuples);
 extern void cost_windowagg(Path *path, PlannerInfo *root,
-			   int numWindowFuncs, int numPartCols, int numOrderCols,
+			   List *windowFuncs, int numPartCols, int numOrderCols,
 			   Cost input_startup_cost, Cost input_total_cost,
 			   double input_tuples);
 extern void cost_group(Path *path, PlannerInfo *root,
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@ -34,7 +34,7 @@ extern void query_planner(PlannerInfo *root, List *tlist,
 */
 extern void preprocess_minmax_aggregates(PlannerInfo *root, List *tlist);
 extern Plan *optimize_minmax_aggregates(PlannerInfo *root, List *tlist,
-						   Path *best_path);
+						   const AggClauseCosts *aggcosts, Path *best_path);

 /*
 * prototypes for plan/createplan.c
@ -54,12 +54,12 @@ extern Sort *make_sort_from_sortclauses(PlannerInfo *root, List *sortcls,
 extern Sort *make_sort_from_groupcols(PlannerInfo *root, List *groupcls,
 						 AttrNumber *grpColIdx, Plan *lefttree);
 extern Agg *make_agg(PlannerInfo *root, List *tlist, List *qual,
-		 AggStrategy aggstrategy,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators,
-		 long numGroups, int numAggs,
+		 long numGroups,
 		 Plan *lefttree);
 extern WindowAgg *make_windowagg(PlannerInfo *root, List *tlist,
-			   int numWindowFuncs, Index winref,
+			   List *windowFuncs, Index winref,
 			   int partNumCols, AttrNumber *partColIdx, Oid *partOperators,
 			   int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators,
 			   int frameOptions, Node *startOffset, Node *endOffset,