You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
postgres/src/backend/optimizer/plan/analyzejoins.c

1171 lines
38 KiB

/*-------------------------------------------------------------------------
*
* analyzejoins.c
* Routines for simplifying joins after initial query analysis
*
* While we do a great deal of join simplification in prep/prepjointree.c,
* certain optimizations cannot be performed at that stage for lack of
* detailed information about the query. The routines here are invoked
* after initsplan.c has done its work, and can do additional join removal
* and simplification steps based on the information extracted. The penalty
* is that we have to work harder to clean up after ourselves when we modify
* the query, since the derived data structures have to be updated too.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/optimizer/plan/analyzejoins.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/joininfo.h"
#include "optimizer/optimizer.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/planmain.h"
#include "optimizer/tlist.h"
#include "utils/lsyscache.h"
/* local functions */
static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
static void remove_rel_from_query(PlannerInfo *root, int relid, int ojrelid,
Relids joinrelids);
static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
List *clause_list);
static Oid distinct_col_search(int colno, List *colnos, List *opids);
static bool is_innerrel_unique_for(PlannerInfo *root,
Relids joinrelids,
Relids outerrelids,
RelOptInfo *innerrel,
JoinType jointype,
List *restrictlist);
/*
* remove_useless_joins
* Check for relations that don't actually need to be joined at all,
* and remove them from the query.
*
* We are passed the current joinlist and return the updated list. Other
* data structures that have to be updated are accessible via "root".
*/
List *
remove_useless_joins(PlannerInfo *root, List *joinlist)
{
ListCell *lc;
/*
* We are only interested in relations that are left-joined to, so we can
* scan the join_info_list to find them easily.
*/
restart:
foreach(lc, root->join_info_list)
{
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
Relids joinrelids;
int innerrelid;
int nremoved;
/* Skip if not removable */
if (!join_is_removable(root, sjinfo))
continue;
/*
* Currently, join_is_removable can only succeed when the sjinfo's
* righthand is a single baserel. Remove that rel from the query and
* joinlist.
*/
innerrelid = bms_singleton_member(sjinfo->min_righthand);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
/* Compute the relid set for the join we are considering */
joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
if (sjinfo->ojrelid != 0)
joinrelids = bms_add_member(joinrelids, sjinfo->ojrelid);
remove_rel_from_query(root, innerrelid, sjinfo->ojrelid, joinrelids);
/* We verify that exactly one reference gets removed from joinlist */
nremoved = 0;
joinlist = remove_rel_from_joinlist(joinlist, innerrelid, &nremoved);
if (nremoved != 1)
elog(ERROR, "failed to find relation %d in joinlist", innerrelid);
/*
* We can delete this SpecialJoinInfo from the list too, since it's no
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
* longer of interest. (Since we'll restart the foreach loop
* immediately, we don't bother with foreach_delete_current.)
*/
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
root->join_info_list = list_delete_cell(root->join_info_list, lc);
/*
* Restart the scan. This is necessary to ensure we find all
* removable joins independently of ordering of the join_info_list
* (note that removal of attr_needed bits may make a join appear
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
* removable that did not before).
*/
goto restart;
}
return joinlist;
}
/*
* clause_sides_match_join
* Determine whether a join clause is of the right form to use in this join.
*
* We already know that the clause is a binary opclause referencing only the
* rels in the current join. The point here is to check whether it has the
* form "outerrel_expr op innerrel_expr" or "innerrel_expr op outerrel_expr",
* rather than mixing outer and inner vars on either side. If it matches,
* we set the transient flag outer_is_left to identify which side is which.
*/
static inline bool
clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids,
Relids innerrelids)
{
if (bms_is_subset(rinfo->left_relids, outerrelids) &&
bms_is_subset(rinfo->right_relids, innerrelids))
{
/* lefthand side is outer */
rinfo->outer_is_left = true;
return true;
}
else if (bms_is_subset(rinfo->left_relids, innerrelids) &&
bms_is_subset(rinfo->right_relids, outerrelids))
{
/* righthand side is outer */
rinfo->outer_is_left = false;
return true;
}
return false; /* no good for these input relations */
}
/*
* join_is_removable
* Check whether we need not perform this special join at all, because
* it will just duplicate its left input.
*
* This is true for a left join for which the join condition cannot match
* more than one inner-side row. (There are other possibly interesting
* cases, but we don't have the infrastructure to prove them.) We also
* have to check that the inner side doesn't generate any variables needed
* above the join.
*/
static bool
join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
{
int innerrelid;
RelOptInfo *innerrel;
Relids joinrelids;
List *clause_list = NIL;
ListCell *l;
int attroff;
/*
* Must be a non-delaying left join to a single baserel, else we aren't
* going to be able to do anything with it.
*/
if (sjinfo->jointype != JOIN_LEFT ||
sjinfo->delay_upper_joins)
return false;
if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
return false;
innerrel = find_base_rel(root, innerrelid);
/*
* Before we go to the effort of checking whether any innerrel variables
* are needed above the join, make a quick check to eliminate cases in
* which we will surely be unable to prove uniqueness of the innerrel.
*/
if (!rel_supports_distinctness(root, innerrel))
return false;
/* Compute the relid set for the join we are considering */
joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
if (sjinfo->ojrelid != 0)
joinrelids = bms_add_member(joinrelids, sjinfo->ojrelid);
/*
* We can't remove the join if any inner-rel attributes are used above the
* join.
*
* Note that this test only detects use of inner-rel attributes in higher
* join conditions and the target list. There might be such attributes in
* pushed-down conditions at this join, too. We check that case below.
*
* As a micro-optimization, it seems better to start with max_attr and
* count down rather than starting with min_attr and counting up, on the
* theory that the system attributes are somewhat less likely to be wanted
* and should be tested last.
*/
for (attroff = innerrel->max_attr - innerrel->min_attr;
attroff >= 0;
attroff--)
{
if (!bms_is_subset(innerrel->attr_needed[attroff], joinrelids))
return false;
}
/*
* Similarly check that the inner rel isn't needed by any PlaceHolderVars
* that will be used above the join. We only need to fail if such a PHV
* actually references some inner-rel attributes; but the correct check
* for that is relatively expensive, so we first check against ph_eval_at,
* which must mention the inner rel if the PHV uses any inner-rel attrs as
* non-lateral references. Note that if the PHV's syntactic scope is just
* the inner rel, we can't drop the rel even if the PHV is variable-free.
*/
foreach(l, root->placeholder_list)
{
PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
if (bms_overlap(phinfo->ph_lateral, innerrel->relids))
return false; /* it references innerrel laterally */
if (bms_is_subset(phinfo->ph_needed, joinrelids))
continue; /* PHV is not used above the join */
if (!bms_overlap(phinfo->ph_eval_at, innerrel->relids))
continue; /* it definitely doesn't reference innerrel */
if (bms_is_subset(phinfo->ph_eval_at, innerrel->relids))
return false; /* there isn't any other place to eval PHV */
Fix pull_varnos' miscomputation of relids set for a PlaceHolderVar. Previously, pull_varnos() took the relids of a PlaceHolderVar as being equal to the relids in its contents, but that fails to account for the possibility that we have to postpone evaluation of the PHV due to outer joins. This could result in a malformed plan. The known cases end up triggering the "failed to assign all NestLoopParams to plan nodes" sanity check in createplan.c, but other symptoms may be possible. The right value to use is the join level we actually intend to evaluate the PHV at. We can get that from the ph_eval_at field of the associated PlaceHolderInfo. However, there are some places that call pull_varnos() before the PlaceHolderInfos have been created; in that case, fall back to the conservative assumption that the PHV will be evaluated at its syntactic level. (In principle this might result in missing some legal optimization, but I'm not aware of any cases where it's an issue in practice.) Things are also a bit ticklish for calls occurring during deconstruct_jointree(), but AFAICS the ph_eval_at fields should have reached their final values by the time we need them. The main problem in making this work is that pull_varnos() has no way to get at the PlaceHolderInfos. We can fix that easily, if a bit tediously, in HEAD by passing it the planner "root" pointer. In the back branches that'd cause an unacceptable API/ABI break for extensions, so leave the existing entry points alone and add new ones with the additional parameter. (If an old entry point is called and encounters a PHV, it'll fall back to using the syntactic level, again possibly missing some valid optimization.) Back-patch to v12. The computation is surely also wrong before that, but it appears that we cannot reach a bad plan thanks to join order restrictions imposed on the subquery that the PlaceHolderVar came from. The error only became reachable when commit 4be058fe9 allowed trivial subqueries to be collapsed out completely, eliminating their join order restrictions. Per report from Stephan Springl. Discussion: https://postgr.es/m/171041.1610849523@sss.pgh.pa.us
5 years ago
if (bms_overlap(pull_varnos(root, (Node *) phinfo->ph_var->phexpr),
innerrel->relids))
return false; /* it does reference innerrel */
}
/*
* Search for mergejoinable clauses that constrain the inner rel against
* either the outer rel or a pseudoconstant. If an operator is
* mergejoinable then it behaves like equality for some btree opclass, so
* it's what we want. The mergejoinability test also eliminates clauses
* containing volatile functions, which we couldn't depend on.
*/
foreach(l, innerrel->joininfo)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
/*
* If the current join commutes with some other outer join(s) via
* outer join identity 3, there will be multiple clones of its join
* clauses in the joininfo list. We want to consider only the
* has_clone form of such clauses. Processing more than one form
* would be wasteful, and also some of the others would confuse the
* RINFO_IS_PUSHED_DOWN test below.
*/
if (restrictinfo->is_clone)
continue; /* ignore it */
/*
* If it's not a join clause for this outer join, we can't use it.
* Note that if the clause is pushed-down, then it is logically from
* above the outer join, even if it references no other rels (it might
* be from WHERE, for example).
*/
if (RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
{
/*
* If such a clause actually references the inner rel then join
* removal has to be disallowed. We have to check this despite
* the previous attr_needed checks because of the possibility of
* pushed-down clauses referencing the rel.
*/
if (bms_is_member(innerrelid, restrictinfo->clause_relids))
return false;
continue; /* else, ignore; not useful here */
}
/* Ignore if it's not a mergejoinable clause */
if (!restrictinfo->can_join ||
restrictinfo->mergeopfamilies == NIL)
continue; /* not mergejoinable */
/*
* Check if clause has the form "outer op inner" or "inner op outer",
* and if so mark which side is inner.
*/
if (!clause_sides_match_join(restrictinfo, sjinfo->min_lefthand,
innerrel->relids))
continue; /* no good for these input relations */
/* OK, add to list */
clause_list = lappend(clause_list, restrictinfo);
}
/*
* Now that we have the relevant equality join clauses, try to prove the
* innerrel distinct.
*/
if (rel_is_distinct_for(root, innerrel, clause_list))
return true;
/*
* Some day it would be nice to check for other methods of establishing
* distinctness.
*/
return false;
}
/*
* Remove the target relid from the planner's data structures, having
* determined that there is no need to include it in the query.
*
* We are not terribly thorough here. We must make sure that the rel is
* no longer treated as a baserel, and that attributes of other baserels
* are no longer marked as being needed at joins involving this rel.
* Also, join quals involving the rel have to be removed from the joininfo
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
* lists, but only if they belong to the outer join identified by ojrelid
* and joinrelids.
*/
static void
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
remove_rel_from_query(PlannerInfo *root, int relid, int ojrelid,
Relids joinrelids)
{
RelOptInfo *rel = find_base_rel(root, relid);
List *joininfos;
Index rti;
ListCell *l;
/*
* Mark the rel as "dead" to show it is no longer part of the join tree.
* (Removing it from the baserel array altogether seems too risky.)
*/
rel->reloptkind = RELOPT_DEADREL;
/*
* Remove references to the rel from other baserels' attr_needed arrays.
*/
for (rti = 1; rti < root->simple_rel_array_size; rti++)
{
RelOptInfo *otherrel = root->simple_rel_array[rti];
int attroff;
/* there may be empty slots corresponding to non-baserel RTEs */
if (otherrel == NULL)
continue;
Assert(otherrel->relid == rti); /* sanity check on array */
/* no point in processing target rel itself */
if (otherrel == rel)
continue;
for (attroff = otherrel->max_attr - otherrel->min_attr;
attroff >= 0;
attroff--)
{
otherrel->attr_needed[attroff] =
bms_del_member(otherrel->attr_needed[attroff], relid);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
otherrel->attr_needed[attroff] =
bms_del_member(otherrel->attr_needed[attroff], ojrelid);
}
}
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
/*
* Update all_baserels and related relid sets.
*/
root->all_baserels = bms_del_member(root->all_baserels, relid);
root->outer_join_rels = bms_del_member(root->outer_join_rels, ojrelid);
root->all_query_rels = bms_del_member(root->all_query_rels, relid);
root->all_query_rels = bms_del_member(root->all_query_rels, ojrelid);
/*
* Likewise remove references from SpecialJoinInfo data structures.
*
* This is relevant in case the outer join we're deleting is nested inside
* other outer joins: the upper joins' relid sets have to be adjusted. The
* RHS of the target outer join will be made empty here, but that's OK
* since caller will delete that SpecialJoinInfo entirely.
*/
foreach(l, root->join_info_list)
{
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
sjinfo->min_lefthand = bms_del_member(sjinfo->min_lefthand, relid);
sjinfo->min_righthand = bms_del_member(sjinfo->min_righthand, relid);
sjinfo->syn_lefthand = bms_del_member(sjinfo->syn_lefthand, relid);
sjinfo->syn_righthand = bms_del_member(sjinfo->syn_righthand, relid);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
sjinfo->min_lefthand = bms_del_member(sjinfo->min_lefthand, ojrelid);
sjinfo->min_righthand = bms_del_member(sjinfo->min_righthand, ojrelid);
sjinfo->syn_lefthand = bms_del_member(sjinfo->syn_lefthand, ojrelid);
sjinfo->syn_righthand = bms_del_member(sjinfo->syn_righthand, ojrelid);
/* relid cannot appear in these fields, but ojrelid can: */
sjinfo->commute_above_l = bms_del_member(sjinfo->commute_above_l, ojrelid);
sjinfo->commute_above_r = bms_del_member(sjinfo->commute_above_r, ojrelid);
sjinfo->commute_below = bms_del_member(sjinfo->commute_below, ojrelid);
}
/*
* Likewise remove references from PlaceHolderVar data structures,
* removing any no-longer-needed placeholders entirely.
*
* Removal is a bit trickier than it might seem: we can remove PHVs that
* are used at the target rel and/or in the join qual, but not those that
* are used at join partner rels or above the join. It's not that easy to
* distinguish PHVs used at partner rels from those used in the join qual,
* since they will both have ph_needed sets that are subsets of
* joinrelids. However, a PHV used at a partner rel could not have the
* target rel in ph_eval_at, so we check that while deciding whether to
* remove or just update the PHV. There is no corresponding test in
* join_is_removable because it doesn't need to distinguish those cases.
*/
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
foreach(l, root->placeholder_list)
{
PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
Assert(!bms_is_member(relid, phinfo->ph_lateral));
if (bms_is_subset(phinfo->ph_needed, joinrelids) &&
bms_is_member(relid, phinfo->ph_eval_at))
{
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
root->placeholder_list = foreach_delete_current(root->placeholder_list,
l);
root->placeholder_array[phinfo->phid] = NULL;
}
else
{
phinfo->ph_eval_at = bms_del_member(phinfo->ph_eval_at, relid);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
phinfo->ph_eval_at = bms_del_member(phinfo->ph_eval_at, ojrelid);
Assert(!bms_is_empty(phinfo->ph_eval_at));
phinfo->ph_needed = bms_del_member(phinfo->ph_needed, relid);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
phinfo->ph_needed = bms_del_member(phinfo->ph_needed, ojrelid);
}
}
/*
* Remove any joinquals referencing the rel from the joininfo lists.
*
* In some cases, a joinqual has to be put back after deleting its
* reference to the target rel. This can occur for pseudoconstant and
* outerjoin-delayed quals, which can get marked as requiring the rel in
* order to force them to be evaluated at or above the join. We can't
* just discard them, though. Only quals that logically belonged to the
* outer join being discarded should be removed from the query.
*
* We must make a copy of the rel's old joininfo list before starting the
* loop, because otherwise remove_join_clause_from_rels would destroy the
* list while we're scanning it.
*/
joininfos = list_copy(rel->joininfo);
foreach(l, joininfos)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
remove_join_clause_from_rels(root, rinfo, rinfo->required_relids);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
/*
* If the qual lists ojrelid in its required_relids, it must have come
* from above the outer join we're removing (so we need to keep it);
* if it does not, then it didn't and we can discard it.
*/
if (bms_is_member(ojrelid, rinfo->required_relids))
{
/* Recheck that qual doesn't actually reference the target rel */
Assert(!bms_is_member(relid, rinfo->clause_relids));
/*
* The required_relids probably aren't shared with anything else,
* but let's copy them just to be sure.
*/
rinfo->required_relids = bms_copy(rinfo->required_relids);
rinfo->required_relids = bms_del_member(rinfo->required_relids,
relid);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
rinfo->required_relids = bms_del_member(rinfo->required_relids,
ojrelid);
distribute_restrictinfo_to_rels(root, rinfo);
}
}
/*
* There may be references to the rel in root->fkey_list, but if so,
* match_foreign_keys_to_quals() will get rid of them.
*/
}
/*
* Remove any occurrences of the target relid from a joinlist structure.
*
* It's easiest to build a whole new list structure, so we handle it that
* way. Efficiency is not a big deal here.
*
* *nremoved is incremented by the number of occurrences removed (there
* should be exactly one, but the caller checks that).
*/
static List *
remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved)
{
List *result = NIL;
ListCell *jl;
foreach(jl, joinlist)
{
Node *jlnode = (Node *) lfirst(jl);
if (IsA(jlnode, RangeTblRef))
{
int varno = ((RangeTblRef *) jlnode)->rtindex;
if (varno == relid)
(*nremoved)++;
else
result = lappend(result, jlnode);
}
else if (IsA(jlnode, List))
{
/* Recurse to handle subproblem */
List *sublist;
sublist = remove_rel_from_joinlist((List *) jlnode,
relid, nremoved);
/* Avoid including empty sub-lists in the result */
if (sublist)
result = lappend(result, sublist);
}
else
{
elog(ERROR, "unrecognized joinlist node type: %d",
(int) nodeTag(jlnode));
}
}
return result;
}
/*
* reduce_unique_semijoins
* Check for semijoins that can be simplified to plain inner joins
* because the inner relation is provably unique for the join clauses.
*
* Ideally this would happen during reduce_outer_joins, but we don't have
* enough information at that point.
*
* To perform the strength reduction when applicable, we need only delete
* the semijoin's SpecialJoinInfo from root->join_info_list. (We don't
* bother fixing the join type attributed to it in the query jointree,
* since that won't be consulted again.)
*/
void
reduce_unique_semijoins(PlannerInfo *root)
{
ListCell *lc;
/*
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
* Scan the join_info_list to find semijoins.
*/
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
foreach(lc, root->join_info_list)
{
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
int innerrelid;
RelOptInfo *innerrel;
Relids joinrelids;
List *restrictlist;
/*
* Must be a non-delaying semijoin to a single baserel, else we aren't
* going to be able to do anything with it. (It's probably not
* possible for delay_upper_joins to be set on a semijoin, but we
* might as well check.)
*/
if (sjinfo->jointype != JOIN_SEMI ||
sjinfo->delay_upper_joins)
continue;
if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
continue;
innerrel = find_base_rel(root, innerrelid);
/*
* Before we trouble to run generate_join_implied_equalities, make a
* quick check to eliminate cases in which we will surely be unable to
* prove uniqueness of the innerrel.
*/
if (!rel_supports_distinctness(root, innerrel))
continue;
/* Compute the relid set for the join we are considering */
joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
Make Vars be outer-join-aware. Traditionally we used the same Var struct to represent the value of a table column everywhere in parse and plan trees. This choice predates our support for SQL outer joins, and it's really a pretty bad idea with outer joins, because the Var's value can depend on where it is in the tree: it might go to NULL above an outer join. So expression nodes that are equal() per equalfuncs.c might not represent the same value, which is a huge correctness hazard for the planner. To improve this, decorate Var nodes with a bitmapset showing which outer joins (identified by RTE indexes) may have nulled them at the point in the parse tree where the Var appears. This allows us to trust that equal() Vars represent the same value. A certain amount of klugery is still needed to cope with cases where we re-order two outer joins, but it's possible to make it work without sacrificing that core principle. PlaceHolderVars receive similar decoration for the same reason. In the planner, we include these outer join bitmapsets into the relids that an expression is considered to depend on, and in consequence also add outer-join relids to the relids of join RelOptInfos. This allows us to correctly perceive whether an expression can be calculated above or below a particular outer join. This change affects FDWs that want to plan foreign joins. They *must* follow suit when labeling foreign joins in order to match with the core planner, but for many purposes (if postgres_fdw is any guide) they'd prefer to consider only base relations within the join. To support both requirements, redefine ForeignScan.fs_relids as base+OJ relids, and add a new field fs_base_relids that's set up by the core planner. Large though it is, this commit just does the minimum necessary to install the new mechanisms and get check-world passing again. Follow-up patches will perform some cleanup. (The README additions and comments mention some stuff that will appear in the follow-up.) Patch by me; thanks to Richard Guo for review. Discussion: https://postgr.es/m/830269.1656693747@sss.pgh.pa.us
3 years ago
Assert(sjinfo->ojrelid == 0); /* SEMI joins don't have RT indexes */
/*
* Since we're only considering a single-rel RHS, any join clauses it
* has must be clauses linking it to the semijoin's min_lefthand. We
* can also consider EC-derived join clauses.
*/
restrictlist =
list_concat(generate_join_implied_equalities(root,
joinrelids,
sjinfo->min_lefthand,
innerrel),
innerrel->joininfo);
/* Test whether the innerrel is unique for those clauses. */
if (!innerrel_is_unique(root,
joinrelids, sjinfo->min_lefthand, innerrel,
JOIN_SEMI, restrictlist, true))
continue;
/* OK, remove the SpecialJoinInfo from the list. */
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
root->join_info_list = foreach_delete_current(root->join_info_list, lc);
}
}
/*
* rel_supports_distinctness
* Could the relation possibly be proven distinct on some set of columns?
*
* This is effectively a pre-checking function for rel_is_distinct_for().
* It must return true if rel_is_distinct_for() could possibly return true
* with this rel, but it should not expend a lot of cycles. The idea is
* that callers can avoid doing possibly-expensive processing to compute
* rel_is_distinct_for()'s argument lists if the call could not possibly
* succeed.
*/
static bool
rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
{
/* We only know about baserels ... */
if (rel->reloptkind != RELOPT_BASEREL)
return false;
if (rel->rtekind == RTE_RELATION)
{
/*
* For a plain relation, we only know how to prove uniqueness by
* reference to unique indexes. Make sure there's at least one
* suitable unique index. It must be immediately enforced, and if
* it's a partial index, it must match the query. (Keep these
* conditions in sync with relation_has_unique_index_for!)
*/
ListCell *lc;
foreach(lc, rel->indexlist)
{
IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
if (ind->unique && ind->immediate &&
(ind->indpred == NIL || ind->predOK))
return true;
}
}
else if (rel->rtekind == RTE_SUBQUERY)
{
Query *subquery = root->simple_rte_array[rel->relid]->subquery;
/* Check if the subquery has any qualities that support distinctness */
if (query_supports_distinctness(subquery))
return true;
}
/* We have no proof rules for any other rtekinds. */
return false;
}
/*
* rel_is_distinct_for
* Does the relation return only distinct rows according to clause_list?
*
* clause_list is a list of join restriction clauses involving this rel and
* some other one. Return true if no two rows emitted by this rel could
* possibly join to the same row of the other rel.
*
* The caller must have already determined that each condition is a
* mergejoinable equality with an expression in this relation on one side, and
* an expression not involving this relation on the other. The transient
* outer_is_left flag is used to identify which side references this relation:
* left side if outer_is_left is false, right side if it is true.
*
* Note that the passed-in clause_list may be destructively modified! This
* is OK for current uses, because the clause_list is built by the caller for
* the sole purpose of passing to this function.
*/
static bool
rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list)
{
/*
* We could skip a couple of tests here if we assume all callers checked
* rel_supports_distinctness first, but it doesn't seem worth taking any
* risk for.
*/
if (rel->reloptkind != RELOPT_BASEREL)
return false;
if (rel->rtekind == RTE_RELATION)
{
/*
* Examine the indexes to see if we have a matching unique index.
* relation_has_unique_index_for automatically adds any usable
* restriction clauses for the rel, so we needn't do that here.
*/
if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL))
return true;
}
else if (rel->rtekind == RTE_SUBQUERY)
{
Index relid = rel->relid;
Query *subquery = root->simple_rte_array[relid]->subquery;
List *colnos = NIL;
List *opids = NIL;
ListCell *l;
/*
* Build the argument lists for query_is_distinct_for: a list of
* output column numbers that the query needs to be distinct over, and
* a list of equality operators that the output columns need to be
* distinct according to.
*
* (XXX we are not considering restriction clauses attached to the
* subquery; is that worth doing?)
*/
foreach(l, clause_list)
{
RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
Oid op;
Var *var;
/*
* Get the equality operator we need uniqueness according to.
* (This might be a cross-type operator and thus not exactly the
* same operator the subquery would consider; that's all right
* since query_is_distinct_for can resolve such cases.) The
* caller's mergejoinability test should have selected only
* OpExprs.
*/
op = castNode(OpExpr, rinfo->clause)->opno;
/* caller identified the inner side for us */
if (rinfo->outer_is_left)
var = (Var *) get_rightop(rinfo->clause);
else
var = (Var *) get_leftop(rinfo->clause);
/*
* We may ignore any RelabelType node above the operand. (There
* won't be more than one, since eval_const_expressions() has been
* applied already.)
*/
if (var && IsA(var, RelabelType))
var = (Var *) ((RelabelType *) var)->arg;
/*
* If inner side isn't a Var referencing a subquery output column,
* this clause doesn't help us.
*/
if (!var || !IsA(var, Var) ||
var->varno != relid || var->varlevelsup != 0)
continue;
colnos = lappend_int(colnos, var->varattno);
opids = lappend_oid(opids, op);
}
if (query_is_distinct_for(subquery, colnos, opids))
return true;
}
return false;
}
/*
* query_supports_distinctness - could the query possibly be proven distinct
* on some set of output columns?
*
* This is effectively a pre-checking function for query_is_distinct_for().
* It must return true if query_is_distinct_for() could possibly return true
* with this query, but it should not expend a lot of cycles. The idea is
* that callers can avoid doing possibly-expensive processing to compute
* query_is_distinct_for()'s argument lists if the call could not possibly
* succeed.
*/
bool
query_supports_distinctness(Query *query)
{
Improve planner's handling of set-returning functions in grouping columns. Improve query_is_distinct_for() to accept SRFs in the targetlist when we can prove distinctness from a DISTINCT clause. In that case the de-duplication will surely happen after SRF expansion, so the proof still works. Continue to punt in the case where we'd try to prove distinctness from GROUP BY (or, in the future, source relations). To do that, we'd have to determine whether the SRFs were in the grouping columns or elsewhere in the tlist, and it still doesn't seem worth the trouble. But this trivial change allows us to recognize that "SELECT DISTINCT unnest(foo) FROM ..." produces unique-ified output, which seems worth having. Also, fix estimate_num_groups() to consider the possibility of SRFs in the grouping columns. Its failure to do so was masked before v10 because grouping_planner() scaled up plan rowcount estimates by the estimated SRF multiplier after performing grouping. That doesn't happen anymore, which is more correct, but it means we need an adjustment in the estimate for the number of groups. Failure to do this leads to an underestimate for the number of output rows of subqueries like "SELECT DISTINCT unnest(foo)" compared to what 9.6 and earlier estimated, thus breaking plan choices in some cases. Per report from Dmitry Shalashov. Back-patch to v10 to avoid degraded plan choices compared to previous releases. Discussion: https://postgr.es/m/CAKPeCUGAeHgoh5O=SvcQxREVkoX7UdeJUMj1F5=aBNvoTa+O8w@mail.gmail.com
8 years ago
/* SRFs break distinctness except with DISTINCT, see below */
if (query->hasTargetSRFs && query->distinctClause == NIL)
Improve parser's and planner's handling of set-returning functions. Teach the parser to reject misplaced set-returning functions during parse analysis using p_expr_kind, in much the same way as we do for aggregates and window functions (cf commit eaccfded9). While this isn't complete (it misses nesting-based restrictions), it's much better than the previous error reporting for such cases, and it allows elimination of assorted ad-hoc expression_returns_set() error checks. We could add nesting checks later if it seems important to catch all cases at parse time. There is one case the parser will now throw error for although previous versions allowed it, which is SRFs in the tlist of an UPDATE. That never behaved sensibly (since it's ill-defined which generated row should be used to perform the update) and it's hard to see why it should not be treated as an error. It's a release-note-worthy change though. Also, add a new Query field hasTargetSRFs reporting whether there are any SRFs in the targetlist (including GROUP BY/ORDER BY expressions). The parser can now set that basically for free during parse analysis, and we can use it in a number of places to avoid expression_returns_set searches. (There will be more such checks soon.) In some places, this allows decontorting the logic since it's no longer expensive to check for SRFs in the tlist --- so I made the checks parallel to the handling of hasAggs/hasWindowFuncs wherever it seemed appropriate. catversion bump because adding a Query field changes stored rules. Andres Freund and Tom Lane Discussion: <24639.1473782855@sss.pgh.pa.us>
9 years ago
return false;
/* check for features we can prove distinctness with */
if (query->distinctClause != NIL ||
query->groupClause != NIL ||
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
query->groupingSets != NIL ||
query->hasAggs ||
query->havingQual ||
query->setOperations)
return true;
return false;
}
/*
* query_is_distinct_for - does query never return duplicates of the
* specified columns?
*
* query is a not-yet-planned subquery (in current usage, it's always from
* a subquery RTE, which the planner avoids scribbling on).
*
* colnos is an integer list of output column numbers (resno's). We are
* interested in whether rows consisting of just these columns are certain
* to be distinct. "Distinctness" is defined according to whether the
* corresponding upper-level equality operators listed in opids would think
* the values are distinct. (Note: the opids entries could be cross-type
* operators, and thus not exactly the equality operators that the subquery
* would use itself. We use equality_ops_are_compatible() to check
* compatibility. That looks at btree or hash opfamily membership, and so
* should give trustworthy answers for all operators that we might need
* to deal with here.)
*/
bool
query_is_distinct_for(Query *query, List *colnos, List *opids)
{
ListCell *l;
Oid opid;
Assert(list_length(colnos) == list_length(opids));
/*
* DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
* columns in the DISTINCT clause appear in colnos and operator semantics
Improve planner's handling of set-returning functions in grouping columns. Improve query_is_distinct_for() to accept SRFs in the targetlist when we can prove distinctness from a DISTINCT clause. In that case the de-duplication will surely happen after SRF expansion, so the proof still works. Continue to punt in the case where we'd try to prove distinctness from GROUP BY (or, in the future, source relations). To do that, we'd have to determine whether the SRFs were in the grouping columns or elsewhere in the tlist, and it still doesn't seem worth the trouble. But this trivial change allows us to recognize that "SELECT DISTINCT unnest(foo) FROM ..." produces unique-ified output, which seems worth having. Also, fix estimate_num_groups() to consider the possibility of SRFs in the grouping columns. Its failure to do so was masked before v10 because grouping_planner() scaled up plan rowcount estimates by the estimated SRF multiplier after performing grouping. That doesn't happen anymore, which is more correct, but it means we need an adjustment in the estimate for the number of groups. Failure to do this leads to an underestimate for the number of output rows of subqueries like "SELECT DISTINCT unnest(foo)" compared to what 9.6 and earlier estimated, thus breaking plan choices in some cases. Per report from Dmitry Shalashov. Back-patch to v10 to avoid degraded plan choices compared to previous releases. Discussion: https://postgr.es/m/CAKPeCUGAeHgoh5O=SvcQxREVkoX7UdeJUMj1F5=aBNvoTa+O8w@mail.gmail.com
8 years ago
* match. This is true even if there are SRFs in the DISTINCT columns or
* elsewhere in the tlist.
*/
if (query->distinctClause)
{
foreach(l, query->distinctClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
Improve planner's handling of set-returning functions in grouping columns. Improve query_is_distinct_for() to accept SRFs in the targetlist when we can prove distinctness from a DISTINCT clause. In that case the de-duplication will surely happen after SRF expansion, so the proof still works. Continue to punt in the case where we'd try to prove distinctness from GROUP BY (or, in the future, source relations). To do that, we'd have to determine whether the SRFs were in the grouping columns or elsewhere in the tlist, and it still doesn't seem worth the trouble. But this trivial change allows us to recognize that "SELECT DISTINCT unnest(foo) FROM ..." produces unique-ified output, which seems worth having. Also, fix estimate_num_groups() to consider the possibility of SRFs in the grouping columns. Its failure to do so was masked before v10 because grouping_planner() scaled up plan rowcount estimates by the estimated SRF multiplier after performing grouping. That doesn't happen anymore, which is more correct, but it means we need an adjustment in the estimate for the number of groups. Failure to do this leads to an underestimate for the number of output rows of subqueries like "SELECT DISTINCT unnest(foo)" compared to what 9.6 and earlier estimated, thus breaking plan choices in some cases. Per report from Dmitry Shalashov. Back-patch to v10 to avoid degraded plan choices compared to previous releases. Discussion: https://postgr.es/m/CAKPeCUGAeHgoh5O=SvcQxREVkoX7UdeJUMj1F5=aBNvoTa+O8w@mail.gmail.com
8 years ago
/*
* Otherwise, a set-returning function in the query's targetlist can
* result in returning duplicate rows, despite any grouping that might
* occur before tlist evaluation. (If all tlist SRFs are within GROUP BY
* columns, it would be safe because they'd be expanded before grouping.
* But it doesn't currently seem worth the effort to check for that.)
*/
if (query->hasTargetSRFs)
return false;
/*
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
* Similarly, GROUP BY without GROUPING SETS guarantees uniqueness if all
* the grouped columns appear in colnos and operator semantics match.
*/
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
if (query->groupClause && !query->groupingSets)
{
foreach(l, query->groupClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
else if (query->groupingSets)
{
/*
* If we have grouping sets with expressions, we probably don't have
* uniqueness and analysis would be hard. Punt.
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
*/
if (query->groupClause)
return false;
/*
* If we have no groupClause (therefore no grouping expressions), we
* might have one or many empty grouping sets. If there's just one,
* then we're returning only one row and are certainly unique. But
* otherwise, we know we're certainly not unique.
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
*/
if (list_length(query->groupingSets) == 1 &&
((GroupingSet *) linitial(query->groupingSets))->kind == GROUPING_SET_EMPTY)
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
10 years ago
return true;
else
return false;
}
else
{
/*
* If we have no GROUP BY, but do have aggregates or HAVING, then the
* result is at most one row so it's surely unique, for any operators.
*/
if (query->hasAggs || query->havingQual)
return true;
}
/*
* UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
* except with ALL.
*/
if (query->setOperations)
{
SetOperationStmt *topop = castNode(SetOperationStmt, query->setOperations);
Assert(topop->op != SETOP_NONE);
if (!topop->all)
{
ListCell *lg;
/* We're good if all the nonjunk output columns are in colnos */
lg = list_head(topop->groupClauses);
foreach(l, query->targetList)
{
TargetEntry *tle = (TargetEntry *) lfirst(l);
SortGroupClause *sgc;
if (tle->resjunk)
continue; /* ignore resjunk columns */
/* non-resjunk columns should have grouping clauses */
Assert(lg != NULL);
sgc = (SortGroupClause *) lfirst(lg);
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
6 years ago
lg = lnext(topop->groupClauses, lg);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
}
/*
* XXX Are there any other cases in which we can easily see the result
* must be distinct?
*
* If you do add more smarts to this function, be sure to update
* query_supports_distinctness() to match.
*/
return false;
}
/*
* distinct_col_search - subroutine for query_is_distinct_for
*
* If colno is in colnos, return the corresponding element of opids,
* else return InvalidOid. (Ordinarily colnos would not contain duplicates,
* but if it does, we arbitrarily select the first match.)
*/
static Oid
distinct_col_search(int colno, List *colnos, List *opids)
{
ListCell *lc1,
*lc2;
forboth(lc1, colnos, lc2, opids)
{
if (colno == lfirst_int(lc1))
return lfirst_oid(lc2);
}
return InvalidOid;
}
/*
* innerrel_is_unique
* Check if the innerrel provably contains at most one tuple matching any
* tuple from the outerrel, based on join clauses in the 'restrictlist'.
*
* We need an actual RelOptInfo for the innerrel, but it's sufficient to
* identify the outerrel by its Relids. This asymmetry supports use of this
* function before joinrels have been built. (The caller is expected to
* also supply the joinrelids, just to save recalculating that.)
*
* The proof must be made based only on clauses that will be "joinquals"
* rather than "otherquals" at execution. For an inner join there's no
* difference; but if the join is outer, we must ignore pushed-down quals,
* as those will become "otherquals". Note that this means the answer might
* vary depending on whether IS_OUTER_JOIN(jointype); since we cache the
* answer without regard to that, callers must take care not to call this
* with jointypes that would be classified differently by IS_OUTER_JOIN().
*
* The actual proof is undertaken by is_innerrel_unique_for(); this function
* is a frontend that is mainly concerned with caching the answers.
* In particular, the force_cache argument allows overriding the internal
* heuristic about whether to cache negative answers; it should be "true"
* if making an inquiry that is not part of the normal bottom-up join search
* sequence.
*/
bool
innerrel_is_unique(PlannerInfo *root,
Relids joinrelids,
Relids outerrelids,
RelOptInfo *innerrel,
JoinType jointype,
List *restrictlist,
bool force_cache)
{
MemoryContext old_context;
ListCell *lc;
/* Certainly can't prove uniqueness when there are no joinclauses */
if (restrictlist == NIL)
return false;
/*
* Make a quick check to eliminate cases in which we will surely be unable
* to prove uniqueness of the innerrel.
*/
if (!rel_supports_distinctness(root, innerrel))
return false;
/*
* Query the cache to see if we've managed to prove that innerrel is
* unique for any subset of this outerrel. We don't need an exact match,
* as extra outerrels can't make the innerrel any less unique (or more
* formally, the restrictlist for a join to a superset outerrel must be a
* superset of the conditions we successfully used before).
*/
foreach(lc, innerrel->unique_for_rels)
{
Relids unique_for_rels = (Relids) lfirst(lc);
if (bms_is_subset(unique_for_rels, outerrelids))
return true; /* Success! */
}
/*
* Conversely, we may have already determined that this outerrel, or some
* superset thereof, cannot prove this innerrel to be unique.
*/
foreach(lc, innerrel->non_unique_for_rels)
{
Relids unique_for_rels = (Relids) lfirst(lc);
if (bms_is_subset(outerrelids, unique_for_rels))
return false;
}
/* No cached information, so try to make the proof. */
if (is_innerrel_unique_for(root, joinrelids, outerrelids, innerrel,
jointype, restrictlist))
{
/*
* Cache the positive result for future probes, being sure to keep it
* in the planner_cxt even if we are working in GEQO.
*
* Note: one might consider trying to isolate the minimal subset of
* the outerrels that proved the innerrel unique. But it's not worth
* the trouble, because the planner builds up joinrels incrementally
* and so we'll see the minimally sufficient outerrels before any
* supersets of them anyway.
*/
old_context = MemoryContextSwitchTo(root->planner_cxt);
innerrel->unique_for_rels = lappend(innerrel->unique_for_rels,
bms_copy(outerrelids));
MemoryContextSwitchTo(old_context);
return true; /* Success! */
}
else
{
/*
* None of the join conditions for outerrel proved innerrel unique, so
* we can safely reject this outerrel or any subset of it in future
* checks.
*
* However, in normal planning mode, caching this knowledge is totally
* pointless; it won't be queried again, because we build up joinrels
* from smaller to larger. It is useful in GEQO mode, where the
* knowledge can be carried across successive planning attempts; and
* it's likely to be useful when using join-search plugins, too. Hence
* cache when join_search_private is non-NULL. (Yeah, that's a hack,
* but it seems reasonable.)
*
* Also, allow callers to override that heuristic and force caching;
* that's useful for reduce_unique_semijoins, which calls here before
* the normal join search starts.
*/
if (force_cache || root->join_search_private)
{
old_context = MemoryContextSwitchTo(root->planner_cxt);
innerrel->non_unique_for_rels =
lappend(innerrel->non_unique_for_rels,
bms_copy(outerrelids));
MemoryContextSwitchTo(old_context);
}
return false;
}
}
/*
* is_innerrel_unique_for
* Check if the innerrel provably contains at most one tuple matching any
* tuple from the outerrel, based on join clauses in the 'restrictlist'.
*/
static bool
is_innerrel_unique_for(PlannerInfo *root,
Relids joinrelids,
Relids outerrelids,
RelOptInfo *innerrel,
JoinType jointype,
List *restrictlist)
{
List *clause_list = NIL;
ListCell *lc;
/*
* Search for mergejoinable clauses that constrain the inner rel against
* the outer rel. If an operator is mergejoinable then it behaves like
* equality for some btree opclass, so it's what we want. The
* mergejoinability test also eliminates clauses containing volatile
* functions, which we couldn't depend on.
*/
foreach(lc, restrictlist)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(lc);
/*
* As noted above, if it's a pushed-down clause and we're at an outer
* join, we can't use it.
*/
if (IS_OUTER_JOIN(jointype) &&
RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
continue;
/* Ignore if it's not a mergejoinable clause */
if (!restrictinfo->can_join ||
restrictinfo->mergeopfamilies == NIL)
continue; /* not mergejoinable */
/*
* Check if clause has the form "outer op inner" or "inner op outer",
* and if so mark which side is inner.
*/
if (!clause_sides_match_join(restrictinfo, outerrelids,
innerrel->relids))
continue; /* no good for these input relations */
/* OK, add to list */
clause_list = lappend(clause_list, restrictinfo);
}
/* Let rel_is_distinct_for() do the hard work */
return rel_is_distinct_for(root, innerrel, clause_list);
}