mirror of https://github.com/postgres/postgres
Tag:
Branch:
Tree:
e6ccd1ce16
REL2_0B
REL6_4
REL6_5_PATCHES
REL7_0_PATCHES
REL7_1_STABLE
REL7_2_STABLE
REL7_3_STABLE
REL7_4_STABLE
REL8_0_STABLE
REL8_1_STABLE
REL8_2_STABLE
REL8_3_STABLE
REL8_4_STABLE
REL8_5_ALPHA1_BRANCH
REL8_5_ALPHA2_BRANCH
REL8_5_ALPHA3_BRANCH
REL9_0_ALPHA4_BRANCH
REL9_0_ALPHA5_BRANCH
REL9_0_STABLE
REL9_1_STABLE
REL9_2_STABLE
REL9_3_STABLE
REL9_4_STABLE
REL9_5_STABLE
REL9_6_STABLE
REL_10_STABLE
REL_11_STABLE
REL_12_STABLE
REL_13_STABLE
REL_14_STABLE
REL_15_STABLE
REL_16_STABLE
REL_17_STABLE
REL_18_STABLE
Release_1_0_3
WIN32_DEV
ecpg_big_bison
master
PG95-1_01
PG95-1_08
PG95-1_09
REL2_0
REL6_1
REL6_1_1
REL6_2
REL6_2_1
REL6_3
REL6_3_2
REL6_4_2
REL6_5
REL6_5_1
REL6_5_2
REL6_5_3
REL7_0
REL7_0_2
REL7_0_3
REL7_1
REL7_1_1
REL7_1_2
REL7_1_3
REL7_1_BETA
REL7_1_BETA2
REL7_1_BETA3
REL7_2
REL7_2_1
REL7_2_2
REL7_2_3
REL7_2_4
REL7_2_5
REL7_2_6
REL7_2_7
REL7_2_8
REL7_2_BETA1
REL7_2_BETA2
REL7_2_BETA3
REL7_2_BETA4
REL7_2_BETA5
REL7_2_RC1
REL7_2_RC2
REL7_3
REL7_3_1
REL7_3_10
REL7_3_11
REL7_3_12
REL7_3_13
REL7_3_14
REL7_3_15
REL7_3_16
REL7_3_17
REL7_3_18
REL7_3_19
REL7_3_2
REL7_3_20
REL7_3_21
REL7_3_3
REL7_3_4
REL7_3_5
REL7_3_6
REL7_3_7
REL7_3_8
REL7_3_9
REL7_4
REL7_4_1
REL7_4_10
REL7_4_11
REL7_4_12
REL7_4_13
REL7_4_14
REL7_4_15
REL7_4_16
REL7_4_17
REL7_4_18
REL7_4_19
REL7_4_2
REL7_4_20
REL7_4_21
REL7_4_22
REL7_4_23
REL7_4_24
REL7_4_25
REL7_4_26
REL7_4_27
REL7_4_28
REL7_4_29
REL7_4_3
REL7_4_30
REL7_4_4
REL7_4_5
REL7_4_6
REL7_4_7
REL7_4_8
REL7_4_9
REL7_4_BETA1
REL7_4_BETA2
REL7_4_BETA3
REL7_4_BETA4
REL7_4_BETA5
REL7_4_RC1
REL7_4_RC2
REL8_0_0
REL8_0_0BETA1
REL8_0_0BETA2
REL8_0_0BETA3
REL8_0_0BETA4
REL8_0_0BETA5
REL8_0_0RC1
REL8_0_0RC2
REL8_0_0RC3
REL8_0_0RC4
REL8_0_0RC5
REL8_0_1
REL8_0_10
REL8_0_11
REL8_0_12
REL8_0_13
REL8_0_14
REL8_0_15
REL8_0_16
REL8_0_17
REL8_0_18
REL8_0_19
REL8_0_2
REL8_0_20
REL8_0_21
REL8_0_22
REL8_0_23
REL8_0_24
REL8_0_25
REL8_0_26
REL8_0_3
REL8_0_4
REL8_0_5
REL8_0_6
REL8_0_7
REL8_0_8
REL8_0_9
REL8_1_0
REL8_1_0BETA1
REL8_1_0BETA2
REL8_1_0BETA3
REL8_1_0BETA4
REL8_1_0RC1
REL8_1_1
REL8_1_10
REL8_1_11
REL8_1_12
REL8_1_13
REL8_1_14
REL8_1_15
REL8_1_16
REL8_1_17
REL8_1_18
REL8_1_19
REL8_1_2
REL8_1_20
REL8_1_21
REL8_1_22
REL8_1_23
REL8_1_3
REL8_1_4
REL8_1_5
REL8_1_6
REL8_1_7
REL8_1_8
REL8_1_9
REL8_2_0
REL8_2_1
REL8_2_10
REL8_2_11
REL8_2_12
REL8_2_13
REL8_2_14
REL8_2_15
REL8_2_16
REL8_2_17
REL8_2_18
REL8_2_19
REL8_2_2
REL8_2_20
REL8_2_21
REL8_2_22
REL8_2_23
REL8_2_3
REL8_2_4
REL8_2_5
REL8_2_6
REL8_2_7
REL8_2_8
REL8_2_9
REL8_2_BETA1
REL8_2_BETA2
REL8_2_BETA3
REL8_2_RC1
REL8_3_0
REL8_3_1
REL8_3_10
REL8_3_11
REL8_3_12
REL8_3_13
REL8_3_14
REL8_3_15
REL8_3_16
REL8_3_17
REL8_3_18
REL8_3_19
REL8_3_2
REL8_3_20
REL8_3_21
REL8_3_22
REL8_3_23
REL8_3_3
REL8_3_4
REL8_3_5
REL8_3_6
REL8_3_7
REL8_3_8
REL8_3_9
REL8_3_BETA1
REL8_3_BETA2
REL8_3_BETA3
REL8_3_BETA4
REL8_3_RC1
REL8_3_RC2
REL8_4_0
REL8_4_1
REL8_4_10
REL8_4_11
REL8_4_12
REL8_4_13
REL8_4_14
REL8_4_15
REL8_4_16
REL8_4_17
REL8_4_18
REL8_4_19
REL8_4_2
REL8_4_20
REL8_4_21
REL8_4_22
REL8_4_3
REL8_4_4
REL8_4_5
REL8_4_6
REL8_4_7
REL8_4_8
REL8_4_9
REL8_4_BETA1
REL8_4_BETA2
REL8_4_RC1
REL8_4_RC2
REL8_5_ALPHA1
REL8_5_ALPHA2
REL8_5_ALPHA3
REL9_0_0
REL9_0_1
REL9_0_10
REL9_0_11
REL9_0_12
REL9_0_13
REL9_0_14
REL9_0_15
REL9_0_16
REL9_0_17
REL9_0_18
REL9_0_19
REL9_0_2
REL9_0_20
REL9_0_21
REL9_0_22
REL9_0_23
REL9_0_3
REL9_0_4
REL9_0_5
REL9_0_6
REL9_0_7
REL9_0_8
REL9_0_9
REL9_0_ALPHA4
REL9_0_ALPHA5
REL9_0_BETA1
REL9_0_BETA2
REL9_0_BETA3
REL9_0_BETA4
REL9_0_RC1
REL9_1_0
REL9_1_1
REL9_1_10
REL9_1_11
REL9_1_12
REL9_1_13
REL9_1_14
REL9_1_15
REL9_1_16
REL9_1_17
REL9_1_18
REL9_1_19
REL9_1_2
REL9_1_20
REL9_1_21
REL9_1_22
REL9_1_23
REL9_1_24
REL9_1_3
REL9_1_4
REL9_1_5
REL9_1_6
REL9_1_7
REL9_1_8
REL9_1_9
REL9_1_ALPHA1
REL9_1_ALPHA2
REL9_1_ALPHA3
REL9_1_ALPHA4
REL9_1_ALPHA5
REL9_1_BETA1
REL9_1_BETA2
REL9_1_BETA3
REL9_1_RC1
REL9_2_0
REL9_2_1
REL9_2_10
REL9_2_11
REL9_2_12
REL9_2_13
REL9_2_14
REL9_2_15
REL9_2_16
REL9_2_17
REL9_2_18
REL9_2_19
REL9_2_2
REL9_2_20
REL9_2_21
REL9_2_22
REL9_2_23
REL9_2_24
REL9_2_3
REL9_2_4
REL9_2_5
REL9_2_6
REL9_2_7
REL9_2_8
REL9_2_9
REL9_2_BETA1
REL9_2_BETA2
REL9_2_BETA3
REL9_2_BETA4
REL9_2_RC1
REL9_3_0
REL9_3_1
REL9_3_10
REL9_3_11
REL9_3_12
REL9_3_13
REL9_3_14
REL9_3_15
REL9_3_16
REL9_3_17
REL9_3_18
REL9_3_19
REL9_3_2
REL9_3_20
REL9_3_21
REL9_3_22
REL9_3_23
REL9_3_24
REL9_3_25
REL9_3_3
REL9_3_4
REL9_3_5
REL9_3_6
REL9_3_7
REL9_3_8
REL9_3_9
REL9_3_BETA1
REL9_3_BETA2
REL9_3_RC1
REL9_4_0
REL9_4_1
REL9_4_10
REL9_4_11
REL9_4_12
REL9_4_13
REL9_4_14
REL9_4_15
REL9_4_16
REL9_4_17
REL9_4_18
REL9_4_19
REL9_4_2
REL9_4_20
REL9_4_21
REL9_4_22
REL9_4_23
REL9_4_24
REL9_4_25
REL9_4_26
REL9_4_3
REL9_4_4
REL9_4_5
REL9_4_6
REL9_4_7
REL9_4_8
REL9_4_9
REL9_4_BETA1
REL9_4_BETA2
REL9_4_BETA3
REL9_4_RC1
REL9_5_0
REL9_5_1
REL9_5_10
REL9_5_11
REL9_5_12
REL9_5_13
REL9_5_14
REL9_5_15
REL9_5_16
REL9_5_17
REL9_5_18
REL9_5_19
REL9_5_2
REL9_5_20
REL9_5_21
REL9_5_22
REL9_5_23
REL9_5_24
REL9_5_25
REL9_5_3
REL9_5_4
REL9_5_5
REL9_5_6
REL9_5_7
REL9_5_8
REL9_5_9
REL9_5_ALPHA1
REL9_5_ALPHA2
REL9_5_BETA1
REL9_5_BETA2
REL9_5_RC1
REL9_6_0
REL9_6_1
REL9_6_10
REL9_6_11
REL9_6_12
REL9_6_13
REL9_6_14
REL9_6_15
REL9_6_16
REL9_6_17
REL9_6_18
REL9_6_19
REL9_6_2
REL9_6_20
REL9_6_21
REL9_6_22
REL9_6_23
REL9_6_24
REL9_6_3
REL9_6_4
REL9_6_5
REL9_6_6
REL9_6_7
REL9_6_8
REL9_6_9
REL9_6_BETA1
REL9_6_BETA2
REL9_6_BETA3
REL9_6_BETA4
REL9_6_RC1
REL_10_0
REL_10_1
REL_10_10
REL_10_11
REL_10_12
REL_10_13
REL_10_14
REL_10_15
REL_10_16
REL_10_17
REL_10_18
REL_10_19
REL_10_2
REL_10_20
REL_10_21
REL_10_22
REL_10_23
REL_10_3
REL_10_4
REL_10_5
REL_10_6
REL_10_7
REL_10_8
REL_10_9
REL_10_BETA1
REL_10_BETA2
REL_10_BETA3
REL_10_BETA4
REL_10_RC1
REL_11_0
REL_11_1
REL_11_10
REL_11_11
REL_11_12
REL_11_13
REL_11_14
REL_11_15
REL_11_16
REL_11_17
REL_11_18
REL_11_19
REL_11_2
REL_11_20
REL_11_21
REL_11_22
REL_11_3
REL_11_4
REL_11_5
REL_11_6
REL_11_7
REL_11_8
REL_11_9
REL_11_BETA1
REL_11_BETA2
REL_11_BETA3
REL_11_BETA4
REL_11_RC1
REL_12_0
REL_12_1
REL_12_10
REL_12_11
REL_12_12
REL_12_13
REL_12_14
REL_12_15
REL_12_16
REL_12_17
REL_12_18
REL_12_19
REL_12_2
REL_12_20
REL_12_21
REL_12_22
REL_12_3
REL_12_4
REL_12_5
REL_12_6
REL_12_7
REL_12_8
REL_12_9
REL_12_BETA1
REL_12_BETA2
REL_12_BETA3
REL_12_BETA4
REL_12_RC1
REL_13_0
REL_13_1
REL_13_10
REL_13_11
REL_13_12
REL_13_13
REL_13_14
REL_13_15
REL_13_16
REL_13_17
REL_13_18
REL_13_19
REL_13_2
REL_13_20
REL_13_21
REL_13_22
REL_13_23
REL_13_3
REL_13_4
REL_13_5
REL_13_6
REL_13_7
REL_13_8
REL_13_9
REL_13_BETA1
REL_13_BETA2
REL_13_BETA3
REL_13_RC1
REL_14_0
REL_14_1
REL_14_10
REL_14_11
REL_14_12
REL_14_13
REL_14_14
REL_14_15
REL_14_16
REL_14_17
REL_14_18
REL_14_19
REL_14_2
REL_14_20
REL_14_3
REL_14_4
REL_14_5
REL_14_6
REL_14_7
REL_14_8
REL_14_9
REL_14_BETA1
REL_14_BETA2
REL_14_BETA3
REL_14_RC1
REL_15_0
REL_15_1
REL_15_10
REL_15_11
REL_15_12
REL_15_13
REL_15_14
REL_15_15
REL_15_2
REL_15_3
REL_15_4
REL_15_5
REL_15_6
REL_15_7
REL_15_8
REL_15_9
REL_15_BETA1
REL_15_BETA2
REL_15_BETA3
REL_15_BETA4
REL_15_RC1
REL_15_RC2
REL_16_0
REL_16_1
REL_16_10
REL_16_11
REL_16_2
REL_16_3
REL_16_4
REL_16_5
REL_16_6
REL_16_7
REL_16_8
REL_16_9
REL_16_BETA1
REL_16_BETA2
REL_16_BETA3
REL_16_RC1
REL_17_0
REL_17_1
REL_17_2
REL_17_3
REL_17_4
REL_17_5
REL_17_6
REL_17_7
REL_17_BETA1
REL_17_BETA2
REL_17_BETA3
REL_17_RC1
REL_18_0
REL_18_1
REL_18_BETA1
REL_18_BETA2
REL_18_BETA3
REL_18_RC1
Release_1_0_2
Release_2_0
Release_2_0_0
release-6-3
${ noResults }
657 Commits (e6ccd1ce1644d1b40b7981f8bc172394de524f99)
| Author | SHA1 | Message | Date |
|---|---|---|---|
|
|
a363bc6da9 |
Fix EXPLAIN ANALYZE for async-capable nodes.
EXPLAIN ANALYZE for an async-capable ForeignScan node associated with
postgres_fdw is done just by using instrumentation for ExecProcNode()
called from the node's callbacks, causing the following problems:
1) If the remote table to scan is empty, the node is incorrectly
considered as "never executed" by the command even if the node is
executed, as ExecProcNode() isn't called from the node's callbacks at
all in that case.
2) The command fails to collect timings for things other than
ExecProcNode() done in the node, such as creating a cursor for the
node's remote query.
To fix these problems, add instrumentation for async-capable nodes, and
modify postgres_fdw accordingly.
My oversight in commit
|
5 years ago |
|
|
d780d7c088 |
Change data type of counters in BufferUsage and WalUsage from long to int64.
Previously long was used as the data type for some counters in BufferUsage and WalUsage. But long is only four byte, e.g., on Windows, and it's entirely possible to wrap a four byte counter. For example, emitting more than four billion WAL records in one transaction isn't actually particularly rare. To avoid the overflows of those counters, this commit changes the data type of them from long to int64. Suggested-by: Andres Freund Author: Masahiro Ikeda Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/20201221211650.k7b53tcnadrciqjo@alap3.anarazel.de Discussion: https://postgr.es/m/af0964ac-7080-1984-dc23-513754987716@oss.nttdata.com |
5 years ago |
|
|
049e1e2edb |
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists.
It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present. If it happens, the ON CONFLICT UPDATE code path would end up storing tuples that include the values of the extra resjunk columns. That's fairly harmless in the short run, but if new columns are added to the table then the values would become accessible, possibly leading to malfunctions if they don't match the datatypes of the new columns. This had escaped notice through a confluence of missing sanity checks, including * There's no cross-check that a tuple presented to heap_insert or heap_update matches the table rowtype. While it's difficult to check that fully at reasonable cost, we can easily add assertions that there aren't too many columns. * The output-column-assignment cases in execExprInterp.c lacked any sanity checks on the output column numbers, which seems like an oversight considering there are plenty of assertion checks on input column numbers. Add assertions there too. * We failed to apply nodeModifyTable's ExecCheckPlanOutput() to the ON CONFLICT UPDATE tlist. That wouldn't have caught this specific error, since that function is chartered to ignore resjunk columns; but it sure seems like a bad omission now that we've seen this bug. In HEAD, the right way to fix this is to make the processing of ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists now do, that is don't add "SET x = x" entries, and use ExecBuildUpdateProjection to evaluate the tlist and combine it with old values of the not-set columns. This adds a little complication to ExecBuildUpdateProjection, but allows removal of a comparable amount of now-dead code from the planner. In the back branches, the most expedient solution seems to be to (a) use an output slot for the ON CONFLICT UPDATE projection that actually matches the target table, and then (b) invent a variant of ExecBuildProjectionInfo that can be told to not store values resulting from resjunk columns, so it doesn't try to store into nonexistent columns of the output slot. (We can't simply ignore the resjunk columns altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.) This works back to v10. In 9.6, projections work much differently and we can't cheaply give them such an option. The 9.6 version of this patch works by inserting a JunkFilter when it's necessary to get rid of resjunk columns. In addition, v11 and up have the reverse problem when trying to perform ON CONFLICT UPDATE on a partitioned table. Through a further oversight, adjust_partition_tlist() discarded resjunk columns when re-ordering the ON CONFLICT UPDATE tlist to match a partition. This accidentally prevented the storing-bogus-tuples problem, but at the cost that MULTIEXPR_SUBLINK cases didn't work, typically crashing if more than one row has to be updated. Fix by preserving resjunk columns in that routine. (I failed to resist the temptation to add more assertions there too, and to do some minor code beautification.) Per report from Andres Freund. Back-patch to all supported branches. Security: CVE-2021-32028 |
5 years ago |
|
|
1111b2668d |
Undo decision to allow pg_proc.prosrc to be NULL.
Commit
|
5 years ago |
|
|
c2db458c10 |
Redesign the caching done by get_cached_rowtype().
Previously, get_cached_rowtype() cached a pointer to a reference-counted
tuple descriptor from the typcache, relying on the ExprContextCallback
mechanism to release the tupdesc refcount when the expression tree
using the tupdesc was destroyed. This worked fine when it was designed,
but the introduction of within-DO-block COMMITs broke it. The refcount
is logged in a transaction-lifespan resource owner, but plpgsql won't
destroy simple expressions made within the DO block (before its first
commit) until the DO block is exited. That results in a warning about
a leaked tupdesc refcount when the COMMIT destroys the original resource
owner, and then an error about the active resource owner not holding a
matching refcount when the expression is destroyed.
To fix, get rid of the need to have a shutdown callback at all, by
instead caching a pointer to the relevant typcache entry. Those
survive for the life of the backend, so we needn't worry about the
pointer becoming stale. (For registered RECORD types, we can still
cache a pointer to the tupdesc, knowing that it won't change for the
life of the backend.) This mechanism has been in use in plpgsql
and expandedrecord.c since commit
|
5 years ago |
|
|
50e17ad281 |
Speedup ScalarArrayOpExpr evaluation
ScalarArrayOpExprs with "useOr=true" and a set of Consts on the righthand side have traditionally been evaluated by using a linear search over the array. When these arrays contain large numbers of elements then this linear search could become a significant part of execution time. Here we add a new method of evaluating ScalarArrayOpExpr expressions to allow them to be evaluated by first building a hash table containing each element, then on subsequent evaluations, we just probe that hash table to determine if there is a match. The planner is in charge of determining when this optimization is possible and it enables it by setting hashfuncid in the ScalarArrayOpExpr. The executor will only perform the hash table evaluation when the hashfuncid is set. This means that not all cases are optimized. For example CHECK constraints containing an IN clause won't go through the planner, so won't get the hashfuncid set. We could maybe do something about that at some later date. The reason we're not doing it now is from fear that we may slow down cases where the expression is evaluated only once. Those cases can be common, for example, a single row INSERT to a table with a CHECK constraint containing an IN clause. In the planner, we enable this when there are suitable hash functions for the ScalarArrayOpExpr's operator and only when there is at least MIN_ARRAY_SIZE_FOR_HASHED_SAOP elements in the array. The threshold is currently set to 9. Author: James Coleman, David Rowley Reviewed-by: David Rowley, Tomas Vondra, Heikki Linnakangas Discussion: https://postgr.es/m/CAAaqYe8x62+=wn0zvNKCj55tPpg-JBHzhZFFc6ANovdqFw7-dA@mail.gmail.com |
5 years ago |
|
|
e717a9a18b |
SQL-standard function body
This adds support for writing CREATE FUNCTION and CREATE PROCEDURE
statements for language SQL with a function body that conforms to the
SQL standard and is portable to other implementations.
Instead of the PostgreSQL-specific AS $$ string literal $$ syntax,
this allows writing out the SQL statements making up the body
unquoted, either as a single statement:
CREATE FUNCTION add(a integer, b integer) RETURNS integer
LANGUAGE SQL
RETURN a + b;
or as a block
CREATE PROCEDURE insert_data(a integer, b integer)
LANGUAGE SQL
BEGIN ATOMIC
INSERT INTO tbl VALUES (a);
INSERT INTO tbl VALUES (b);
END;
The function body is parsed at function definition time and stored as
expression nodes in a new pg_proc column prosqlbody. So at run time,
no further parsing is required.
However, this form does not support polymorphic arguments, because
there is no more parse analysis done at call time.
Dependencies between the function and the objects it uses are fully
tracked.
A new RETURN statement is introduced. This can only be used inside
function bodies. Internally, it is treated much like a SELECT
statement.
psql needs some new intelligence to keep track of function body
boundaries so that it doesn't send off statements when it sees
semicolons that are inside a function body.
Tested-by: Jaime Casanova <jcasanov@systemguards.com.ec>
Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/1c11f1eb-f00c-43b7-799d-2d44132c02d7@2ndquadrant.com
|
5 years ago |
|
|
c5b7ba4e67 |
Postpone some stuff out of ExecInitModifyTable.
Arrange to do some things on-demand, rather than immediately during
executor startup, because there's a fair chance of never having to do
them at all:
* Don't open result relations' indexes until needed.
* Don't initialize partition tuple routing, nor the child-to-root
tuple conversion map, until needed.
This wins in UPDATEs on partitioned tables when only some of the
partitions will actually receive updates; with larger partition
counts the savings is quite noticeable. Also, we can remove some
sketchy heuristics in ExecInitModifyTable about whether to set up
tuple routing.
Also, remove execPartition.c's private hash table tracking which
partitions were already opened by the ModifyTable node. Instead
use the hash added to ModifyTable itself by commit
|
5 years ago |
|
|
789d81de8a |
Fix missing #include in nodeResultCache.h.
Per cpluspluscheck. |
5 years ago |
|
|
9eacee2e62 |
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com |
5 years ago |
|
|
28b3e3905c |
Revert b6002a796
This removes "Add Result Cache executor node". It seems that something weird is going on with the tracking of cache hits and misses as highlighted by many buildfarm animals. It's not yet clear what the problem is as other parts of the plan indicate that the cache did work correctly, it's just the hits and misses that were being reported as 0. This is especially a bad time to have the buildfarm so broken, so reverting before too many more animals go red. Discussion: https://postgr.es/m/CAApHDvq_hydhfovm4=izgWs+C5HqEeRScjMbOgbpC-jRAeK3Yw@mail.gmail.com |
5 years ago |
|
|
b6002a796d |
Add Result Cache executor node
Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com |
5 years ago |
|
|
86dc90056d |
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com |
5 years ago |
|
|
27e1f14563 |
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a non-parallel-aware Append concurrently rather than serially to improve performance when possible. Currently, the only node type that can be run concurrently is a ForeignScan that is an immediate child of such an Append. In the case where such ForeignScans access data on different remote servers, this would run those ForeignScans concurrently, and overlap the remote operations to be performed simultaneously, so it'll improve the performance especially when the operations involve time-consuming ones such as remote join and remote aggregation. We may extend this to other node types such as joins or aggregates over ForeignScans in the future. This also adds the support for postgres_fdw, which is enabled by the table-level/server-level option "async_capable". The default is false. Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit is mostly based on the patch proposed by Robert Haas, but also uses stuff from the patch proposed by Kyotaro Horiguchi and from the patch proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and others. Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com |
5 years ago |
|
|
7f7f25f15e |
Revert "Fix race in Parallel Hash Join batch cleanup."
This reverts commit |
5 years ago |
|
|
378802e371 |
Update the names of Parallel Hash Join phases.
Commit |
5 years ago |
|
|
3b8981b6e1 |
Fix race in Parallel Hash Join batch cleanup.
With very unlucky timing and parallel_leader_participation off, PHJ could attempt to access per-batch state just as it was being freed. There was code intended to prevent that by checking for a cleared pointer, but it was buggy. Fix, by introducing an extra barrier phase. The new phase PHJ_BUILD_RUNNING means that it's safe to access the per-batch state to find a batch to help with, and PHJ_BUILD_DONE means that it is too late. The last to detach will free the array of per-batch state as before, but now it will also atomically advance the phase at the same time, so that late attachers can avoid the hazard, without the data race. This mirrors the way per-batch hash tables are freed (see phases PHJ_BATCH_PROBING and PHJ_BATCH_DONE). Revealed by a one-off build farm failure, where BarrierAttach() failed a sanity check assertion, because the memory had been clobbered by dsa_free(). Back-patch to 11, where the code arrived. Reported-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz |
5 years ago |
|
|
bb437f995d |
Add TID Range Scans to support efficient scanning ranges of TIDs
This adds a new executor node named TID Range Scan. The query planner will generate paths for TID Range scans when quals are discovered on base relations which search for ranges on the table's ctid column. These ranges may be open at either end. For example, WHERE ctid >= '(10,0)'; will return all tuples on page 10 and over. To support this, two new optional callback functions have been added to table AM. scan_set_tidrange is used to set the scan range to just the given range of TIDs. scan_getnextslot_tidrange fetches the next tuple in the given range. For AMs were scanning ranges of TIDs would not make sense, these functions can be set to NULL in the TableAmRoutine. The query planner won't generate TID Range Scan Paths in that case. Author: Edmund Horner, David Rowley Reviewed-by: David Rowley, Tomas Vondra, Tom Lane, Andres Freund, Zhihong Yu Discussion: https://postgr.es/m/CAMyN-kB-nFTkF=VA_JPwFNo08S0d-Yk0F741S2B7LDmYAi8eyA@mail.gmail.com |
5 years ago |
|
|
6214e2b228 |
Fix permission checks on constraint violation errors on partitions.
If a cross-partition UPDATE violates a constraint on the target partition,
and the columns in the new partition are in different physical order than
in the parent, the error message can reveal columns that the user does not
have SELECT permission on. A similar bug was fixed earlier in commit
|
5 years ago |
|
|
d5a83d79c9 |
Rethink recently-added SPI interfaces.
SPI_execute_with_receiver and SPI_cursor_parse_open_with_paramlist are
new in v14 (cf. commit
|
5 years ago |
|
|
ee895a655c |
Improve performance of repeated CALLs within plpgsql procedures.
This patch essentially is cleaning up technical debt left behind
by the original implementation of plpgsql procedures, particularly
commit
|
5 years ago |
|
|
9dc718bdf2 |
Pass down "logically unchanged index" hint.
Add an executor aminsert() hint mechanism that informs index AMs that the incoming index tuple (the tuple that accompanies the hint) is not being inserted by execution of an SQL statement that logically modifies any of the index's key columns. The hint is received by indexes when an UPDATE takes place that does not apply an optimization like heapam's HOT (though only for indexes where all key columns are logically unchanged). Any index tuple that receives the hint on insert is expected to be a duplicate of at least one existing older version that is needed for the same logical row. Related versions will typically be stored on the same index page, at least within index AMs that apply the hint. Recognizing the difference between MVCC version churn duplicates and true logical row duplicates at the index AM level can help with cleanup of garbage index tuples. Cleanup can intelligently target tuples that are likely to be garbage, without wasting too many cycles on less promising tuples/pages (index pages with little or no version churn). This is infrastructure for an upcoming commit that will teach nbtree to perform bottom-up index deletion. No index AM actually applies the hint just yet. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wz=CEKFa74EScx_hFVshCOn6AA5T-ajFASTdzipdkLTNQQ@mail.gmail.com |
5 years ago |
|
|
844fe9f159 |
Add the ability for the core grammar to have more than one parse target.
This patch essentially allows gram.y to implement a family of related syntax trees, rather than necessarily always parsing a list of SQL statements. raw_parser() gains a new argument, enum RawParseMode, to say what to do. As proof of concept, add a mode that just parses a TypeName without any other decoration, and use that to greatly simplify typeStringToTypeName(). In addition, invent a new SPI entry point SPI_prepare_extended() to allow SPI users (particularly plpgsql) to get at this new functionality. In hopes of making this the last variant of SPI_prepare(), set up its additional arguments as a struct rather than direct arguments, and promise that future additions to the struct can default to zero. SPI_prepare_cursor() and SPI_prepare_params() can perhaps go away at some point. Discussion: https://postgr.es/m/4165684.1607707277@sss.pgh.pa.us |
5 years ago |
|
|
ca3b37487b |
Update copyright for 2021
Backpatch-through: 9.5 |
5 years ago |
|
|
c7aba7c14e |
Support subscripting of arbitrary types, not only arrays.
This patch generalizes the subscripting infrastructure so that any data type can be subscripted, if it provides a handler function to define what that means. Traditional variable-length (varlena) arrays all use array_subscript_handler(), while the existing fixed-length types that support subscripting use raw_array_subscript_handler(). It's expected that other types that want to use subscripting notation will define their own handlers. (This patch provides no such new features, though; it only lays the foundation for them.) To do this, move the parser's semantic processing of subscripts (including coercion to whatever data type is required) into a method callback supplied by the handler. On the execution side, replace the ExecEvalSubscriptingRef* layer of functions with direct calls to callback-supplied execution routines. (Thus, essentially no new run-time overhead should be caused by this patch. Indeed, there is room to remove some overhead by supplying specialized execution routines. This patch does a little bit in that line, but more could be done.) Additional work is required here and there to remove formerly hard-wired assumptions about the result type, collation, etc of a SubscriptingRef expression node; and to remove assumptions that the subscript values must be integers. One useful side-effect of this is that we now have a less squishy mechanism for identifying whether a data type is a "true" array: instead of wiring in weird rules about typlen, we can look to see if pg_type.typsubscript == F_ARRAY_SUBSCRIPT_HANDLER. For this to be bulletproof, we have to forbid user-defined types from using that handler directly; but there seems no good reason for them to do so. This patch also removes assumptions that the number of subscripts is limited to MAXDIM (6), or indeed has any hard-wired limit. That limit still applies to types handled by array_subscript_handler or raw_array_subscript_handler, but to discourage other dependencies on this constant, I've moved it from c.h to utils/array.h. Dmitry Dolgov, reviewed at various times by Tom Lane, Arthur Zakirov, Peter Eisentraut, Pavel Stehule Discussion: https://postgr.es/m/CA+q6zcVDuGBv=M0FqBYX8DPebS3F_0KQ6OVFobGJPM507_SZ_w@mail.gmail.com Discussion: https://postgr.es/m/CA+q6zcVovR+XY4mfk-7oNk-rF91gH0PebnNfuUjuuDsyHjOcVA@mail.gmail.com |
5 years ago |
|
|
0a2bc5d61e |
Move per-agg and per-trans duplicate finding to the planner.
This has the advantage that the cost estimates for aggregates can count the number of calls to transition and final functions correctly. Bump catalog version, because views can contain Aggrefs. Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/b2e3536b-1dbc-8303-c97e-89cb0b4a9a48%40iki.fi |
5 years ago |
|
|
20d3fe9009 |
In INSERT/UPDATE, use the table's real tuple descriptor as target.
Previously, ExecInitModifyTable relied on ExecInitJunkFilter,
and thence ExecCleanTypeFromTL, to build the target descriptor from
the query tlist. While we just checked (in ExecCheckPlanOutput)
that the tlist produces compatible output, this is not a great
substitute for the relation's actual tuple descriptor that's
available from the relcache. For one thing, dropped columns will
not be correctly marked attisdropped; it's a bit surprising that
we've gotten away with that this long. But the real reason for
being concerned with this is that using the table's descriptor means
that the slot will have correct attrmissing data, allowing us to
revert the klugy fix of commit
|
5 years ago |
|
|
c8ab970179 |
Fix list-munging bug that broke SQL function result coercions.
Since commit
|
5 years ago |
|
|
fb5883da86 |
Remove PartitionRoutingInfo struct.
The extra indirection neeeded to access its members via its enclosing ResultRelInfo seems pointless. Move all the fields from PartitionRoutingInfo to ResultRelInfo. Author: Amit Langote Reviewed-by: Alvaro Herrera Discussion: https://www.postgresql.org/message-id/CA%2BHiwqFViT47Zbr_ASBejiK7iDG8%3DQ1swQ-tjM6caRPQ67pT%3Dw%40mail.gmail.com |
5 years ago |
|
|
6973533650 |
Revise child-to-root tuple conversion map management.
Store the tuple conversion map to convert a tuple from a child table's format to the root format in a new ri_ChildToRootMap field in ResultRelInfo. It is initialized if transition tuple capture for FOR STATEMENT triggers or INSERT tuple routing on a partitioned table is needed. Previously, ModifyTable kept the maps in the per-subplan ModifyTableState->mt_per_subplan_tupconv_maps array, or when tuple routing was used, in ResultRelInfo->ri_Partitioninfo->pi_PartitionToRootMap. The new field replaces both of those. Now that the child-to-root tuple conversion map is always available in ResultRelInfo (when needed), remove the TransitionCaptureState.tcs_map field. The callers of Exec*Trigger() functions no longer need to set or save it, which is much less confusing and bug-prone. Also, as a future optimization, this will allow us to delay creating the map for a given result relation until the relation is actually processed during execution. Author: Amit Langote Discussion: https://www.postgresql.org/message-id/CA%2BHiwqHtCWLdK-LO%3DNEsvOdHx%2B7yv4mE_zYK0i3BH7dXb-wxog%40mail.gmail.com |
5 years ago |
|
|
a04daa97a4 |
Remove es_result_relation_info from EState.
Maintaining 'es_result_relation_info' correctly at all times has become cumbersome, especially with partitioning where each partition gets its own result relation info. Having to set and reset it across arbitrary operations has caused bugs in the past. This changes all the places that used 'es_result_relation_info', to receive the currently active ResultRelInfo via function parameters instead. Author: Amit Langote Discussion: https://www.postgresql.org/message-id/CA%2BHiwqGEmiib8FLiHMhKB%2BCH5dRgHSLc5N5wnvc4kym%2BZYpQEQ%40mail.gmail.com |
5 years ago |
|
|
1375422c78 |
Create ResultRelInfos later in InitPlan, index them by RT index.
Instead of allocating all the ResultRelInfos upfront in one big array, allocate them in ExecInitModifyTable(). es_result_relations is now an array of ResultRelInfo pointers, rather than an array of structs, and it is indexed by the RT index. This simplifies things: we get rid of the separate concept of a "result rel index", and don't need to set it in setrefs.c anymore. This also allows follow-up optimizations (not included in this commit yet) to skip initializing ResultRelInfos for target relations that were not needed at runtime, and removal of the es_result_relation_info pointer. The EState arrays of regular result rels and root result rels are merged into one array. Similarly, the resultRelations and rootResultRelations lists in PlannedStmt are merged into one. It's not actually clear to me why they were kept separate in the first place, but now that the es_result_relations array is indexed by RT index, it certainly seems pointless. The PlannedStmt->resultRelations list is now only needed for ExecRelationIsTargetRelation(). One visible effect of this change is that ExecRelationIsTargetRelation() will now return 'true' also for the partition root, if a partitioned table is updated. That seems like a good thing, although the function isn't used in core code, and I don't see any reason for an FDW to call it on a partition root. Author: Amit Langote Discussion: https://www.postgresql.org/message-id/CA%2BHiwqGEmiib8FLiHMhKB%2BCH5dRgHSLc5N5wnvc4kym%2BZYpQEQ%40mail.gmail.com |
5 years ago |
|
|
41efb83408 |
Move resolution of AlternativeSubPlan choices to the planner.
When commit
|
5 years ago |
|
|
7b1110d2fd |
Fix comment in instrument.h
local_blks_dirtied tracks the number of local blocks dirtied, not shared ones. Author: Kirk Jamison Discussion: https://postgr.es/m/OSBPR01MB2341760686DC056DE89D2AB9EF710@OSBPR01MB2341.jpnprd01.prod.outlook.com |
5 years ago |
|
|
d6c08e29e7 |
Add hash_mem_multiplier GUC.
Add a GUC that acts as a multiplier on work_mem. It gets applied when
sizing executor node hash tables that were previously size constrained
using work_mem alone.
The new GUC can be used to preferentially give hash-based nodes more
memory than the generic work_mem limit. It is intended to enable admin
tuning of the executor's memory usage. Overall system throughput and
system responsiveness can be improved by giving hash-based executor
nodes more memory (especially over sort-based alternatives, which are
often much less sensitive to being memory constrained).
The default value for hash_mem_multiplier is 1.0, which is also the
minimum valid value. This means that hash-based nodes continue to apply
work_mem in the traditional way by default.
hash_mem_multiplier is generally useful. However, it is being added now
due to concerns about hash aggregate performance stability for users
that upgrade to Postgres 13 (which added disk-based hash aggregation in
commit
|
6 years ago |
|
|
9878b643f3 |
HashAgg: use better cardinality estimate for recursive spilling.
Use HyperLogLog to estimate the group cardinality in a spilled partition. This estimate is used to choose the number of partitions if we recurse. The previous behavior was to use the number of tuples in a spilled partition as the estimate for the number of groups, which lead to overpartitioning. That could cause the number of batches to be much higher than expected (with each batch being very small), which made it harder to interpret EXPLAIN ANALYZE results. Reviewed-by: Peter Geoghegan Discussion: https://postgr.es/m/a856635f9284bc36f7a77d02f47bbb6aaf7b59b3.camel@j-davis.com Backpatch-through: 13 |
6 years ago |
|
|
200f6100a9 |
Fix LookupTupleHashEntryHash() pipeline-stall issue.
Refactor hash lookups in nodeAgg.c to improve performance. Author: Andres Freund and Jeff Davis Discussion: https://postgr.es/m/20200612213715.op4ye4q7gktqvpuo%40alap3.anarazel.de Backpatch-through: 13 |
6 years ago |
|
|
cdc7169509 |
Use MinimalTuple for tuple queues.
This representation saves 8 bytes per tuple compared to HeapTuple, and avoids the need to allocate, copy and free on the receiving side. Gather can emit the returned MinimalTuple directly, but GatherMerge now needs to make an explicit copy because it buffers multiple tuples at a time. That should be no worse than before. Reviewed-by: Soumyadeep Chakraborty <soumyadeep2007@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKG%2B8T_ggoUTAE-U%3DA%2BOcPc4%3DB0nPPHcSfffuQhvXXjML6w%40mail.gmail.com |
6 years ago |
|
|
9bdb300ded |
Fix EXPLAIN ANALYZE for parallel HashAgg plans
Since
|
6 years ago |
|
|
2f48ede080 |
Avoid using a cursor in plpgsql's RETURN QUERY statement.
plpgsql has always executed the query given in a RETURN QUERY command by opening it as a cursor and then fetching a few rows at a time, which it turns around and dumps into the function's result tuplestore. The point of this was to keep from blowing out memory with an oversized SPITupleTable result (note that while a tuplestore can spill tuples to disk, SPITupleTable cannot). However, it's rather inefficient, both because of extra data copying and because of executor entry/exit overhead. In recent versions, a new performance problem has emerged: use of a cursor prevents use of a parallel plan for the executed query. We can improve matters by skipping use of a cursor and having the executor push result tuples directly into the function's result tuplestore. However, a moderate amount of new infrastructure is needed to make that idea work: * We can use the existing tstoreReceiver.c DestReceiver code to funnel executor output to the tuplestore, but it has to be extended to support plpgsql's requirement for possibly applying a tuple conversion map. * SPI needs to be extended to allow use of a caller-supplied DestReceiver instead of its usual receiver that puts tuples into a SPITupleTable. Two new API calls are needed to handle both the RETURN QUERY and RETURN QUERY EXECUTE cases. I also felt that I didn't want these new API calls to use the legacy method of specifying query parameter values with "char" null flags (the old ' '/'n' convention); rather they should accept ParamListInfo objects containing the parameter type and value info. This required a bit of additional new infrastructure since we didn't yet have any parse analysis callback that would interpret $N parameter symbols according to type data supplied in a ParamListInfo. There seems to be no harm in letting makeParamList install that callback by default, rather than leaving a new ParamListInfo's parserSetup hook as NULL. (Indeed, as of HEAD, I couldn't find anyplace that was using the parserSetup field at all; plpgsql was using parserSetupArg for its own purposes, but parserSetup seemed to be write-only.) We can actually get plpgsql out of the business of using legacy null flags altogether, and using ParamListInfo instead of its ad-hoc PreparedParamsData structure; but this requires inventing one more SPI API call that can replace SPI_cursor_open_with_args. That seems worth doing, though. SPI_execute_with_args and SPI_cursor_open_with_args are now unused anywhere in the core PG distribution. Perhaps someday we could deprecate/remove them. But cleaning up the crufty bits of the SPI API is a task for a different patch. Per bug #16040 from Jeremy Smith. This is unfortunately too invasive to consider back-patching. Patch by me; thanks to Hamid Akhtar for review. Discussion: https://postgr.es/m/16040-eaacad11fecfb198@postgresql.org |
6 years ago |
|
|
5cbfce562f |
Initial pgindent and pgperltidy run for v13.
Includes some manual cleanup of places that pgindent messed up,
most of which weren't per project style anyway.
Notably, it seems some people didn't absorb the style rules of
commit
|
6 years ago |
|
|
69bfaf2e1d |
Change the display of WAL usage statistics in Explain.
In commit
|
6 years ago |
|
|
ef08ca113f |
Cosmetic fixups for WAL usage work.
Reported-by: Justin Pryzby and Euler Taveira Author: Justin Pryzby and Julien Rouhaud Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/CAB-hujrP8ZfUkvL5OYETipQwA=e3n7oqHFU=4ZLxWS_Cza3kQQ@mail.gmail.com |
6 years ago |
|
|
969f9d0b4b |
Make EXPLAIN report maximum hashtable usage across multiple rescans.
Before discarding the old hash table in ExecReScanHashJoin, capture its statistics, ensuring that we report the maximum hashtable size across repeated rescans of the hash input relation. We can repurpose the existing code for reporting hashtable size in parallel workers to help with this, making the patch pretty small. This also ensures that if rescans happen within parallel workers, we get the correct maximums across all instances. Konstantin Knizhnik and Tom Lane, per diagnosis by Thomas Munro of a trouble report from Alvaro Herrera. Discussion: https://postgr.es/m/20200323165059.GA24950@alvherre.pgsql |
6 years ago |
|
|
50a38f6517 |
Create memory context for HashAgg with a reasonable maxBlockSize.
If the memory context's maxBlockSize is too big, a single block allocation can suddenly exceed work_mem. For Hash Aggregation, this can mean spilling to disk too early or reporting a confusing memory usage number for EXPLAN ANALYZE. Introduce CreateWorkExprContext(), which is like CreateExprContext(), except that it creates the AllocSet with a maxBlockSize that is reasonable in proportion to work_mem. Right now, CreateWorkExprContext() is only used by Hash Aggregation, but it may be generally useful in the future. Discussion: https://postgr.es/m/412a3fbf306f84d8d78c4009e11791867e62b87c.camel@j-davis.com |
6 years ago |
|
|
d2d8a229bc |
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com |
6 years ago |
|
|
df3b181499 |
Add infrastructure to track WAL usage.
This allows gathering the WAL generation statistics for each statement execution. The three statistics that we collect are the number of WAL records, the number of full page writes and the amount of WAL bytes generated. This helps the users who have write-intensive workload to see the impact of I/O due to WAL. This further enables us to see approximately what percentage of overall WAL is due to full page writes. In the future, we can extend this functionality to allow us to compute the the exact amount of WAL data due to full page writes. This patch in itself is just an infrastructure to compute WAL usage data. The upcoming patches will expose this data via explain, auto_explain, pg_stat_statements and verbose (auto)vacuum output. Author: Kirill Bychik, Julien Rouhaud Reviewed-by: Dilip Kumar, Fujii Masao and Amit Kapila Discussion: https://postgr.es/m/CAB-hujrP8ZfUkvL5OYETipQwA=e3n7oqHFU=4ZLxWS_Cza3kQQ@mail.gmail.com |
6 years ago |
|
|
0588ee63aa |
Include chunk overhead in hash table entry size estimate.
Don't try to be precise about it, just use a constant 16 bytes of chunk overhead. Being smarter would require knowing the memory context where the chunk will be allocated, which is not known by all callers. Discussion: https://postgr.es/m/20200325220936.il3ni2fj2j2b45y5@alap3.anarazel.de |
6 years ago |
|
|
4a539a25eb |
Expose BufferUsageAccumDiff().
Previously pg_stat_statements calculated the difference of buffer counters by its own code even while BufferUsageAccumDiff() had the same code. This commit expose BufferUsageAccumDiff() and makes pg_stat_statements use it for the calculation, in order to simply the code. This change also would be useful for the upcoming patch for the planning counters in pg_stat_statements because the patch will add one more code for the calculation of difference of buffer counters and that can easily be done by using BufferUsageAccumDiff(). Author: Julien Rouhaud Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/bdfee4e0-a304-2498-8da5-3cb52c0a193e@oss.nttdata.com |
6 years ago |
|
|
bda6dedbea |
Go back to returning int from ereport auxiliary functions.
This reverts the parts of commit
|
6 years ago |