mirror of https://github.com/postgres/postgres
Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.compull/51/head
parent
3c8553547b
commit
d2d8a229bc
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,28 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* nodeIncrementalSort.h |
||||
* |
||||
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* src/include/executor/nodeIncrementalSort.h |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef NODEINCREMENTALSORT_H |
||||
#define NODEINCREMENTALSORT_H |
||||
|
||||
#include "access/parallel.h" |
||||
#include "nodes/execnodes.h" |
||||
|
||||
extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags); |
||||
extern void ExecEndIncrementalSort(IncrementalSortState *node); |
||||
extern void ExecReScanIncrementalSort(IncrementalSortState *node); |
||||
|
||||
/* parallel instrumentation support */ |
||||
extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt); |
||||
extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt); |
||||
extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt); |
||||
extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node); |
||||
|
||||
#endif /* NODEINCREMENTALSORT_H */ |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,213 @@ |
||||
-- When we have to sort the entire table, incremental sort will |
||||
-- be slower than plain sort, so it should not be used. |
||||
explain (costs off) |
||||
select * from (select * from tenk1 order by four) t order by four, ten; |
||||
|
||||
-- When there is a LIMIT clause, incremental sort is beneficial because |
||||
-- it only has to sort some of the groups, and not the entire table. |
||||
explain (costs off) |
||||
select * from (select * from tenk1 order by four) t order by four, ten |
||||
limit 1; |
||||
|
||||
-- When work_mem is not enough to sort the entire table, incremental sort |
||||
-- may be faster if individual groups still fit into work_mem. |
||||
set work_mem to '2MB'; |
||||
explain (costs off) |
||||
select * from (select * from tenk1 order by four) t order by four, ten; |
||||
reset work_mem; |
||||
|
||||
create table t(a integer, b integer); |
||||
|
||||
create or replace function explain_analyze_without_memory(query text) |
||||
returns table (out_line text) language plpgsql |
||||
as |
||||
$$ |
||||
declare |
||||
line text; |
||||
begin |
||||
for line in |
||||
execute 'explain (analyze, costs off, summary off, timing off) ' || query |
||||
loop |
||||
out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); |
||||
return next; |
||||
end loop; |
||||
end; |
||||
$$; |
||||
|
||||
create or replace function explain_analyze_inc_sort_nodes(query text) |
||||
returns jsonb language plpgsql |
||||
as |
||||
$$ |
||||
declare |
||||
elements jsonb; |
||||
element jsonb; |
||||
matching_nodes jsonb := '[]'::jsonb; |
||||
begin |
||||
execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements; |
||||
while jsonb_array_length(elements) > 0 loop |
||||
element := elements->0; |
||||
elements := elements - 0; |
||||
case jsonb_typeof(element) |
||||
when 'array' then |
||||
if jsonb_array_length(element) > 0 then |
||||
elements := elements || element; |
||||
end if; |
||||
when 'object' then |
||||
if element ? 'Plan' then |
||||
elements := elements || jsonb_build_array(element->'Plan'); |
||||
element := element - 'Plan'; |
||||
else |
||||
if element ? 'Plans' then |
||||
elements := elements || jsonb_build_array(element->'Plans'); |
||||
element := element - 'Plans'; |
||||
end if; |
||||
if (element->>'Node Type')::text = 'Incremental Sort' then |
||||
matching_nodes := matching_nodes || element; |
||||
end if; |
||||
end if; |
||||
end case; |
||||
end loop; |
||||
return matching_nodes; |
||||
end; |
||||
$$; |
||||
|
||||
create or replace function explain_analyze_inc_sort_nodes_without_memory(query text) |
||||
returns jsonb language plpgsql |
||||
as |
||||
$$ |
||||
declare |
||||
nodes jsonb := '[]'::jsonb; |
||||
node jsonb; |
||||
group_key text; |
||||
space_key text; |
||||
begin |
||||
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop |
||||
for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop |
||||
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop |
||||
node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false); |
||||
node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false); |
||||
end loop; |
||||
end loop; |
||||
nodes := nodes || node; |
||||
end loop; |
||||
return nodes; |
||||
end; |
||||
$$; |
||||
|
||||
create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text) |
||||
returns bool language plpgsql |
||||
as |
||||
$$ |
||||
declare |
||||
node jsonb; |
||||
group_stats jsonb; |
||||
group_key text; |
||||
space_key text; |
||||
begin |
||||
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop |
||||
for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop |
||||
group_stats := node->group_key; |
||||
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop |
||||
if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then |
||||
raise exception '% has invalid max space < average space', group_key; |
||||
end if; |
||||
end loop; |
||||
end loop; |
||||
end loop; |
||||
return true; |
||||
end; |
||||
$$; |
||||
|
||||
-- A single large group tested around each mode transition point. |
||||
insert into t(a, b) select 1, i from generate_series(1, 100) n(i); |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; |
||||
select * from (select * from t order by a) s order by a, b limit 31; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; |
||||
select * from (select * from t order by a) s order by a, b limit 32; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; |
||||
select * from (select * from t order by a) s order by a, b limit 33; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; |
||||
select * from (select * from t order by a) s order by a, b limit 65; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; |
||||
select * from (select * from t order by a) s order by a, b limit 66; |
||||
delete from t; |
||||
|
||||
-- An initial large group followed by a small group. |
||||
insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i); |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; |
||||
select * from (select * from t order by a) s order by a, b limit 55; |
||||
-- Test EXPLAIN ANALYZE with only a fullsort group. |
||||
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); |
||||
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); |
||||
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); |
||||
delete from t; |
||||
|
||||
-- An initial small group followed by a large group. |
||||
insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i); |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; |
||||
select * from (select * from t order by a) s order by a, b limit 70; |
||||
-- Test rescan. |
||||
begin; |
||||
-- We force the planner to choose a plan with incremental sort on the right side |
||||
-- of a nested loop join node. That way we trigger the rescan code path. |
||||
set local enable_hashjoin = off; |
||||
set local enable_mergejoin = off; |
||||
set local enable_material = off; |
||||
set local enable_sort = off; |
||||
explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); |
||||
select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); |
||||
rollback; |
||||
-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. |
||||
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); |
||||
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); |
||||
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); |
||||
delete from t; |
||||
|
||||
-- Small groups of 10 tuples each tested around each mode transition point. |
||||
insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i); |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; |
||||
select * from (select * from t order by a) s order by a, b limit 31; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; |
||||
select * from (select * from t order by a) s order by a, b limit 32; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; |
||||
select * from (select * from t order by a) s order by a, b limit 33; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; |
||||
select * from (select * from t order by a) s order by a, b limit 65; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; |
||||
select * from (select * from t order by a) s order by a, b limit 66; |
||||
delete from t; |
||||
|
||||
-- Small groups of only 1 tuple each tested around each mode transition point. |
||||
insert into t(a, b) select i, i from generate_series(1, 70) n(i); |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; |
||||
select * from (select * from t order by a) s order by a, b limit 31; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; |
||||
select * from (select * from t order by a) s order by a, b limit 32; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; |
||||
select * from (select * from t order by a) s order by a, b limit 33; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; |
||||
select * from (select * from t order by a) s order by a, b limit 65; |
||||
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; |
||||
select * from (select * from t order by a) s order by a, b limit 66; |
||||
delete from t; |
||||
|
||||
drop table t; |
||||
|
||||
-- Incremental sort vs. parallel queries |
||||
set min_parallel_table_scan_size = '1kB'; |
||||
set min_parallel_index_scan_size = '1kB'; |
||||
set parallel_setup_cost = 0; |
||||
set parallel_tuple_cost = 0; |
||||
|
||||
create table t (a int, b int, c int); |
||||
insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i); |
||||
create index on t (a); |
||||
analyze t; |
||||
|
||||
set enable_incrementalsort = off; |
||||
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; |
||||
|
||||
set enable_incrementalsort = on; |
||||
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; |
||||
|
||||
drop table t; |
Loading…
Reference in new issue