Allow ordered partition scans in more cases

959d00e9d added the ability to make use of an Append node instead of a
MergeAppend when we wanted to perform a scan of a partitioned table and
the required sort order was the same as the partitioned keys and the
partitioned table was defined in such a way that earlier partitions were
guaranteed to only contain lower-order values than later partitions.
However, previously we didn't allow these ordered partition scans for
LIST partitioned table when there were any partitions that allowed
multiple Datums.  This was a very cheap check to make and we could likely
have done a little better by checking if there were interleaved
partitions, but at the time we didn't have visibility about which
partitions were pruned, so we still may have disallowed cases where all
interleaved partitions were pruned.

Since 475dbd0b7, we now have knowledge of pruned partitions, we can do a
much better job inside partitions_are_ordered().

Here we pass which partitions survived partition pruning into
partitions_are_ordered() and, for LIST partitioning, have it check to see
if any live partitions exist that are also in the new "interleaved_parts"
field defined in PartitionBoundInfo.

For RANGE partitioning we can relax the code which caused the partitions
to be unordered if a DEFAULT partition existed.  Since we now know which
partitions were pruned, partitions_are_ordered() now returns true when the
DEFAULT partition was pruned.

Reviewed-by: Amit Langote, Zhihong Yu
Discussion: https://postgr.es/m/CAApHDvrdoN_sXU52i=QDXe2k3WAo=EVry29r2+Tq2WYcn2xhEA@mail.gmail.com
This commit is contained in:
David Rowley 2021-08-03 12:25:52 +12:00
parent 475dbd0b71
commit db632fbca3
6 changed files with 235 additions and 27 deletions

View File

@ -1689,7 +1689,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
* for both forward and reverse scans.
*/
if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) &&
partitions_are_ordered(rel->boundinfo, rel->nparts))
partitions_are_ordered(rel->boundinfo, rel->live_parts))
{
partition_pathkeys = build_partition_pathkeys(root, rel,
ForwardScanDirection,

View File

@ -704,7 +704,7 @@ build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel,
int i;
Assert(partscheme != NULL);
Assert(partitions_are_ordered(partrel->boundinfo, partrel->nparts));
Assert(partitions_are_ordered(partrel->boundinfo, partrel->live_parts));
/* For now, we can only cope with baserels */
Assert(IS_SIMPLE_REL(partrel));

View File

@ -395,6 +395,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->ndatums = nparts;
boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *));
boundinfo->kind = NULL;
boundinfo->interleaved_parts = NULL;
boundinfo->nindexes = greatest_modulus;
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
for (i = 0; i < greatest_modulus; i++)
@ -543,6 +544,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->ndatums = ndatums;
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
boundinfo->kind = NULL;
boundinfo->interleaved_parts = NULL;
boundinfo->nindexes = ndatums;
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
@ -607,6 +609,69 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->default_index = (*mapping)[default_index];
}
/*
* Calculate interleaved partitions. Here we look for partitions which
* might be interleaved with other partitions and set a bit in
* interleaved_parts for any partitions which may be interleaved with
* another partition.
*/
/*
* There must be multiple partitions to have any interleaved partitions,
* otherwise there's nothing to interleave with.
*/
if (nparts > 1)
{
/*
* Short-circuit check to see if only 1 Datum is allowed per
* partition. When this is true there's no need to do the more
* expensive checks to look for interleaved values.
*/
if (boundinfo->ndatums +
partition_bound_accepts_nulls(boundinfo) +
partition_bound_has_default(boundinfo) != nparts)
{
int last_index = -1;
/*
* Since the indexes array is sorted in Datum order, if any
* partitions are interleaved then it will show up by the
* partition indexes not being in ascending order. Here we check
* for that and record all partitions that are out of order.
*/
for (i = 0; i < boundinfo->nindexes; i++)
{
int index = boundinfo->indexes[i];
if (index < last_index)
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
index);
/*
* Mark the NULL partition as interleaved if we find that it
* allows some other non-NULL Datum.
*/
if (partition_bound_accepts_nulls(boundinfo) &&
index == boundinfo->null_index)
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
boundinfo->null_index);
last_index = index;
}
}
/*
* The DEFAULT partition is the "catch-all" partition that can contain
* anything that does not belong to any other partition. If there are
* any other partitions then the DEFAULT partition must be marked as
* interleaved.
*/
if (partition_bound_has_default(boundinfo))
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
boundinfo->default_index);
}
/* All partitions must now have been assigned canonical indexes. */
Assert(next_index == nparts);
return boundinfo;
@ -750,6 +815,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
boundinfo->kind = (PartitionRangeDatumKind **)
palloc(ndatums *
sizeof(PartitionRangeDatumKind *));
boundinfo->interleaved_parts = NULL;
/*
* For range partitioning, an additional value of -1 is stored as the last
@ -993,6 +1059,9 @@ partition_bounds_copy(PartitionBoundInfo src,
else
dest->kind = NULL;
/* copy interleaved partitions for LIST partitioned tables */
dest->interleaved_parts = bms_copy(src->interleaved_parts);
/*
* For hash partitioning, datums array will have two elements - modulus
* and remainder.
@ -2780,13 +2849,15 @@ add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
* that is partitions appearing earlier in the PartitionDesc sequence
* contain partition keys strictly less than those appearing later.
* Also, if NULL values are possible, they must come in the last
* partition defined in the PartitionDesc.
* partition defined in the PartitionDesc. 'live_parts' marks which
* partitions we should include when checking the ordering. Partitions
* that do not appear in 'live_parts' are ignored.
*
* If out of order, or there is insufficient info to know the order,
* then we return false.
*/
bool
partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts)
partitions_are_ordered(PartitionBoundInfo boundinfo, Bitmapset *live_parts)
{
Assert(boundinfo != NULL);
@ -2798,38 +2869,24 @@ partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts)
* RANGE-type partitioning guarantees that the partitions can be
* scanned in the order that they're defined in the PartitionDesc
* to provide sequential, non-overlapping ranges of tuples.
* However, if a DEFAULT partition exists then it doesn't work, as
* that could contain tuples from either below or above the
* defined range, or tuples belonging to gaps between partitions.
* However, if a DEFAULT partition exists and it's contained
* within live_parts, then the partitions are not ordered.
*/
if (!partition_bound_has_default(boundinfo))
if (!partition_bound_has_default(boundinfo) ||
!bms_is_member(boundinfo->default_index, live_parts))
return true;
break;
case PARTITION_STRATEGY_LIST:
/*
* LIST partitioning can also guarantee ordering, but only if the
* partitions don't accept interleaved values. We could likely
* check for this by looping over the PartitionBound's indexes
* array to check that the indexes are in order. For now, let's
* just keep it simple and just accept LIST partitioning when
* there's no DEFAULT partition, exactly one value per partition,
* and optionally a NULL partition that does not accept any other
* values. Such a NULL partition will come last in the
* PartitionDesc, and the other partitions will be properly
* ordered. This is a cheap test to make as it does not require
* any per-partition processing. Maybe we'd like to handle more
* complex cases in the future.
* LIST partitioned are ordered providing none of live_parts
* overlap with the partitioned table's interleaved partitions.
*/
if (partition_bound_has_default(boundinfo))
return false;
if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo)
== nparts)
if (!bms_overlap(live_parts, boundinfo->interleaved_parts))
return true;
break;
break;
default:
/* HASH, or some other strategy */
break;

View File

@ -61,6 +61,18 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
* The indexes array is indexed according to the hash key's remainder modulo
* the greatest modulus, and it contains either the partition index accepting
* that remainder, or -1 if there is no partition for that remainder.
*
* For LIST partitioned tables, we track the partition indexes of partitions
* which are possibly "interleaved" partitions. A partition is considered
* interleaved if it allows multiple values and there exists at least one
* other partition which could contain a value that lies between those values.
* For example, if a partition exists FOR VALUES IN(3,5) and another partition
* exists FOR VALUES IN (4), then the IN(3,5) partition is an interleaved
* partition. The same is possible with DEFAULT partitions since they can
* contain any value that does not belong in another partition. This field
* only serves as proof that a particular partition is not interleaved, not
* proof that it is interleaved. When we're uncertain, we marked the
* partition as interleaved.
*/
typedef struct PartitionBoundInfoData
{
@ -70,6 +82,9 @@ typedef struct PartitionBoundInfoData
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
* NULL for hash and list partitioned
* tables */
Bitmapset *interleaved_parts; /* Partition indexes of partitions which
* may be interleaved. See above. This is
* only set for LIST partitioned tables */
int nindexes; /* Length of the indexes[] array */
int *indexes; /* Partition indexes */
int null_index; /* Index of the null-accepting partition; -1
@ -102,7 +117,8 @@ extern PartitionBoundInfo partition_bounds_merge(int partnatts,
JoinType jointype,
List **outer_parts,
List **inner_parts);
extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts);
extern bool partitions_are_ordered(PartitionBoundInfo boundinfo,
Bitmapset *live_parts);
extern void check_new_partition_bound(char *relname, Relation parent,
PartitionBoundSpec *spec,
ParseState *pstate);

View File

@ -2180,6 +2180,8 @@ explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c;
Index Cond: (a < 20)
(9 rows)
set enable_bitmapscan to off;
set enable_sort to off;
create table mclparted (a int) partition by list(a);
create table mclparted1 partition of mclparted for values in(1);
create table mclparted2 partition of mclparted for values in(2);
@ -2208,7 +2210,113 @@ explain (costs off) select * from mclparted order by a;
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
(6 rows)
explain (costs off) select * from mclparted where a in(3,4,5) order by a;
QUERY PLAN
----------------------------------------------------------------------------
Merge Append
Sort Key: mclparted.a
-> Index Only Scan using mclparted3_5_a_idx on mclparted3_5 mclparted_1
Index Cond: (a = ANY ('{3,4,5}'::integer[]))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_2
Index Cond: (a = ANY ('{3,4,5}'::integer[]))
(6 rows)
-- Introduce a NULL and DEFAULT partition so we can test more complex cases
create table mclparted_null partition of mclparted for values in(null);
create table mclparted_def partition of mclparted default;
-- Append can be used providing we don't scan the interleaved partition
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
QUERY PLAN
------------------------------------------------------------------------
Append
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
(7 rows)
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
QUERY PLAN
--------------------------------------------------------------------------------
Append
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted_null_a_idx on mclparted_null mclparted_4
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
(9 rows)
-- Test a more complex case where the NULL partition allows some other value
drop table mclparted_null;
create table mclparted_0_null partition of mclparted for values in(0,null);
-- Ensure MergeAppend is used since 0 and NULLs are in the same partition.
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
QUERY PLAN
------------------------------------------------------------------------------------
Merge Append
Sort Key: mclparted.a
-> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
(10 rows)
explain (costs off) select * from mclparted where a in(0,1,2,4) order by a;
QUERY PLAN
------------------------------------------------------------------------------------
Merge Append
Sort Key: mclparted.a
-> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
(10 rows)
-- Ensure Append is used when the null partition is pruned
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
QUERY PLAN
------------------------------------------------------------------------
Append
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
(7 rows)
-- Ensure MergeAppend is used when the default partition is not pruned
explain (costs off) select * from mclparted where a in(1,2,4,100) order by a;
QUERY PLAN
------------------------------------------------------------------------------
Merge Append
Sort Key: mclparted.a
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
-> Index Only Scan using mclparted_def_a_idx on mclparted_def mclparted_4
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
(10 rows)
drop table mclparted;
reset enable_sort;
reset enable_bitmapscan;
-- Ensure subplans which don't have a path with the correct pathkeys get
-- sorted correctly.
drop index mcrparted_a_abs_c_idx;

View File

@ -775,6 +775,8 @@ explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c;
-- during planning.
explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c;
set enable_bitmapscan to off;
set enable_sort to off;
create table mclparted (a int) partition by list(a);
create table mclparted1 partition of mclparted for values in(1);
create table mclparted2 partition of mclparted for values in(2);
@ -789,8 +791,33 @@ create table mclparted3_5 partition of mclparted for values in(3,5);
create table mclparted4 partition of mclparted for values in(4);
explain (costs off) select * from mclparted order by a;
explain (costs off) select * from mclparted where a in(3,4,5) order by a;
-- Introduce a NULL and DEFAULT partition so we can test more complex cases
create table mclparted_null partition of mclparted for values in(null);
create table mclparted_def partition of mclparted default;
-- Append can be used providing we don't scan the interleaved partition
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
-- Test a more complex case where the NULL partition allows some other value
drop table mclparted_null;
create table mclparted_0_null partition of mclparted for values in(0,null);
-- Ensure MergeAppend is used since 0 and NULLs are in the same partition.
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
explain (costs off) select * from mclparted where a in(0,1,2,4) order by a;
-- Ensure Append is used when the null partition is pruned
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
-- Ensure MergeAppend is used when the default partition is not pruned
explain (costs off) select * from mclparted where a in(1,2,4,100) order by a;
drop table mclparted;
reset enable_sort;
reset enable_bitmapscan;
-- Ensure subplans which don't have a path with the correct pathkeys get
-- sorted correctly.