From db632fbca392389807ffb9d9b2207157e8e9b3e8 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Tue, 3 Aug 2021 12:25:52 +1200 Subject: [PATCH] Allow ordered partition scans in more cases 959d00e9d added the ability to make use of an Append node instead of a MergeAppend when we wanted to perform a scan of a partitioned table and the required sort order was the same as the partitioned keys and the partitioned table was defined in such a way that earlier partitions were guaranteed to only contain lower-order values than later partitions. However, previously we didn't allow these ordered partition scans for LIST partitioned table when there were any partitions that allowed multiple Datums. This was a very cheap check to make and we could likely have done a little better by checking if there were interleaved partitions, but at the time we didn't have visibility about which partitions were pruned, so we still may have disallowed cases where all interleaved partitions were pruned. Since 475dbd0b7, we now have knowledge of pruned partitions, we can do a much better job inside partitions_are_ordered(). Here we pass which partitions survived partition pruning into partitions_are_ordered() and, for LIST partitioning, have it check to see if any live partitions exist that are also in the new "interleaved_parts" field defined in PartitionBoundInfo. For RANGE partitioning we can relax the code which caused the partitions to be unordered if a DEFAULT partition existed. Since we now know which partitions were pruned, partitions_are_ordered() now returns true when the DEFAULT partition was pruned. Reviewed-by: Amit Langote, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvrdoN_sXU52i=QDXe2k3WAo=EVry29r2+Tq2WYcn2xhEA@mail.gmail.com --- src/backend/optimizer/path/allpaths.c | 2 +- src/backend/optimizer/path/pathkeys.c | 2 +- src/backend/partitioning/partbounds.c | 105 +++++++++++++++++++------ src/include/partitioning/partbounds.h | 18 ++++- src/test/regress/expected/inherit.out | 108 ++++++++++++++++++++++++++ src/test/regress/sql/inherit.sql | 27 +++++++ 6 files changed, 235 insertions(+), 27 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 671117314a..296dd75c1b 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1689,7 +1689,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * for both forward and reverse scans. */ if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) && - partitions_are_ordered(rel->boundinfo, rel->nparts)) + partitions_are_ordered(rel->boundinfo, rel->live_parts)) { partition_pathkeys = build_partition_pathkeys(root, rel, ForwardScanDirection, diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index bd9a176d7d..216dd26385 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -704,7 +704,7 @@ build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, int i; Assert(partscheme != NULL); - Assert(partitions_are_ordered(partrel->boundinfo, partrel->nparts)); + Assert(partitions_are_ordered(partrel->boundinfo, partrel->live_parts)); /* For now, we can only cope with baserels */ Assert(IS_SIMPLE_REL(partrel)); diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 25018b1a8b..fdfe712f91 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -395,6 +395,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->ndatums = nparts; boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *)); boundinfo->kind = NULL; + boundinfo->interleaved_parts = NULL; boundinfo->nindexes = greatest_modulus; boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int)); for (i = 0; i < greatest_modulus; i++) @@ -543,6 +544,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->ndatums = ndatums; boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); boundinfo->kind = NULL; + boundinfo->interleaved_parts = NULL; boundinfo->nindexes = ndatums; boundinfo->indexes = (int *) palloc(ndatums * sizeof(int)); @@ -607,6 +609,69 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->default_index = (*mapping)[default_index]; } + /* + * Calculate interleaved partitions. Here we look for partitions which + * might be interleaved with other partitions and set a bit in + * interleaved_parts for any partitions which may be interleaved with + * another partition. + */ + + /* + * There must be multiple partitions to have any interleaved partitions, + * otherwise there's nothing to interleave with. + */ + if (nparts > 1) + { + /* + * Short-circuit check to see if only 1 Datum is allowed per + * partition. When this is true there's no need to do the more + * expensive checks to look for interleaved values. + */ + if (boundinfo->ndatums + + partition_bound_accepts_nulls(boundinfo) + + partition_bound_has_default(boundinfo) != nparts) + { + int last_index = -1; + + /* + * Since the indexes array is sorted in Datum order, if any + * partitions are interleaved then it will show up by the + * partition indexes not being in ascending order. Here we check + * for that and record all partitions that are out of order. + */ + for (i = 0; i < boundinfo->nindexes; i++) + { + int index = boundinfo->indexes[i]; + + if (index < last_index) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + index); + + /* + * Mark the NULL partition as interleaved if we find that it + * allows some other non-NULL Datum. + */ + if (partition_bound_accepts_nulls(boundinfo) && + index == boundinfo->null_index) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + boundinfo->null_index); + + last_index = index; + } + } + + /* + * The DEFAULT partition is the "catch-all" partition that can contain + * anything that does not belong to any other partition. If there are + * any other partitions then the DEFAULT partition must be marked as + * interleaved. + */ + if (partition_bound_has_default(boundinfo)) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + boundinfo->default_index); + } + + /* All partitions must now have been assigned canonical indexes. */ Assert(next_index == nparts); return boundinfo; @@ -750,6 +815,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->kind = (PartitionRangeDatumKind **) palloc(ndatums * sizeof(PartitionRangeDatumKind *)); + boundinfo->interleaved_parts = NULL; /* * For range partitioning, an additional value of -1 is stored as the last @@ -993,6 +1059,9 @@ partition_bounds_copy(PartitionBoundInfo src, else dest->kind = NULL; + /* copy interleaved partitions for LIST partitioned tables */ + dest->interleaved_parts = bms_copy(src->interleaved_parts); + /* * For hash partitioning, datums array will have two elements - modulus * and remainder. @@ -2780,13 +2849,15 @@ add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, * that is partitions appearing earlier in the PartitionDesc sequence * contain partition keys strictly less than those appearing later. * Also, if NULL values are possible, they must come in the last - * partition defined in the PartitionDesc. + * partition defined in the PartitionDesc. 'live_parts' marks which + * partitions we should include when checking the ordering. Partitions + * that do not appear in 'live_parts' are ignored. * * If out of order, or there is insufficient info to know the order, * then we return false. */ bool -partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) +partitions_are_ordered(PartitionBoundInfo boundinfo, Bitmapset *live_parts) { Assert(boundinfo != NULL); @@ -2798,38 +2869,24 @@ partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) * RANGE-type partitioning guarantees that the partitions can be * scanned in the order that they're defined in the PartitionDesc * to provide sequential, non-overlapping ranges of tuples. - * However, if a DEFAULT partition exists then it doesn't work, as - * that could contain tuples from either below or above the - * defined range, or tuples belonging to gaps between partitions. + * However, if a DEFAULT partition exists and it's contained + * within live_parts, then the partitions are not ordered. */ - if (!partition_bound_has_default(boundinfo)) + if (!partition_bound_has_default(boundinfo) || + !bms_is_member(boundinfo->default_index, live_parts)) return true; break; case PARTITION_STRATEGY_LIST: /* - * LIST partitioning can also guarantee ordering, but only if the - * partitions don't accept interleaved values. We could likely - * check for this by looping over the PartitionBound's indexes - * array to check that the indexes are in order. For now, let's - * just keep it simple and just accept LIST partitioning when - * there's no DEFAULT partition, exactly one value per partition, - * and optionally a NULL partition that does not accept any other - * values. Such a NULL partition will come last in the - * PartitionDesc, and the other partitions will be properly - * ordered. This is a cheap test to make as it does not require - * any per-partition processing. Maybe we'd like to handle more - * complex cases in the future. + * LIST partitioned are ordered providing none of live_parts + * overlap with the partitioned table's interleaved partitions. */ - if (partition_bound_has_default(boundinfo)) - return false; - - if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo) - == nparts) + if (!bms_overlap(live_parts, boundinfo->interleaved_parts)) return true; - break; + break; default: /* HASH, or some other strategy */ break; diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index 2f00f9aa3d..9db546def6 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -61,6 +61,18 @@ struct RelOptInfo; /* avoid including pathnodes.h here */ * The indexes array is indexed according to the hash key's remainder modulo * the greatest modulus, and it contains either the partition index accepting * that remainder, or -1 if there is no partition for that remainder. + * + * For LIST partitioned tables, we track the partition indexes of partitions + * which are possibly "interleaved" partitions. A partition is considered + * interleaved if it allows multiple values and there exists at least one + * other partition which could contain a value that lies between those values. + * For example, if a partition exists FOR VALUES IN(3,5) and another partition + * exists FOR VALUES IN (4), then the IN(3,5) partition is an interleaved + * partition. The same is possible with DEFAULT partitions since they can + * contain any value that does not belong in another partition. This field + * only serves as proof that a particular partition is not interleaved, not + * proof that it is interleaved. When we're uncertain, we marked the + * partition as interleaved. */ typedef struct PartitionBoundInfoData { @@ -70,6 +82,9 @@ typedef struct PartitionBoundInfoData PartitionRangeDatumKind **kind; /* The kind of each range bound datum; * NULL for hash and list partitioned * tables */ + Bitmapset *interleaved_parts; /* Partition indexes of partitions which + * may be interleaved. See above. This is + * only set for LIST partitioned tables */ int nindexes; /* Length of the indexes[] array */ int *indexes; /* Partition indexes */ int null_index; /* Index of the null-accepting partition; -1 @@ -102,7 +117,8 @@ extern PartitionBoundInfo partition_bounds_merge(int partnatts, JoinType jointype, List **outer_parts, List **inner_parts); -extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts); +extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, + Bitmapset *live_parts); extern void check_new_partition_bound(char *relname, Relation parent, PartitionBoundSpec *spec, ParseState *pstate); diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 06f44287bc..2d49e765de 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2180,6 +2180,8 @@ explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; Index Cond: (a < 20) (9 rows) +set enable_bitmapscan to off; +set enable_sort to off; create table mclparted (a int) partition by list(a); create table mclparted1 partition of mclparted for values in(1); create table mclparted2 partition of mclparted for values in(2); @@ -2208,7 +2210,113 @@ explain (costs off) select * from mclparted order by a; -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 (6 rows) +explain (costs off) select * from mclparted where a in(3,4,5) order by a; + QUERY PLAN +---------------------------------------------------------------------------- + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted3_5_a_idx on mclparted3_5 mclparted_1 + Index Cond: (a = ANY ('{3,4,5}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_2 + Index Cond: (a = ANY ('{3,4,5}'::integer[])) +(6 rows) + +-- Introduce a NULL and DEFAULT partition so we can test more complex cases +create table mclparted_null partition of mclparted for values in(null); +create table mclparted_def partition of mclparted default; +-- Append can be used providing we don't scan the interleaved partition +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) +(7 rows) + +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + QUERY PLAN +-------------------------------------------------------------------------------- + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted_null_a_idx on mclparted_null mclparted_4 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) +(9 rows) + +-- Test a more complex case where the NULL partition allows some other value +drop table mclparted_null; +create table mclparted_0_null partition of mclparted for values in(0,null); +-- Ensure MergeAppend is used since 0 and NULLs are in the same partition. +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + QUERY PLAN +------------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) +(10 rows) + +explain (costs off) select * from mclparted where a in(0,1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) +(10 rows) + +-- Ensure Append is used when the null partition is pruned +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) +(7 rows) + +-- Ensure MergeAppend is used when the default partition is not pruned +explain (costs off) select * from mclparted where a in(1,2,4,100) order by a; + QUERY PLAN +------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted_def_a_idx on mclparted_def mclparted_4 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) +(10 rows) + drop table mclparted; +reset enable_sort; +reset enable_bitmapscan; -- Ensure subplans which don't have a path with the correct pathkeys get -- sorted correctly. drop index mcrparted_a_abs_c_idx; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 64173a8738..195aedb5ff 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -775,6 +775,8 @@ explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c; -- during planning. explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; +set enable_bitmapscan to off; +set enable_sort to off; create table mclparted (a int) partition by list(a); create table mclparted1 partition of mclparted for values in(1); create table mclparted2 partition of mclparted for values in(2); @@ -789,8 +791,33 @@ create table mclparted3_5 partition of mclparted for values in(3,5); create table mclparted4 partition of mclparted for values in(4); explain (costs off) select * from mclparted order by a; +explain (costs off) select * from mclparted where a in(3,4,5) order by a; + +-- Introduce a NULL and DEFAULT partition so we can test more complex cases +create table mclparted_null partition of mclparted for values in(null); +create table mclparted_def partition of mclparted default; + +-- Append can be used providing we don't scan the interleaved partition +explain (costs off) select * from mclparted where a in(1,2,4) order by a; +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + +-- Test a more complex case where the NULL partition allows some other value +drop table mclparted_null; +create table mclparted_0_null partition of mclparted for values in(0,null); + +-- Ensure MergeAppend is used since 0 and NULLs are in the same partition. +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; +explain (costs off) select * from mclparted where a in(0,1,2,4) order by a; + +-- Ensure Append is used when the null partition is pruned +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + +-- Ensure MergeAppend is used when the default partition is not pruned +explain (costs off) select * from mclparted where a in(1,2,4,100) order by a; drop table mclparted; +reset enable_sort; +reset enable_bitmapscan; -- Ensure subplans which don't have a path with the correct pathkeys get -- sorted correctly.