diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 671117314a..296dd75c1b 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1689,7 +1689,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * for both forward and reverse scans. */ if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) && - partitions_are_ordered(rel->boundinfo, rel->nparts)) + partitions_are_ordered(rel->boundinfo, rel->live_parts)) { partition_pathkeys = build_partition_pathkeys(root, rel, ForwardScanDirection, diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index bd9a176d7d..216dd26385 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -704,7 +704,7 @@ build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, int i; Assert(partscheme != NULL); - Assert(partitions_are_ordered(partrel->boundinfo, partrel->nparts)); + Assert(partitions_are_ordered(partrel->boundinfo, partrel->live_parts)); /* For now, we can only cope with baserels */ Assert(IS_SIMPLE_REL(partrel)); diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 25018b1a8b..fdfe712f91 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -395,6 +395,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->ndatums = nparts; boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *)); boundinfo->kind = NULL; + boundinfo->interleaved_parts = NULL; boundinfo->nindexes = greatest_modulus; boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int)); for (i = 0; i < greatest_modulus; i++) @@ -543,6 +544,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->ndatums = ndatums; boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *)); boundinfo->kind = NULL; + boundinfo->interleaved_parts = NULL; boundinfo->nindexes = ndatums; boundinfo->indexes = (int *) palloc(ndatums * sizeof(int)); @@ -607,6 +609,69 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->default_index = (*mapping)[default_index]; } + /* + * Calculate interleaved partitions. Here we look for partitions which + * might be interleaved with other partitions and set a bit in + * interleaved_parts for any partitions which may be interleaved with + * another partition. + */ + + /* + * There must be multiple partitions to have any interleaved partitions, + * otherwise there's nothing to interleave with. + */ + if (nparts > 1) + { + /* + * Short-circuit check to see if only 1 Datum is allowed per + * partition. When this is true there's no need to do the more + * expensive checks to look for interleaved values. + */ + if (boundinfo->ndatums + + partition_bound_accepts_nulls(boundinfo) + + partition_bound_has_default(boundinfo) != nparts) + { + int last_index = -1; + + /* + * Since the indexes array is sorted in Datum order, if any + * partitions are interleaved then it will show up by the + * partition indexes not being in ascending order. Here we check + * for that and record all partitions that are out of order. + */ + for (i = 0; i < boundinfo->nindexes; i++) + { + int index = boundinfo->indexes[i]; + + if (index < last_index) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + index); + + /* + * Mark the NULL partition as interleaved if we find that it + * allows some other non-NULL Datum. + */ + if (partition_bound_accepts_nulls(boundinfo) && + index == boundinfo->null_index) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + boundinfo->null_index); + + last_index = index; + } + } + + /* + * The DEFAULT partition is the "catch-all" partition that can contain + * anything that does not belong to any other partition. If there are + * any other partitions then the DEFAULT partition must be marked as + * interleaved. + */ + if (partition_bound_has_default(boundinfo)) + boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts, + boundinfo->default_index); + } + + /* All partitions must now have been assigned canonical indexes. */ Assert(next_index == nparts); return boundinfo; @@ -750,6 +815,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, boundinfo->kind = (PartitionRangeDatumKind **) palloc(ndatums * sizeof(PartitionRangeDatumKind *)); + boundinfo->interleaved_parts = NULL; /* * For range partitioning, an additional value of -1 is stored as the last @@ -993,6 +1059,9 @@ partition_bounds_copy(PartitionBoundInfo src, else dest->kind = NULL; + /* copy interleaved partitions for LIST partitioned tables */ + dest->interleaved_parts = bms_copy(src->interleaved_parts); + /* * For hash partitioning, datums array will have two elements - modulus * and remainder. @@ -2780,13 +2849,15 @@ add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs, * that is partitions appearing earlier in the PartitionDesc sequence * contain partition keys strictly less than those appearing later. * Also, if NULL values are possible, they must come in the last - * partition defined in the PartitionDesc. + * partition defined in the PartitionDesc. 'live_parts' marks which + * partitions we should include when checking the ordering. Partitions + * that do not appear in 'live_parts' are ignored. * * If out of order, or there is insufficient info to know the order, * then we return false. */ bool -partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) +partitions_are_ordered(PartitionBoundInfo boundinfo, Bitmapset *live_parts) { Assert(boundinfo != NULL); @@ -2798,38 +2869,24 @@ partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) * RANGE-type partitioning guarantees that the partitions can be * scanned in the order that they're defined in the PartitionDesc * to provide sequential, non-overlapping ranges of tuples. - * However, if a DEFAULT partition exists then it doesn't work, as - * that could contain tuples from either below or above the - * defined range, or tuples belonging to gaps between partitions. + * However, if a DEFAULT partition exists and it's contained + * within live_parts, then the partitions are not ordered. */ - if (!partition_bound_has_default(boundinfo)) + if (!partition_bound_has_default(boundinfo) || + !bms_is_member(boundinfo->default_index, live_parts)) return true; break; case PARTITION_STRATEGY_LIST: /* - * LIST partitioning can also guarantee ordering, but only if the - * partitions don't accept interleaved values. We could likely - * check for this by looping over the PartitionBound's indexes - * array to check that the indexes are in order. For now, let's - * just keep it simple and just accept LIST partitioning when - * there's no DEFAULT partition, exactly one value per partition, - * and optionally a NULL partition that does not accept any other - * values. Such a NULL partition will come last in the - * PartitionDesc, and the other partitions will be properly - * ordered. This is a cheap test to make as it does not require - * any per-partition processing. Maybe we'd like to handle more - * complex cases in the future. + * LIST partitioned are ordered providing none of live_parts + * overlap with the partitioned table's interleaved partitions. */ - if (partition_bound_has_default(boundinfo)) - return false; - - if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo) - == nparts) + if (!bms_overlap(live_parts, boundinfo->interleaved_parts)) return true; - break; + break; default: /* HASH, or some other strategy */ break; diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index 2f00f9aa3d..9db546def6 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -61,6 +61,18 @@ struct RelOptInfo; /* avoid including pathnodes.h here */ * The indexes array is indexed according to the hash key's remainder modulo * the greatest modulus, and it contains either the partition index accepting * that remainder, or -1 if there is no partition for that remainder. + * + * For LIST partitioned tables, we track the partition indexes of partitions + * which are possibly "interleaved" partitions. A partition is considered + * interleaved if it allows multiple values and there exists at least one + * other partition which could contain a value that lies between those values. + * For example, if a partition exists FOR VALUES IN(3,5) and another partition + * exists FOR VALUES IN (4), then the IN(3,5) partition is an interleaved + * partition. The same is possible with DEFAULT partitions since they can + * contain any value that does not belong in another partition. This field + * only serves as proof that a particular partition is not interleaved, not + * proof that it is interleaved. When we're uncertain, we marked the + * partition as interleaved. */ typedef struct PartitionBoundInfoData { @@ -70,6 +82,9 @@ typedef struct PartitionBoundInfoData PartitionRangeDatumKind **kind; /* The kind of each range bound datum; * NULL for hash and list partitioned * tables */ + Bitmapset *interleaved_parts; /* Partition indexes of partitions which + * may be interleaved. See above. This is + * only set for LIST partitioned tables */ int nindexes; /* Length of the indexes[] array */ int *indexes; /* Partition indexes */ int null_index; /* Index of the null-accepting partition; -1 @@ -102,7 +117,8 @@ extern PartitionBoundInfo partition_bounds_merge(int partnatts, JoinType jointype, List **outer_parts, List **inner_parts); -extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts); +extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, + Bitmapset *live_parts); extern void check_new_partition_bound(char *relname, Relation parent, PartitionBoundSpec *spec, ParseState *pstate); diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 06f44287bc..2d49e765de 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2180,6 +2180,8 @@ explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; Index Cond: (a < 20) (9 rows) +set enable_bitmapscan to off; +set enable_sort to off; create table mclparted (a int) partition by list(a); create table mclparted1 partition of mclparted for values in(1); create table mclparted2 partition of mclparted for values in(2); @@ -2208,7 +2210,113 @@ explain (costs off) select * from mclparted order by a; -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 (6 rows) +explain (costs off) select * from mclparted where a in(3,4,5) order by a; + QUERY PLAN +---------------------------------------------------------------------------- + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted3_5_a_idx on mclparted3_5 mclparted_1 + Index Cond: (a = ANY ('{3,4,5}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_2 + Index Cond: (a = ANY ('{3,4,5}'::integer[])) +(6 rows) + +-- Introduce a NULL and DEFAULT partition so we can test more complex cases +create table mclparted_null partition of mclparted for values in(null); +create table mclparted_def partition of mclparted default; +-- Append can be used providing we don't scan the interleaved partition +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) +(7 rows) + +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + QUERY PLAN +-------------------------------------------------------------------------------- + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted_null_a_idx on mclparted_null mclparted_4 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) +(9 rows) + +-- Test a more complex case where the NULL partition allows some other value +drop table mclparted_null; +create table mclparted_0_null partition of mclparted for values in(0,null); +-- Ensure MergeAppend is used since 0 and NULLs are in the same partition. +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + QUERY PLAN +------------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 + Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL)) +(10 rows) + +explain (costs off) select * from mclparted where a in(0,1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4 + Index Cond: (a = ANY ('{0,1,2,4}'::integer[])) +(10 rows) + +-- Ensure Append is used when the null partition is pruned +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4}'::integer[])) +(7 rows) + +-- Ensure MergeAppend is used when the default partition is not pruned +explain (costs off) select * from mclparted where a in(1,2,4,100) order by a; + QUERY PLAN +------------------------------------------------------------------------------ + Merge Append + Sort Key: mclparted.a + -> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) + -> Index Only Scan using mclparted_def_a_idx on mclparted_def mclparted_4 + Index Cond: (a = ANY ('{1,2,4,100}'::integer[])) +(10 rows) + drop table mclparted; +reset enable_sort; +reset enable_bitmapscan; -- Ensure subplans which don't have a path with the correct pathkeys get -- sorted correctly. drop index mcrparted_a_abs_c_idx; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 64173a8738..195aedb5ff 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -775,6 +775,8 @@ explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c; -- during planning. explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; +set enable_bitmapscan to off; +set enable_sort to off; create table mclparted (a int) partition by list(a); create table mclparted1 partition of mclparted for values in(1); create table mclparted2 partition of mclparted for values in(2); @@ -789,8 +791,33 @@ create table mclparted3_5 partition of mclparted for values in(3,5); create table mclparted4 partition of mclparted for values in(4); explain (costs off) select * from mclparted order by a; +explain (costs off) select * from mclparted where a in(3,4,5) order by a; + +-- Introduce a NULL and DEFAULT partition so we can test more complex cases +create table mclparted_null partition of mclparted for values in(null); +create table mclparted_def partition of mclparted default; + +-- Append can be used providing we don't scan the interleaved partition +explain (costs off) select * from mclparted where a in(1,2,4) order by a; +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; + +-- Test a more complex case where the NULL partition allows some other value +drop table mclparted_null; +create table mclparted_0_null partition of mclparted for values in(0,null); + +-- Ensure MergeAppend is used since 0 and NULLs are in the same partition. +explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a; +explain (costs off) select * from mclparted where a in(0,1,2,4) order by a; + +-- Ensure Append is used when the null partition is pruned +explain (costs off) select * from mclparted where a in(1,2,4) order by a; + +-- Ensure MergeAppend is used when the default partition is not pruned +explain (costs off) select * from mclparted where a in(1,2,4,100) order by a; drop table mclparted; +reset enable_sort; +reset enable_bitmapscan; -- Ensure subplans which don't have a path with the correct pathkeys get -- sorted correctly.