diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 7a37bb5d1a..d17b8d9b1f 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1148,9 +1148,13 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, * statext_mcv_clauselist_selectivity * Estimate clauses using the best multi-column statistics. * - * Selects the best extended (multi-column) statistic on a table (measured by - * the number of attributes extracted from the clauses and covered by it), and - * computes the selectivity for the supplied clauses. + * Applies available extended (multi-column) statistics on a table. There may + * be multiple applicable statistics (with respect to the clauses), in which + * case we use greedy approach. In each round we select the best statistic on + * a table (measured by the number of attributes extracted from the clauses + * and covered by it), and compute the selectivity for the supplied clauses. + * We repeat this process with the remaining clauses (if any), until none of + * the available statistics can be used. * * One of the main challenges with using MCV lists is how to extrapolate the * estimate to the data not covered by the MCV list. To do that, we compute @@ -1194,11 +1198,6 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, * 'estimatedclauses' is an input/output parameter. We set bits for the * 0-based 'clauses' indexes we estimate for and also skip clause items that * already have a bit set. - * - * XXX If we were to use multiple statistics, this is where it would happen. - * We would simply repeat this on a loop on the "remaining" clauses, possibly - * using the already estimated clauses as conditions (and combining the values - * using conditional probability formula). */ static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, @@ -1208,14 +1207,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli ListCell *l; Bitmapset **list_attnums; int listidx; - StatisticExtInfo *stat; - List *stat_clauses; - Selectivity simple_sel, - mcv_sel, - mcv_basesel, - mcv_totalsel, - other_sel, - sel; + Selectivity sel = 1.0; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) @@ -1250,66 +1242,85 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli listidx++; } - /* find the best suited statistics object for these attnums */ - stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, - list_attnums, list_length(clauses)); - - /* if no matching stats could be found then we've nothing to do */ - if (!stat) - return 1.0; - - /* Ensure choose_best_statistics produced an expected stats type. */ - Assert(stat->kind == STATS_EXT_MCV); - - /* now filter the clauses to be estimated using the selected MCV */ - stat_clauses = NIL; - - listidx = 0; - foreach(l, clauses) + /* apply as many extended statistics as possible */ + while (true) { - /* - * If the clause is compatible with the selected statistics, mark it - * as estimated and add it to the list to estimate. - */ - if (list_attnums[listidx] != NULL && - bms_is_subset(list_attnums[listidx], stat->keys)) + StatisticExtInfo *stat; + List *stat_clauses; + Selectivity simple_sel, + mcv_sel, + mcv_basesel, + mcv_totalsel, + other_sel, + stat_sel; + + /* find the best suited statistics object for these attnums */ + stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, + list_attnums, list_length(clauses)); + + /* if no (additional) matching stats could be found then we've nothing to do */ + if (!stat) + break; + + /* Ensure choose_best_statistics produced an expected stats type. */ + Assert(stat->kind == STATS_EXT_MCV); + + /* now filter the clauses to be estimated using the selected MCV */ + stat_clauses = NIL; + + listidx = 0; + foreach(l, clauses) { - stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + /* + * If the clause is compatible with the selected statistics, mark it + * as estimated and add it to the list to estimate. + */ + if (list_attnums[listidx] != NULL && + bms_is_subset(list_attnums[listidx], stat->keys)) + { + stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + + bms_free(list_attnums[listidx]); + list_attnums[listidx] = NULL; + } + + listidx++; } - listidx++; + /* + * First compute "simple" selectivity, i.e. without the extended + * statistics, and essentially assuming independence of the + * columns/clauses. We'll then use the various selectivities computed from + * MCV list to improve it. + */ + simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, + jointype, sjinfo, NULL); + + /* + * Now compute the multi-column estimate from the MCV list, along with the + * other selectivities (base & total selectivity). + */ + mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, + jointype, sjinfo, rel, + &mcv_basesel, &mcv_totalsel); + + /* Estimated selectivity of values not covered by MCV matches */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); + + /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; + + /* Overall selectivity is the combination of MCV and non-MCV estimates. */ + stat_sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(stat_sel); + + /* Factor the estimate from this MCV to the oveall estimate. */ + sel *= stat_sel; } - /* - * First compute "simple" selectivity, i.e. without the extended - * statistics, and essentially assuming independence of the - * columns/clauses. We'll then use the various selectivities computed from - * MCV list to improve it. - */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); - - /* - * Now compute the multi-column estimate from the MCV list, along with the - * other selectivities (base & total selectivity). - */ - mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, - jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); - - /* Estimated selectivity of values not covered by MCV matches */ - other_sel = simple_sel - mcv_basesel; - CLAMP_PROBABILITY(other_sel); - - /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ - if (other_sel > 1.0 - mcv_totalsel) - other_sel = 1.0 - mcv_totalsel; - - /* Overall selectivity is the combination of MCV and non-MCV estimates. */ - sel = mcv_sel + other_sel; - CLAMP_PROBABILITY(sel); - return sel; } diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index d467c0dd7d..61237dfb11 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -836,6 +836,63 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND 1 | 0 (1 row) +-- check the ability to use multiple MCV lists +CREATE TABLE mcv_lists_multi ( + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +); +INSERT INTO mcv_lists_multi (a, b, c, d) + SELECT + mod(i,5), + mod(i,5), + mod(i,7), + mod(i,7) + FROM generate_series(1,5000) s(i); +ANALYZE mcv_lists_multi; +-- estimates without any mcv statistics +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); + estimated | actual +-----------+-------- + 200 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 102 | 714 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 4 | 142 +(1 row) + +-- create separate MCV statistics +CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; +CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; +ANALYZE mcv_lists_multi; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 714 | 714 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +DROP TABLE mcv_lists_multi; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 734b1f60c9..84f13e8814 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -535,6 +535,41 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c'); +-- check the ability to use multiple MCV lists +CREATE TABLE mcv_lists_multi ( + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +); + +INSERT INTO mcv_lists_multi (a, b, c, d) + SELECT + mod(i,5), + mod(i,5), + mod(i,7), + mod(i,7) + FROM generate_series(1,5000) s(i); + +ANALYZE mcv_lists_multi; + +-- estimates without any mcv statistics +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + +-- create separate MCV statistics +CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; +CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; + +ANALYZE mcv_lists_multi; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + +DROP TABLE mcv_lists_multi; + -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table.