diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 73fbb4ac86..ffca0fe5bb 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -164,10 +164,20 @@ static double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, FmgrInfo *opproc, bool isgt, bool iseq, Datum constval, Oid consttype); -static double eqjoinsel_inner(Oid operator, - VariableStatData *vardata1, VariableStatData *vardata2); -static double eqjoinsel_semi(Oid operator, +static double eqjoinsel_inner(Oid opfuncoid, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2); +static double eqjoinsel_semi(Oid opfuncoid, VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2, RelOptInfo *inner_rel); static bool estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, List **varinfos, double *ndistinct); @@ -2290,20 +2300,69 @@ eqjoinsel(PG_FUNCTION_ARGS) #endif SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; + double selec_inner; VariableStatData vardata1; VariableStatData vardata2; + double nd1; + double nd2; + bool isdefault1; + bool isdefault2; + Oid opfuncoid; + AttStatsSlot sslot1; + AttStatsSlot sslot2; + Form_pg_statistic stats1 = NULL; + Form_pg_statistic stats2 = NULL; + bool have_mcvs1 = false; + bool have_mcvs2 = false; bool join_is_reversed; RelOptInfo *inner_rel; get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &join_is_reversed); + nd1 = get_variable_numdistinct(&vardata1, &isdefault1); + nd2 = get_variable_numdistinct(&vardata2, &isdefault2); + + opfuncoid = get_opcode(operator); + + memset(&sslot1, 0, sizeof(sslot1)); + memset(&sslot2, 0, sizeof(sslot2)); + + if (HeapTupleIsValid(vardata1.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple); + if (statistic_proc_security_check(&vardata1, opfuncoid)) + have_mcvs1 = get_attstatsslot(&sslot1, vardata1.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (HeapTupleIsValid(vardata2.statsTuple)) + { + /* note we allow use of nullfrac regardless of security check */ + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple); + if (statistic_proc_security_check(&vardata2, opfuncoid)) + have_mcvs2 = get_attstatsslot(&sslot2, vardata2.statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + /* We need to compute the inner-join selectivity in all cases */ + selec_inner = eqjoinsel_inner(opfuncoid, + &vardata1, &vardata2, + nd1, nd2, + isdefault1, isdefault2, + &sslot1, &sslot2, + stats1, stats2, + have_mcvs1, have_mcvs2); + switch (sjinfo->jointype) { case JOIN_INNER: case JOIN_LEFT: case JOIN_FULL: - selec = eqjoinsel_inner(operator, &vardata1, &vardata2); + selec = selec_inner; break; case JOIN_SEMI: case JOIN_ANTI: @@ -2317,12 +2376,40 @@ eqjoinsel(PG_FUNCTION_ARGS) inner_rel = find_join_input_rel(root, sjinfo->min_righthand); if (!join_is_reversed) - selec = eqjoinsel_semi(operator, &vardata1, &vardata2, + selec = eqjoinsel_semi(opfuncoid, + &vardata1, &vardata2, + nd1, nd2, + isdefault1, isdefault2, + &sslot1, &sslot2, + stats1, stats2, + have_mcvs1, have_mcvs2, inner_rel); else - selec = eqjoinsel_semi(get_commutator(operator), + { + Oid commop = get_commutator(operator); + Oid commopfuncoid = OidIsValid(commop) ? get_opcode(commop) : InvalidOid; + + selec = eqjoinsel_semi(commopfuncoid, &vardata2, &vardata1, + nd2, nd1, + isdefault2, isdefault1, + &sslot2, &sslot1, + stats2, stats1, + have_mcvs2, have_mcvs1, inner_rel); + } + + /* + * We should never estimate the output of a semijoin to be more + * rows than we estimate for an inner join with the same input + * rels and join condition; it's obviously impossible for that to + * happen. The former estimate is N1 * Ssemi while the latter is + * N1 * N2 * Sinner, so we may clamp Ssemi <= N2 * Sinner. Doing + * this is worthwhile because of the shakier estimation rules we + * use in eqjoinsel_semi, particularly in cases where it has to + * punt entirely. + */ + selec = Min(selec, inner_rel->rows * selec_inner); break; default: /* other values not expected here */ @@ -2332,6 +2419,9 @@ eqjoinsel(PG_FUNCTION_ARGS) break; } + free_attstatsslot(&sslot1); + free_attstatsslot(&sslot2); + ReleaseVariableStats(vardata1); ReleaseVariableStats(vardata2); @@ -2347,49 +2437,15 @@ eqjoinsel(PG_FUNCTION_ARGS) * that it's worth trying to distinguish them here. */ static double -eqjoinsel_inner(Oid operator, - VariableStatData *vardata1, VariableStatData *vardata2) +eqjoinsel_inner(Oid opfuncoid, + VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2) { double selec; - double nd1; - double nd2; - bool isdefault1; - bool isdefault2; - Oid opfuncoid; - Form_pg_statistic stats1 = NULL; - Form_pg_statistic stats2 = NULL; - bool have_mcvs1 = false; - bool have_mcvs2 = false; - AttStatsSlot sslot1; - AttStatsSlot sslot2; - - nd1 = get_variable_numdistinct(vardata1, &isdefault1); - nd2 = get_variable_numdistinct(vardata2, &isdefault2); - - opfuncoid = get_opcode(operator); - - memset(&sslot1, 0, sizeof(sslot1)); - memset(&sslot2, 0, sizeof(sslot2)); - - if (HeapTupleIsValid(vardata1->statsTuple)) - { - /* note we allow use of nullfrac regardless of security check */ - stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); - if (statistic_proc_security_check(vardata1, opfuncoid)) - have_mcvs1 = get_attstatsslot(&sslot1, vardata1->statsTuple, - STATISTIC_KIND_MCV, InvalidOid, - ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); - } - - if (HeapTupleIsValid(vardata2->statsTuple)) - { - /* note we allow use of nullfrac regardless of security check */ - stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); - if (statistic_proc_security_check(vardata2, opfuncoid)) - have_mcvs2 = get_attstatsslot(&sslot2, vardata2->statsTuple, - STATISTIC_KIND_MCV, InvalidOid, - ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); - } if (have_mcvs1 && have_mcvs2) { @@ -2423,8 +2479,8 @@ eqjoinsel_inner(Oid operator, nmatches; fmgr_info(opfuncoid, &eqproc); - hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool)); - hasmatch2 = (bool *) palloc0(sslot2.nvalues * sizeof(bool)); + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); + hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool)); /* * Note we assume that each MCV will match at most one member of the @@ -2434,21 +2490,21 @@ eqjoinsel_inner(Oid operator, */ matchprodfreq = 0.0; nmatches = 0; - for (i = 0; i < sslot1.nvalues; i++) + for (i = 0; i < sslot1->nvalues; i++) { int j; - for (j = 0; j < sslot2.nvalues; j++) + for (j = 0; j < sslot2->nvalues; j++) { if (hasmatch2[j]) continue; if (DatumGetBool(FunctionCall2Coll(&eqproc, DEFAULT_COLLATION_OID, - sslot1.values[i], - sslot2.values[j]))) + sslot1->values[i], + sslot2->values[j]))) { hasmatch1[i] = hasmatch2[j] = true; - matchprodfreq += sslot1.numbers[i] * sslot2.numbers[j]; + matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j]; nmatches++; break; } @@ -2457,22 +2513,22 @@ eqjoinsel_inner(Oid operator, CLAMP_PROBABILITY(matchprodfreq); /* Sum up frequencies of matched and unmatched MCVs */ matchfreq1 = unmatchfreq1 = 0.0; - for (i = 0; i < sslot1.nvalues; i++) + for (i = 0; i < sslot1->nvalues; i++) { if (hasmatch1[i]) - matchfreq1 += sslot1.numbers[i]; + matchfreq1 += sslot1->numbers[i]; else - unmatchfreq1 += sslot1.numbers[i]; + unmatchfreq1 += sslot1->numbers[i]; } CLAMP_PROBABILITY(matchfreq1); CLAMP_PROBABILITY(unmatchfreq1); matchfreq2 = unmatchfreq2 = 0.0; - for (i = 0; i < sslot2.nvalues; i++) + for (i = 0; i < sslot2->nvalues; i++) { if (hasmatch2[i]) - matchfreq2 += sslot2.numbers[i]; + matchfreq2 += sslot2->numbers[i]; else - unmatchfreq2 += sslot2.numbers[i]; + unmatchfreq2 += sslot2->numbers[i]; } CLAMP_PROBABILITY(matchfreq2); CLAMP_PROBABILITY(unmatchfreq2); @@ -2497,15 +2553,15 @@ eqjoinsel_inner(Oid operator, * MCVs plus non-MCV values. */ totalsel1 = matchprodfreq; - if (nd2 > sslot2.nvalues) - totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - sslot2.nvalues); + if (nd2 > sslot2->nvalues) + totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - sslot2->nvalues); if (nd2 > nmatches) totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / (nd2 - nmatches); /* Same estimate from the point of view of relation 2. */ totalsel2 = matchprodfreq; - if (nd1 > sslot1.nvalues) - totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1.nvalues); + if (nd1 > sslot1->nvalues) + totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1->nvalues); if (nd1 > nmatches) totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / (nd1 - nmatches); @@ -2550,9 +2606,6 @@ eqjoinsel_inner(Oid operator, selec /= nd2; } - free_attstatsslot(&sslot1); - free_attstatsslot(&sslot2); - return selec; } @@ -2561,32 +2614,19 @@ eqjoinsel_inner(Oid operator, * * (Also used for anti join, which we are supposed to estimate the same way.) * Caller has ensured that vardata1 is the LHS variable. - * Unlike eqjoinsel_inner, we have to cope with operator being InvalidOid. + * Unlike eqjoinsel_inner, we have to cope with opfuncoid being InvalidOid. */ static double -eqjoinsel_semi(Oid operator, +eqjoinsel_semi(Oid opfuncoid, VariableStatData *vardata1, VariableStatData *vardata2, + double nd1, double nd2, + bool isdefault1, bool isdefault2, + AttStatsSlot *sslot1, AttStatsSlot *sslot2, + Form_pg_statistic stats1, Form_pg_statistic stats2, + bool have_mcvs1, bool have_mcvs2, RelOptInfo *inner_rel) { double selec; - double nd1; - double nd2; - bool isdefault1; - bool isdefault2; - Oid opfuncoid; - Form_pg_statistic stats1 = NULL; - bool have_mcvs1 = false; - bool have_mcvs2 = false; - AttStatsSlot sslot1; - AttStatsSlot sslot2; - - nd1 = get_variable_numdistinct(vardata1, &isdefault1); - nd2 = get_variable_numdistinct(vardata2, &isdefault2); - - opfuncoid = OidIsValid(operator) ? get_opcode(operator) : InvalidOid; - - memset(&sslot1, 0, sizeof(sslot1)); - memset(&sslot2, 0, sizeof(sslot2)); /* * We clamp nd2 to be not more than what we estimate the inner relation's @@ -2621,26 +2661,7 @@ eqjoinsel_semi(Oid operator, isdefault2 = false; } - if (HeapTupleIsValid(vardata1->statsTuple)) - { - /* note we allow use of nullfrac regardless of security check */ - stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); - if (statistic_proc_security_check(vardata1, opfuncoid)) - have_mcvs1 = get_attstatsslot(&sslot1, vardata1->statsTuple, - STATISTIC_KIND_MCV, InvalidOid, - ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); - } - - if (HeapTupleIsValid(vardata2->statsTuple) && - statistic_proc_security_check(vardata2, opfuncoid)) - { - have_mcvs2 = get_attstatsslot(&sslot2, vardata2->statsTuple, - STATISTIC_KIND_MCV, InvalidOid, - ATTSTATSSLOT_VALUES); - /* note: currently don't need stanumbers from RHS */ - } - - if (have_mcvs1 && have_mcvs2 && OidIsValid(operator)) + if (have_mcvs1 && have_mcvs2 && OidIsValid(opfuncoid)) { /* * We have most-common-value lists for both relations. Run through @@ -2663,15 +2684,15 @@ eqjoinsel_semi(Oid operator, /* * The clamping above could have resulted in nd2 being less than - * sslot2.nvalues; in which case, we assume that precisely the nd2 + * sslot2->nvalues; in which case, we assume that precisely the nd2 * most common values in the relation will appear in the join input, * and so compare to only the first nd2 members of the MCV list. Of * course this is frequently wrong, but it's the best bet we can make. */ - clamped_nvalues2 = Min(sslot2.nvalues, nd2); + clamped_nvalues2 = Min(sslot2->nvalues, nd2); fmgr_info(opfuncoid, &eqproc); - hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool)); + hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool)); hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool)); /* @@ -2681,7 +2702,7 @@ eqjoinsel_semi(Oid operator, * and because the math wouldn't add up... */ nmatches = 0; - for (i = 0; i < sslot1.nvalues; i++) + for (i = 0; i < sslot1->nvalues; i++) { int j; @@ -2691,8 +2712,8 @@ eqjoinsel_semi(Oid operator, continue; if (DatumGetBool(FunctionCall2Coll(&eqproc, DEFAULT_COLLATION_OID, - sslot1.values[i], - sslot2.values[j]))) + sslot1->values[i], + sslot2->values[j]))) { hasmatch1[i] = hasmatch2[j] = true; nmatches++; @@ -2702,10 +2723,10 @@ eqjoinsel_semi(Oid operator, } /* Sum up frequencies of matched MCVs */ matchfreq1 = 0.0; - for (i = 0; i < sslot1.nvalues; i++) + for (i = 0; i < sslot1->nvalues; i++) { if (hasmatch1[i]) - matchfreq1 += sslot1.numbers[i]; + matchfreq1 += sslot1->numbers[i]; } CLAMP_PROBABILITY(matchfreq1); pfree(hasmatch1); @@ -2760,9 +2781,6 @@ eqjoinsel_semi(Oid operator, selec = 0.5 * (1.0 - nullfrac1); } - free_attstatsslot(&sslot1); - free_attstatsslot(&sslot2); - return selec; } diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 3ba3aaf2d8..c55de5d476 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -801,8 +801,8 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; - QUERY PLAN -------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------- Sort Sort Key: t1.a -> Append @@ -831,19 +831,18 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN ( Index Cond: (a = t1_4.b) Filter: (b = 0) -> Nested Loop - -> Unique - -> Sort - Sort Key: t1_5.b - -> Hash Semi Join - Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2)) - -> Seq Scan on prt2_p3 t1_5 - -> Hash - -> Seq Scan on prt1_e_p3 t1_8 - Filter: (c = 0) + -> HashAggregate + Group Key: t1_5.b + -> Hash Semi Join + Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2)) + -> Seq Scan on prt2_p3 t1_5 + -> Hash + -> Seq Scan on prt1_e_p3 t1_8 + Filter: (c = 0) -> Index Scan using iprt1_p3_a on prt1_p3 t1_2 Index Cond: (a = t1_5.b) Filter: (b = 0) -(40 rows) +(39 rows) SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a; a | b | c