Clamp semijoin selectivity to be not more than inner-join selectivity.

We should never estimate the output of a semijoin to be more rows than
we estimate for an inner join with the same input rels and join condition;
it's obviously impossible for that to happen.  However, given the
relatively poor quality of our semijoin selectivity estimates ---
particularly, but not only, in cases where we punt and return a default
estimate --- we did often deliver such estimates.  To improve matters,
calculate both estimates inside eqjoinsel() and take the smaller one.

The bulk of this patch is just mechanical refactoring to avoid repetitive
information lookup when we call both eqjoinsel_semi and eqjoinsel_inner.
The actual new behavior is just

	selec = Min(selec, inner_rel->rows * selec_inner);

which looks a bit odd but is correct because of our different definitions
for inner and semi join selectivity.

There is one ensuing plan change in the regression tests, but it looks
reasonable enough (and checking the actual row counts shows that the
estimate moved closer to reality, not further away).

Per bug #15160 from Alexey Ermakov.  Although this is arguably a bug fix,
I won't risk destabilizing plan choices in stable branches by
back-patching.

Tom Lane, reviewed by Melanie Plageman

Discussion: https://postgr.es/m/152395805004.19366.3107109716821067806@wrigleys.postgresql.org
This commit is contained in:
Tom Lane 2018-11-23 12:48:49 -05:00
parent 3be5fe2b10
commit a314c34079
2 changed files with 147 additions and 130 deletions

View File

@ -164,10 +164,20 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, bool iseq,
Datum constval, Oid consttype);
static double eqjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2);
static double eqjoinsel_semi(Oid operator,
static double eqjoinsel_inner(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
bool isdefault1, bool isdefault2,
AttStatsSlot *sslot1, AttStatsSlot *sslot2,
Form_pg_statistic stats1, Form_pg_statistic stats2,
bool have_mcvs1, bool have_mcvs2);
static double eqjoinsel_semi(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
bool isdefault1, bool isdefault2,
AttStatsSlot *sslot1, AttStatsSlot *sslot2,
Form_pg_statistic stats1, Form_pg_statistic stats2,
bool have_mcvs1, bool have_mcvs2,
RelOptInfo *inner_rel);
static bool estimate_multivariate_ndistinct(PlannerInfo *root,
RelOptInfo *rel, List **varinfos, double *ndistinct);
@ -2290,20 +2300,69 @@ eqjoinsel(PG_FUNCTION_ARGS)
#endif
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
double selec;
double selec_inner;
VariableStatData vardata1;
VariableStatData vardata2;
double nd1;
double nd2;
bool isdefault1;
bool isdefault2;
Oid opfuncoid;
AttStatsSlot sslot1;
AttStatsSlot sslot2;
Form_pg_statistic stats1 = NULL;
Form_pg_statistic stats2 = NULL;
bool have_mcvs1 = false;
bool have_mcvs2 = false;
bool join_is_reversed;
RelOptInfo *inner_rel;
get_join_variables(root, args, sjinfo,
&vardata1, &vardata2, &join_is_reversed);
nd1 = get_variable_numdistinct(&vardata1, &isdefault1);
nd2 = get_variable_numdistinct(&vardata2, &isdefault2);
opfuncoid = get_opcode(operator);
memset(&sslot1, 0, sizeof(sslot1));
memset(&sslot2, 0, sizeof(sslot2));
if (HeapTupleIsValid(vardata1.statsTuple))
{
/* note we allow use of nullfrac regardless of security check */
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
if (statistic_proc_security_check(&vardata1, opfuncoid))
have_mcvs1 = get_attstatsslot(&sslot1, vardata1.statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
}
if (HeapTupleIsValid(vardata2.statsTuple))
{
/* note we allow use of nullfrac regardless of security check */
stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
if (statistic_proc_security_check(&vardata2, opfuncoid))
have_mcvs2 = get_attstatsslot(&sslot2, vardata2.statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
}
/* We need to compute the inner-join selectivity in all cases */
selec_inner = eqjoinsel_inner(opfuncoid,
&vardata1, &vardata2,
nd1, nd2,
isdefault1, isdefault2,
&sslot1, &sslot2,
stats1, stats2,
have_mcvs1, have_mcvs2);
switch (sjinfo->jointype)
{
case JOIN_INNER:
case JOIN_LEFT:
case JOIN_FULL:
selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
selec = selec_inner;
break;
case JOIN_SEMI:
case JOIN_ANTI:
@ -2317,12 +2376,40 @@ eqjoinsel(PG_FUNCTION_ARGS)
inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
if (!join_is_reversed)
selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
selec = eqjoinsel_semi(opfuncoid,
&vardata1, &vardata2,
nd1, nd2,
isdefault1, isdefault2,
&sslot1, &sslot2,
stats1, stats2,
have_mcvs1, have_mcvs2,
inner_rel);
else
selec = eqjoinsel_semi(get_commutator(operator),
{
Oid commop = get_commutator(operator);
Oid commopfuncoid = OidIsValid(commop) ? get_opcode(commop) : InvalidOid;
selec = eqjoinsel_semi(commopfuncoid,
&vardata2, &vardata1,
nd2, nd1,
isdefault2, isdefault1,
&sslot2, &sslot1,
stats2, stats1,
have_mcvs2, have_mcvs1,
inner_rel);
}
/*
* We should never estimate the output of a semijoin to be more
* rows than we estimate for an inner join with the same input
* rels and join condition; it's obviously impossible for that to
* happen. The former estimate is N1 * Ssemi while the latter is
* N1 * N2 * Sinner, so we may clamp Ssemi <= N2 * Sinner. Doing
* this is worthwhile because of the shakier estimation rules we
* use in eqjoinsel_semi, particularly in cases where it has to
* punt entirely.
*/
selec = Min(selec, inner_rel->rows * selec_inner);
break;
default:
/* other values not expected here */
@ -2332,6 +2419,9 @@ eqjoinsel(PG_FUNCTION_ARGS)
break;
}
free_attstatsslot(&sslot1);
free_attstatsslot(&sslot2);
ReleaseVariableStats(vardata1);
ReleaseVariableStats(vardata2);
@ -2347,49 +2437,15 @@ eqjoinsel(PG_FUNCTION_ARGS)
* that it's worth trying to distinguish them here.
*/
static double
eqjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2)
eqjoinsel_inner(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
bool isdefault1, bool isdefault2,
AttStatsSlot *sslot1, AttStatsSlot *sslot2,
Form_pg_statistic stats1, Form_pg_statistic stats2,
bool have_mcvs1, bool have_mcvs2)
{
double selec;
double nd1;
double nd2;
bool isdefault1;
bool isdefault2;
Oid opfuncoid;
Form_pg_statistic stats1 = NULL;
Form_pg_statistic stats2 = NULL;
bool have_mcvs1 = false;
bool have_mcvs2 = false;
AttStatsSlot sslot1;
AttStatsSlot sslot2;
nd1 = get_variable_numdistinct(vardata1, &isdefault1);
nd2 = get_variable_numdistinct(vardata2, &isdefault2);
opfuncoid = get_opcode(operator);
memset(&sslot1, 0, sizeof(sslot1));
memset(&sslot2, 0, sizeof(sslot2));
if (HeapTupleIsValid(vardata1->statsTuple))
{
/* note we allow use of nullfrac regardless of security check */
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
if (statistic_proc_security_check(vardata1, opfuncoid))
have_mcvs1 = get_attstatsslot(&sslot1, vardata1->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
}
if (HeapTupleIsValid(vardata2->statsTuple))
{
/* note we allow use of nullfrac regardless of security check */
stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
if (statistic_proc_security_check(vardata2, opfuncoid))
have_mcvs2 = get_attstatsslot(&sslot2, vardata2->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
}
if (have_mcvs1 && have_mcvs2)
{
@ -2423,8 +2479,8 @@ eqjoinsel_inner(Oid operator,
nmatches;
fmgr_info(opfuncoid, &eqproc);
hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool));
hasmatch2 = (bool *) palloc0(sslot2.nvalues * sizeof(bool));
hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
hasmatch2 = (bool *) palloc0(sslot2->nvalues * sizeof(bool));
/*
* Note we assume that each MCV will match at most one member of the
@ -2434,21 +2490,21 @@ eqjoinsel_inner(Oid operator,
*/
matchprodfreq = 0.0;
nmatches = 0;
for (i = 0; i < sslot1.nvalues; i++)
for (i = 0; i < sslot1->nvalues; i++)
{
int j;
for (j = 0; j < sslot2.nvalues; j++)
for (j = 0; j < sslot2->nvalues; j++)
{
if (hasmatch2[j])
continue;
if (DatumGetBool(FunctionCall2Coll(&eqproc,
DEFAULT_COLLATION_OID,
sslot1.values[i],
sslot2.values[j])))
sslot1->values[i],
sslot2->values[j])))
{
hasmatch1[i] = hasmatch2[j] = true;
matchprodfreq += sslot1.numbers[i] * sslot2.numbers[j];
matchprodfreq += sslot1->numbers[i] * sslot2->numbers[j];
nmatches++;
break;
}
@ -2457,22 +2513,22 @@ eqjoinsel_inner(Oid operator,
CLAMP_PROBABILITY(matchprodfreq);
/* Sum up frequencies of matched and unmatched MCVs */
matchfreq1 = unmatchfreq1 = 0.0;
for (i = 0; i < sslot1.nvalues; i++)
for (i = 0; i < sslot1->nvalues; i++)
{
if (hasmatch1[i])
matchfreq1 += sslot1.numbers[i];
matchfreq1 += sslot1->numbers[i];
else
unmatchfreq1 += sslot1.numbers[i];
unmatchfreq1 += sslot1->numbers[i];
}
CLAMP_PROBABILITY(matchfreq1);
CLAMP_PROBABILITY(unmatchfreq1);
matchfreq2 = unmatchfreq2 = 0.0;
for (i = 0; i < sslot2.nvalues; i++)
for (i = 0; i < sslot2->nvalues; i++)
{
if (hasmatch2[i])
matchfreq2 += sslot2.numbers[i];
matchfreq2 += sslot2->numbers[i];
else
unmatchfreq2 += sslot2.numbers[i];
unmatchfreq2 += sslot2->numbers[i];
}
CLAMP_PROBABILITY(matchfreq2);
CLAMP_PROBABILITY(unmatchfreq2);
@ -2497,15 +2553,15 @@ eqjoinsel_inner(Oid operator,
* MCVs plus non-MCV values.
*/
totalsel1 = matchprodfreq;
if (nd2 > sslot2.nvalues)
totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - sslot2.nvalues);
if (nd2 > sslot2->nvalues)
totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - sslot2->nvalues);
if (nd2 > nmatches)
totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
(nd2 - nmatches);
/* Same estimate from the point of view of relation 2. */
totalsel2 = matchprodfreq;
if (nd1 > sslot1.nvalues)
totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1.nvalues);
if (nd1 > sslot1->nvalues)
totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - sslot1->nvalues);
if (nd1 > nmatches)
totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
(nd1 - nmatches);
@ -2550,9 +2606,6 @@ eqjoinsel_inner(Oid operator,
selec /= nd2;
}
free_attstatsslot(&sslot1);
free_attstatsslot(&sslot2);
return selec;
}
@ -2561,32 +2614,19 @@ eqjoinsel_inner(Oid operator,
*
* (Also used for anti join, which we are supposed to estimate the same way.)
* Caller has ensured that vardata1 is the LHS variable.
* Unlike eqjoinsel_inner, we have to cope with operator being InvalidOid.
* Unlike eqjoinsel_inner, we have to cope with opfuncoid being InvalidOid.
*/
static double
eqjoinsel_semi(Oid operator,
eqjoinsel_semi(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
bool isdefault1, bool isdefault2,
AttStatsSlot *sslot1, AttStatsSlot *sslot2,
Form_pg_statistic stats1, Form_pg_statistic stats2,
bool have_mcvs1, bool have_mcvs2,
RelOptInfo *inner_rel)
{
double selec;
double nd1;
double nd2;
bool isdefault1;
bool isdefault2;
Oid opfuncoid;
Form_pg_statistic stats1 = NULL;
bool have_mcvs1 = false;
bool have_mcvs2 = false;
AttStatsSlot sslot1;
AttStatsSlot sslot2;
nd1 = get_variable_numdistinct(vardata1, &isdefault1);
nd2 = get_variable_numdistinct(vardata2, &isdefault2);
opfuncoid = OidIsValid(operator) ? get_opcode(operator) : InvalidOid;
memset(&sslot1, 0, sizeof(sslot1));
memset(&sslot2, 0, sizeof(sslot2));
/*
* We clamp nd2 to be not more than what we estimate the inner relation's
@ -2621,26 +2661,7 @@ eqjoinsel_semi(Oid operator,
isdefault2 = false;
}
if (HeapTupleIsValid(vardata1->statsTuple))
{
/* note we allow use of nullfrac regardless of security check */
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
if (statistic_proc_security_check(vardata1, opfuncoid))
have_mcvs1 = get_attstatsslot(&sslot1, vardata1->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
}
if (HeapTupleIsValid(vardata2->statsTuple) &&
statistic_proc_security_check(vardata2, opfuncoid))
{
have_mcvs2 = get_attstatsslot(&sslot2, vardata2->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES);
/* note: currently don't need stanumbers from RHS */
}
if (have_mcvs1 && have_mcvs2 && OidIsValid(operator))
if (have_mcvs1 && have_mcvs2 && OidIsValid(opfuncoid))
{
/*
* We have most-common-value lists for both relations. Run through
@ -2663,15 +2684,15 @@ eqjoinsel_semi(Oid operator,
/*
* The clamping above could have resulted in nd2 being less than
* sslot2.nvalues; in which case, we assume that precisely the nd2
* sslot2->nvalues; in which case, we assume that precisely the nd2
* most common values in the relation will appear in the join input,
* and so compare to only the first nd2 members of the MCV list. Of
* course this is frequently wrong, but it's the best bet we can make.
*/
clamped_nvalues2 = Min(sslot2.nvalues, nd2);
clamped_nvalues2 = Min(sslot2->nvalues, nd2);
fmgr_info(opfuncoid, &eqproc);
hasmatch1 = (bool *) palloc0(sslot1.nvalues * sizeof(bool));
hasmatch1 = (bool *) palloc0(sslot1->nvalues * sizeof(bool));
hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
/*
@ -2681,7 +2702,7 @@ eqjoinsel_semi(Oid operator,
* and because the math wouldn't add up...
*/
nmatches = 0;
for (i = 0; i < sslot1.nvalues; i++)
for (i = 0; i < sslot1->nvalues; i++)
{
int j;
@ -2691,8 +2712,8 @@ eqjoinsel_semi(Oid operator,
continue;
if (DatumGetBool(FunctionCall2Coll(&eqproc,
DEFAULT_COLLATION_OID,
sslot1.values[i],
sslot2.values[j])))
sslot1->values[i],
sslot2->values[j])))
{
hasmatch1[i] = hasmatch2[j] = true;
nmatches++;
@ -2702,10 +2723,10 @@ eqjoinsel_semi(Oid operator,
}
/* Sum up frequencies of matched MCVs */
matchfreq1 = 0.0;
for (i = 0; i < sslot1.nvalues; i++)
for (i = 0; i < sslot1->nvalues; i++)
{
if (hasmatch1[i])
matchfreq1 += sslot1.numbers[i];
matchfreq1 += sslot1->numbers[i];
}
CLAMP_PROBABILITY(matchfreq1);
pfree(hasmatch1);
@ -2760,9 +2781,6 @@ eqjoinsel_semi(Oid operator,
selec = 0.5 * (1.0 - nullfrac1);
}
free_attstatsslot(&sslot1);
free_attstatsslot(&sslot2);
return selec;
}

View File

@ -801,8 +801,8 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER
EXPLAIN (COSTS OFF)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
QUERY PLAN
-------------------------------------------------------------------------------
QUERY PLAN
-------------------------------------------------------------------------
Sort
Sort Key: t1.a
-> Append
@ -831,19 +831,18 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (
Index Cond: (a = t1_4.b)
Filter: (b = 0)
-> Nested Loop
-> Unique
-> Sort
Sort Key: t1_5.b
-> Hash Semi Join
Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2))
-> Seq Scan on prt2_p3 t1_5
-> Hash
-> Seq Scan on prt1_e_p3 t1_8
Filter: (c = 0)
-> HashAggregate
Group Key: t1_5.b
-> Hash Semi Join
Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2))
-> Seq Scan on prt2_p3 t1_5
-> Hash
-> Seq Scan on prt1_e_p3 t1_8
Filter: (c = 0)
-> Index Scan using iprt1_p3_a on prt1_p3 t1_2
Index Cond: (a = t1_5.b)
Filter: (b = 0)
(40 rows)
(39 rows)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
a | b | c