Extract column statistics from CTE references, if possible.

examine_simple_variable() left this as an unimplemented case years
ago, with the result that plans for queries involving un-flattened
CTEs might be much stupider than necessary.  It's not hard to extend
the existing logic for RTE_SUBQUERY cases to also be able to drill
down into CTEs, so let's do that.

There was some discussion of whether this patch breaks the idea
of a MATERIALIZED CTE being an optimization fence.  We concluded
it's okay, because we already allow the outer planner level to
see the estimated width and rowcount of the CTE result, and
letting it see column statistics too seems fairly equivalent.
Basically, what we expect of the optimization fence is that the
outer query should not affect the plan chosen for the CTE query.
Once that plan is chosen, it's okay for the outer planner level
to make use of whatever information we have about it.

Jian Guo and Tom Lane, per complaint from Hans Buschmann

Discussion: https://postgr.es/m/4504e67078d648cdac3651b2960da6e7@nidsa.net
This commit is contained in:
Tom Lane 2023-11-17 14:36:23 -05:00
parent 06c70849fb
commit f7816aec23
3 changed files with 124 additions and 35 deletions

View File

@ -5363,7 +5363,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
* Handle a simple Var for examine_variable
*
* This is split out as a subroutine so that we can recurse to deal with
* Vars referencing subqueries.
* Vars referencing subqueries (either sub-SELECT-in-FROM or CTE style).
*
* We already filled in all the fields of *vardata except for the stats tuple.
*/
@ -5497,13 +5497,19 @@ examine_simple_variable(PlannerInfo *root, Var *var,
vardata->acl_ok = true;
}
}
else if (rte->rtekind == RTE_SUBQUERY && !rte->inh)
else if ((rte->rtekind == RTE_SUBQUERY && !rte->inh) ||
(rte->rtekind == RTE_CTE && !rte->self_reference))
{
/*
* Plain subquery (not one that was converted to an appendrel).
* Plain subquery (not one that was converted to an appendrel) or
* non-recursive CTE. In either case, we can try to find out what the
* Var refers to within the subquery. We skip this for appendrel and
* recursive-CTE cases because any column stats we did find would
* likely not be very relevant.
*/
Query *subquery = rte->subquery;
RelOptInfo *rel;
PlannerInfo *subroot;
Query *subquery;
List *subtlist;
TargetEntry *ste;
/*
@ -5512,6 +5518,85 @@ examine_simple_variable(PlannerInfo *root, Var *var,
if (var->varattno == InvalidAttrNumber)
return;
/*
* Otherwise, find the subquery's planner subroot.
*/
if (rte->rtekind == RTE_SUBQUERY)
{
RelOptInfo *rel;
/*
* Fetch RelOptInfo for subquery. Note that we don't change the
* rel returned in vardata, since caller expects it to be a rel of
* the caller's query level. Because we might already be
* recursing, we can't use that rel pointer either, but have to
* look up the Var's rel afresh.
*/
rel = find_base_rel(root, var->varno);
subroot = rel->subroot;
}
else
{
/* CTE case is more difficult */
PlannerInfo *cteroot;
Index levelsup;
int ndx;
int plan_id;
ListCell *lc;
/*
* Find the referenced CTE, and locate the subroot previously made
* for it.
*/
levelsup = rte->ctelevelsup;
cteroot = root;
while (levelsup-- > 0)
{
cteroot = cteroot->parent_root;
if (!cteroot) /* shouldn't happen */
elog(ERROR, "bad levelsup for CTE \"%s\"", rte->ctename);
}
/*
* Note: cte_plan_ids can be shorter than cteList, if we are still
* working on planning the CTEs (ie, this is a side-reference from
* another CTE). So we mustn't use forboth here.
*/
ndx = 0;
foreach(lc, cteroot->parse->cteList)
{
CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
if (strcmp(cte->ctename, rte->ctename) == 0)
break;
ndx++;
}
if (lc == NULL) /* shouldn't happen */
elog(ERROR, "could not find CTE \"%s\"", rte->ctename);
if (ndx >= list_length(cteroot->cte_plan_ids))
elog(ERROR, "could not find plan for CTE \"%s\"", rte->ctename);
plan_id = list_nth_int(cteroot->cte_plan_ids, ndx);
if (plan_id <= 0)
elog(ERROR, "no plan was made for CTE \"%s\"", rte->ctename);
subroot = list_nth(root->glob->subroots, plan_id - 1);
}
/* If the subquery hasn't been planned yet, we have to punt */
if (subroot == NULL)
return;
Assert(IsA(subroot, PlannerInfo));
/*
* We must use the subquery parsetree as mangled by the planner, not
* the raw version from the RTE, because we need a Var that will refer
* to the subroot's live RelOptInfos. For instance, if any subquery
* pullup happened during planning, Vars in the targetlist might have
* gotten replaced, and we need to see the replacement expressions.
*/
subquery = subroot->parse;
Assert(IsA(subquery, Query));
/*
* Punt if subquery uses set operations or GROUP BY, as these will
* mash underlying columns' stats beyond recognition. (Set ops are
@ -5525,33 +5610,12 @@ examine_simple_variable(PlannerInfo *root, Var *var,
subquery->groupingSets)
return;
/*
* OK, fetch RelOptInfo for subquery. Note that we don't change the
* rel returned in vardata, since caller expects it to be a rel of the
* caller's query level. Because we might already be recursing, we
* can't use that rel pointer either, but have to look up the Var's
* rel afresh.
*/
rel = find_base_rel(root, var->varno);
/* If the subquery hasn't been planned yet, we have to punt */
if (rel->subroot == NULL)
return;
Assert(IsA(rel->subroot, PlannerInfo));
/*
* Switch our attention to the subquery as mangled by the planner. It
* was okay to look at the pre-planning version for the tests above,
* but now we need a Var that will refer to the subroot's live
* RelOptInfos. For instance, if any subquery pullup happened during
* planning, Vars in the targetlist might have gotten replaced, and we
* need to see the replacement expressions.
*/
subquery = rel->subroot->parse;
Assert(IsA(subquery, Query));
/* Get the subquery output expression referenced by the upper Var */
ste = get_tle_by_resno(subquery->targetList, var->varattno);
if (subquery->returningList)
subtlist = subquery->returningList;
else
subtlist = subquery->targetList;
ste = get_tle_by_resno(subtlist, var->varattno);
if (ste == NULL || ste->resjunk)
elog(ERROR, "subquery %s does not have attribute %d",
rte->eref->aliasname, var->varattno);
@ -5599,16 +5663,16 @@ examine_simple_variable(PlannerInfo *root, Var *var,
* if the underlying column is unique, the subquery may have
* joined to other tables in a way that creates duplicates.
*/
examine_simple_variable(rel->subroot, var, vardata);
examine_simple_variable(subroot, var, vardata);
}
}
else
{
/*
* Otherwise, the Var comes from a FUNCTION, VALUES, or CTE RTE. (We
* won't see RTE_JOIN here because join alias Vars have already been
* Otherwise, the Var comes from a FUNCTION or VALUES RTE. (We won't
* see RTE_JOIN here because join alias Vars have already been
* flattened.) There's not much we can do with function outputs, but
* maybe someday try to be smarter about VALUES and/or CTEs.
* maybe someday try to be smarter about VALUES.
*/
}
}

View File

@ -636,6 +636,24 @@ SELECT t1.id, t2.path, t2 FROM t AS t1 JOIN t AS t2 ON
16 | {3,7,11,16} | (16,"{3,7,11,16}")
(16 rows)
-- test that column statistics from a materialized CTE are available
-- to upper planner (otherwise, we'd get a stupider plan)
explain (costs off)
with x as materialized (select unique1 from tenk1 b)
select count(*) from tenk1 a
where unique1 in (select * from x);
QUERY PLAN
------------------------------------------------------------
Aggregate
CTE x
-> Index Only Scan using tenk1_unique1 on tenk1 b
-> Hash Semi Join
Hash Cond: (a.unique1 = x.unique1)
-> Index Only Scan using tenk1_unique1 on tenk1 a
-> Hash
-> CTE Scan on x
(8 rows)
-- SEARCH clause
create temp table graph0( f int, t int, label text );
insert into graph0 values

View File

@ -347,6 +347,13 @@ UNION ALL
SELECT t1.id, t2.path, t2 FROM t AS t1 JOIN t AS t2 ON
(t1.id=t2.id);
-- test that column statistics from a materialized CTE are available
-- to upper planner (otherwise, we'd get a stupider plan)
explain (costs off)
with x as materialized (select unique1 from tenk1 b)
select count(*) from tenk1 a
where unique1 in (select * from x);
-- SEARCH clause
create temp table graph0( f int, t int, label text );