Fix get_actual_variable_range() to cope with broken HOT chains.

Commit 3ca930fc3 modified get_actual_variable_range() to use a new
"SnapshotNonVacuumable" snapshot type for selecting tuples that it
would consider valid.  However, because that snapshot type can accept
recently-dead tuples, this caused a bug when using a recently-created
index: we might accept a recently-dead tuple that is an early member
of a broken HOT chain and does not actually match the index entry.
Then, the data extracted from the heap tuple would not necessarily be
an endpoint value of the column; it could even be NULL, leading to
get_actual_variable_range() itself reporting "found unexpected null
value in index".  Even without an error, this could lead to poor
plan choices due to an erroneous notion of the endpoint value.

We can improve matters by changing the code to use the index-only
scan technique (which didn't exist when get_actual_variable_range was
originally written).  If any of the tuples in a HOT chain are live
enough to satisfy SnapshotNonVacuumable, we take the data from the
index entry, ignoring what is in the heap.  This fixes the problem
without changing the live-vs-dead-tuple behavior from what was
intended by commit 3ca930fc3.

A side benefit is that for static tables we might not have to touch
the heap at all (when the extremal value is in an all-visible page).
In addition, we can save some overhead by not having to create a
complete ExecutorState, and we don't need to run FormIndexDatum,
avoiding more cycles as well as the possibility of failure for
indexes on expressions.  (I'm not sure that this code would ever
be used to determine the extreme value of an expression, in the
current state of the planner; but it's definitely possible that
lower-order columns of the selected index could be expressions.
So one could construct perhaps-artificial examples in which the
old code unexpectedly failed due to trying to compute an
expression's value for a now-dead row.)

Per report from Manuel Rigger.  Back-patch to v11 where commit
3ca930fc3 came in.

Discussion: https://postgr.es/m/CA+u7OA7W4NWEhCvftdV6_8bbm2vgypi5nuxfnSEJQqVKFSUoMg@mail.gmail.com
This commit is contained in:
Tom Lane 2019-07-12 16:24:59 -04:00
parent cfde234939
commit d3751adcf1
1 changed files with 177 additions and 104 deletions

View File

@ -103,17 +103,14 @@
#include "access/brin.h"
#include "access/gin.h"
#include "access/htup_details.h"
#include "access/sysattr.h"
#include "access/table.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "access/visibilitymap.h"
#include "catalog/pg_am.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_statistic_ext.h"
#include "executor/executor.h"
#include "executor/nodeAgg.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
@ -127,12 +124,14 @@
#include "parser/parse_clause.h"
#include "parser/parsetree.h"
#include "statistics/statistics.h"
#include "storage/bufmgr.h"
#include "utils/builtins.h"
#include "utils/date.h"
#include "utils/datum.h"
#include "utils/fmgroids.h"
#include "utils/index_selfuncs.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
@ -198,6 +197,15 @@ static bool get_actual_variable_range(PlannerInfo *root,
VariableStatData *vardata,
Oid sortop,
Datum *min, Datum *max);
static bool get_actual_variable_endpoint(Relation heapRel,
Relation indexRel,
ScanDirection indexscandir,
ScanKey scankeys,
int16 typLen,
bool typByVal,
TupleTableSlot *tableslot,
MemoryContext outercontext,
Datum *endpointDatum);
static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
@ -5180,30 +5188,23 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
}
/*
* Found a suitable index to extract data from. We'll need an EState
* and a bunch of other infrastructure.
* Found a suitable index to extract data from. Set up some data that
* can be used by both invocations of get_actual_variable_endpoint.
*/
{
EState *estate;
ExprContext *econtext;
MemoryContext tmpcontext;
MemoryContext oldcontext;
Relation heapRel;
Relation indexRel;
IndexInfo *indexInfo;
TupleTableSlot *slot;
int16 typLen;
bool typByVal;
ScanKeyData scankeys[1];
IndexScanDesc index_scan;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
SnapshotData SnapshotNonVacuumable;
estate = CreateExecutorState();
econtext = GetPerTupleExprContext(estate);
/* Make sure any cruft is generated in the econtext's memory */
tmpcontext = econtext->ecxt_per_tuple_memory;
/* Make sure any cruft gets recycled when we're done */
tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
"get_actual_variable_range workspace",
ALLOCSET_DEFAULT_SIZES);
oldcontext = MemoryContextSwitchTo(tmpcontext);
/*
@ -5213,14 +5214,9 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
heapRel = table_open(rte->relid, NoLock);
indexRel = index_open(index->indexoid, NoLock);
/* extract index key information from the index's pg_index info */
indexInfo = BuildIndexInfo(indexRel);
/* some other stuff */
/* build some stuff needed for indexscan execution */
slot = table_slot_create(heapRel, NULL);
econtext->ecxt_scantuple = slot;
get_typlenbyval(vardata->atttype, &typLen, &typByVal);
InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin);
/* set up an IS NOT NULL scan key so that we ignore nulls */
ScanKeyEntryInitialize(&scankeys[0],
@ -5232,94 +5228,38 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
InvalidOid, /* no reg proc for this */
(Datum) 0); /* constant */
have_data = true;
/* If min is requested ... */
if (min)
{
/*
* In principle, we should scan the index with our current
* active snapshot, which is the best approximation we've got
* to what the query will see when executed. But that won't
* be exact if a new snap is taken before running the query,
* and it can be very expensive if a lot of recently-dead or
* uncommitted rows exist at the beginning or end of the index
* (because we'll laboriously fetch each one and reject it).
* Instead, we use SnapshotNonVacuumable. That will accept
* recently-dead and uncommitted rows as well as normal
* visible rows. On the other hand, it will reject known-dead
* rows, and thus not give a bogus answer when the extreme
* value has been deleted (unless the deletion was quite
* recent); that case motivates not using SnapshotAny here.
*
* A crucial point here is that SnapshotNonVacuumable, with
* RecentGlobalXmin as horizon, yields the inverse of the
* condition that the indexscan will use to decide that index
* entries are killable (see heap_hot_search_buffer()).
* Therefore, if the snapshot rejects a tuple and we have to
* continue scanning past it, we know that the indexscan will
* mark that index entry killed. That means that the next
* get_actual_variable_range() call will not have to visit
* that heap entry. In this way we avoid repetitive work when
* this function is used a lot during planning.
*/
index_scan = index_beginscan(heapRel, indexRel,
&SnapshotNonVacuumable,
1, 0);
index_rescan(index_scan, scankeys, 1, NULL, 0);
/* Fetch first tuple in sortop's direction */
if (index_getnext_slot(index_scan, indexscandir, slot))
{
/* Extract the index column values from the slot */
FormIndexDatum(indexInfo, slot, estate,
values, isnull);
/* Shouldn't have got a null, but be careful */
if (isnull[0])
elog(ERROR, "found unexpected null value in index \"%s\"",
RelationGetRelationName(indexRel));
/* Copy the index column value out to caller's context */
MemoryContextSwitchTo(oldcontext);
*min = datumCopy(values[0], typByVal, typLen);
MemoryContextSwitchTo(tmpcontext);
}
else
have_data = false;
index_endscan(index_scan);
have_data = get_actual_variable_endpoint(heapRel,
indexRel,
indexscandir,
scankeys,
typLen,
typByVal,
slot,
oldcontext,
min);
}
else
{
/* If min not requested, assume index is nonempty */
have_data = true;
}
/* If max is requested, and we didn't find the index is empty */
if (max && have_data)
{
index_scan = index_beginscan(heapRel, indexRel,
&SnapshotNonVacuumable,
1, 0);
index_rescan(index_scan, scankeys, 1, NULL, 0);
/* Fetch first tuple in reverse direction */
if (index_getnext_slot(index_scan, -indexscandir, slot))
{
/* Extract the index column values from the slot */
FormIndexDatum(indexInfo, slot, estate,
values, isnull);
/* Shouldn't have got a null, but be careful */
if (isnull[0])
elog(ERROR, "found unexpected null value in index \"%s\"",
RelationGetRelationName(indexRel));
/* Copy the index column value out to caller's context */
MemoryContextSwitchTo(oldcontext);
*max = datumCopy(values[0], typByVal, typLen);
MemoryContextSwitchTo(tmpcontext);
}
else
have_data = false;
index_endscan(index_scan);
/* scan in the opposite direction; all else is the same */
have_data = get_actual_variable_endpoint(heapRel,
indexRel,
-indexscandir,
scankeys,
typLen,
typByVal,
slot,
oldcontext,
max);
}
/* Clean everything up */
@ -5329,7 +5269,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
table_close(heapRel, NoLock);
MemoryContextSwitchTo(oldcontext);
FreeExecutorState(estate);
MemoryContextDelete(tmpcontext);
/* And we're done */
break;
@ -5339,6 +5279,139 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
return have_data;
}
/*
* Get one endpoint datum (min or max depending on indexscandir) from the
* specified index. Return true if successful, false if index is empty.
* On success, endpoint value is stored to *endpointDatum (and copied into
* outercontext).
*
* scankeys is a 1-element scankey array set up to reject nulls.
* typLen/typByVal describe the datatype of the index's first column.
* tableslot is a slot suitable to hold table tuples, in case we need
* to probe the heap.
* (We could compute these values locally, but that would mean computing them
* twice when get_actual_variable_range needs both the min and the max.)
*/
static bool
get_actual_variable_endpoint(Relation heapRel,
Relation indexRel,
ScanDirection indexscandir,
ScanKey scankeys,
int16 typLen,
bool typByVal,
TupleTableSlot *tableslot,
MemoryContext outercontext,
Datum *endpointDatum)
{
bool have_data = false;
SnapshotData SnapshotNonVacuumable;
IndexScanDesc index_scan;
Buffer vmbuffer = InvalidBuffer;
ItemPointer tid;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
MemoryContext oldcontext;
/*
* We use the index-only-scan machinery for this. With mostly-static
* tables that's a win because it avoids a heap visit. It's also a win
* for dynamic data, but the reason is less obvious; read on for details.
*
* In principle, we should scan the index with our current active
* snapshot, which is the best approximation we've got to what the query
* will see when executed. But that won't be exact if a new snap is taken
* before running the query, and it can be very expensive if a lot of
* recently-dead or uncommitted rows exist at the beginning or end of the
* index (because we'll laboriously fetch each one and reject it).
* Instead, we use SnapshotNonVacuumable. That will accept recently-dead
* and uncommitted rows as well as normal visible rows. On the other
* hand, it will reject known-dead rows, and thus not give a bogus answer
* when the extreme value has been deleted (unless the deletion was quite
* recent); that case motivates not using SnapshotAny here.
*
* A crucial point here is that SnapshotNonVacuumable, with
* RecentGlobalXmin as horizon, yields the inverse of the condition that
* the indexscan will use to decide that index entries are killable (see
* heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple
* (or more precisely, all tuples of a HOT chain) and we have to continue
* scanning past it, we know that the indexscan will mark that index entry
* killed. That means that the next get_actual_variable_endpoint() call
* will not have to re-consider that index entry. In this way we avoid
* repetitive work when this function is used a lot during planning.
*
* But using SnapshotNonVacuumable creates a hazard of its own. In a
* recently-created index, some index entries may point at "broken" HOT
* chains in which not all the tuple versions contain data matching the
* index entry. The live tuple version(s) certainly do match the index,
* but SnapshotNonVacuumable can accept recently-dead tuple versions that
* don't match. Hence, if we took data from the selected heap tuple, we
* might get a bogus answer that's not close to the index extremal value,
* or could even be NULL. We avoid this hazard because we take the data
* from the index entry not the heap.
*/
InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin);
index_scan = index_beginscan(heapRel, indexRel,
&SnapshotNonVacuumable,
1, 0);
/* Set it up for index-only scan */
index_scan->xs_want_itup = true;
index_rescan(index_scan, scankeys, 1, NULL, 0);
/* Fetch first/next tuple in specified direction */
while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL)
{
if (!VM_ALL_VISIBLE(heapRel,
ItemPointerGetBlockNumber(tid),
&vmbuffer))
{
/* Rats, we have to visit the heap to check visibility */
if (!index_fetch_heap(index_scan, tableslot))
continue; /* no visible tuple, try next index entry */
/* We don't actually need the heap tuple for anything */
ExecClearTuple(tableslot);
/*
* We don't care whether there's more than one visible tuple in
* the HOT chain; if any are visible, that's good enough.
*/
}
/*
* We expect that btree will return data in IndexTuple not HeapTuple
* format. It's not lossy either.
*/
if (!index_scan->xs_itup)
elog(ERROR, "no data returned for index-only scan");
if (index_scan->xs_recheck)
elog(ERROR, "unexpected recheck indication from btree");
/* OK to deconstruct the index tuple */
index_deform_tuple(index_scan->xs_itup,
index_scan->xs_itupdesc,
values, isnull);
/* Shouldn't have got a null, but be careful */
if (isnull[0])
elog(ERROR, "found unexpected null value in index \"%s\"",
RelationGetRelationName(indexRel));
/* Copy the index column value out to caller's context */
oldcontext = MemoryContextSwitchTo(outercontext);
*endpointDatum = datumCopy(values[0], typByVal, typLen);
MemoryContextSwitchTo(oldcontext);
have_data = true;
break;
}
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
index_endscan(index_scan);
return have_data;
}
/*
* find_join_input_rel
* Look up the input relation for a join.