Allow bitmap scans to operate as index-only scans when possible.

If we don't have to return any columns from heap tuples, and there's
no need to recheck qual conditions, and the heap page is all-visible,
then we can skip fetching the heap page altogether.

Skip prefetching pages too, when possible, on the assumption that the
recheck flag will remain the same from one page to the next.  While that
assumption is hardly bulletproof, it seems like a good bet most of the
time, and better than prefetching pages we don't need.

This commit installs the executor infrastructure, but doesn't change
any planner cost estimates, thus possibly causing bitmap scans to
not be chosen in cases where this change renders them the best choice.
I (tgl) am not entirely convinced that we need to account for this
behavior in the planner, because I think typically the bitmap scan would
get chosen anyway if it's the best bet.  In any case the submitted patch
took way too many shortcuts, resulting in too many clearly-bad choices,
to be committable.

Alexander Kuzmenkov, reviewed by Alexey Chernyshov, and whacked around
rather heavily by me.

Discussion: https://postgr.es/m/239a8955-c0fc-f506-026d-c837e86c827b@postgrespro.ru
This commit is contained in:
Tom Lane 2017-11-01 17:38:12 -04:00
parent ec7ce54204
commit 7c70996ebf
5 changed files with 152 additions and 42 deletions

View File

@ -39,6 +39,7 @@
#include "access/relscan.h"
#include "access/transam.h"
#include "access/visibilitymap.h"
#include "executor/execdebug.h"
#include "executor/nodeBitmapHeapscan.h"
#include "miscadmin.h"
@ -225,9 +226,31 @@ BitmapHeapNext(BitmapHeapScanState *node)
}
/*
* Fetch the current heap page and identify candidate tuples.
* We can skip fetching the heap page if we don't need any fields
* from the heap, and the bitmap entries don't need rechecking,
* and all tuples on the page are visible to our transaction.
*/
bitgetpage(scan, tbmres);
node->skip_fetch = (node->can_skip_fetch &&
!tbmres->recheck &&
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
tbmres->blockno,
&node->vmbuffer));
if (node->skip_fetch)
{
/*
* The number of tuples on this page is put into
* scan->rs_ntuples; note we don't fill scan->rs_vistuples.
*/
scan->rs_ntuples = tbmres->ntuples;
}
else
{
/*
* Fetch the current heap page and identify candidate tuples.
*/
bitgetpage(scan, tbmres);
}
if (tbmres->ntuples >= 0)
node->exact_pages++;
@ -289,45 +312,55 @@ BitmapHeapNext(BitmapHeapScanState *node)
*/
BitmapPrefetch(node, scan);
/*
* Okay to fetch the tuple
*/
targoffset = scan->rs_vistuples[scan->rs_cindex];
dp = (Page) BufferGetPage(scan->rs_cbuf);
lp = PageGetItemId(dp, targoffset);
Assert(ItemIdIsNormal(lp));
scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
scan->rs_ctup.t_len = ItemIdGetLength(lp);
scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
pgstat_count_heap_fetch(scan->rs_rd);
/*
* Set up the result slot to point to this tuple. Note that the slot
* acquires a pin on the buffer.
*/
ExecStoreTuple(&scan->rs_ctup,
slot,
scan->rs_cbuf,
false);
/*
* If we are using lossy info, we have to recheck the qual conditions
* at every tuple.
*/
if (tbmres->recheck)
if (node->skip_fetch)
{
econtext->ecxt_scantuple = slot;
ResetExprContext(econtext);
/*
* If we don't have to fetch the tuple, just return nulls.
*/
ExecStoreAllNullTuple(slot);
}
else
{
/*
* Okay to fetch the tuple.
*/
targoffset = scan->rs_vistuples[scan->rs_cindex];
dp = (Page) BufferGetPage(scan->rs_cbuf);
lp = PageGetItemId(dp, targoffset);
Assert(ItemIdIsNormal(lp));
if (!ExecQual(node->bitmapqualorig, econtext))
scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
scan->rs_ctup.t_len = ItemIdGetLength(lp);
scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
pgstat_count_heap_fetch(scan->rs_rd);
/*
* Set up the result slot to point to this tuple. Note that the
* slot acquires a pin on the buffer.
*/
ExecStoreTuple(&scan->rs_ctup,
slot,
scan->rs_cbuf,
false);
/*
* If we are using lossy info, we have to recheck the qual
* conditions at every tuple.
*/
if (tbmres->recheck)
{
/* Fails recheck, so drop it and loop back for another */
InstrCountFiltered2(node, 1);
ExecClearTuple(slot);
continue;
econtext->ecxt_scantuple = slot;
ResetExprContext(econtext);
if (!ExecQual(node->bitmapqualorig, econtext))
{
/* Fails recheck, so drop it and loop back for another */
InstrCountFiltered2(node, 1);
ExecClearTuple(slot);
continue;
}
}
}
@ -582,6 +615,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
while (node->prefetch_pages < node->prefetch_target)
{
TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
bool skip_fetch;
if (tbmpre == NULL)
{
@ -591,7 +625,26 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
break;
}
node->prefetch_pages++;
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
/*
* If we expect not to have to actually read this heap page,
* skip this prefetch call, but continue to run the prefetch
* logic normally. (Would it be better not to increment
* prefetch_pages?)
*
* This depends on the assumption that the index AM will
* report the same recheck flag for this future heap page as
* it did for the current heap page; which is not a certainty
* but is true in many cases.
*/
skip_fetch = (node->can_skip_fetch &&
(node->tbmres ? !node->tbmres->recheck : false) &&
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
tbmpre->blockno,
&node->pvmbuffer));
if (!skip_fetch)
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
}
}
@ -608,6 +661,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
{
TBMIterateResult *tbmpre;
bool do_prefetch = false;
bool skip_fetch;
/*
* Recheck under the mutex. If some other process has already
@ -633,7 +687,15 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
break;
}
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
/* As above, skip prefetch if we expect not to need page */
skip_fetch = (node->can_skip_fetch &&
(node->tbmres ? !node->tbmres->recheck : false) &&
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
tbmpre->blockno,
&node->pvmbuffer));
if (!skip_fetch)
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
}
}
}
@ -687,6 +749,7 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
/* rescan to release any page pin */
heap_rescan(node->ss.ss_currentScanDesc, NULL);
/* release bitmaps and buffers if any */
if (node->tbmiterator)
tbm_end_iterate(node->tbmiterator);
if (node->prefetch_iterator)
@ -697,6 +760,10 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
tbm_end_shared_iterate(node->shared_prefetch_iterator);
if (node->tbm)
tbm_free(node->tbm);
if (node->vmbuffer != InvalidBuffer)
ReleaseBuffer(node->vmbuffer);
if (node->pvmbuffer != InvalidBuffer)
ReleaseBuffer(node->pvmbuffer);
node->tbm = NULL;
node->tbmiterator = NULL;
node->tbmres = NULL;
@ -704,6 +771,8 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
node->initialized = false;
node->shared_tbmiterator = NULL;
node->shared_prefetch_iterator = NULL;
node->vmbuffer = InvalidBuffer;
node->pvmbuffer = InvalidBuffer;
ExecScanReScan(&node->ss);
@ -748,7 +817,7 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
ExecEndNode(outerPlanState(node));
/*
* release bitmap if any
* release bitmaps and buffers if any
*/
if (node->tbmiterator)
tbm_end_iterate(node->tbmiterator);
@ -760,6 +829,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
tbm_end_shared_iterate(node->shared_tbmiterator);
if (node->shared_prefetch_iterator)
tbm_end_shared_iterate(node->shared_prefetch_iterator);
if (node->vmbuffer != InvalidBuffer)
ReleaseBuffer(node->vmbuffer);
if (node->pvmbuffer != InvalidBuffer)
ReleaseBuffer(node->pvmbuffer);
/*
* close heap scan
@ -805,6 +878,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->tbm = NULL;
scanstate->tbmiterator = NULL;
scanstate->tbmres = NULL;
scanstate->skip_fetch = false;
scanstate->vmbuffer = InvalidBuffer;
scanstate->pvmbuffer = InvalidBuffer;
scanstate->exact_pages = 0;
scanstate->lossy_pages = 0;
scanstate->prefetch_iterator = NULL;
@ -815,8 +891,19 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->pscan_len = 0;
scanstate->initialized = false;
scanstate->shared_tbmiterator = NULL;
scanstate->shared_prefetch_iterator = NULL;
scanstate->pstate = NULL;
/*
* We can potentially skip fetching heap pages if we do not need any
* columns of the table, either for checking non-indexable quals or for
* returning data. This test is a bit simplistic, as it checks the
* stronger condition that there's no qual or return tlist at all. But in
* most cases it's probably not worth working harder than that.
*/
scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
node->scan.plan.targetlist == NIL);
/*
* Miscellaneous initialization
*

View File

@ -807,6 +807,15 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags)
if (IsA(path, CustomPath))
return false;
/*
* If a bitmap scan's tlist is empty, keep it as-is. This may allow the
* executor to skip heap page fetches, and in any case, the benefit of
* using a physical tlist instead would be minimal.
*/
if (IsA(path, BitmapHeapPath) &&
path->pathtarget->exprs == NIL)
return false;
/*
* Can't do it if any system columns or whole-row Vars are requested.
* (This could possibly be fixed but would take some fragile assumptions

View File

@ -507,7 +507,7 @@ typedef struct EState
bool *es_epqTupleSet; /* true if EPQ tuple is provided */
bool *es_epqScanDone; /* true if EPQ tuple has been fetched */
bool es_use_parallel_mode; /* can we use parallel workers? */
bool es_use_parallel_mode; /* can we use parallel workers? */
/* The per-query shared memory area to use for parallel execution. */
struct dsa_area *es_query_dsa;
@ -1331,6 +1331,10 @@ typedef struct ParallelBitmapHeapState
* tbm bitmap obtained from child index scan(s)
* tbmiterator iterator for scanning current pages
* tbmres current-page data
* can_skip_fetch can we potentially skip tuple fetches in this scan?
* skip_fetch are we skipping tuple fetches on this page?
* vmbuffer buffer for visibility-map lookups
* pvmbuffer ditto, for prefetched pages
* exact_pages total number of exact pages retrieved
* lossy_pages total number of lossy pages retrieved
* prefetch_iterator iterator for prefetching ahead of current page
@ -1351,6 +1355,10 @@ typedef struct BitmapHeapScanState
TIDBitmap *tbm;
TBMIterator *tbmiterator;
TBMIterateResult *tbmres;
bool can_skip_fetch;
bool skip_fetch;
Buffer vmbuffer;
Buffer pvmbuffer;
long exact_pages;
long lossy_pages;
TBMIterator *prefetch_iterator;

View File

@ -136,12 +136,15 @@ SELECT count(*) FROM tenk2;
(1 row)
-- do an indexscan
-- make sure it is not a bitmap scan, which might skip fetching heap tuples
SET enable_bitmapscan TO off;
SELECT count(*) FROM tenk2 WHERE unique1 = 1;
count
-------
1
(1 row)
RESET enable_bitmapscan;
-- We can't just call wait_for_stats() at this point, because we only
-- transmit stats when the session goes idle, and we probably didn't
-- transmit the last couple of counts yet thanks to the rate-limiting logic

View File

@ -138,7 +138,10 @@ ROLLBACK;
-- do a seqscan
SELECT count(*) FROM tenk2;
-- do an indexscan
-- make sure it is not a bitmap scan, which might skip fetching heap tuples
SET enable_bitmapscan TO off;
SELECT count(*) FROM tenk2 WHERE unique1 = 1;
RESET enable_bitmapscan;
-- We can't just call wait_for_stats() at this point, because we only
-- transmit stats when the session goes idle, and we probably didn't