Fix deduplication "single value" strategy bug.

It was possible for deduplication's single value strategy to mistakenly
believe that a very small duplicate tuple counts as one of the six large
tuples that it aims to leave behind after the page finally splits.  This
could cause slightly suboptimal space utilization with very low
cardinality indexes, though only under fairly narrow conditions.

To fix, be particular about what kind of tuple counts as a
maxpostingsize-capped tuple.  This avoids confusion in the event of a
small tuple that gets "wedged" between two large tuples, where all
tuples on the page are duplicates of the same value.

Discussion: https://postgr.es/m/CAH2-Wz=Y+sgSFc-O3LpiZX-POx2bC+okec2KafERHuzdVa7-rQ@mail.gmail.com
Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb)
This commit is contained in:
Peter Geoghegan 2020-06-19 08:57:24 -07:00
parent f9e9704f09
commit be14f884d5
4 changed files with 32 additions and 13 deletions

View File

@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
Page newpage;
int newpagendataitems = 0;
OffsetNumber deletable[MaxIndexTuplesPerPage];
BTDedupState state;
int ndeletable = 0;
@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
*/
state = (BTDedupState) palloc(sizeof(BTDedupStateData));
state->deduplicate = true;
state->nmaxitems = 0;
state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
/* Metadata about base tuple of current pending posting list */
state->base = NULL;
@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
* reset the state and move on without modifying the page.
*/
pagesaving += _bt_dedup_finish_pending(newpage, state);
newpagendataitems++;
if (singlevalstrat)
{
/*
* Single value strategy's extra steps.
*
* Lower maxpostingsize for sixth and final item that might be
* deduplicated by current deduplication pass. When sixth
* item formed/observed, stop deduplicating items.
* Lower maxpostingsize for sixth and final large posting list
* tuple at the point where 5 maxpostingsize-capped tuples
* have either been formed or observed.
*
* Note: It's possible that this will be reached even when
* current deduplication pass has yet to merge together some
* existing items. It doesn't matter whether or not the
* current call generated the maxpostingsize-capped duplicate
* tuples at the start of the page.
* When a sixth maxpostingsize-capped item is formed/observed,
* stop merging together tuples altogether. The few tuples
* that remain at the end of the page won't be merged together
* at all (at least not until after a future page split takes
* place).
*/
if (newpagendataitems == 5)
if (state->nmaxitems == 5)
_bt_singleval_fillfactor(page, state, newitemsz);
else if (newpagendataitems == 6)
else if (state->nmaxitems == 6)
{
state->deduplicate = false;
singlevalstrat = false; /* won't be back here */
@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
/* Handle the last item */
pagesaving += _bt_dedup_finish_pending(newpage, state);
newpagendataitems++;
/*
* If no items suitable for deduplication were found, newpage must be
@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
(state->nhtids + nhtids) * sizeof(ItemPointerData));
if (mergedtupsz > state->maxpostingsize)
{
/*
* Count this as an oversized item for single value strategy, though
* only when there are 50 TIDs in the final posting list tuple. This
* limit (which is fairly arbitrary) avoids confusion about how many
* 1/6 of a page tuples have been encountered/created by the current
* deduplication pass.
*
* Note: We deliberately don't consider which deduplication pass
* merged together tuples to create this item (could be a previous
* deduplication pass, or current pass). See _bt_do_singleval()
* comments.
*/
if (state->nhtids > 50)
state->nmaxitems++;
return false;
}
/*
* Save heap TIDs to pending posting list tuple -- itup can be merged into

View File

@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
pfree(postingtuple);
}
dstate->nmaxitems = 0;
dstate->nhtids = 0;
dstate->nitems = 0;
dstate->phystupsize = 0;
@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
dstate->deduplicate = true; /* unused */
dstate->nmaxitems = 0; /* unused */
dstate->maxpostingsize = 0; /* set later */
/* Metadata about base tuple of current pending posting list */
dstate->base = NULL;

View File

@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record)
state = (BTDedupState) palloc(sizeof(BTDedupStateData));
state->deduplicate = true; /* unused */
state->nmaxitems = 0; /* unused */
/* Conservatively use larger maxpostingsize than primary */
state->maxpostingsize = BTMaxItemSize(page);
state->base = NULL;

View File

@ -739,6 +739,7 @@ typedef struct BTDedupStateData
{
/* Deduplication status info for entire pass over page */
bool deduplicate; /* Still deduplicating page? */
int nmaxitems; /* Number of max-sized tuples so far */
Size maxpostingsize; /* Limit on size of final tuple */
/* Metadata about base tuple of current pending posting list */