postgresql/src/backend/access/hash/hashsearch.c

722 lines
21 KiB
C

/*-------------------------------------------------------------------------
*
* hashsearch.c
* search code for postgres hash tables
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/hash/hashsearch.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/hash.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/rel.h"
static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
ScanDirection dir);
static int _hash_load_qualified_items(IndexScanDesc scan, Page page,
OffsetNumber offnum, ScanDirection dir);
static inline void _hash_saveitem(HashScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
static void _hash_readnext(IndexScanDesc scan, Buffer *bufp,
Page *pagep, HashPageOpaque *opaquep);
/*
* _hash_next() -- Get the next item in a scan.
*
* On entry, so->currPos describes the current page, which may
* be pinned but not locked, and so->currPos.itemIndex identifies
* which item was previously returned.
*
* On successful exit, scan->xs_ctup.t_self is set to the TID
* of the next heap tuple. so->currPos is updated as needed.
*
* On failure exit (no more tuples), we return false with pin
* held on bucket page but no pins or locks held on overflow
* page.
*/
bool
_hash_next(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
HashScanPosItem *currItem;
BlockNumber blkno;
Buffer buf;
bool end_of_scan = false;
/*
* Advance to the next tuple on the current page; or if done, try to read
* data from the next or previous page based on the scan direction. Before
* moving to the next or previous page make sure that we deal with all the
* killed items.
*/
if (ScanDirectionIsForward(dir))
{
if (++so->currPos.itemIndex > so->currPos.lastItem)
{
if (so->numKilled > 0)
_hash_kill_items(scan);
blkno = so->currPos.nextPage;
if (BlockNumberIsValid(blkno))
{
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf));
if (!_hash_readpage(scan, &buf, dir))
end_of_scan = true;
}
else
end_of_scan = true;
}
}
else
{
if (--so->currPos.itemIndex < so->currPos.firstItem)
{
if (so->numKilled > 0)
_hash_kill_items(scan);
blkno = so->currPos.prevPage;
if (BlockNumberIsValid(blkno))
{
buf = _hash_getbuf(rel, blkno, HASH_READ,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(buf));
/*
* We always maintain the pin on bucket page for whole scan
* operation, so releasing the additional pin we have acquired
* here.
*/
if (buf == so->hashso_bucket_buf ||
buf == so->hashso_split_bucket_buf)
_hash_dropbuf(rel, buf);
if (!_hash_readpage(scan, &buf, dir))
end_of_scan = true;
}
else
end_of_scan = true;
}
}
if (end_of_scan)
{
_hash_dropscanbuf(rel, so);
HashScanPosInvalidate(so->currPos);
return false;
}
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_heaptid = currItem->heapTid;
return true;
}
/*
* Advance to next page in a bucket, if any. If we are scanning the bucket
* being populated during split operation then this function advances to the
* bucket being split after the last bucket page of bucket being populated.
*/
static void
_hash_readnext(IndexScanDesc scan,
Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
bool block_found = false;
blkno = (*opaquep)->hasho_nextblkno;
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno))
{
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
block_found = true;
}
else if (so->hashso_buc_populated && !so->hashso_buc_split)
{
/*
* end of bucket, scan bucket being split if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_split_bucket_buf;
/*
* buffer for bucket being split must be valid as we acquire the pin
* on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
PredicateLockPage(rel, BufferGetBlockNumber(*bufp), scan->xs_snapshot);
/*
* setting hashso_buc_split to true indicates that we are scanning
* bucket being split.
*/
so->hashso_buc_split = true;
block_found = true;
}
if (block_found)
{
*pagep = BufferGetPage(*bufp);
TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
}
}
/*
* Advance to previous page in a bucket, if any. If the current scan has
* started during split operation then this function advances to bucket
* being populated after the first bucket page of bucket being split.
*/
static void
_hash_readprev(IndexScanDesc scan,
Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
bool haveprevblk;
blkno = (*opaquep)->hasho_prevblkno;
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
{
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
haveprevblk = false;
}
else
{
_hash_relbuf(rel, *bufp);
haveprevblk = true;
}
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (haveprevblk)
{
Assert(BlockNumberIsValid(blkno));
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
*pagep = BufferGetPage(*bufp);
TestForOldSnapshot(scan->xs_snapshot, rel, *pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/*
* We always maintain the pin on bucket page for whole scan operation,
* so releasing the additional pin we have acquired here.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
_hash_dropbuf(rel, *bufp);
}
else if (so->hashso_buc_populated && so->hashso_buc_split)
{
/*
* end of bucket, scan bucket being populated if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_bucket_buf;
/*
* buffer for bucket being populated must be valid as we acquire the
* pin on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/* move to the end of bucket chain */
while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
_hash_readnext(scan, bufp, pagep, opaquep);
/*
* setting hashso_buc_split to false indicates that we are scanning
* bucket being populated.
*/
so->hashso_buc_split = false;
}
}
/*
* _hash_first() -- Find the first item in a scan.
*
* We find the first item (or, if backward scan, the last item) in the
* index that satisfies the qualification associated with the scan
* descriptor.
*
* On successful exit, if the page containing current index tuple is an
* overflow page, both pin and lock are released whereas if it is a bucket
* page then it is pinned but not locked and data about the matching
* tuple(s) on the page has been loaded into so->currPos,
* scan->xs_ctup.t_self is set to the heap TID of the current tuple.
*
* On failure exit (no more tuples), we return false, with pin held on
* bucket page but no pins or locks held on overflow page.
*/
bool
_hash_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
ScanKey cur;
uint32 hashkey;
Bucket bucket;
Buffer buf;
Page page;
HashPageOpaque opaque;
HashScanPosItem *currItem;
pgstat_count_index_scan(rel);
/*
* We do not support hash scans with no index qualification, because we
* would have to read the whole index rather than just one bucket. That
* creates a whole raft of problems, since we haven't got a practical way
* to lock all the buckets against splits or compactions.
*/
if (scan->numberOfKeys < 1)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("hash indexes do not support whole-index scans")));
/* There may be more than one index qual, but we hash only the first */
cur = &scan->keyData[0];
/* We support only single-column hash indexes */
Assert(cur->sk_attno == 1);
/* And there's only one operator strategy, too */
Assert(cur->sk_strategy == HTEqualStrategyNumber);
/*
* If the constant in the index qual is NULL, assume it cannot match any
* items in the index.
*/
if (cur->sk_flags & SK_ISNULL)
return false;
/*
* Okay to compute the hash key. We want to do this before acquiring any
* locks, in case a user-defined hash function happens to be slow.
*
* If scankey operator is not a cross-type comparison, we can use the
* cached hash function; otherwise gotta look it up in the catalogs.
*
* We support the convention that sk_subtype == InvalidOid means the
* opclass input type; this is a hack to simplify life for ScanKeyInit().
*/
if (cur->sk_subtype == rel->rd_opcintype[0] ||
cur->sk_subtype == InvalidOid)
hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
else
hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
cur->sk_subtype);
so->hashso_sk_hash = hashkey;
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
page = BufferGetPage(buf);
TestForOldSnapshot(scan->xs_snapshot, rel, page);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
bucket = opaque->hasho_bucket;
so->hashso_bucket_buf = buf;
/*
* If a bucket split is in progress, then while scanning the bucket being
* populated, we need to skip tuples that were copied from bucket being
* split. We also need to maintain a pin on the bucket being split to
* ensure that split-cleanup work done by vacuum doesn't remove tuples
* from it till this scan is done. We need to maintain a pin on the
* bucket being populated to ensure that vacuum doesn't squeeze that
* bucket till this scan is complete; otherwise, the ordering of tuples
* can't be maintained during forward and backward scans. Here, we have
* to be cautious about locking order: first, acquire the lock on bucket
* being split; then, release the lock on it but not the pin; then,
* acquire a lock on bucket being populated and again re-verify whether
* the bucket split is still in progress. Acquiring the lock on bucket
* being split first ensures that the vacuum waits for this scan to
* finish.
*/
if (H_BUCKET_BEING_POPULATED(opaque))
{
BlockNumber old_blkno;
Buffer old_buf;
old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
/*
* release the lock on new bucket and re-acquire it after acquiring
* the lock on old bucket.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf));
/*
* remember the split bucket buffer so as to use it later for
* scanning.
*/
so->hashso_split_bucket_buf = old_buf;
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
if (H_BUCKET_BEING_POPULATED(opaque))
so->hashso_buc_populated = true;
else
{
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
so->hashso_split_bucket_buf = InvalidBuffer;
}
}
/* If a backwards scan is requested, move to the end of the chain */
if (ScanDirectionIsBackward(dir))
{
/*
* Backward scans that start during split needs to start from end of
* bucket being split.
*/
while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
(so->hashso_buc_populated && !so->hashso_buc_split))
_hash_readnext(scan, &buf, &page, &opaque);
}
/* remember which buffer we have pinned, if any */
Assert(BufferIsInvalid(so->currPos.buf));
so->currPos.buf = buf;
/* Now find all the tuples satisfying the qualification from a page */
if (!_hash_readpage(scan, &buf, dir))
return false;
/* OK, itemIndex says what to return */
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_heaptid = currItem->heapTid;
/* if we're here, _hash_readpage found a valid tuples */
return true;
}
/*
* _hash_readpage() -- Load data from current index page into so->currPos
*
* We scan all the items in the current index page and save them into
* so->currPos if it satisfies the qualification. If no matching items
* are found in the current page, we move to the next or previous page
* in a bucket chain as indicated by the direction.
*
* Return true if any matching items are found else return false.
*/
static bool
_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
Buffer buf;
Page page;
HashPageOpaque opaque;
OffsetNumber offnum;
uint16 itemIndex;
buf = *bufP;
Assert(BufferIsValid(buf));
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
so->currPos.buf = buf;
so->currPos.currPage = BufferGetBlockNumber(buf);
if (ScanDirectionIsForward(dir))
{
BlockNumber prev_blkno = InvalidBlockNumber;
for (;;)
{
/* new page, locate starting position by binary search */
offnum = _hash_binsearch(page, so->hashso_sk_hash);
itemIndex = _hash_load_qualified_items(scan, page, offnum, dir);
if (itemIndex != 0)
break;
/*
* Could not find any matching tuples in the current page, move to
* the next page. Before leaving the current page, deal with any
* killed items.
*/
if (so->numKilled > 0)
_hash_kill_items(scan);
/*
* If this is a primary bucket page, hasho_prevblkno is not a real
* block number.
*/
if (so->currPos.buf == so->hashso_bucket_buf ||
so->currPos.buf == so->hashso_split_bucket_buf)
prev_blkno = InvalidBlockNumber;
else
prev_blkno = opaque->hasho_prevblkno;
_hash_readnext(scan, &buf, &page, &opaque);
if (BufferIsValid(buf))
{
so->currPos.buf = buf;
so->currPos.currPage = BufferGetBlockNumber(buf);
}
else
{
/*
* Remember next and previous block numbers for scrollable
* cursors to know the start position and return false
* indicating that no more matching tuples were found. Also,
* don't reset currPage or lsn, because we expect
* _hash_kill_items to be called for the old page after this
* function returns.
*/
so->currPos.prevPage = prev_blkno;
so->currPos.nextPage = InvalidBlockNumber;
so->currPos.buf = buf;
return false;
}
}
so->currPos.firstItem = 0;
so->currPos.lastItem = itemIndex - 1;
so->currPos.itemIndex = 0;
}
else
{
BlockNumber next_blkno = InvalidBlockNumber;
for (;;)
{
/* new page, locate starting position by binary search */
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
itemIndex = _hash_load_qualified_items(scan, page, offnum, dir);
if (itemIndex != MaxIndexTuplesPerPage)
break;
/*
* Could not find any matching tuples in the current page, move to
* the previous page. Before leaving the current page, deal with
* any killed items.
*/
if (so->numKilled > 0)
_hash_kill_items(scan);
if (so->currPos.buf == so->hashso_bucket_buf ||
so->currPos.buf == so->hashso_split_bucket_buf)
next_blkno = opaque->hasho_nextblkno;
_hash_readprev(scan, &buf, &page, &opaque);
if (BufferIsValid(buf))
{
so->currPos.buf = buf;
so->currPos.currPage = BufferGetBlockNumber(buf);
}
else
{
/*
* Remember next and previous block numbers for scrollable
* cursors to know the start position and return false
* indicating that no more matching tuples were found. Also,
* don't reset currPage or lsn, because we expect
* _hash_kill_items to be called for the old page after this
* function returns.
*/
so->currPos.prevPage = InvalidBlockNumber;
so->currPos.nextPage = next_blkno;
so->currPos.buf = buf;
return false;
}
}
so->currPos.firstItem = itemIndex;
so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
}
if (so->currPos.buf == so->hashso_bucket_buf ||
so->currPos.buf == so->hashso_split_bucket_buf)
{
so->currPos.prevPage = InvalidBlockNumber;
so->currPos.nextPage = opaque->hasho_nextblkno;
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
}
else
{
so->currPos.prevPage = opaque->hasho_prevblkno;
so->currPos.nextPage = opaque->hasho_nextblkno;
_hash_relbuf(rel, so->currPos.buf);
so->currPos.buf = InvalidBuffer;
}
Assert(so->currPos.firstItem <= so->currPos.lastItem);
return true;
}
/*
* Load all the qualified items from a current index page
* into so->currPos. Helper function for _hash_readpage.
*/
static int
_hash_load_qualified_items(IndexScanDesc scan, Page page,
OffsetNumber offnum, ScanDirection dir)
{
HashScanOpaque so = (HashScanOpaque) scan->opaque;
IndexTuple itup;
int itemIndex;
OffsetNumber maxoff;
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
/* load items[] in ascending order */
itemIndex = 0;
while (offnum <= maxoff)
{
Assert(offnum >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation for the scan
* that has started when split was in progress. Also, skip the
* tuples that are marked as dead.
*/
if ((so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
(scan->ignore_killed_tuples &&
(ItemIdIsDead(PageGetItemId(page, offnum)))))
{
offnum = OffsetNumberNext(offnum); /* move forward */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
_hash_checkqual(scan, itup))
{
/* tuple is qualified, so remember it */
_hash_saveitem(so, itemIndex, offnum, itup);
itemIndex++;
}
else
{
/*
* No more matching tuples exist in this page. so, exit while
* loop.
*/
break;
}
offnum = OffsetNumberNext(offnum);
}
Assert(itemIndex <= MaxIndexTuplesPerPage);
return itemIndex;
}
else
{
/* load items[] in descending order */
itemIndex = MaxIndexTuplesPerPage;
while (offnum >= FirstOffsetNumber)
{
Assert(offnum <= maxoff);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation for the scan
* that has started when split was in progress. Also, skip the
* tuples that are marked as dead.
*/
if ((so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
(scan->ignore_killed_tuples &&
(ItemIdIsDead(PageGetItemId(page, offnum)))))
{
offnum = OffsetNumberPrev(offnum); /* move back */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
_hash_checkqual(scan, itup))
{
itemIndex--;
/* tuple is qualified, so remember it */
_hash_saveitem(so, itemIndex, offnum, itup);
}
else
{
/*
* No more matching tuples exist in this page. so, exit while
* loop.
*/
break;
}
offnum = OffsetNumberPrev(offnum);
}
Assert(itemIndex >= 0);
return itemIndex;
}
}
/* Save an index item into so->currPos.items[itemIndex] */
static inline void
_hash_saveitem(HashScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup)
{
HashScanPosItem *currItem = &so->currPos.items[itemIndex];
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
}