630 lines
17 KiB
C
630 lines
17 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* hashutil.c
|
|
* Utility code for Postgres hash implementation.
|
|
*
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/hash/hashutil.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/hash.h"
|
|
#include "access/reloptions.h"
|
|
#include "access/relscan.h"
|
|
#include "storage/buf_internals.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/rel.h"
|
|
|
|
#define CALC_NEW_BUCKET(old_bucket, lowmask) \
|
|
old_bucket | (lowmask + 1)
|
|
|
|
/*
|
|
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
|
|
*/
|
|
bool
|
|
_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
|
|
{
|
|
/*
|
|
* Currently, we can't check any of the scan conditions since we do not
|
|
* have the original index entry value to supply to the sk_func. Always
|
|
* return true; we expect that hashgettuple already set the recheck flag
|
|
* to make the main indexscan code do it.
|
|
*/
|
|
#ifdef NOT_USED
|
|
TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
|
|
ScanKey key = scan->keyData;
|
|
int scanKeySize = scan->numberOfKeys;
|
|
|
|
while (scanKeySize > 0)
|
|
{
|
|
Datum datum;
|
|
bool isNull;
|
|
Datum test;
|
|
|
|
datum = index_getattr(itup,
|
|
key->sk_attno,
|
|
tupdesc,
|
|
&isNull);
|
|
|
|
/* assume sk_func is strict */
|
|
if (isNull)
|
|
return false;
|
|
if (key->sk_flags & SK_ISNULL)
|
|
return false;
|
|
|
|
test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
|
|
datum, key->sk_argument);
|
|
|
|
if (!DatumGetBool(test))
|
|
return false;
|
|
|
|
key++;
|
|
scanKeySize--;
|
|
}
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* _hash_datum2hashkey -- given a Datum, call the index's hash function
|
|
*
|
|
* The Datum is assumed to be of the index's column type, so we can use the
|
|
* "primary" hash function that's tracked for us by the generic index code.
|
|
*/
|
|
uint32
|
|
_hash_datum2hashkey(Relation rel, Datum key)
|
|
{
|
|
FmgrInfo *procinfo;
|
|
Oid collation;
|
|
|
|
/* XXX assumes index has only one attribute */
|
|
procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
|
|
collation = rel->rd_indcollation[0];
|
|
|
|
return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
|
|
}
|
|
|
|
/*
|
|
* _hash_datum2hashkey_type -- given a Datum of a specified type,
|
|
* hash it in a fashion compatible with this index
|
|
*
|
|
* This is much more expensive than _hash_datum2hashkey, so use it only in
|
|
* cross-type situations.
|
|
*/
|
|
uint32
|
|
_hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
|
|
{
|
|
RegProcedure hash_proc;
|
|
Oid collation;
|
|
|
|
/* XXX assumes index has only one attribute */
|
|
hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
|
|
keytype,
|
|
keytype,
|
|
HASHSTANDARD_PROC);
|
|
if (!RegProcedureIsValid(hash_proc))
|
|
elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
|
|
HASHSTANDARD_PROC, keytype, keytype,
|
|
RelationGetRelationName(rel));
|
|
collation = rel->rd_indcollation[0];
|
|
|
|
return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
|
|
}
|
|
|
|
/*
|
|
* _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
|
|
*/
|
|
Bucket
|
|
_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
|
|
uint32 highmask, uint32 lowmask)
|
|
{
|
|
Bucket bucket;
|
|
|
|
bucket = hashkey & highmask;
|
|
if (bucket > maxbucket)
|
|
bucket = bucket & lowmask;
|
|
|
|
return bucket;
|
|
}
|
|
|
|
/*
|
|
* _hash_log2 -- returns ceil(lg2(num))
|
|
*/
|
|
uint32
|
|
_hash_log2(uint32 num)
|
|
{
|
|
uint32 i,
|
|
limit;
|
|
|
|
limit = 1;
|
|
for (i = 0; limit < num; limit <<= 1, i++)
|
|
;
|
|
return i;
|
|
}
|
|
|
|
/*
|
|
* _hash_spareindex -- returns spare index / global splitpoint phase of the
|
|
* bucket
|
|
*/
|
|
uint32
|
|
_hash_spareindex(uint32 num_bucket)
|
|
{
|
|
uint32 splitpoint_group;
|
|
uint32 splitpoint_phases;
|
|
|
|
splitpoint_group = _hash_log2(num_bucket);
|
|
|
|
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
|
return splitpoint_group;
|
|
|
|
/* account for single-phase groups */
|
|
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
|
|
|
/* account for multi-phase groups before splitpoint_group */
|
|
splitpoint_phases +=
|
|
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
|
|
HASH_SPLITPOINT_PHASE_BITS);
|
|
|
|
/* account for phases within current group */
|
|
splitpoint_phases +=
|
|
(((num_bucket - 1) >>
|
|
(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
|
|
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
|
|
|
|
return splitpoint_phases;
|
|
}
|
|
|
|
/*
|
|
* _hash_get_totalbuckets -- returns total number of buckets allocated till
|
|
* the given splitpoint phase.
|
|
*/
|
|
uint32
|
|
_hash_get_totalbuckets(uint32 splitpoint_phase)
|
|
{
|
|
uint32 splitpoint_group;
|
|
uint32 total_buckets;
|
|
uint32 phases_within_splitpoint_group;
|
|
|
|
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
|
return (1 << splitpoint_phase);
|
|
|
|
/* get splitpoint's group */
|
|
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
|
splitpoint_group +=
|
|
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
|
|
HASH_SPLITPOINT_PHASE_BITS);
|
|
|
|
/* account for buckets before splitpoint_group */
|
|
total_buckets = (1 << (splitpoint_group - 1));
|
|
|
|
/* account for buckets within splitpoint_group */
|
|
phases_within_splitpoint_group =
|
|
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
|
|
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
|
|
total_buckets +=
|
|
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
|
|
phases_within_splitpoint_group);
|
|
|
|
return total_buckets;
|
|
}
|
|
|
|
/*
|
|
* _hash_checkpage -- sanity checks on the format of all hash pages
|
|
*
|
|
* If flags is not zero, it is a bitwise OR of the acceptable page types
|
|
* (values of hasho_flag & LH_PAGE_TYPE).
|
|
*/
|
|
void
|
|
_hash_checkpage(Relation rel, Buffer buf, int flags)
|
|
{
|
|
Page page = BufferGetPage(buf);
|
|
|
|
/*
|
|
* ReadBuffer verifies that every newly-read page passes
|
|
* PageHeaderIsValid, which means it either contains a reasonably sane
|
|
* page header or is all-zero. We have to defend against the all-zero
|
|
* case, however.
|
|
*/
|
|
if (PageIsNew(page))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
errmsg("index \"%s\" contains unexpected zero page at block %u",
|
|
RelationGetRelationName(rel),
|
|
BufferGetBlockNumber(buf)),
|
|
errhint("Please REINDEX it.")));
|
|
|
|
/*
|
|
* Additionally check that the special area looks sane.
|
|
*/
|
|
if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
errmsg("index \"%s\" contains corrupted page at block %u",
|
|
RelationGetRelationName(rel),
|
|
BufferGetBlockNumber(buf)),
|
|
errhint("Please REINDEX it.")));
|
|
|
|
if (flags)
|
|
{
|
|
HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if ((opaque->hasho_flag & flags) == 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
errmsg("index \"%s\" contains corrupted page at block %u",
|
|
RelationGetRelationName(rel),
|
|
BufferGetBlockNumber(buf)),
|
|
errhint("Please REINDEX it.")));
|
|
}
|
|
|
|
/*
|
|
* When checking the metapage, also verify magic number and version.
|
|
*/
|
|
if (flags == LH_META_PAGE)
|
|
{
|
|
HashMetaPage metap = HashPageGetMeta(page);
|
|
|
|
if (metap->hashm_magic != HASH_MAGIC)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
errmsg("index \"%s\" is not a hash index",
|
|
RelationGetRelationName(rel))));
|
|
|
|
if (metap->hashm_version != HASH_VERSION)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
errmsg("index \"%s\" has wrong hash version",
|
|
RelationGetRelationName(rel)),
|
|
errhint("Please REINDEX it.")));
|
|
}
|
|
}
|
|
|
|
bytea *
|
|
hashoptions(Datum reloptions, bool validate)
|
|
{
|
|
return default_reloptions(reloptions, validate, RELOPT_KIND_HASH);
|
|
}
|
|
|
|
/*
|
|
* _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
|
|
*/
|
|
uint32
|
|
_hash_get_indextuple_hashkey(IndexTuple itup)
|
|
{
|
|
char *attp;
|
|
|
|
/*
|
|
* We assume the hash key is the first attribute and can't be null, so
|
|
* this can be done crudely but very very cheaply ...
|
|
*/
|
|
attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
|
|
return *((uint32 *) attp);
|
|
}
|
|
|
|
/*
|
|
* _hash_convert_tuple - convert raw index data to hash key
|
|
*
|
|
* Inputs: values and isnull arrays for the user data column(s)
|
|
* Outputs: values and isnull arrays for the index tuple, suitable for
|
|
* passing to index_form_tuple().
|
|
*
|
|
* Returns true if successful, false if not (because there are null values).
|
|
* On a false result, the given data need not be indexed.
|
|
*
|
|
* Note: callers know that the index-column arrays are always of length 1.
|
|
* In principle, there could be more than one input column, though we do not
|
|
* currently support that.
|
|
*/
|
|
bool
|
|
_hash_convert_tuple(Relation index,
|
|
Datum *user_values, bool *user_isnull,
|
|
Datum *index_values, bool *index_isnull)
|
|
{
|
|
uint32 hashkey;
|
|
|
|
/*
|
|
* We do not insert null values into hash indexes. This is okay because
|
|
* the only supported search operator is '=', and we assume it is strict.
|
|
*/
|
|
if (user_isnull[0])
|
|
return false;
|
|
|
|
hashkey = _hash_datum2hashkey(index, user_values[0]);
|
|
index_values[0] = UInt32GetDatum(hashkey);
|
|
index_isnull[0] = false;
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* _hash_binsearch - Return the offset number in the page where the
|
|
* specified hash value should be sought or inserted.
|
|
*
|
|
* We use binary search, relying on the assumption that the existing entries
|
|
* are ordered by hash key.
|
|
*
|
|
* Returns the offset of the first index entry having hashkey >= hash_value,
|
|
* or the page's max offset plus one if hash_value is greater than all
|
|
* existing hash keys in the page. This is the appropriate place to start
|
|
* a search, or to insert a new item.
|
|
*/
|
|
OffsetNumber
|
|
_hash_binsearch(Page page, uint32 hash_value)
|
|
{
|
|
OffsetNumber upper;
|
|
OffsetNumber lower;
|
|
|
|
/* Loop invariant: lower <= desired place <= upper */
|
|
upper = PageGetMaxOffsetNumber(page) + 1;
|
|
lower = FirstOffsetNumber;
|
|
|
|
while (upper > lower)
|
|
{
|
|
OffsetNumber off;
|
|
IndexTuple itup;
|
|
uint32 hashkey;
|
|
|
|
off = (upper + lower) / 2;
|
|
Assert(OffsetNumberIsValid(off));
|
|
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
|
|
hashkey = _hash_get_indextuple_hashkey(itup);
|
|
if (hashkey < hash_value)
|
|
lower = off + 1;
|
|
else
|
|
upper = off;
|
|
}
|
|
|
|
return lower;
|
|
}
|
|
|
|
/*
|
|
* _hash_binsearch_last
|
|
*
|
|
* Same as above, except that if there are multiple matching items in the
|
|
* page, we return the offset of the last one instead of the first one,
|
|
* and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
|
|
* This is handy for starting a new page in a backwards scan.
|
|
*/
|
|
OffsetNumber
|
|
_hash_binsearch_last(Page page, uint32 hash_value)
|
|
{
|
|
OffsetNumber upper;
|
|
OffsetNumber lower;
|
|
|
|
/* Loop invariant: lower <= desired place <= upper */
|
|
upper = PageGetMaxOffsetNumber(page);
|
|
lower = FirstOffsetNumber - 1;
|
|
|
|
while (upper > lower)
|
|
{
|
|
IndexTuple itup;
|
|
OffsetNumber off;
|
|
uint32 hashkey;
|
|
|
|
off = (upper + lower + 1) / 2;
|
|
Assert(OffsetNumberIsValid(off));
|
|
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
|
|
hashkey = _hash_get_indextuple_hashkey(itup);
|
|
if (hashkey > hash_value)
|
|
upper = off - 1;
|
|
else
|
|
lower = off;
|
|
}
|
|
|
|
return lower;
|
|
}
|
|
|
|
/*
|
|
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
|
|
* from which current (new) bucket is being split.
|
|
*/
|
|
BlockNumber
|
|
_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
|
|
{
|
|
Bucket old_bucket;
|
|
uint32 mask;
|
|
Buffer metabuf;
|
|
HashMetaPage metap;
|
|
BlockNumber blkno;
|
|
|
|
/*
|
|
* To get the old bucket from the current bucket, we need a mask to modulo
|
|
* into lower half of table. This mask is stored in meta page as
|
|
* hashm_lowmask, but here we can't rely on the same, because we need a
|
|
* value of lowmask that was prevalent at the time when bucket split was
|
|
* started. Masking the most significant bit of new bucket would give us
|
|
* old bucket.
|
|
*/
|
|
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
|
|
old_bucket = new_bucket & mask;
|
|
|
|
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
|
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
|
|
|
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
|
|
|
|
_hash_relbuf(rel, metabuf);
|
|
|
|
return blkno;
|
|
}
|
|
|
|
/*
|
|
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
|
|
* that will be generated after split from old bucket.
|
|
*
|
|
* This is used to find the new bucket from old bucket based on current table
|
|
* half. It is mainly required to finish the incomplete splits where we are
|
|
* sure that not more than one bucket could have split in progress from old
|
|
* bucket.
|
|
*/
|
|
BlockNumber
|
|
_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
|
|
{
|
|
Bucket new_bucket;
|
|
Buffer metabuf;
|
|
HashMetaPage metap;
|
|
BlockNumber blkno;
|
|
|
|
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
|
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
|
|
|
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
|
|
metap->hashm_lowmask,
|
|
metap->hashm_maxbucket);
|
|
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
|
|
|
|
_hash_relbuf(rel, metabuf);
|
|
|
|
return blkno;
|
|
}
|
|
|
|
/*
|
|
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
|
|
* generated after split from current (old) bucket.
|
|
*
|
|
* This is used to find the new bucket from old bucket. New bucket can be
|
|
* obtained by OR'ing old bucket with most significant bit of current table
|
|
* half (lowmask passed in this function can be used to identify msb of
|
|
* current table half). There could be multiple buckets that could have
|
|
* been split from current bucket. We need the first such bucket that exists.
|
|
* Caller must ensure that no more than one split has happened from old
|
|
* bucket.
|
|
*/
|
|
Bucket
|
|
_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
|
|
uint32 lowmask, uint32 maxbucket)
|
|
{
|
|
Bucket new_bucket;
|
|
|
|
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
|
if (new_bucket > maxbucket)
|
|
{
|
|
lowmask = lowmask >> 1;
|
|
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
|
}
|
|
|
|
return new_bucket;
|
|
}
|
|
|
|
/*
|
|
* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
|
|
* told us were killed.
|
|
*
|
|
* scan->opaque, referenced locally through so, contains information about the
|
|
* current page and killed tuples thereon (generally, this should only be
|
|
* called if so->numKilled > 0).
|
|
*
|
|
* The caller does not have a lock on the page and may or may not have the
|
|
* page pinned in a buffer. Note that read-lock is sufficient for setting
|
|
* LP_DEAD status (which is only a hint).
|
|
*
|
|
* The caller must have pin on bucket buffer, but may or may not have pin
|
|
* on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
|
|
*
|
|
* We match items by heap TID before assuming they are the right ones to
|
|
* delete.
|
|
*
|
|
* There are never any scans active in a bucket at the time VACUUM begins,
|
|
* because VACUUM takes a cleanup lock on the primary bucket page and scans
|
|
* hold a pin. A scan can begin after VACUUM leaves the primary bucket page
|
|
* but before it finishes the entire bucket, but it can never pass VACUUM,
|
|
* because VACUUM always locks the next page before releasing the lock on
|
|
* the previous one. Therefore, we don't have to worry about accidentally
|
|
* killing a TID that has been reused for an unrelated tuple.
|
|
*/
|
|
void
|
|
_hash_kill_items(IndexScanDesc scan)
|
|
{
|
|
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
|
Relation rel = scan->indexRelation;
|
|
BlockNumber blkno;
|
|
Buffer buf;
|
|
Page page;
|
|
HashPageOpaque opaque;
|
|
OffsetNumber offnum,
|
|
maxoff;
|
|
int numKilled = so->numKilled;
|
|
int i;
|
|
bool killedsomething = false;
|
|
bool havePin = false;
|
|
|
|
Assert(so->numKilled > 0);
|
|
Assert(so->killedItems != NULL);
|
|
Assert(HashScanPosIsValid(so->currPos));
|
|
|
|
/*
|
|
* Always reset the scan state, so we don't look for same items on other
|
|
* pages.
|
|
*/
|
|
so->numKilled = 0;
|
|
|
|
blkno = so->currPos.currPage;
|
|
if (HashScanPosIsPinned(so->currPos))
|
|
{
|
|
/*
|
|
* We already have pin on this buffer, so, all we need to do is
|
|
* acquire lock on it.
|
|
*/
|
|
havePin = true;
|
|
buf = so->currPos.buf;
|
|
LockBuffer(buf, BUFFER_LOCK_SHARE);
|
|
}
|
|
else
|
|
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
|
|
|
|
page = BufferGetPage(buf);
|
|
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
for (i = 0; i < numKilled; i++)
|
|
{
|
|
int itemIndex = so->killedItems[i];
|
|
HashScanPosItem *currItem = &so->currPos.items[itemIndex];
|
|
|
|
offnum = currItem->indexOffset;
|
|
|
|
Assert(itemIndex >= so->currPos.firstItem &&
|
|
itemIndex <= so->currPos.lastItem);
|
|
|
|
while (offnum <= maxoff)
|
|
{
|
|
ItemId iid = PageGetItemId(page, offnum);
|
|
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
|
|
|
|
if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid))
|
|
{
|
|
/* found the item */
|
|
ItemIdMarkDead(iid);
|
|
killedsomething = true;
|
|
break; /* out of inner search loop */
|
|
}
|
|
offnum = OffsetNumberNext(offnum);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Since this can be redone later if needed, mark as dirty hint. Whenever
|
|
* we mark anything LP_DEAD, we also set the page's
|
|
* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
|
|
*/
|
|
if (killedsomething)
|
|
{
|
|
opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
|
|
MarkBufferDirtyHint(buf, true);
|
|
}
|
|
|
|
if (so->hashso_bucket_buf == so->currPos.buf ||
|
|
havePin)
|
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
|
else
|
|
_hash_relbuf(rel, buf);
|
|
}
|