/*------------------------------------------------------------------------- * * ginget.c * fetch tuples from a GIN scan. * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/gin/ginget.c *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/gin_private.h" #include "access/relscan.h" #include "miscadmin.h" #include "storage/predicate.h" #include "utils/datum.h" #include "utils/memutils.h" #include "utils/rel.h" /* GUC parameter */ int GinFuzzySearchLimit = 0; typedef struct pendingPosition { Buffer pendingBuffer; OffsetNumber firstOffset; OffsetNumber lastOffset; ItemPointerData item; bool *hasMatchKey; } pendingPosition; /* * Goes to the next page if current offset is outside of bounds */ static bool moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot) { Page page = BufferGetPage(stack->buffer); if (stack->off > PageGetMaxOffsetNumber(page)) { /* * We scanned the whole page, so we should take right page */ if (GinPageRightMost(page)) return false; /* no more pages */ stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); stack->blkno = BufferGetBlockNumber(stack->buffer); stack->off = FirstOffsetNumber; PredicateLockPage(btree->index, stack->blkno, snapshot); } return true; } /* * Scan all pages of a posting tree and save all its heap ItemPointers * in scanEntry->matchBitmap */ static void scanPostingTree(Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree, Snapshot snapshot) { GinBtreeData btree; GinBtreeStack *stack; Buffer buffer; Page page; /* Descend to the leftmost leaf page */ stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot); buffer = stack->buffer; IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ freeGinBtreeStack(stack); /* * Loop iterates through all leaf pages of posting tree */ for (;;) { page = BufferGetPage(buffer); if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) { int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); scanEntry->predictNumberResult += n; } if (GinPageRightMost(page)) break; /* no more pages */ buffer = ginStepRight(buffer, index, GIN_SHARE); } UnlockReleaseBuffer(buffer); } /* * Collects TIDs into scanEntry->matchBitmap for all heap tuples that * match the search entry. This supports three different match modes: * * 1. Partial-match support: scan from current point until the * comparePartialFn says we're done. * 2. SEARCH_MODE_ALL: scan from current point (which should be first * key for the current attnum) until we hit null items or end of attnum * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first * key for the current attnum) until we hit end of attnum * * Returns true if done, false if it's necessary to restart scan from scratch */ static bool collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry scanEntry, Snapshot snapshot) { OffsetNumber attnum; Form_pg_attribute attr; /* Initialize empty bitmap result */ scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); /* Null query cannot partial-match anything */ if (scanEntry->isPartialMatch && scanEntry->queryCategory != GIN_CAT_NORM_KEY) return true; /* Locate tupdesc entry for key column (for attbyval/attlen data) */ attnum = scanEntry->attnum; attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1); /* * Predicate lock entry leaf page, following pages will be locked by * moveRightIfItNeeded() */ PredicateLockPage(btree->index, stack->buffer, snapshot); for (;;) { Page page; IndexTuple itup; Datum idatum; GinNullCategory icategory; /* * stack->off points to the interested entry, buffer is already locked */ if (moveRightIfItNeeded(btree, stack, snapshot) == false) return true; page = BufferGetPage(stack->buffer); TestForOldSnapshot(snapshot, btree->index, page); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); /* * If tuple stores another attribute then stop scan */ if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) return true; /* Safe to fetch attribute value */ idatum = gintuple_get_key(btree->ginstate, itup, &icategory); /* * Check for appropriate scan stop conditions */ if (scanEntry->isPartialMatch) { int32 cmp; /* * In partial match, stop scan at any null (including * placeholders); partial matches never match nulls */ if (icategory != GIN_CAT_NORM_KEY) return true; /*---------- * Check of partial match. * case cmp == 0 => match * case cmp > 0 => not match and finish scan * case cmp < 0 => not match and continue scan *---------- */ cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1], btree->ginstate->supportCollation[attnum - 1], scanEntry->queryKey, idatum, UInt16GetDatum(scanEntry->strategy), PointerGetDatum(scanEntry->extra_data))); if (cmp > 0) return true; else if (cmp < 0) { stack->off++; continue; } } else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL) { /* * In ALL mode, we are not interested in null items, so we can * stop if we get to a null-item placeholder (which will be the * last entry for a given attnum). We do want to include NULL_KEY * and EMPTY_ITEM entries, though. */ if (icategory == GIN_CAT_NULL_ITEM) return true; } /* * OK, we want to return the TIDs listed in this entry. */ if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); /* * We should unlock current page (but not unpin) during tree scan * to prevent deadlock with vacuum processes. * * We save current entry value (idatum) to be able to re-find our * tuple after re-locking */ if (icategory == GIN_CAT_NORM_KEY) idatum = datumCopy(idatum, attr->attbyval, attr->attlen); LockBuffer(stack->buffer, GIN_UNLOCK); /* * Acquire predicate lock on the posting tree. We already hold a * lock on the entry page, but insertions to the posting tree * don't check for conflicts on that level. */ PredicateLockPage(btree->index, rootPostingTree, snapshot); /* Collect all the TIDs in this entry's posting tree */ scanPostingTree(btree->index, scanEntry, rootPostingTree, snapshot); /* * We lock again the entry page and while it was unlocked insert * might have occurred, so we need to re-find our position. */ LockBuffer(stack->buffer, GIN_SHARE); page = BufferGetPage(stack->buffer); if (!GinPageIsLeaf(page)) { /* * Root page becomes non-leaf while we unlock it. We will * start again, this situation doesn't occur often - root can * became a non-leaf only once per life of index. */ return false; } /* Search forward to re-find idatum */ for (;;) { Datum newDatum; GinNullCategory newCategory; if (moveRightIfItNeeded(btree, stack, snapshot) == false) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); if (gintuple_get_attrnum(btree->ginstate, itup) != attnum) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ newDatum = gintuple_get_key(btree->ginstate, itup, &newCategory); if (ginCompareEntries(btree->ginstate, attnum, newDatum, newCategory, idatum, icategory) == 0) break; /* Found! */ stack->off++; } if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval) pfree(DatumGetPointer(idatum)); } else { ItemPointer ipd; int nipd; ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); scanEntry->predictNumberResult += GinGetNPosting(itup); pfree(ipd); } /* * Done with this entry, go to the next */ stack->off++; } } /* * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; Page page; bool needUnlock; restartScanEntry: entry->buffer = InvalidBuffer; ItemPointerSetMin(&entry->curItem); entry->offset = InvalidOffsetNumber; if (entry->list) pfree(entry->list); entry->list = NULL; entry->nlist = 0; entry->matchBitmap = NULL; entry->matchResult = NULL; entry->reduceResult = false; entry->predictNumberResult = 0; /* * we should find entry, and begin scan of posting tree or just store * posting list in memory */ ginPrepareEntryScan(&btreeEntry, entry->attnum, entry->queryKey, entry->queryCategory, ginstate); stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot); page = BufferGetPage(stackEntry->buffer); /* ginFindLeafPage() will have already checked snapshot age. */ needUnlock = true; entry->isFinished = true; if (entry->isPartialMatch || entry->queryCategory == GIN_CAT_EMPTY_QUERY) { /* * btreeEntry.findItem locates the first item >= given search key. * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item * because of the way the GIN_CAT_EMPTY_QUERY category code is * assigned.) We scan forward from there and collect all TIDs needed * for the entry type. */ btreeEntry.findItem(&btreeEntry, stackEntry); if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot) == false) { /* * GIN tree was seriously restructured, so we will cleanup all * found data and rescan. See comments near 'return false' in * collectMatchBitmap() */ if (entry->matchBitmap) { if (entry->matchIterator) tbm_end_iterate(entry->matchIterator); entry->matchIterator = NULL; tbm_free(entry->matchBitmap); entry->matchBitmap = NULL; } LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); goto restartScanEntry; } if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) { entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); entry->isFinished = false; } } else if (btreeEntry.findItem(&btreeEntry, stackEntry)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); GinBtreeStack *stack; Page page; ItemPointerData minItem; /* * This is an equality scan, so lock the root of the posting tree. * It represents a lock on the exact key value, and covers all the * items in the posting tree. */ PredicateLockPage(ginstate->index, rootPostingTree, snapshot); /* * We should unlock entry page before touching posting tree to * prevent deadlocks with vacuum processes. Because entry is never * deleted from page and posting tree is never reduced to the * posting list, we can unlock page after getting BlockNumber of * root of posting tree. */ LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = false; stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, rootPostingTree, snapshot); entry->buffer = stack->buffer; /* * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount is * increased to keep buffer pinned after freeGinBtreeStack() call. */ IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); /* * Load the first page into memory. */ ItemPointerSetMin(&minItem); entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem); entry->predictNumberResult = stack->predictNumber * entry->nlist; LockBuffer(entry->buffer, GIN_UNLOCK); freeGinBtreeStack(stack); entry->isFinished = false; } else { /* * Lock the entry leaf page. This is more coarse-grained than * necessary, because it will conflict with any insertions that * land on the same leaf page, not only the exact key we searched * for. But locking an individual tuple would require updating * that lock whenever it moves because of insertions or vacuums, * which seems too complicated. */ PredicateLockPage(ginstate->index, BufferGetBlockNumber(stackEntry->buffer), snapshot); if (GinGetNPosting(itup) > 0) { entry->list = ginReadTuple(ginstate, entry->attnum, itup, &entry->nlist); entry->predictNumberResult = entry->nlist; entry->isFinished = false; } } } else { /* * No entry found. Predicate lock the leaf page, to lock the place * where the entry would've been, had there been one. */ PredicateLockPage(ginstate->index, BufferGetBlockNumber(stackEntry->buffer), snapshot); } if (needUnlock) LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); } /* * Comparison function for scan entry indexes. Sorts by predictNumberResult, * least frequent items first. */ static int entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg) { const GinScanKey key = (const GinScanKey) arg; int i1 = *(const int *) a1; int i2 = *(const int *) a2; uint32 n1 = key->scanEntry[i1]->predictNumberResult; uint32 n2 = key->scanEntry[i2]->predictNumberResult; if (n1 < n2) return -1; else if (n1 == n2) return 0; else return 1; } static void startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) { MemoryContext oldCtx = CurrentMemoryContext; int i; int j; int *entryIndexes; ItemPointerSetMin(&key->curItem); key->curItemMatches = false; key->recheckCurItem = false; key->isFinished = false; /* * Divide the entries into two distinct sets: required and additional. * Additional entries are not enough for a match alone, without any items * from the required set, but are needed by the consistent function to * decide if an item matches. When scanning, we can skip over items from * additional entries that have no corresponding matches in any of the * required entries. That speeds up queries like "frequent & rare" * considerably, if the frequent term can be put in the additional set. * * There can be many legal ways to divide them entries into these two * sets. A conservative division is to just put everything in the required * set, but the more you can put in the additional set, the more you can * skip during the scan. To maximize skipping, we try to put as many * frequent items as possible into additional, and less frequent ones into * required. To do that, sort the entries by frequency * (predictNumberResult), and put entries into the required set in that * order, until the consistent function says that none of the remaining * entries can form a match, without any items from the required set. The * rest go to the additional set. */ if (key->nentries > 1) { MemoryContextSwitchTo(so->tempCtx); entryIndexes = (int *) palloc(sizeof(int) * key->nentries); for (i = 0; i < key->nentries; i++) entryIndexes[i] = i; qsort_arg(entryIndexes, key->nentries, sizeof(int), entryIndexByFrequencyCmp, key); for (i = 0; i < key->nentries - 1; i++) { /* Pass all entries <= i as FALSE, and the rest as MAYBE */ for (j = 0; j <= i; j++) key->entryRes[entryIndexes[j]] = GIN_FALSE; for (j = i + 1; j < key->nentries; j++) key->entryRes[entryIndexes[j]] = GIN_MAYBE; if (key->triConsistentFn(key) == GIN_FALSE) break; } /* i is now the last required entry. */ MemoryContextSwitchTo(so->keyCtx); key->nrequired = i + 1; key->nadditional = key->nentries - key->nrequired; key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry)); key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry)); j = 0; for (i = 0; i < key->nrequired; i++) key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]]; for (i = 0; i < key->nadditional; i++) key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]]; /* clean up after consistentFn calls (also frees entryIndexes) */ MemoryContextReset(so->tempCtx); } else { MemoryContextSwitchTo(so->keyCtx); key->nrequired = 1; key->nadditional = 0; key->requiredEntries = palloc(1 * sizeof(GinScanEntry)); key->requiredEntries[0] = key->scanEntry[0]; } MemoryContextSwitchTo(oldCtx); } static void startScan(IndexScanDesc scan) { GinScanOpaque so = (GinScanOpaque) scan->opaque; GinState *ginstate = &so->ginstate; uint32 i; for (i = 0; i < so->totalentries; i++) startScanEntry(ginstate, so->entries[i], scan->xs_snapshot); if (GinFuzzySearchLimit > 0) { /* * If all of keys more than threshold we will try to reduce result, we * hope (and only hope, for intersection operation of array our * supposition isn't true), that total result will not more than * minimal predictNumberResult. */ bool reduce = true; for (i = 0; i < so->totalentries; i++) { if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit) { reduce = false; break; } } if (reduce) { for (i = 0; i < so->totalentries; i++) { so->entries[i]->predictNumberResult /= so->totalentries; so->entries[i]->reduceResult = true; } } } /* * Now that we have the estimates for the entry frequencies, finish * initializing the scan keys. */ for (i = 0; i < so->nkeys; i++) startScanKey(ginstate, so, so->keys + i); } /* * Load the next batch of item pointers from a posting tree. * * Note that we copy the page into GinScanEntry->list array and unlock it, but * keep it pinned to prevent interference with vacuum. */ static void entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast, Snapshot snapshot) { Page page; int i; bool stepright; if (!BufferIsValid(entry->buffer)) { entry->isFinished = true; return; } /* * We have two strategies for finding the correct page: step right from * the current page, or descend the tree again from the root. If * advancePast equals the current item, the next matching item should be * on the next page, so we step right. Otherwise, descend from root. */ if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0) { stepright = true; LockBuffer(entry->buffer, GIN_SHARE); } else { GinBtreeStack *stack; ReleaseBuffer(entry->buffer); /* * Set the search key, and find the correct leaf page. */ if (ItemPointerIsLossyPage(&advancePast)) { ItemPointerSet(&entry->btree.itemptr, GinItemPointerGetBlockNumber(&advancePast) + 1, FirstOffsetNumber); } else { ItemPointerSet(&entry->btree.itemptr, GinItemPointerGetBlockNumber(&advancePast), OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast))); } entry->btree.fullScan = false; stack = ginFindLeafPage(&entry->btree, true, false, snapshot); /* we don't need the stack, just the buffer. */ entry->buffer = stack->buffer; IncrBufferRefCount(entry->buffer); freeGinBtreeStack(stack); stepright = false; } elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d", GinItemPointerGetBlockNumber(&advancePast), GinItemPointerGetOffsetNumber(&advancePast), !stepright); page = BufferGetPage(entry->buffer); for (;;) { entry->offset = InvalidOffsetNumber; if (entry->list) { pfree(entry->list); entry->list = NULL; entry->nlist = 0; } if (stepright) { /* * We've processed all the entries on this page. If it was the * last page in the tree, we're done. */ if (GinPageRightMost(page)) { UnlockReleaseBuffer(entry->buffer); entry->buffer = InvalidBuffer; entry->isFinished = true; return; } /* * Step to next page, following the right link. then find the * first ItemPointer greater than advancePast. */ entry->buffer = ginStepRight(entry->buffer, ginstate->index, GIN_SHARE); page = BufferGetPage(entry->buffer); } stepright = true; if (GinPageGetOpaque(page)->flags & GIN_DELETED) continue; /* page was deleted by concurrent vacuum */ /* * The first item > advancePast might not be on this page, but * somewhere to the right, if the page was split, or a non-match from * another key in the query allowed us to skip some items from this * entry. Keep following the right-links until we re-find the correct * page. */ if (!GinPageRightMost(page) && ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0) { /* * the item we're looking is > the right bound of the page, so it * can't be on this page. */ continue; } entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast); for (i = 0; i < entry->nlist; i++) { if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) { entry->offset = i; if (GinPageRightMost(page)) { /* after processing the copied items, we're done. */ UnlockReleaseBuffer(entry->buffer); entry->buffer = InvalidBuffer; } else LockBuffer(entry->buffer, GIN_UNLOCK); return; } } } } #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* * Sets entry->curItem to next heap item pointer > advancePast, for one entry * of one scan key, or sets entry->isFinished to true if there are no more. * * Item pointers are returned in ascending order. * * Note: this can return a "lossy page" item pointer, indicating that the * entry potentially matches all items on that heap page. However, it is * not allowed to return both a lossy page pointer and exact (regular) * item pointers for the same page. (Doing so would break the key-combination * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the * current implementation this is guaranteed by the behavior of tidbitmaps. */ static void entryGetItem(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast, Snapshot snapshot) { Assert(!entry->isFinished); Assert(!ItemPointerIsValid(&entry->curItem) || ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); if (entry->matchBitmap) { /* A bitmap result */ BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast); OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast); bool gotitem = false; do { /* * If we've exhausted all items on this block, move to next block * in the bitmap. */ while (entry->matchResult == NULL || (entry->matchResult->ntuples >= 0 && entry->offset >= entry->matchResult->ntuples) || entry->matchResult->blockno < advancePastBlk || (ItemPointerIsLossyPage(&advancePast) && entry->matchResult->blockno == advancePastBlk)) { entry->matchResult = tbm_iterate(entry->matchIterator); if (entry->matchResult == NULL) { ItemPointerSetInvalid(&entry->curItem); tbm_end_iterate(entry->matchIterator); entry->matchIterator = NULL; entry->isFinished = true; break; } /* * Reset counter to the beginning of entry->matchResult. Note: * entry->offset is still greater than matchResult->ntuples if * matchResult is lossy. So, on next call we will get next * result from TIDBitmap. */ entry->offset = 0; } if (entry->isFinished) break; /* * We're now on the first page after advancePast which has any * items on it. If it's a lossy result, return that. */ if (entry->matchResult->ntuples < 0) { ItemPointerSetLossyPage(&entry->curItem, entry->matchResult->blockno); /* * We might as well fall out of the loop; we could not * estimate number of results on this page to support correct * reducing of result even if it's enabled. */ gotitem = true; break; } /* * Not a lossy page. Skip over any offsets <= advancePast, and * return that. */ if (entry->matchResult->blockno == advancePastBlk) { /* * First, do a quick check against the last offset on the * page. If that's > advancePast, so are all the other * offsets. */ if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) { entry->offset = entry->matchResult->ntuples; continue; } /* Otherwise scan to find the first item > advancePast */ while (entry->matchResult->offsets[entry->offset] <= advancePastOff) entry->offset++; } ItemPointerSet(&entry->curItem, entry->matchResult->blockno, entry->matchResult->offsets[entry->offset]); entry->offset++; gotitem = true; } while (!gotitem || (entry->reduceResult == true && dropItem(entry))); } else if (!BufferIsValid(entry->buffer)) { /* * A posting list from an entry tuple, or the last page of a posting * tree. */ do { if (entry->offset >= entry->nlist) { ItemPointerSetInvalid(&entry->curItem); entry->isFinished = true; break; } entry->curItem = entry->list[entry->offset++]; } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0); /* XXX: shouldn't we apply the fuzzy search limit here? */ } else { /* A posting tree */ do { /* If we've processed the current batch, load more items */ while (entry->offset >= entry->nlist) { entryLoadMoreItems(ginstate, entry, advancePast, snapshot); if (entry->isFinished) { ItemPointerSetInvalid(&entry->curItem); return; } } entry->curItem = entry->list[entry->offset++]; } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 || (entry->reduceResult == true && dropItem(entry))); } } /* * Identify the "current" item among the input entry streams for this scan key * that is greater than advancePast, and test whether it passes the scan key * qual condition. * * The current item is the smallest curItem among the inputs. key->curItem * is set to that value. key->curItemMatches is set to indicate whether that * TID passes the consistentFn test. If so, key->recheckCurItem is set true * iff recheck is needed for this item pointer (including the case where the * item pointer is a lossy page pointer). * * If all entry streams are exhausted, sets key->isFinished to true. * * Item pointers must be returned in ascending order. * * Note: this can return a "lossy page" item pointer, indicating that the * key potentially matches all items on that heap page. However, it is * not allowed to return both a lossy page pointer and exact (regular) * item pointers for the same page. (Doing so would break the key-combination * logic in scanGetItem.) */ static void keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, ItemPointerData advancePast, Snapshot snapshot) { ItemPointerData minItem; ItemPointerData curPageLossy; uint32 i; bool haveLossyEntry; GinScanEntry entry; GinTernaryValue res; MemoryContext oldCtx; bool allFinished; Assert(!key->isFinished); /* * We might have already tested this item; if so, no need to repeat work. * (Note: the ">" case can happen, if advancePast is exact but we * previously had to set curItem to a lossy-page pointer.) */ if (ginCompareItemPointers(&key->curItem, &advancePast) > 0) return; /* * Find the minimum item > advancePast among the active entry streams. * * Note: a lossy-page entry is encoded by a ItemPointer with max value for * offset (0xffff), so that it will sort after any exact entries for the * same page. So we'll prefer to return exact pointers not lossy * pointers, which is good. */ ItemPointerSetMax(&minItem); allFinished = true; for (i = 0; i < key->nrequired; i++) { entry = key->requiredEntries[i]; if (entry->isFinished) continue; /* * Advance this stream if necessary. * * In particular, since entry->curItem was initialized with * ItemPointerSetMin, this ensures we fetch the first item for each * entry on the first call. */ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) { entryGetItem(ginstate, entry, advancePast, snapshot); if (entry->isFinished) continue; } allFinished = false; if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) minItem = entry->curItem; } if (allFinished) { /* all entries are finished */ key->isFinished = true; return; } /* * Ok, we now know that there are no matches < minItem. * * If minItem is lossy, it means that there were no exact items on the * page among requiredEntries, because lossy pointers sort after exact * items. However, there might be exact items for the same page among * additionalEntries, so we mustn't advance past them. */ if (ItemPointerIsLossyPage(&minItem)) { if (GinItemPointerGetBlockNumber(&advancePast) < GinItemPointerGetBlockNumber(&minItem)) { ItemPointerSet(&advancePast, GinItemPointerGetBlockNumber(&minItem), InvalidOffsetNumber); } } else { Assert(GinItemPointerGetOffsetNumber(&minItem) > 0); ItemPointerSet(&advancePast, GinItemPointerGetBlockNumber(&minItem), OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem))); } /* * We might not have loaded all the entry streams for this TID yet. We * could call the consistent function, passing MAYBE for those entries, to * see if it can decide if this TID matches based on the information we * have. But if the consistent-function is expensive, and cannot in fact * decide with partial information, that could be a big loss. So, load all * the additional entries, before calling the consistent function. */ for (i = 0; i < key->nadditional; i++) { entry = key->additionalEntries[i]; if (entry->isFinished) continue; if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) { entryGetItem(ginstate, entry, advancePast, snapshot); if (entry->isFinished) continue; } /* * Normally, none of the items in additionalEntries can have a curItem * larger than minItem. But if minItem is a lossy page, then there * might be exact items on the same page among additionalEntries. */ if (ginCompareItemPointers(&entry->curItem, &minItem) < 0) { Assert(ItemPointerIsLossyPage(&minItem)); minItem = entry->curItem; } } /* * Ok, we've advanced all the entries up to minItem now. Set key->curItem, * and perform consistentFn test. * * Lossy-page entries pose a problem, since we don't know the correct * entryRes state to pass to the consistentFn, and we also don't know what * its combining logic will be (could be AND, OR, or even NOT). If the * logic is OR then the consistentFn might succeed for all items in the * lossy page even when none of the other entries match. * * Our strategy is to call the tri-state consistent function, with the * lossy-page entries set to MAYBE, and all the other entries FALSE. If it * returns FALSE, none of the lossy items alone are enough for a match, so * we don't need to return a lossy-page pointer. Otherwise, return a * lossy-page pointer to indicate that the whole heap page must be * checked. (On subsequent calls, we'll do nothing until minItem is past * the page altogether, thus ensuring that we never return both regular * and lossy pointers for the same page.) * * An exception is that it doesn't matter what we pass for lossy pointers * in "hidden" entries, because the consistentFn's result can't depend on * them. We could pass them as MAYBE as well, but if we're using the * "shim" implementation of a tri-state consistent function (see * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass * them as true. * * Note that only lossy-page entries pointing to the current item's page * should trigger this processing; we might have future lossy pages in the * entry array, but they aren't relevant yet. */ key->curItem = minItem; ItemPointerSetLossyPage(&curPageLossy, GinItemPointerGetBlockNumber(&key->curItem)); haveLossyEntry = false; for (i = 0; i < key->nentries; i++) { entry = key->scanEntry[i]; if (entry->isFinished == false && ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) { if (i < key->nuserentries) key->entryRes[i] = GIN_MAYBE; else key->entryRes[i] = GIN_TRUE; haveLossyEntry = true; } else key->entryRes[i] = GIN_FALSE; } /* prepare for calling consistentFn in temp context */ oldCtx = MemoryContextSwitchTo(tempCtx); if (haveLossyEntry) { /* Have lossy-page entries, so see if whole page matches */ res = key->triConsistentFn(key); if (res == GIN_TRUE || res == GIN_MAYBE) { /* Yes, so clean up ... */ MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); /* and return lossy pointer for whole page */ key->curItem = curPageLossy; key->curItemMatches = true; key->recheckCurItem = true; return; } } /* * At this point we know that we don't need to return a lossy whole-page * pointer, but we might have matches for individual exact item pointers, * possibly in combination with a lossy pointer. Pass lossy pointers as * MAYBE to the ternary consistent function, to let it decide if this * tuple satisfies the overall key, even though we don't know if the lossy * entries match. * * Prepare entryRes array to be passed to consistentFn. */ for (i = 0; i < key->nentries; i++) { entry = key->scanEntry[i]; if (entry->isFinished) key->entryRes[i] = GIN_FALSE; #if 0 /* * This case can't currently happen, because we loaded all the entries * for this item earlier. */ else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) key->entryRes[i] = GIN_MAYBE; #endif else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0) key->entryRes[i] = GIN_MAYBE; else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0) key->entryRes[i] = GIN_TRUE; else key->entryRes[i] = GIN_FALSE; } res = key->triConsistentFn(key); switch (res) { case GIN_TRUE: key->curItemMatches = true; /* triConsistentFn set recheckCurItem */ break; case GIN_FALSE: key->curItemMatches = false; break; case GIN_MAYBE: key->curItemMatches = true; key->recheckCurItem = true; break; default: /* * the 'default' case shouldn't happen, but if the consistent * function returns something bogus, this is the safe result */ key->curItemMatches = true; key->recheckCurItem = true; break; } /* * We have a tuple, and we know if it matches or not. If it's a non-match, * we could continue to find the next matching tuple, but let's break out * and give scanGetItem a chance to advance the other keys. They might be * able to skip past to a much higher TID, allowing us to save work. */ /* clean up after consistentFn calls */ MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); } /* * Get next heap item pointer (after advancePast) from scan. * Returns true if anything found. * On success, *item and *recheck are set. * * Note: this is very nearly the same logic as in keyGetItem(), except * that we know the keys are to be combined with AND logic, whereas in * keyGetItem() the combination logic is known only to the consistentFn. */ static bool scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, ItemPointerData *item, bool *recheck) { GinScanOpaque so = (GinScanOpaque) scan->opaque; uint32 i; bool match; /*---------- * Advance the scan keys in lock-step, until we find an item that matches * all the keys. If any key reports isFinished, meaning its subset of the * entries is exhausted, we can stop. Otherwise, set *item to the next * matching item. * * This logic works only if a keyGetItem stream can never contain both * exact and lossy pointers for the same page. Else we could have a * case like * * stream 1 stream 2 * ... ... * 42/6 42/7 * 50/1 42/0xffff * ... ... * * We would conclude that 42/6 is not a match and advance stream 1, * thus never detecting the match to the lossy pointer in stream 2. * (keyGetItem has a similar problem versus entryGetItem.) *---------- */ do { ItemPointerSetMin(item); match = true; for (i = 0; i < so->nkeys && match; i++) { GinScanKey key = so->keys + i; /* Fetch the next item for this key that is > advancePast. */ keyGetItem(&so->ginstate, so->tempCtx, key, advancePast, scan->xs_snapshot); if (key->isFinished) return false; /* * If it's not a match, we can immediately conclude that nothing * <= this item matches, without checking the rest of the keys. */ if (!key->curItemMatches) { advancePast = key->curItem; match = false; break; } /* * It's a match. We can conclude that nothing < matches, so the * other key streams can skip to this item. * * Beware of lossy pointers, though; from a lossy pointer, we can * only conclude that nothing smaller than this *block* matches. */ if (ItemPointerIsLossyPage(&key->curItem)) { if (GinItemPointerGetBlockNumber(&advancePast) < GinItemPointerGetBlockNumber(&key->curItem)) { ItemPointerSet(&advancePast, GinItemPointerGetBlockNumber(&key->curItem), InvalidOffsetNumber); } } else { Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0); ItemPointerSet(&advancePast, GinItemPointerGetBlockNumber(&key->curItem), OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem))); } /* * If this is the first key, remember this location as a potential * match, and proceed to check the rest of the keys. * * Otherwise, check if this is the same item that we checked the * previous keys for (or a lossy pointer for the same page). If * not, loop back to check the previous keys for this item (we * will check this key again too, but keyGetItem returns quickly * for that) */ if (i == 0) { *item = key->curItem; } else { if (ItemPointerIsLossyPage(&key->curItem) || ItemPointerIsLossyPage(item)) { Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item)); match = (GinItemPointerGetBlockNumber(&key->curItem) == GinItemPointerGetBlockNumber(item)); } else { Assert(ginCompareItemPointers(&key->curItem, item) >= 0); match = (ginCompareItemPointers(&key->curItem, item) == 0); } } } } while (!match); Assert(!ItemPointerIsMin(item)); /* * Now *item contains the first ItemPointer after previous result that * satisfied all the keys for that exact TID, or a lossy reference to the * same page. * * We must return recheck = true if any of the keys are marked recheck. */ *recheck = false; for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (key->recheckCurItem) { *recheck = true; break; } } return true; } /* * Functions for scanning the pending list */ /* * Get ItemPointer of next heap row to be checked from pending list. * Returns false if there are no more. On pages with several heap rows * it returns each row separately, on page with part of heap row returns * per page data. pos->firstOffset and pos->lastOffset are set to identify * the range of pending-list tuples belonging to this heap row. * * The pendingBuffer is presumed pinned and share-locked on entry, and is * pinned and share-locked on success exit. On failure exit it's released. */ static bool scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) { OffsetNumber maxoff; Page page; IndexTuple itup; ItemPointerSetInvalid(&pos->item); for (;;) { page = BufferGetPage(pos->pendingBuffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); maxoff = PageGetMaxOffsetNumber(page); if (pos->firstOffset > maxoff) { BlockNumber blkno = GinPageGetOpaque(page)->rightlink; if (blkno == InvalidBlockNumber) { UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer = InvalidBuffer; return false; } else { /* * Here we must prevent deletion of next page by insertcleanup * process, which may be trying to obtain exclusive lock on * current page. So, we lock next page before releasing the * current one */ Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); LockBuffer(tmpbuf, GIN_SHARE); UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer = tmpbuf; pos->firstOffset = FirstOffsetNumber; } } else { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); pos->item = itup->t_tid; if (GinPageHasFullRow(page)) { /* * find itempointer to the next row */ for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); if (!ItemPointerEquals(&pos->item, &itup->t_tid)) break; } } else { /* * All itempointers are the same on this page */ pos->lastOffset = maxoff + 1; } /* * Now pos->firstOffset points to the first tuple of current heap * row, pos->lastOffset points to the first tuple of next heap row * (or to the end of page) */ break; } } return true; } /* * Scan pending-list page from current tuple (off) up till the first of: * - match is found (then returns true) * - no later match is possible * - tuple's attribute number is not equal to entry's attrnum * - reach end of page * * datum[]/category[]/datumExtracted[] arrays are used to cache the results * of gintuple_get_key() on the current page. */ static bool matchPartialInPendingList(GinState *ginstate, Page page, OffsetNumber off, OffsetNumber maxoff, GinScanEntry entry, Datum *datum, GinNullCategory *category, bool *datumExtracted) { IndexTuple itup; int32 cmp; /* Partial match to a null is not possible */ if (entry->queryCategory != GIN_CAT_NORM_KEY) return false; while (off < maxoff) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); if (gintuple_get_attrnum(ginstate, itup) != entry->attnum) return false; if (datumExtracted[off - 1] == false) { datum[off - 1] = gintuple_get_key(ginstate, itup, &category[off - 1]); datumExtracted[off - 1] = true; } /* Once we hit nulls, no further match is possible */ if (category[off - 1] != GIN_CAT_NORM_KEY) return false; /*---------- * Check partial match. * case cmp == 0 => match * case cmp > 0 => not match and end scan (no later match possible) * case cmp < 0 => not match and continue scan *---------- */ cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1], ginstate->supportCollation[entry->attnum - 1], entry->queryKey, datum[off - 1], UInt16GetDatum(entry->strategy), PointerGetDatum(entry->extra_data))); if (cmp == 0) return true; else if (cmp > 0) return false; off++; } return false; } /* * Set up the entryRes array for each key by looking at * every entry for current heap row in pending list. * * Returns true if each scan key has at least one entryRes match. * This corresponds to the situations where the normal index search will * try to apply the key's consistentFn. (A tuple not meeting that requirement * cannot be returned by the normal search since no entry stream will * source its TID.) * * The pendingBuffer is presumed pinned and share-locked on entry. */ static bool collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) { GinScanOpaque so = (GinScanOpaque) scan->opaque; OffsetNumber attrnum; Page page; IndexTuple itup; int i, j; /* * Reset all entryRes and hasMatchKey flags */ for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; memset(key->entryRes, GIN_FALSE, key->nentries); } memset(pos->hasMatchKey, false, so->nkeys); /* * Outer loop iterates over multiple pending-list pages when a single heap * row has entries spanning those pages. */ for (;;) { Datum datum[BLCKSZ / sizeof(IndexTupleData)]; GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; Assert(pos->lastOffset > pos->firstOffset); memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset)); page = BufferGetPage(pos->pendingBuffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; for (j = 0; j < key->nentries; j++) { GinScanEntry entry = key->scanEntry[j]; OffsetNumber StopLow = pos->firstOffset, StopHigh = pos->lastOffset, StopMiddle; /* If already matched on earlier page, do no extra work */ if (key->entryRes[j]) continue; /* * Interesting tuples are from pos->firstOffset to * pos->lastOffset and they are ordered by (attnum, Datum) as * it's done in entry tree. So we can use binary search to * avoid linear scanning. */ while (StopLow < StopHigh) { int res; StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); attrnum = gintuple_get_attrnum(&so->ginstate, itup); if (key->attnum < attrnum) { StopHigh = StopMiddle; continue; } if (key->attnum > attrnum) { StopLow = StopMiddle + 1; continue; } if (datumExtracted[StopMiddle - 1] == false) { datum[StopMiddle - 1] = gintuple_get_key(&so->ginstate, itup, &category[StopMiddle - 1]); datumExtracted[StopMiddle - 1] = true; } if (entry->queryCategory == GIN_CAT_EMPTY_QUERY) { /* special behavior depending on searchMode */ if (entry->searchMode == GIN_SEARCH_MODE_ALL) { /* match anything except NULL_ITEM */ if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM) res = -1; else res = 0; } else { /* match everything */ res = 0; } } else { res = ginCompareEntries(&so->ginstate, entry->attnum, entry->queryKey, entry->queryCategory, datum[StopMiddle - 1], category[StopMiddle - 1]); } if (res == 0) { /* * Found exact match (there can be only one, except in * EMPTY_QUERY mode). * * If doing partial match, scan forward from here to * end of page to check for matches. * * See comment above about tuple's ordering. */ if (entry->isPartialMatch) key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopMiddle, pos->lastOffset, entry, datum, category, datumExtracted); else key->entryRes[j] = true; /* done with binary search */ break; } else if (res < 0) StopHigh = StopMiddle; else StopLow = StopMiddle + 1; } if (StopLow >= StopHigh && entry->isPartialMatch) { /* * No exact match on this page. If doing partial match, * scan from the first tuple greater than target value to * end of page. Note that since we don't remember whether * the comparePartialFn told us to stop early on a * previous page, we will uselessly apply comparePartialFn * to the first tuple on each subsequent page. */ key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopHigh, pos->lastOffset, entry, datum, category, datumExtracted); } pos->hasMatchKey[i] |= key->entryRes[j]; } } /* Advance firstOffset over the scanned tuples */ pos->firstOffset = pos->lastOffset; if (GinPageHasFullRow(page)) { /* * We have examined all pending entries for the current heap row. * Break out of loop over pages. */ break; } else { /* * Advance to next page of pending entries for the current heap * row. Complain if there isn't one. */ ItemPointerData item = pos->item; if (scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item)) elog(ERROR, "could not find additional pending pages for same heap tuple"); } } /* * Now return "true" if all scan keys have at least one matching datum */ for (i = 0; i < so->nkeys; i++) { if (pos->hasMatchKey[i] == false) return false; } return true; } /* * Collect all matched rows from pending list into bitmap. */ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { GinScanOpaque so = (GinScanOpaque) scan->opaque; MemoryContext oldCtx; bool recheck, match; int i; pendingPosition pos; Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); Page page; BlockNumber blkno; *ntids = 0; /* * Acquire predicate lock on the metapage, to conflict with any fastupdate * insertions. */ PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot); LockBuffer(metabuffer, GIN_SHARE); page = BufferGetPage(metabuffer); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); blkno = GinPageGetMeta(page)->head; /* * fetch head of list before unlocking metapage. head page must be pinned * to prevent deletion by vacuum process */ if (blkno == InvalidBlockNumber) { /* No pending list, so proceed with normal scan */ UnlockReleaseBuffer(metabuffer); return; } pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); LockBuffer(pos.pendingBuffer, GIN_SHARE); pos.firstOffset = FirstOffsetNumber; UnlockReleaseBuffer(metabuffer); pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys); /* * loop for each heap row. scanGetCandidate returns full row or row's * tuples from first page. */ while (scanGetCandidate(scan, &pos)) { /* * Check entries in tuple and set up entryRes array. * * If pending tuples belonging to the current heap row are spread * across several pages, collectMatchesForHeapRow will read all of * those pages. */ if (!collectMatchesForHeapRow(scan, &pos)) continue; /* * Matching of entries of one row is finished, so check row using * consistent functions. */ oldCtx = MemoryContextSwitchTo(so->tempCtx); recheck = false; match = true; for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (!key->boolConsistentFn(key)) { match = false; break; } recheck |= key->recheckCurItem; } MemoryContextSwitchTo(oldCtx); MemoryContextReset(so->tempCtx); if (match) { tbm_add_tuples(tbm, &pos.item, 1, recheck); (*ntids)++; } } pfree(pos.hasMatchKey); } #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes ) int64 gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { GinScanOpaque so = (GinScanOpaque) scan->opaque; int64 ntids; ItemPointerData iptr; bool recheck; /* * Set up the scan keys, and check for unsatisfiable query. */ ginFreeScanKeys(so); /* there should be no keys yet, but just to be * sure */ ginNewScanKey(scan); if (GinIsVoidRes(scan)) return 0; ntids = 0; /* * First, scan the pending list and collect any matching entries into the * bitmap. After we scan a pending item, some other backend could post it * into the main index, and so we might visit it a second time during the * main scan. This is okay because we'll just re-set the same bit in the * bitmap. (The possibility of duplicate visits is a major reason why GIN * can't support the amgettuple API, however.) Note that it would not do * to scan the main index before the pending list, since concurrent * cleanup could then make us miss entries entirely. */ scanPendingInsert(scan, tbm, &ntids); /* * Now scan the main index. */ startScan(scan); ItemPointerSetMin(&iptr); for (;;) { CHECK_FOR_INTERRUPTS(); if (!scanGetItem(scan, iptr, &iptr, &recheck)) break; if (ItemPointerIsLossyPage(&iptr)) tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); else tbm_add_tuples(tbm, &iptr, 1, recheck); ntids++; } return ntids; }