/*------------------------------------------------------------------------- * * freepage.c * Management of free memory pages. * * The intention of this code is to provide infrastructure for memory * allocators written specifically for PostgreSQL. At least in the case * of dynamic shared memory, we can't simply use malloc() or even * relatively thin wrappers like palloc() which sit on top of it, because * no allocator built into the operating system will deal with relative * pointers. In the future, we may find other cases in which greater * control over our own memory management seems desirable. * * A FreePageManager keeps track of which 4kB pages of memory are currently * unused from the point of view of some higher-level memory allocator. * Unlike a user-facing allocator such as palloc(), a FreePageManager can * only allocate and free in units of whole pages, and freeing an * allocation can only be done given knowledge of its length in pages. * * Since a free page manager has only a fixed amount of dedicated memory, * and since there is no underlying allocator, it uses the free pages * it is given to manage to store its bookkeeping data. It keeps multiple * freelists of runs of pages, sorted by the size of the run; the head of * each freelist is stored in the FreePageManager itself, and the first * page of each run contains a relative pointer to the next run. See * FreePageManagerGetInternal for more details on how the freelists are * managed. * * To avoid memory fragmentation, it's important to consolidate adjacent * spans of pages whenever possible; otherwise, large allocation requests * might not be satisfied even when sufficient contiguous space is * available. Therefore, in addition to the freelists, we maintain an * in-memory btree of free page ranges ordered by page number. If a * range being freed precedes or follows a range that is already free, * the existing range is extended; if it exactly bridges the gap between * free ranges, then the two existing ranges are consolidated with the * newly-freed range to form one great big range of free pages. * * When there is only one range of free pages, the btree is trivial and * is stored within the FreePageManager proper; otherwise, pages are * allocated from the area under management as needed. Even in cases * where memory fragmentation is very severe, only a tiny fraction of * the pages under management are consumed by this btree. * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/utils/mmgr/freepage.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "lib/stringinfo.h" #include "miscadmin.h" #include "utils/freepage.h" #include "utils/relptr.h" /* Magic numbers to identify various page types */ #define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 #define FREE_PAGE_LEAF_MAGIC 0x98eae728 #define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 /* Doubly linked list of spans of free pages; stored in first page of span. */ struct FreePageSpanLeader { int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ Size npages; /* number of pages in span */ RelptrFreePageSpanLeader prev; RelptrFreePageSpanLeader next; }; /* Common header for btree leaf and internal pages. */ typedef struct FreePageBtreeHeader { int magic; /* FREE_PAGE_LEAF_MAGIC or * FREE_PAGE_INTERNAL_MAGIC */ Size nused; /* number of items used */ RelptrFreePageBtree parent; /* uplink */ } FreePageBtreeHeader; /* Internal key; points to next level of btree. */ typedef struct FreePageBtreeInternalKey { Size first_page; /* low bound for keys on child page */ RelptrFreePageBtree child; /* downlink */ } FreePageBtreeInternalKey; /* Leaf key; no payload data. */ typedef struct FreePageBtreeLeafKey { Size first_page; /* first page in span */ Size npages; /* number of pages in span */ } FreePageBtreeLeafKey; /* Work out how many keys will fit on a page. */ #define FPM_ITEMS_PER_INTERNAL_PAGE \ ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ sizeof(FreePageBtreeInternalKey)) #define FPM_ITEMS_PER_LEAF_PAGE \ ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ sizeof(FreePageBtreeLeafKey)) /* A btree page of either sort */ struct FreePageBtree { FreePageBtreeHeader hdr; union { FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; } u; }; /* Results of a btree search */ typedef struct FreePageBtreeSearchResult { FreePageBtree *page; Size index; bool found; unsigned split_pages; } FreePageBtreeSearchResult; /* Helper functions */ static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp); static Size FreePageBtreeCleanup(FreePageManager *fpm); static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp); static FreePageBtree *FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp); static Size FreePageBtreeFirstKey(FreePageBtree *btp); static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, Size first_page, FreePageBtree *child); static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, Size npages); static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index); static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, FreePageBtreeSearchResult *result); static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp); static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, FreePageBtree *parent, int level, StringInfo buf); static void FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, Size expected_pages, StringInfo buf); static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page); static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, bool soft); static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages); static Size FreePageManagerLargestContiguous(FreePageManager *fpm); static void FreePageManagerUpdateLargest(FreePageManager *fpm); #ifdef FPM_EXTRA_ASSERTS static Size sum_free_pages(FreePageManager *fpm); #endif /* * Initialize a new, empty free page manager. * * 'fpm' should reference caller-provided memory large enough to contain a * FreePageManager. We'll initialize it here. * * 'base' is the address to which all pointers are relative. When managing * a dynamic shared memory segment, it should normally be the base of the * segment. When managing backend-private memory, it can be either NULL or, * if managing a single contiguous extent of memory, the start of that extent. */ void FreePageManagerInitialize(FreePageManager *fpm, char *base) { Size f; relptr_store(base, fpm->self, fpm); relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); fpm->btree_depth = 0; fpm->btree_recycle_count = 0; fpm->singleton_first_page = 0; fpm->singleton_npages = 0; fpm->contiguous_pages = 0; fpm->contiguous_pages_dirty = true; #ifdef FPM_EXTRA_ASSERTS fpm->free_pages = 0; #endif for (f = 0; f < FPM_NUM_FREELISTS; f++) relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); } /* * Allocate a run of pages of the given length from the free page manager. * The return value indicates whether we were able to satisfy the request; * if true, the first page of the allocation is stored in *first_page. */ bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) { bool result; Size contiguous_pages; result = FreePageManagerGetInternal(fpm, npages, first_page); /* * It's a bit counterintuitive, but allocating pages can actually create * opportunities for cleanup that create larger ranges. We might pull a * key out of the btree that enables the item at the head of the btree * recycle list to be inserted; and then if there are more items behind it * one of those might cause two currently-separated ranges to merge, * creating a single range of contiguous pages larger than any that * existed previously. It might be worth trying to improve the cleanup * algorithm to avoid such corner cases, but for now we just notice the * condition and do the appropriate reporting. */ contiguous_pages = FreePageBtreeCleanup(fpm); if (fpm->contiguous_pages < contiguous_pages) fpm->contiguous_pages = contiguous_pages; /* * FreePageManagerGetInternal may have set contiguous_pages_dirty. * Recompute contiguous_pages if so. */ FreePageManagerUpdateLargest(fpm); #ifdef FPM_EXTRA_ASSERTS if (result) { Assert(fpm->free_pages >= npages); fpm->free_pages -= npages; } Assert(fpm->free_pages == sum_free_pages(fpm)); Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); #endif return result; } #ifdef FPM_EXTRA_ASSERTS static void sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum) { char *base = fpm_segment_base(fpm); Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC || btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); ++*sum; if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) { Size index; for (index = 0; index < btp->hdr.nused; ++index) { FreePageBtree *child; child = relptr_access(base, btp->u.internal_key[index].child); sum_free_pages_recurse(fpm, child, sum); } } } static Size sum_free_pages(FreePageManager *fpm) { FreePageSpanLeader *recycle; char *base = fpm_segment_base(fpm); Size sum = 0; int list; /* Count the spans by scanning the freelists. */ for (list = 0; list < FPM_NUM_FREELISTS; ++list) { if (!relptr_is_null(fpm->freelist[list])) { FreePageSpanLeader *candidate = relptr_access(base, fpm->freelist[list]); do { sum += candidate->npages; candidate = relptr_access(base, candidate->next); } while (candidate != NULL); } } /* Count btree internal pages. */ if (fpm->btree_depth > 0) { FreePageBtree *root = relptr_access(base, fpm->btree_root); sum_free_pages_recurse(fpm, root, &sum); } /* Count the recycle list. */ for (recycle = relptr_access(base, fpm->btree_recycle); recycle != NULL; recycle = relptr_access(base, recycle->next)) { Assert(recycle->npages == 1); ++sum; } return sum; } #endif /* * Compute the size of the largest run of pages that the user could * successfully get. */ static Size FreePageManagerLargestContiguous(FreePageManager *fpm) { char *base; Size largest; base = fpm_segment_base(fpm); largest = 0; if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) { FreePageSpanLeader *candidate; candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); do { if (candidate->npages > largest) largest = candidate->npages; candidate = relptr_access(base, candidate->next); } while (candidate != NULL); } else { Size f = FPM_NUM_FREELISTS - 1; do { --f; if (!relptr_is_null(fpm->freelist[f])) { largest = f + 1; break; } } while (f > 0); } return largest; } /* * Recompute the size of the largest run of pages that the user could * successfully get, if it has been marked dirty. */ static void FreePageManagerUpdateLargest(FreePageManager *fpm) { if (!fpm->contiguous_pages_dirty) return; fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm); fpm->contiguous_pages_dirty = false; } /* * Transfer a run of pages to the free page manager. */ void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) { Size contiguous_pages; Assert(npages > 0); /* Record the new pages. */ contiguous_pages = FreePageManagerPutInternal(fpm, first_page, npages, false); /* * If the new range we inserted into the page manager was contiguous with * an existing range, it may have opened up cleanup opportunities. */ if (contiguous_pages > npages) { Size cleanup_contiguous_pages; cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); if (cleanup_contiguous_pages > contiguous_pages) contiguous_pages = cleanup_contiguous_pages; } /* See if we now have a new largest chunk. */ if (fpm->contiguous_pages < contiguous_pages) fpm->contiguous_pages = contiguous_pages; /* * The earlier call to FreePageManagerPutInternal may have set * contiguous_pages_dirty if it needed to allocate internal pages, so * recompute contiguous_pages if necessary. */ FreePageManagerUpdateLargest(fpm); #ifdef FPM_EXTRA_ASSERTS fpm->free_pages += npages; Assert(fpm->free_pages == sum_free_pages(fpm)); Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); #endif } /* * Produce a debugging dump of the state of a free page manager. */ char * FreePageManagerDump(FreePageManager *fpm) { char *base = fpm_segment_base(fpm); StringInfoData buf; FreePageSpanLeader *recycle; bool dumped_any_freelist = false; Size f; /* Initialize output buffer. */ initStringInfo(&buf); /* Dump general stuff. */ appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n", fpm->self.relptr_off, fpm->contiguous_pages); /* Dump btree. */ if (fpm->btree_depth > 0) { FreePageBtree *root; appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth); root = relptr_access(base, fpm->btree_root); FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); } else if (fpm->singleton_npages > 0) { appendStringInfo(&buf, "singleton: %zu(%zu)\n", fpm->singleton_first_page, fpm->singleton_npages); } /* Dump btree recycle list. */ recycle = relptr_access(base, fpm->btree_recycle); if (recycle != NULL) { appendStringInfoString(&buf, "btree recycle:"); FreePageManagerDumpSpans(fpm, recycle, 1, &buf); } /* Dump free lists. */ for (f = 0; f < FPM_NUM_FREELISTS; ++f) { FreePageSpanLeader *span; if (relptr_is_null(fpm->freelist[f])) continue; if (!dumped_any_freelist) { appendStringInfoString(&buf, "freelists:\n"); dumped_any_freelist = true; } appendStringInfo(&buf, " %zu:", f + 1); span = relptr_access(base, fpm->freelist[f]); FreePageManagerDumpSpans(fpm, span, f + 1, &buf); } /* And return result to caller. */ return buf.data; } /* * The first_page value stored at index zero in any non-root page must match * the first_page value stored in its parent at the index which points to that * page. So when the value stored at index zero in a btree page changes, we've * got to walk up the tree adjusting ancestor keys until we reach an ancestor * where that key isn't index zero. This function should be called after * updating the first key on the target page; it will propagate the change * upward as far as needed. * * We assume here that the first key on the page has not changed enough to * require changes in the ordering of keys on its ancestor pages. Thus, * if we search the parent page for the first key greater than or equal to * the first key on the current page, the downlink to this page will be either * the exact index returned by the search (if the first key decreased) * or one less (if the first key increased). */ static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) { char *base = fpm_segment_base(fpm); Size first_page; FreePageBtree *parent; FreePageBtree *child; /* This might be either a leaf or an internal page. */ Assert(btp->hdr.nused > 0); if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) { Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); first_page = btp->u.leaf_key[0].first_page; } else { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); first_page = btp->u.internal_key[0].first_page; } child = btp; /* Loop until we find an ancestor that does not require adjustment. */ for (;;) { Size s; parent = relptr_access(base, child->hdr.parent); if (parent == NULL) break; s = FreePageBtreeSearchInternal(parent, first_page); /* Key is either at index s or index s-1; figure out which. */ if (s >= parent->hdr.nused) { Assert(s == parent->hdr.nused); --s; } else { FreePageBtree *check; check = relptr_access(base, parent->u.internal_key[s].child); if (check != child) { Assert(s > 0); --s; } } #ifdef USE_ASSERT_CHECKING /* Debugging double-check. */ { FreePageBtree *check; check = relptr_access(base, parent->u.internal_key[s].child); Assert(s < parent->hdr.nused); Assert(child == check); } #endif /* Update the parent key. */ parent->u.internal_key[s].first_page = first_page; /* * If this is the first key in the parent, go up another level; else * done. */ if (s > 0) break; child = parent; } } /* * Attempt to reclaim space from the free-page btree. The return value is * the largest range of contiguous pages created by the cleanup operation. */ static Size FreePageBtreeCleanup(FreePageManager *fpm) { char *base = fpm_segment_base(fpm); Size max_contiguous_pages = 0; /* Attempt to shrink the depth of the btree. */ while (!relptr_is_null(fpm->btree_root)) { FreePageBtree *root = relptr_access(base, fpm->btree_root); /* If the root contains only one key, reduce depth by one. */ if (root->hdr.nused == 1) { /* Shrink depth of tree by one. */ Assert(fpm->btree_depth > 0); --fpm->btree_depth; if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) { /* If root is a leaf, convert only entry to singleton range. */ relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); fpm->singleton_first_page = root->u.leaf_key[0].first_page; fpm->singleton_npages = root->u.leaf_key[0].npages; } else { FreePageBtree *newroot; /* If root is an internal page, make only child the root. */ Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); relptr_copy(fpm->btree_root, root->u.internal_key[0].child); newroot = relptr_access(base, fpm->btree_root); relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); } FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); } else if (root->hdr.nused == 2 && root->hdr.magic == FREE_PAGE_LEAF_MAGIC) { Size end_of_first; Size start_of_second; end_of_first = root->u.leaf_key[0].first_page + root->u.leaf_key[0].npages; start_of_second = root->u.leaf_key[1].first_page; if (end_of_first + 1 == start_of_second) { Size root_page = fpm_pointer_to_page(base, root); if (end_of_first == root_page) { FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); fpm->singleton_first_page = root->u.leaf_key[0].first_page; fpm->singleton_npages = root->u.leaf_key[0].npages + root->u.leaf_key[1].npages + 1; fpm->btree_depth = 0; relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); FreePagePushSpanLeader(fpm, fpm->singleton_first_page, fpm->singleton_npages); Assert(max_contiguous_pages == 0); max_contiguous_pages = fpm->singleton_npages; } } /* Whether it worked or not, it's time to stop. */ break; } else { /* Nothing more to do. Stop. */ break; } } /* * Attempt to free recycled btree pages. We skip this if releasing the * recycled page would require a btree page split, because the page we're * trying to recycle would be consumed by the split, which would be * counterproductive. * * We also currently only ever attempt to recycle the first page on the * list; that could be made more aggressive, but it's not clear that the * complexity would be worthwhile. */ while (fpm->btree_recycle_count > 0) { FreePageBtree *btp; Size first_page; Size contiguous_pages; btp = FreePageBtreeGetRecycled(fpm); first_page = fpm_pointer_to_page(base, btp); contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); if (contiguous_pages == 0) { FreePageBtreeRecycle(fpm, first_page); break; } else { if (contiguous_pages > max_contiguous_pages) max_contiguous_pages = contiguous_pages; } } return max_contiguous_pages; } /* * Consider consolidating the given page with its left or right sibling, * if it's fairly empty. */ static void FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) { char *base = fpm_segment_base(fpm); FreePageBtree *np; Size max; /* * We only try to consolidate pages that are less than a third full. We * could be more aggressive about this, but that might risk performing * consolidation only to end up splitting again shortly thereafter. Since * the btree should be very small compared to the space under management, * our goal isn't so much to ensure that it always occupies the absolutely * smallest possible number of pages as to reclaim pages before things get * too egregiously out of hand. */ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) max = FPM_ITEMS_PER_LEAF_PAGE; else { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); max = FPM_ITEMS_PER_INTERNAL_PAGE; } if (btp->hdr.nused >= max / 3) return; /* * If we can fit our right sibling's keys onto this page, consolidate. */ np = FreePageBtreeFindRightSibling(base, btp); if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) { if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) { memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], sizeof(FreePageBtreeLeafKey) * np->hdr.nused); btp->hdr.nused += np->hdr.nused; } else { memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], sizeof(FreePageBtreeInternalKey) * np->hdr.nused); btp->hdr.nused += np->hdr.nused; FreePageBtreeUpdateParentPointers(base, btp); } FreePageBtreeRemovePage(fpm, np); return; } /* * If we can fit our keys onto our left sibling's page, consolidate. In * this case, we move our keys onto the other page rather than vice versa, * to avoid having to adjust ancestor keys. */ np = FreePageBtreeFindLeftSibling(base, btp); if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) { if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) { memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); np->hdr.nused += btp->hdr.nused; } else { memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); np->hdr.nused += btp->hdr.nused; FreePageBtreeUpdateParentPointers(base, np); } FreePageBtreeRemovePage(fpm, btp); return; } } /* * Find the passed page's left sibling; that is, the page at the same level * of the tree whose keyspace immediately precedes ours. */ static FreePageBtree * FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) { FreePageBtree *p = btp; int levels = 0; /* Move up until we can move left. */ for (;;) { Size first_page; Size index; first_page = FreePageBtreeFirstKey(p); p = relptr_access(base, p->hdr.parent); if (p == NULL) return NULL; /* we were passed the rightmost page */ index = FreePageBtreeSearchInternal(p, first_page); if (index > 0) { Assert(p->u.internal_key[index].first_page == first_page); p = relptr_access(base, p->u.internal_key[index - 1].child); break; } Assert(index == 0); ++levels; } /* Descend left. */ while (levels > 0) { Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); --levels; } Assert(p->hdr.magic == btp->hdr.magic); return p; } /* * Find the passed page's right sibling; that is, the page at the same level * of the tree whose keyspace immediately follows ours. */ static FreePageBtree * FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) { FreePageBtree *p = btp; int levels = 0; /* Move up until we can move right. */ for (;;) { Size first_page; Size index; first_page = FreePageBtreeFirstKey(p); p = relptr_access(base, p->hdr.parent); if (p == NULL) return NULL; /* we were passed the rightmost page */ index = FreePageBtreeSearchInternal(p, first_page); if (index < p->hdr.nused - 1) { Assert(p->u.internal_key[index].first_page == first_page); p = relptr_access(base, p->u.internal_key[index + 1].child); break; } Assert(index == p->hdr.nused - 1); ++levels; } /* Descend left. */ while (levels > 0) { Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); p = relptr_access(base, p->u.internal_key[0].child); --levels; } Assert(p->hdr.magic == btp->hdr.magic); return p; } /* * Get the first key on a btree page. */ static Size FreePageBtreeFirstKey(FreePageBtree *btp) { Assert(btp->hdr.nused > 0); if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) return btp->u.leaf_key[0].first_page; else { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); return btp->u.internal_key[0].first_page; } } /* * Get a page from the btree recycle list for use as a btree page. */ static FreePageBtree * FreePageBtreeGetRecycled(FreePageManager *fpm) { char *base = fpm_segment_base(fpm); FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); FreePageSpanLeader *newhead; Assert(victim != NULL); newhead = relptr_access(base, victim->next); if (newhead != NULL) relptr_copy(newhead->prev, victim->prev); relptr_store(base, fpm->btree_recycle, newhead); Assert(fpm_pointer_is_page_aligned(base, victim)); fpm->btree_recycle_count--; return (FreePageBtree *) victim; } /* * Insert an item into an internal page. */ static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, Size first_page, FreePageBtree *child) { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); Assert(index <= btp->hdr.nused); memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); btp->u.internal_key[index].first_page = first_page; relptr_store(base, btp->u.internal_key[index].child, child); ++btp->hdr.nused; } /* * Insert an item into a leaf page. */ static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, Size npages) { Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); Assert(index <= btp->hdr.nused); memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); btp->u.leaf_key[index].first_page = first_page; btp->u.leaf_key[index].npages = npages; ++btp->hdr.nused; } /* * Put a page on the btree recycle list. */ static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) { char *base = fpm_segment_base(fpm); FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); FreePageSpanLeader *span; span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; span->npages = 1; relptr_store(base, span->next, head); relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); if (head != NULL) relptr_store(base, head->prev, span); relptr_store(base, fpm->btree_recycle, span); fpm->btree_recycle_count++; } /* * Remove an item from the btree at the given position on the given page. */ static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) { Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); Assert(index < btp->hdr.nused); /* When last item is removed, extirpate entire page from btree. */ if (btp->hdr.nused == 1) { FreePageBtreeRemovePage(fpm, btp); return; } /* Physically remove the key from the page. */ --btp->hdr.nused; if (index < btp->hdr.nused) memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); /* If we just removed the first key, adjust ancestor keys. */ if (index == 0) FreePageBtreeAdjustAncestorKeys(fpm, btp); /* Consider whether to consolidate this page with a sibling. */ FreePageBtreeConsolidate(fpm, btp); } /* * Remove a page from the btree. Caller is responsible for having relocated * any keys from this page that are still wanted. The page is placed on the * recycled list. */ static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) { char *base = fpm_segment_base(fpm); FreePageBtree *parent; Size index; Size first_page; for (;;) { /* Find parent page. */ parent = relptr_access(base, btp->hdr.parent); if (parent == NULL) { /* We are removing the root page. */ relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); fpm->btree_depth = 0; Assert(fpm->singleton_first_page == 0); Assert(fpm->singleton_npages == 0); return; } /* * If the parent contains only one item, we need to remove it as well. */ if (parent->hdr.nused > 1) break; FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); btp = parent; } /* Find and remove the downlink. */ first_page = FreePageBtreeFirstKey(btp); if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) { index = FreePageBtreeSearchLeaf(parent, first_page); Assert(index < parent->hdr.nused); if (index < parent->hdr.nused - 1) memmove(&parent->u.leaf_key[index], &parent->u.leaf_key[index + 1], sizeof(FreePageBtreeLeafKey) * (parent->hdr.nused - index - 1)); } else { index = FreePageBtreeSearchInternal(parent, first_page); Assert(index < parent->hdr.nused); if (index < parent->hdr.nused - 1) memmove(&parent->u.internal_key[index], &parent->u.internal_key[index + 1], sizeof(FreePageBtreeInternalKey) * (parent->hdr.nused - index - 1)); } parent->hdr.nused--; Assert(parent->hdr.nused > 0); /* Recycle the page. */ FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); /* Adjust ancestor keys if needed. */ if (index == 0) FreePageBtreeAdjustAncestorKeys(fpm, parent); /* Consider whether to consolidate the parent with a sibling. */ FreePageBtreeConsolidate(fpm, parent); } /* * Search the btree for an entry for the given first page and initialize * *result with the results of the search. result->page and result->index * indicate either the position of an exact match or the position at which * the new key should be inserted. result->found is true for an exact match, * otherwise false. result->split_pages will contain the number of additional * btree pages that will be needed when performing a split to insert a key. * Except as described above, the contents of fields in the result object are * undefined on return. */ static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, FreePageBtreeSearchResult *result) { char *base = fpm_segment_base(fpm); FreePageBtree *btp = relptr_access(base, fpm->btree_root); Size index; result->split_pages = 1; /* If the btree is empty, there's nothing to find. */ if (btp == NULL) { result->page = NULL; result->found = false; return; } /* Descend until we hit a leaf. */ while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) { FreePageBtree *child; bool found_exact; index = FreePageBtreeSearchInternal(btp, first_page); found_exact = index < btp->hdr.nused && btp->u.internal_key[index].first_page == first_page; /* * If we found an exact match we descend directly. Otherwise, we * descend into the child to the left if possible so that we can find * the insertion point at that child's high end. */ if (!found_exact && index > 0) --index; /* Track required split depth for leaf insert. */ if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) { Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); result->split_pages++; } else result->split_pages = 0; /* Descend to appropriate child page. */ Assert(index < btp->hdr.nused); child = relptr_access(base, btp->u.internal_key[index].child); Assert(relptr_access(base, child->hdr.parent) == btp); btp = child; } /* Track required split depth for leaf insert. */ if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) { Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); result->split_pages++; } else result->split_pages = 0; /* Search leaf page. */ index = FreePageBtreeSearchLeaf(btp, first_page); /* Assemble results. */ result->page = btp; result->index = index; result->found = index < btp->hdr.nused && first_page == btp->u.leaf_key[index].first_page; } /* * Search an internal page for the first key greater than or equal to a given * page number. Returns the index of that key, or one greater than the number * of keys on the page if none. */ static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) { Size low = 0; Size high = btp->hdr.nused; Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); while (low < high) { Size mid = (low + high) / 2; Size val = btp->u.internal_key[mid].first_page; if (first_page == val) return mid; else if (first_page < val) high = mid; else low = mid + 1; } return low; } /* * Search a leaf page for the first key greater than or equal to a given * page number. Returns the index of that key, or one greater than the number * of keys on the page if none. */ static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) { Size low = 0; Size high = btp->hdr.nused; Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); while (low < high) { Size mid = (low + high) / 2; Size val = btp->u.leaf_key[mid].first_page; if (first_page == val) return mid; else if (first_page < val) high = mid; else low = mid + 1; } return low; } /* * Allocate a new btree page and move half the keys from the provided page * to the new page. Caller is responsible for making sure that there's a * page available from fpm->btree_recycle. Returns a pointer to the new page, * to which caller must add a downlink. */ static FreePageBtree * FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) { FreePageBtree *newsibling; newsibling = FreePageBtreeGetRecycled(fpm); newsibling->hdr.magic = btp->hdr.magic; newsibling->hdr.nused = btp->hdr.nused / 2; relptr_copy(newsibling->hdr.parent, btp->hdr.parent); btp->hdr.nused -= newsibling->hdr.nused; if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) memcpy(&newsibling->u.leaf_key, &btp->u.leaf_key[btp->hdr.nused], sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); else { Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); memcpy(&newsibling->u.internal_key, &btp->u.internal_key[btp->hdr.nused], sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); } return newsibling; } /* * When internal pages are split or merged, the parent pointers of their * children must be updated. */ static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) { Size i; Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); for (i = 0; i < btp->hdr.nused; ++i) { FreePageBtree *child; child = relptr_access(base, btp->u.internal_key[i].child); relptr_store(base, child->hdr.parent, btp); } } /* * Debugging dump of btree data. */ static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, FreePageBtree *parent, int level, StringInfo buf) { char *base = fpm_segment_base(fpm); Size pageno = fpm_pointer_to_page(base, btp); Size index; FreePageBtree *check_parent; check_stack_depth(); check_parent = relptr_access(base, btp->hdr.parent); appendStringInfo(buf, " %zu@%d %c", pageno, level, btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); if (parent != check_parent) appendStringInfo(buf, " [actual parent %zu, expected %zu]", fpm_pointer_to_page(base, check_parent), fpm_pointer_to_page(base, parent)); appendStringInfoChar(buf, ':'); for (index = 0; index < btp->hdr.nused; ++index) { if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) appendStringInfo(buf, " %zu->%zu", btp->u.internal_key[index].first_page, btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE); else appendStringInfo(buf, " %zu(%zu)", btp->u.leaf_key[index].first_page, btp->u.leaf_key[index].npages); } appendStringInfoChar(buf, '\n'); if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) { for (index = 0; index < btp->hdr.nused; ++index) { FreePageBtree *child; child = relptr_access(base, btp->u.internal_key[index].child); FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); } } } /* * Debugging dump of free-span data. */ static void FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, Size expected_pages, StringInfo buf) { char *base = fpm_segment_base(fpm); while (span != NULL) { if (span->npages != expected_pages) appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span), span->npages); else appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span)); span = relptr_access(base, span->next); } appendStringInfoChar(buf, '\n'); } /* * This function allocates a run of pages of the given length from the free * page manager. */ static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) { char *base = fpm_segment_base(fpm); FreePageSpanLeader *victim = NULL; FreePageSpanLeader *prev; FreePageSpanLeader *next; FreePageBtreeSearchResult result; Size victim_page = 0; /* placate compiler */ Size f; /* * Search for a free span. * * Right now, we use a simple best-fit policy here, but it's possible for * this to result in memory fragmentation if we're repeatedly asked to * allocate chunks just a little smaller than what we have available. * Hopefully, this is unlikely, because we expect most requests to be * single pages or superblock-sized chunks -- but no policy can be optimal * under all circumstances unless it has knowledge of future allocation * patterns. */ for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) { /* Skip empty freelists. */ if (relptr_is_null(fpm->freelist[f])) continue; /* * All of the freelists except the last one contain only items of a * single size, so we just take the first one. But the final free * list contains everything too big for any of the other lists, so we * need to search the list. */ if (f < FPM_NUM_FREELISTS - 1) victim = relptr_access(base, fpm->freelist[f]); else { FreePageSpanLeader *candidate; candidate = relptr_access(base, fpm->freelist[f]); do { if (candidate->npages >= npages && (victim == NULL || victim->npages > candidate->npages)) { victim = candidate; if (victim->npages == npages) break; } candidate = relptr_access(base, candidate->next); } while (candidate != NULL); } break; } /* If we didn't find an allocatable span, return failure. */ if (victim == NULL) return false; /* Remove span from free list. */ Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); prev = relptr_access(base, victim->prev); next = relptr_access(base, victim->next); if (prev != NULL) relptr_copy(prev->next, victim->next); else relptr_copy(fpm->freelist[f], victim->next); if (next != NULL) relptr_copy(next->prev, victim->prev); victim_page = fpm_pointer_to_page(base, victim); /* Decide whether we might be invalidating contiguous_pages. */ if (f == FPM_NUM_FREELISTS - 1 && victim->npages == fpm->contiguous_pages) { /* * The victim span came from the oversized freelist, and had the same * size as the longest span. There may or may not be another one of * the same size, so contiguous_pages must be recomputed just to be * safe. */ fpm->contiguous_pages_dirty = true; } else if (f + 1 == fpm->contiguous_pages && relptr_is_null(fpm->freelist[f])) { /* * The victim span came from a fixed sized freelist, and it was the * list for spans of the same size as the current longest span, and * the list is now empty after removing the victim. So * contiguous_pages must be recomputed without a doubt. */ fpm->contiguous_pages_dirty = true; } /* * If we haven't initialized the btree yet, the victim must be the single * span stored within the FreePageManager itself. Otherwise, we need to * update the btree. */ if (relptr_is_null(fpm->btree_root)) { Assert(victim_page == fpm->singleton_first_page); Assert(victim->npages == fpm->singleton_npages); Assert(victim->npages >= npages); fpm->singleton_first_page += npages; fpm->singleton_npages -= npages; if (fpm->singleton_npages > 0) FreePagePushSpanLeader(fpm, fpm->singleton_first_page, fpm->singleton_npages); } else { /* * If the span we found is exactly the right size, remove it from the * btree completely. Otherwise, adjust the btree entry to reflect the * still-unallocated portion of the span, and put that portion on the * appropriate free list. */ FreePageBtreeSearch(fpm, victim_page, &result); Assert(result.found); if (victim->npages == npages) FreePageBtreeRemove(fpm, result.page, result.index); else { FreePageBtreeLeafKey *key; /* Adjust btree to reflect remaining pages. */ Assert(victim->npages > npages); key = &result.page->u.leaf_key[result.index]; Assert(key->npages == victim->npages); key->first_page += npages; key->npages -= npages; if (result.index == 0) FreePageBtreeAdjustAncestorKeys(fpm, result.page); /* Put the unallocated pages back on the appropriate free list. */ FreePagePushSpanLeader(fpm, victim_page + npages, victim->npages - npages); } } /* Return results to caller. */ *first_page = fpm_pointer_to_page(base, victim); return true; } /* * Put a range of pages into the btree and freelists, consolidating it with * existing free spans just before and/or after it. If 'soft' is true, * only perform the insertion if it can be done without allocating new btree * pages; if false, do it always. Returns 0 if the soft flag caused the * insertion to be skipped, or otherwise the size of the contiguous span * created by the insertion. This may be larger than npages if we're able * to consolidate with an adjacent range. */ static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, bool soft) { char *base = fpm_segment_base(fpm); FreePageBtreeSearchResult result; FreePageBtreeLeafKey *prevkey = NULL; FreePageBtreeLeafKey *nextkey = NULL; FreePageBtree *np; Size nindex; Assert(npages > 0); /* We can store a single free span without initializing the btree. */ if (fpm->btree_depth == 0) { if (fpm->singleton_npages == 0) { /* Don't have a span yet; store this one. */ fpm->singleton_first_page = first_page; fpm->singleton_npages = npages; FreePagePushSpanLeader(fpm, first_page, npages); return fpm->singleton_npages; } else if (fpm->singleton_first_page + fpm->singleton_npages == first_page) { /* New span immediately follows sole existing span. */ fpm->singleton_npages += npages; FreePagePopSpanLeader(fpm, fpm->singleton_first_page); FreePagePushSpanLeader(fpm, fpm->singleton_first_page, fpm->singleton_npages); return fpm->singleton_npages; } else if (first_page + npages == fpm->singleton_first_page) { /* New span immediately precedes sole existing span. */ FreePagePopSpanLeader(fpm, fpm->singleton_first_page); fpm->singleton_first_page = first_page; fpm->singleton_npages += npages; FreePagePushSpanLeader(fpm, fpm->singleton_first_page, fpm->singleton_npages); return fpm->singleton_npages; } else { /* Not contiguous; we need to initialize the btree. */ Size root_page; FreePageBtree *root; if (!relptr_is_null(fpm->btree_recycle)) root = FreePageBtreeGetRecycled(fpm); else if (soft) return 0; /* Should not allocate if soft. */ else if (FreePageManagerGetInternal(fpm, 1, &root_page)) root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); else { /* We'd better be able to get a page from the existing range. */ elog(FATAL, "free page manager btree is corrupt"); } /* Create the btree and move the preexisting range into it. */ root->hdr.magic = FREE_PAGE_LEAF_MAGIC; root->hdr.nused = 1; relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); root->u.leaf_key[0].first_page = fpm->singleton_first_page; root->u.leaf_key[0].npages = fpm->singleton_npages; relptr_store(base, fpm->btree_root, root); fpm->singleton_first_page = 0; fpm->singleton_npages = 0; fpm->btree_depth = 1; /* * Corner case: it may be that the btree root took the very last * free page. In that case, the sole btree entry covers a zero * page run, which is invalid. Overwrite it with the entry we're * trying to insert and get out. */ if (root->u.leaf_key[0].npages == 0) { root->u.leaf_key[0].first_page = first_page; root->u.leaf_key[0].npages = npages; FreePagePushSpanLeader(fpm, first_page, npages); return npages; } /* Fall through to insert the new key. */ } } /* Search the btree. */ FreePageBtreeSearch(fpm, first_page, &result); Assert(!result.found); if (result.index > 0) prevkey = &result.page->u.leaf_key[result.index - 1]; if (result.index < result.page->hdr.nused) { np = result.page; nindex = result.index; nextkey = &result.page->u.leaf_key[result.index]; } else { np = FreePageBtreeFindRightSibling(base, result.page); nindex = 0; if (np != NULL) nextkey = &np->u.leaf_key[0]; } /* Consolidate with the previous entry if possible. */ if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) { bool remove_next = false; Size result; Assert(prevkey->first_page + prevkey->npages == first_page); prevkey->npages = (first_page - prevkey->first_page) + npages; /* Check whether we can *also* consolidate with the following entry. */ if (nextkey != NULL && prevkey->first_page + prevkey->npages >= nextkey->first_page) { Assert(prevkey->first_page + prevkey->npages == nextkey->first_page); prevkey->npages = (nextkey->first_page - prevkey->first_page) + nextkey->npages; FreePagePopSpanLeader(fpm, nextkey->first_page); remove_next = true; } /* Put the span on the correct freelist and save size. */ FreePagePopSpanLeader(fpm, prevkey->first_page); FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); result = prevkey->npages; /* * If we consolidated with both the preceding and following entries, * we must remove the following entry. We do this last, because * removing an element from the btree may invalidate pointers we hold * into the current data structure. * * NB: The btree is technically in an invalid state a this point * because we've already updated prevkey to cover the same key space * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. */ if (remove_next) FreePageBtreeRemove(fpm, np, nindex); return result; } /* Consolidate with the next entry if possible. */ if (nextkey != NULL && first_page + npages >= nextkey->first_page) { Size newpages; /* Compute new size for span. */ Assert(first_page + npages == nextkey->first_page); newpages = (nextkey->first_page - first_page) + nextkey->npages; /* Put span on correct free list. */ FreePagePopSpanLeader(fpm, nextkey->first_page); FreePagePushSpanLeader(fpm, first_page, newpages); /* Update key in place. */ nextkey->first_page = first_page; nextkey->npages = newpages; /* If reducing first key on page, ancestors might need adjustment. */ if (nindex == 0) FreePageBtreeAdjustAncestorKeys(fpm, np); return nextkey->npages; } /* Split leaf page and as many of its ancestors as necessary. */ if (result.split_pages > 0) { /* * NB: We could consider various coping strategies here to avoid a * split; most obviously, if np != result.page, we could target that * page instead. More complicated shuffling strategies could be * possible as well; basically, unless every single leaf page is 100% * full, we can jam this key in there if we try hard enough. It's * unlikely that trying that hard is worthwhile, but it's possible we * might need to make more than no effort. For now, we just do the * easy thing, which is nothing. */ /* If this is a soft insert, it's time to give up. */ if (soft) return 0; /* Check whether we need to allocate more btree pages to split. */ if (result.split_pages > fpm->btree_recycle_count) { Size pages_needed; Size recycle_page; Size i; /* * Allocate the required number of pages and split each one in * turn. This should never fail, because if we've got enough * spans of free pages kicking around that we need additional * storage space just to remember them all, then we should * certainly have enough to expand the btree, which should only * ever use a tiny number of pages compared to the number under * management. If it does, something's badly screwed up. */ pages_needed = result.split_pages - fpm->btree_recycle_count; for (i = 0; i < pages_needed; ++i) { if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) elog(FATAL, "free page manager btree is corrupt"); FreePageBtreeRecycle(fpm, recycle_page); } /* * The act of allocating pages to recycle may have invalidated the * results of our previous btree reserch, so repeat it. (We could * recheck whether any of our split-avoidance strategies that were * not viable before now are, but it hardly seems worthwhile, so * we don't bother. Consolidation can't be possible now if it * wasn't previously.) */ FreePageBtreeSearch(fpm, first_page, &result); /* * The act of allocating pages for use in constructing our btree * should never cause any page to become more full, so the new * split depth should be no greater than the old one, and perhaps * less if we fortuitously allocated a chunk that freed up a slot * on the page we need to update. */ Assert(result.split_pages <= fpm->btree_recycle_count); } /* If we still need to perform a split, do it. */ if (result.split_pages > 0) { FreePageBtree *split_target = result.page; FreePageBtree *child = NULL; Size key = first_page; for (;;) { FreePageBtree *newsibling; FreePageBtree *parent; /* Identify parent page, which must receive downlink. */ parent = relptr_access(base, split_target->hdr.parent); /* Split the page - downlink not added yet. */ newsibling = FreePageBtreeSplitPage(fpm, split_target); /* * At this point in the loop, we're always carrying a pending * insertion. On the first pass, it's the actual key we're * trying to insert; on subsequent passes, it's the downlink * that needs to be added as a result of the split performed * during the previous loop iteration. Since we've just split * the page, there's definitely room on one of the two * resulting pages. */ if (child == NULL) { Size index; FreePageBtree *insert_into; insert_into = key < newsibling->u.leaf_key[0].first_page ? split_target : newsibling; index = FreePageBtreeSearchLeaf(insert_into, key); FreePageBtreeInsertLeaf(insert_into, index, key, npages); if (index == 0 && insert_into == split_target) FreePageBtreeAdjustAncestorKeys(fpm, split_target); } else { Size index; FreePageBtree *insert_into; insert_into = key < newsibling->u.internal_key[0].first_page ? split_target : newsibling; index = FreePageBtreeSearchInternal(insert_into, key); FreePageBtreeInsertInternal(base, insert_into, index, key, child); relptr_store(base, child->hdr.parent, insert_into); if (index == 0 && insert_into == split_target) FreePageBtreeAdjustAncestorKeys(fpm, split_target); } /* If the page we just split has no parent, split the root. */ if (parent == NULL) { FreePageBtree *newroot; newroot = FreePageBtreeGetRecycled(fpm); newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; newroot->hdr.nused = 2; relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); newroot->u.internal_key[0].first_page = FreePageBtreeFirstKey(split_target); relptr_store(base, newroot->u.internal_key[0].child, split_target); relptr_store(base, split_target->hdr.parent, newroot); newroot->u.internal_key[1].first_page = FreePageBtreeFirstKey(newsibling); relptr_store(base, newroot->u.internal_key[1].child, newsibling); relptr_store(base, newsibling->hdr.parent, newroot); relptr_store(base, fpm->btree_root, newroot); fpm->btree_depth++; break; } /* If the parent page isn't full, insert the downlink. */ key = newsibling->u.internal_key[0].first_page; if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) { Size index; index = FreePageBtreeSearchInternal(parent, key); FreePageBtreeInsertInternal(base, parent, index, key, newsibling); relptr_store(base, newsibling->hdr.parent, parent); if (index == 0) FreePageBtreeAdjustAncestorKeys(fpm, parent); break; } /* The parent also needs to be split, so loop around. */ child = newsibling; split_target = parent; } /* * The loop above did the insert, so just need to update the free * list, and we're done. */ FreePagePushSpanLeader(fpm, first_page, npages); return npages; } } /* Physically add the key to the page. */ Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); /* If new first key on page, ancestors might need adjustment. */ if (result.index == 0) FreePageBtreeAdjustAncestorKeys(fpm, result.page); /* Put it on the free list. */ FreePagePushSpanLeader(fpm, first_page, npages); return npages; } /* * Remove a FreePageSpanLeader from the linked-list that contains it, either * because we're changing the size of the span, or because we're allocating it. */ static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) { char *base = fpm_segment_base(fpm); FreePageSpanLeader *span; FreePageSpanLeader *next; FreePageSpanLeader *prev; span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); next = relptr_access(base, span->next); prev = relptr_access(base, span->prev); if (next != NULL) relptr_copy(next->prev, span->prev); if (prev != NULL) relptr_copy(prev->next, span->next); else { Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE); relptr_copy(fpm->freelist[f], span->next); } } /* * Initialize a new FreePageSpanLeader and put it on the appropriate free list. */ static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) { char *base = fpm_segment_base(fpm); Size f = Min(npages, FPM_NUM_FREELISTS) - 1; FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); FreePageSpanLeader *span; span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; span->npages = npages; relptr_store(base, span->next, head); relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); if (head != NULL) relptr_store(base, head->prev, span); relptr_store(base, fpm->freelist[f], span); }