Use full 64-bit XIDs in deleted nbtree pages.

Otherwise we risk "leaking" deleted pages by making them non-recyclable indefinitely. Commit 6655a729 did the same thing for deleted pages in GiST indexes. That work was used as a starting point here. Stop storing an XID indicating the oldest bpto.xact across all deleted though unrecycled pages in nbtree metapages. There is no longer any reason to care about that condition/the oldest XID. It only ever made sense when wraparound was something _bt_vacuum_needs_cleanup() had to consider. The btm_oldest_btpo_xact metapage field has been repurposed and renamed. It is now btm_last_cleanup_num_delpages, which is used to remember how many non-recycled deleted pages remain from the last VACUUM (in practice its value is usually the precise number of pages that were _newly deleted_ during the specific VACUUM operation that last set the field). The general idea behind storing btm_last_cleanup_num_delpages is to use it to give _some_ consideration to non-recycled deleted pages inside _bt_vacuum_needs_cleanup() -- though never too much. We only really need to avoid leaving a truly excessive number of deleted pages in an unrecycled state forever. We only do this to cover certain narrow cases where no other factor makes VACUUM do a full scan, and yet the index continues to grow (and so actually misses out on recycling existing deleted pages). These metapage changes result in a clear user-visible benefit: We no longer trigger full index scans during VACUUM operations solely due to the presence of only 1 or 2 known deleted (though unrecycled) blocks from a very large index. All that matters now is keeping the costs and benefits in balance over time. Fix an issue that has been around since commit 857f9c36, which added the "skip full scan of index" mechanism (i.e. the _bt_vacuum_needs_cleanup() logic). The accuracy of btm_last_cleanup_num_heap_tuples accidentally hinged upon _when_ the source value gets stored. We now always store btm_last_cleanup_num_heap_tuples in btvacuumcleanup(). This fixes the issue because IndexVacuumInfo.num_heap_tuples (the source field) is expected to accurately indicate the state of the table _after_ the VACUUM completes inside btvacuumcleanup(). A backpatchable fix cannot easily be extracted from this commit. A targeted fix for the issue will follow in a later commit, though that won't happen today. I (pgeoghegan) have chosen to remove any mention of deleted pages in the documentation of the vacuum_cleanup_index_scale_factor GUC/param, since the presence of deleted (though unrecycled) pages is no longer of much concern to users. The vacuum_cleanup_index_scale_factor description in the docs now seems rather unclear in any case, and it should probably be rewritten in the near future. Perhaps some passing mention of page deletion will be added back at the same time. Bump XLOG_PAGE_MAGIC due to nbtree WAL records using full XIDs now. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznpdHvujGUwYZ8sihX=d5u-tRYhi-F4wnV2uN2zHpMUXw@mail.gmail.com
2021-02-24 18:41:34 -08:00 · 2021-02-24 18:41:34 -08:00 · e5d8a99903
parent 8a4f9522d0
commit e5d8a99903
20 changed files with 623 additions and 407 deletions
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 											  P_FIRSTDATAKEY(opaque));
 				itup = (IndexTuple) PageGetItem(state->target, itemid);
 				nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
-				nextleveldown.level = opaque->btpo.level - 1;
+				nextleveldown.level = opaque->btpo_level - 1;
 			}
 			else
 			{
@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 		if (opaque->btpo_prev != leftcurrent)
 			bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);

-		/* Check level, which must be valid for non-ignorable page */
-		if (level.level != opaque->btpo.level)
+		/* Check level */
+		if (level.level != opaque->btpo_level)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
 							RelationGetRelationName(state->rel)),
 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-										current, level.level, opaque->btpo.level)));
+										current, level.level, opaque->btpo_level)));

 		/* Verify invariants for page */
 		bt_target_page_check(state);
@ -1164,7 +1164,7 @@ bt_target_page_check(BtreeCheckState *state)
 				bt_child_highkey_check(state,
 									   offset,
 									   NULL,
-									   topaque->btpo.level);
+									   topaque->btpo_level);
 			}
 			continue;
 		}
@ -1520,7 +1520,7 @@ bt_target_page_check(BtreeCheckState *state)
 	if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
 	{
 		bt_child_highkey_check(state, InvalidOffsetNumber,
-							   NULL, topaque->btpo.level);
+							   NULL, topaque->btpo_level);
 	}
 }

@ -1597,7 +1597,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 		ereport(DEBUG1,
 				(errcode(ERRCODE_NO_DATA),
 				 errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead",
-						opaque->btpo.level, RelationGetRelationName(state->rel)),
+						opaque->btpo_level, RelationGetRelationName(state->rel)),
 				 errdetail_internal("Deleted page found when building scankey from right sibling.")));

 		/* Be slightly more pro-active in freeing this memory, just in case */
@ -1900,14 +1900,15 @@ bt_child_highkey_check(BtreeCheckState *state,
 										state->targetblock, blkno,
 										LSN_FORMAT_ARGS(state->targetlsn))));

-		/* Check level for non-ignorable page */
-		if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
+		/* Do level sanity check */
+		if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
+			opaque->btpo_level != target_level - 1)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
 							RelationGetRelationName(state->rel)),
 					 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
-										blkno, target_level - 1, opaque->btpo.level)));
+										blkno, target_level - 1, opaque->btpo_level)));

 		/* Try to detect circular links */
 		if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
@ -2132,7 +2133,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
 	 * check for downlink connectivity.
 	 */
 	bt_child_highkey_check(state, downlinkoffnum,
-						   child, topaque->btpo.level);
+						   child, topaque->btpo_level);

 	/*
 	 * Since there cannot be a concurrent VACUUM operation in readonly mode,
@ -2275,7 +2276,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 				 errmsg_internal("harmless interrupted page split detected in index %s",
 						RelationGetRelationName(state->rel)),
 				 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
-									blkno, opaque->btpo.level,
+									blkno, opaque->btpo_level,
 									opaque->btpo_prev,
 									LSN_FORMAT_ARGS(pagelsn))));
 		return;
@ -2304,7 +2305,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 	elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
 		 RelationGetRelationName(state->rel));

-	level = opaque->btpo.level;
+	level = opaque->btpo_level;
 	itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
 	itup = (IndexTuple) PageGetItem(page, itemid);
 	childblk = BTreeTupleGetDownLink(itup);
@ -2319,16 +2320,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 			break;

 		/* Do an extra sanity check in passing on internal pages */
-		if (copaque->btpo.level != level - 1)
+		if (copaque->btpo_level != level - 1)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
 									 RelationGetRelationName(state->rel)),
 					 errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
 										blkno, childblk,
-										level - 1, copaque->btpo.level)));
+										level - 1, copaque->btpo_level)));

-		level = copaque->btpo.level;
+		level = copaque->btpo_level;
 		itemid = PageGetItemIdCareful(state, childblk, child,
 									  P_FIRSTDATAKEY(copaque));
 		itup = (IndexTuple) PageGetItem(child, itemid);
@ -2389,7 +2390,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 			 errmsg("internal index block lacks downlink in index \"%s\"",
 					RelationGetRelationName(state->rel)),
 			 errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
-								blkno, opaque->btpo.level,
+								blkno, opaque->btpo_level,
 								LSN_FORMAT_ARGS(pagelsn))));
 }

@ -2983,21 +2984,28 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 	}

 	/*
-	 * Deleted pages have no sane "level" field, so can only check non-deleted
-	 * page level
+	 * Deleted pages that still use the old 32-bit XID representation have no
+	 * sane "level" field because they type pun the field, but all other pages
+	 * (including pages deleted on Postgres 14+) have a valid value.
 	 */
-	if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_INDEX_CORRUPTED),
-				 errmsg("invalid leaf page level %u for block %u in index \"%s\"",
-						opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
+	if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
+	{
+		/* Okay, no reason not to trust btpo_level field from page */

-	if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
-		opaque->btpo.level == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_INDEX_CORRUPTED),
-				 errmsg("invalid internal page level 0 for block %u in index \"%s\"",
-						blocknum, RelationGetRelationName(state->rel))));
+		if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
+									 opaque->btpo_level, blocknum,
+									 RelationGetRelationName(state->rel))));
+
+		if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
+									 blocknum,
+									 RelationGetRelationName(state->rel))));
+	}

 	/*
 	 * Sanity checks for number of items on page.
@ -3044,8 +3052,6 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 	 * state.  This state is nonetheless treated as corruption by VACUUM on
 	 * from version 9.4 on, so do the same here.  See _bt_pagedel() for full
 	 * details.
-	 *
-	 * Internal pages should never have garbage items, either.
 	 */
 	if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
 		ereport(ERROR,
@ -3054,11 +3060,27 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 						blocknum, RelationGetRelationName(state->rel)),
 				 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));

+	/*
+	 * Check that internal pages have no garbage items, and that no page has
+	 * an invalid combination of deletion-related page level flags
+	 */
 	if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
-				 errmsg("internal page block %u in index \"%s\" has garbage items",
-						blocknum, RelationGetRelationName(state->rel))));
+				 errmsg_internal("internal page block %u in index \"%s\" has garbage items",
+								 blocknum, RelationGetRelationName(state->rel))));
+
+	if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
+								 blocknum, RelationGetRelationName(state->rel))));
+
+	if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
+								 blocknum, RelationGetRelationName(state->rel))));

 	return page;
 }
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@ -75,11 +75,7 @@ typedef struct BTPageStat
 	/* opaque data */
 	BlockNumber btpo_prev;
 	BlockNumber btpo_next;
-	union
-	{
-		uint32		level;
-		TransactionId xact;
-	}			btpo;
+	uint32		btpo_level;
 	uint16		btpo_flags;
 	BTCycleId	btpo_cycleid;
 } BTPageStat;
@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
 	/* page type (flags) */
 	if (P_ISDELETED(opaque))
 	{
-		stat->type = 'd';
-		stat->btpo.xact = opaque->btpo.xact;
-		return;
+		/* We divide deleted pages into leaf ('d') or internal ('D') */
+		if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
+			stat->type = 'd';
+		else
+			stat->type = 'D';
+
+		/*
+		 * Report safexid in a deleted page.
+		 *
+		 * Handle pg_upgrade'd deleted pages that used the previous safexid
+		 * representation in btpo_level field (this used to be a union type
+		 * called "bpto").
+		 */
+		if (P_HAS_FULLXID(opaque))
+		{
+			FullTransactionId safexid = BTPageGetDeleteXid(page);
+
+			elog(NOTICE, "deleted page from block %u has safexid %u:%u",
+				 blkno, EpochFromFullTransactionId(safexid),
+				 XidFromFullTransactionId(safexid));
+		}
+		else
+			elog(NOTICE, "deleted page from block %u has safexid %u",
+				 blkno, opaque->btpo_level);
+
+		/* Don't interpret BTDeletedPageData as index tuples */
+		maxoff = InvalidOffsetNumber;
 	}
 	else if (P_IGNORE(opaque))
 		stat->type = 'e';
@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
 	/* btpage opaque data */
 	stat->btpo_prev = opaque->btpo_prev;
 	stat->btpo_next = opaque->btpo_next;
-	stat->btpo.level = opaque->btpo.level;
+	stat->btpo_level = opaque->btpo_level;
 	stat->btpo_flags = opaque->btpo_flags;
 	stat->btpo_cycleid = opaque->btpo_cycleid;

@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
 	values[j++] = psprintf("%u", stat.free_size);
 	values[j++] = psprintf("%u", stat.btpo_prev);
 	values[j++] = psprintf("%u", stat.btpo_next);
-	values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
+	values[j++] = psprintf("%u", stat.btpo_level);
 	values[j++] = psprintf("%d", stat.btpo_flags);

 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)

 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);

-		if (P_ISDELETED(opaque))
-			elog(NOTICE, "page is deleted");
-
-		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		if (!P_ISDELETED(opaque))
+			fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		else
+		{
+			/* Don't interpret BTDeletedPageData as index tuples */
+			elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
+			fctx->max_calls = 0;
+		}
 		uargs->leafpage = P_ISLEAF(opaque);
 		uargs->rightmost = P_RIGHTMOST(opaque);

@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
 		if (P_ISDELETED(opaque))
 			elog(NOTICE, "page is deleted");

-		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		if (!P_ISDELETED(opaque))
+			fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+		else
+		{
+			/* Don't interpret BTDeletedPageData as index tuples */
+			elog(NOTICE, "page from block is deleted");
+			fctx->max_calls = 0;
+		}
 		uargs->leafpage = P_ISLEAF(opaque);
 		uargs->rightmost = P_RIGHTMOST(opaque);

@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS)

 	/*
 	 * We need a kluge here to detect API versions prior to 1.8.  Earlier
-	 * versions incorrectly used int4 for certain columns.  This caused
-	 * various problems.  For example, an int4 version of the "oldest_xact"
-	 * column would not work with TransactionId values that happened to exceed
-	 * PG_INT32_MAX.
+	 * versions incorrectly used int4 for certain columns.
 	 *
 	 * There is no way to reliably avoid the problems created by the old
 	 * function definition at this point, so insist that the user update the
@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS)
 	 */
 	if (metad->btm_version >= BTREE_NOVAC_VERSION)
 	{
-		values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+		values[j++] = psprintf(INT64_FORMAT,
+							   (int64) metad->btm_last_cleanup_num_delpages);
 		values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
 		values[j++] = metad->btm_allequalimage ? "t" : "f";
 	}
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@ -3,16 +3,16 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
 CREATE INDEX test1_a_idx ON test1 USING btree (a);
 \x
 SELECT * FROM bt_metap('test1_a_idx');
-[ RECORD 1 ]-----------+-------
-magic                   | 340322
-version                 | 4
-root                    | 1
-level                   | 0
-fastroot                | 1
-fastlevel               | 0
-oldest_xact             | 0
-last_cleanup_num_tuples | -1
-allequalimage           | t
+-[ RECORD 1 ]-------------+-------
+magic                     | 340322
+version                   | 4
+root                      | 1
+level                     | 0
+fastroot                  | 1
+fastlevel                 | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples   | -1
+allequalimage             | t

 SELECT * FROM bt_page_stats('test1_a_idx', -1);
 ERROR:  invalid block number
@ -29,7 +29,7 @@ page_size     | 8192
 free_size     | 8128
 btpo_prev     | 0
 btpo_next     | 0
-btpo          | 0
+btpo_level    | 0
 btpo_flags    | 3

 SELECT * FROM bt_page_stats('test1_a_idx', 2);
--- a/contrib/pageinspect/pageinspect--1.8--1.9.sql
+++ b/contrib/pageinspect/pageinspect--1.8--1.9.sql
@ -66,6 +66,23 @@ RETURNS smallint
 AS 'MODULE_PATHNAME', 'page_checksum_1_9'
 LANGUAGE C STRICT PARALLEL SAFE;

+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(text);
+CREATE FUNCTION bt_metap(IN relname text,
+    OUT magic int4,
+    OUT version int4,
+    OUT root int8,
+    OUT level int8,
+    OUT fastroot int8,
+    OUT fastlevel int8,
+    OUT last_cleanup_num_delpages int8,
+    OUT last_cleanup_num_tuples float8,
+    OUT allequalimage boolean)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
+
 --
 -- bt_page_stats()
 --
@ -80,7 +97,7 @@ CREATE FUNCTION bt_page_stats(IN relname text, IN blkno int8,
    OUT free_size int4,
    OUT btpo_prev int8,
    OUT btpo_next int8,
-    OUT btpo int4,
+    OUT btpo_level int8,
    OUT btpo_flags int4)
 AS 'MODULE_PATHNAME', 'bt_page_stats_1_9'
 LANGUAGE C STRICT PARALLEL SAFE;
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 		page = BufferGetPage(buffer);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

-		/* Determine page type, and update totals */
-
+		/*
+		 * Determine page type, and update totals.
+		 *
+		 * Note that we arbitrarily bucket deleted pages together without
+		 * considering if they're leaf pages or internal pages.
+		 */
 		if (P_ISDELETED(opaque))
 			indexStat.deleted_pages++;
 		else if (P_IGNORE(opaque))
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@ -8529,11 +8529,10 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;

       <para>
        If no tuples were deleted from the heap, B-tree indexes are still
-        scanned at the <command>VACUUM</command> cleanup stage when at least one
-        of the following conditions is met: the index statistics are stale, or
-        the index contains deleted pages that can be recycled during cleanup.
-        Index statistics are considered to be stale if the number of newly
-        inserted tuples exceeds the <varname>vacuum_cleanup_index_scale_factor</varname>
+        scanned at the <command>VACUUM</command> cleanup stage when the
+        index's statistics are stale.  Index statistics are considered
+        stale if the number of newly inserted tuples exceeds the
+        <varname>vacuum_cleanup_index_scale_factor</varname>
        fraction of the total number of heap tuples detected by the previous
        statistics collection. The total number of heap tuples is stored in
        the index meta-page. Note that the meta-page does not include this data
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@ -298,16 +298,16 @@ test=# SELECT t_ctid, raw_flags, combined_flags
      index's metapage.  For example:
 <screen>
 test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]-----------+-------
-magic                   | 340322
-version                 | 4
-root                    | 1
-level                   | 0
-fastroot                | 1
-fastlevel               | 0
-oldest_xact             | 582
-last_cleanup_num_tuples | 1000
-allequalimage           | f
+-[ RECORD 1 ]-------------+-------
+magic                     | 340322
+version                   | 4
+root                      | 1
+level                     | 0
+fastroot                  | 1
+fastlevel                 | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples   | 230
+allequalimage             | f
 </screen>
     </para>
    </listitem>
@ -337,7 +337,7 @@ page_size     | 8192
 free_size     | 3668
 btpo_prev     | 0
 btpo_next     | 0
-btpo          | 0
+btpo_level    | 0
 btpo_flags    | 3
 </screen>
     </para>
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record)
 	 * same exclusion effect on primary and standby.
 	 */
 	if (InHotStandby)
-	{
-		FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
-		FullTransactionId nextXid = ReadNextFullTransactionId();
-		uint64		diff;
-
-		/*
-		 * ResolveRecoveryConflictWithSnapshot operates on 32-bit
-		 * TransactionIds, so truncate the logged FullTransactionId. If the
-		 * logged value is very old, so that XID wrap-around already happened
-		 * on it, there can't be any snapshots that still see it.
-		 */
-		diff = U64FromFullTransactionId(nextXid) -
-			U64FromFullTransactionId(latestRemovedFullXid);
-		if (diff < MaxTransactionId / 2)
-		{
-			TransactionId latestRemovedXid;
-
-			latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
-			ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
-												xlrec->node);
-		}
-	}
+		ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+												   xlrec->node);
 }

 void
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@ -1241,7 +1241,7 @@ _bt_insertonpg(Relation rel,
 			metapg = BufferGetPage(metabuf);
 			metad = BTPageGetMeta(metapg);

-			if (metad->btm_fastlevel >= opaque->btpo.level)
+			if (metad->btm_fastlevel >= opaque->btpo_level)
 			{
 				/* no update wanted */
 				_bt_relbuf(rel, metabuf);
@ -1268,7 +1268,7 @@ _bt_insertonpg(Relation rel,
 			if (metad->btm_version < BTREE_NOVAC_VERSION)
 				_bt_upgrademetapage(metapg);
 			metad->btm_fastroot = BufferGetBlockNumber(buf);
-			metad->btm_fastlevel = opaque->btpo.level;
+			metad->btm_fastlevel = opaque->btpo_level;
 			MarkBufferDirty(metabuf);
 		}

@ -1331,7 +1331,7 @@ _bt_insertonpg(Relation rel,
 					xlmeta.level = metad->btm_level;
 					xlmeta.fastroot = metad->btm_fastroot;
 					xlmeta.fastlevel = metad->btm_fastlevel;
-					xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+					xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 					xlmeta.last_cleanup_num_heap_tuples =
 						metad->btm_last_cleanup_num_heap_tuples;
 					xlmeta.allequalimage = metad->btm_allequalimage;
@ -1537,7 +1537,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
 	lopaque->btpo_prev = oopaque->btpo_prev;
 	/* handle btpo_next after rightpage buffer acquired */
-	lopaque->btpo.level = oopaque->btpo.level;
+	lopaque->btpo_level = oopaque->btpo_level;
 	/* handle btpo_cycleid after rightpage buffer acquired */

 	/*
@ -1722,7 +1722,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
 	ropaque->btpo_prev = origpagenumber;
 	ropaque->btpo_next = oopaque->btpo_next;
-	ropaque->btpo.level = oopaque->btpo.level;
+	ropaque->btpo_level = oopaque->btpo_level;
 	ropaque->btpo_cycleid = lopaque->btpo_cycleid;

 	/*
@ -1950,7 +1950,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		uint8		xlinfo;
 		XLogRecPtr	recptr;

-		xlrec.level = ropaque->btpo.level;
+		xlrec.level = ropaque->btpo_level;
 		/* See comments below on newitem, orignewitem, and posting lists */
 		xlrec.firstrightoff = firstrightoff;
 		xlrec.newitemoff = newitemoff;
@ -2142,7 +2142,7 @@ _bt_insert_parent(Relation rel,
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));

 			/* Find the leftmost page at the next level up */
-			pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL);
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
@ -2480,15 +2480,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 	rootopaque->btpo_flags = BTP_ROOT;
-	rootopaque->btpo.level =
-		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+	rootopaque->btpo_level =
+		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
 	rootopaque->btpo_cycleid = 0;

 	/* update metapage data */
 	metad->btm_root = rootblknum;
-	metad->btm_level = rootopaque->btpo.level;
+	metad->btm_level = rootopaque->btpo_level;
 	metad->btm_fastroot = rootblknum;
-	metad->btm_fastlevel = rootopaque->btpo.level;
+	metad->btm_fastlevel = rootopaque->btpo_level;

 	/*
 	 * Insert the left page pointer into the new root page.  The root page is
@ -2548,7 +2548,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		md.level = metad->btm_level;
 		md.fastroot = rootblknum;
 		md.fastlevel = metad->btm_level;
-		md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+		md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 		md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 		md.allequalimage = metad->btm_allequalimage;

--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@ -37,7 +37,7 @@

 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
-							   TransactionId latestRemovedXid);
+							   FullTransactionId safexid);
 static void _bt_delitems_delete(Relation rel, Buffer buf,
 								TransactionId latestRemovedXid,
 								OffsetNumber *deletable, int ndeletable,
@ -50,7 +50,6 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
 									 BlockNumber scanblkno,
 									 bool *rightsib_empty,
-									 TransactionId *oldestBtpoXact,
 									 uint32 *ndeleted);
 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
 									BTStack stack,
@ -78,7 +77,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
 	metad->btm_level = level;
 	metad->btm_fastroot = rootbknum;
 	metad->btm_fastlevel = level;
-	metad->btm_oldest_btpo_xact = InvalidTransactionId;
+	metad->btm_last_cleanup_num_delpages = 0;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
 	metad->btm_allequalimage = allequalimage;

@ -118,7 +117,7 @@ _bt_upgrademetapage(Page page)

 	/* Set version number and fill extra fields added into version 3 */
 	metad->btm_version = BTREE_NOVAC_VERSION;
-	metad->btm_oldest_btpo_xact = InvalidTransactionId;
+	metad->btm_last_cleanup_num_delpages = 0;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
 	/* Only a REINDEX can set this field */
 	Assert(!metad->btm_allequalimage);
@ -169,35 +168,61 @@ _bt_getmeta(Relation rel, Buffer metabuf)
 }

 /*
- *	_bt_update_meta_cleanup_info() -- Update cleanup-related information in
- *									  the metapage.
+ *	_bt_set_cleanup_info() -- Update metapage for btvacuumcleanup().
 *
- *		This routine checks if provided cleanup-related information is matching
- *		to those written in the metapage.  On mismatch, metapage is overwritten.
+ *		This routine is called at the end of each VACUUM's btvacuumcleanup()
+ *		call.  Its purpose is to maintain the metapage fields that are used by
+ *		_bt_vacuum_needs_cleanup() to decide whether or not a btvacuumscan()
+ *		call should go ahead for an entire VACUUM operation.
+ *
+ *		See btvacuumcleanup() and _bt_vacuum_needs_cleanup() for details of
+ *		the two fields that we maintain here.
+ *
+ *		The information that we maintain for btvacuumcleanup() describes the
+ *		state of the index (as well as the table it indexes) just _after_ the
+ *		ongoing VACUUM operation.  The next _bt_vacuum_needs_cleanup() call
+ *		will consider the information we saved for it during the next VACUUM
+ *		operation (assuming that there will be no btbulkdelete() call during
+ *		the next VACUUM operation -- if there is then the question of skipping
+ *		btvacuumscan() doesn't even arise).
 */
 void
-_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
-							 float8 numHeapTuples)
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+					 float8 num_heap_tuples)
 {
 	Buffer		metabuf;
 	Page		metapg;
 	BTMetaPageData *metad;
-	bool		needsRewrite = false;
+	bool		rewrite = false;
 	XLogRecPtr	recptr;

-	/* read the metapage and check if it needs rewrite */
+	/*
+	 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+	 * field started out as a TransactionId field called btm_oldest_btpo_xact.
+	 * Both "versions" are just uint32 fields.  It was convenient to repurpose
+	 * the field when we began to use 64-bit XIDs in deleted pages.
+	 *
+	 * It's possible that a pg_upgrade'd database will contain an XID value in
+	 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+	 * field.  _bt_vacuum_needs_cleanup() may even believe that this value
+	 * indicates that there are lots of pages that it needs to recycle, when
+	 * in reality there are only one or two.  The worst that can happen is
+	 * that there will be a call to btvacuumscan a little earlier, which will
+	 * set btm_last_cleanup_num_delpages to a sane value when we're called.
+	 */
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);

-	/* outdated version of metapage always needs rewrite */
+	/* Always dynamically upgrade index/metapage when BTREE_MIN_VERSION */
 	if (metad->btm_version < BTREE_NOVAC_VERSION)
-		needsRewrite = true;
-	else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
-			 metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
-		needsRewrite = true;
+		rewrite = true;
+	else if (metad->btm_last_cleanup_num_delpages != num_delpages)
+		rewrite = true;
+	else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples)
+		rewrite = true;

-	if (!needsRewrite)
+	if (!rewrite)
 	{
 		_bt_relbuf(rel, metabuf);
 		return;
@ -214,8 +239,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		_bt_upgrademetapage(metapg);

 	/* update cleanup-related information */
-	metad->btm_oldest_btpo_xact = oldestBtpoXact;
-	metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+	metad->btm_last_cleanup_num_delpages = num_delpages;
+	metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples;
 	MarkBufferDirty(metabuf);

 	/* write wal record if needed */
@ -232,8 +257,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		md.level = metad->btm_level;
 		md.fastroot = metad->btm_fastroot;
 		md.fastlevel = metad->btm_fastlevel;
-		md.oldest_btpo_xact = oldestBtpoXact;
-		md.last_cleanup_num_heap_tuples = numHeapTuples;
+		md.last_cleanup_num_delpages = num_delpages;
+		md.last_cleanup_num_heap_tuples = num_heap_tuples;
 		md.allequalimage = metad->btm_allequalimage;

 		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
@ -244,6 +269,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	}

 	END_CRIT_SECTION();
+
 	_bt_relbuf(rel, metabuf);
 }

@ -316,7 +342,7 @@ _bt_getroot(Relation rel, int access)
 		 * because that's not set in a "fast root".
 		 */
 		if (!P_IGNORE(rootopaque) &&
-			rootopaque->btpo.level == rootlevel &&
+			rootopaque->btpo_level == rootlevel &&
 			P_LEFTMOST(rootopaque) &&
 			P_RIGHTMOST(rootopaque))
 		{
@ -377,7 +403,7 @@ _bt_getroot(Relation rel, int access)
 		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
-		rootopaque->btpo.level = 0;
+		rootopaque->btpo_level = 0;
 		rootopaque->btpo_cycleid = 0;
 		/* Get raw page pointer for metapage */
 		metapg = BufferGetPage(metabuf);
@ -393,7 +419,7 @@ _bt_getroot(Relation rel, int access)
 		metad->btm_level = 0;
 		metad->btm_fastroot = rootblkno;
 		metad->btm_fastlevel = 0;
-		metad->btm_oldest_btpo_xact = InvalidTransactionId;
+		metad->btm_last_cleanup_num_delpages = 0;
 		metad->btm_last_cleanup_num_heap_tuples = -1.0;

 		MarkBufferDirty(rootbuf);
@ -416,7 +442,7 @@ _bt_getroot(Relation rel, int access)
 			md.level = 0;
 			md.fastroot = rootblkno;
 			md.fastlevel = 0;
-			md.oldest_btpo_xact = InvalidTransactionId;
+			md.last_cleanup_num_delpages = 0;
 			md.last_cleanup_num_heap_tuples = -1.0;
 			md.allequalimage = metad->btm_allequalimage;

@ -481,11 +507,10 @@ _bt_getroot(Relation rel, int access)
 			rootblkno = rootopaque->btpo_next;
 		}

-		/* Note: can't check btpo.level on deleted pages */
-		if (rootopaque->btpo.level != rootlevel)
+		if (rootopaque->btpo_level != rootlevel)
 			elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 				 rootblkno, RelationGetRelationName(rel),
-				 rootopaque->btpo.level, rootlevel);
+				 rootopaque->btpo_level, rootlevel);
 	}

 	/*
@ -585,11 +610,10 @@ _bt_gettrueroot(Relation rel)
 		rootblkno = rootopaque->btpo_next;
 	}

-	/* Note: can't check btpo.level on deleted pages */
-	if (rootopaque->btpo.level != rootlevel)
+	if (rootopaque->btpo_level != rootlevel)
 		elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 			 rootblkno, RelationGetRelationName(rel),
-			 rootopaque->btpo.level, rootlevel);
+			 rootopaque->btpo_level, rootlevel);

 	return rootbuf;
 }
@ -762,7 +786,7 @@ _bt_checkpage(Relation rel, Buffer buf)
 * Log the reuse of a page from the FSM.
 */
 static void
-_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
 {
 	xl_btree_reuse_page xlrec_reuse;

@ -775,7 +799,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
 	/* XLOG stuff */
 	xlrec_reuse.node = rel->rd_node;
 	xlrec_reuse.block = blkno;
-	xlrec_reuse.latestRemovedXid = latestRemovedXid;
+	xlrec_reuse.latestRemovedFullXid = safexid;

 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
@ -856,26 +880,34 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 			if (_bt_conditionallockbuf(rel, buf))
 			{
 				page = BufferGetPage(buf);
-				if (_bt_page_recyclable(page))
+
+				/*
+				 * It's possible to find an all-zeroes page in an index.  For
+				 * example, a backend might successfully extend the relation
+				 * one page and then crash before it is able to make a WAL
+				 * entry for adding the page.  If we find a zeroed page then
+				 * reclaim it immediately.
+				 */
+				if (PageIsNew(page))
+				{
+					/* Okay to use page.  Initialize and return it. */
+					_bt_pageinit(page, BufferGetPageSize(buf));
+					return buf;
+				}
+
+				if (BTPageIsRecyclable(page))
 				{
 					/*
 					 * If we are generating WAL for Hot Standby then create a
 					 * WAL record that will allow us to conflict with queries
 					 * running on standby, in case they have snapshots older
-					 * than btpo.xact.  This can only apply if the page does
-					 * have a valid btpo.xact value, ie not if it's new.  (We
-					 * must check that because an all-zero page has no special
-					 * space.)
+					 * than safexid value
 					 */
-					if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
-						!PageIsNew(page))
-					{
-						BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+					if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+						_bt_log_reuse_page(rel, blkno,
+										   BTPageGetDeleteXid(page));

-						_bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
-					}
-
-					/* Okay to use page.  Re-initialize and return it */
+					/* Okay to use page.  Re-initialize and return it. */
 					_bt_pageinit(page, BufferGetPageSize(buf));
 					return buf;
 				}
@ -1073,40 +1105,6 @@ _bt_pageinit(Page page, Size size)
 	PageInit(page, size, sizeof(BTPageOpaqueData));
 }

-/*
- *	_bt_page_recyclable() -- Is an existing page recyclable?
- *
- * This exists to make sure _bt_getbuf and btvacuumscan have the same
- * policy about whether a page is safe to re-use.  But note that _bt_getbuf
- * knows enough to distinguish the PageIsNew condition from the other one.
- * At some point it might be appropriate to redesign this to have a three-way
- * result value.
- */
-bool
-_bt_page_recyclable(Page page)
-{
-	BTPageOpaque opaque;
-
-	/*
-	 * It's possible to find an all-zeroes page in an index --- for example, a
-	 * backend might successfully extend the relation one page and then crash
-	 * before it is able to make a WAL entry for adding the page. If we find a
-	 * zeroed page then reclaim it.
-	 */
-	if (PageIsNew(page))
-		return true;
-
-	/*
-	 * Otherwise, recycle if deleted and too old to have any processes
-	 * interested in it.
-	 */
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-	if (P_ISDELETED(opaque) &&
-		GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
-		return true;
-	return false;
-}
-
 /*
 * Delete item(s) from a btree leaf page during VACUUM.
 *
@ -1768,16 +1766,12 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
 * that the btvacuumscan scan has yet to reach; they'll get counted later
 * instead.
 *
- * Maintains *oldestBtpoXact for any pages that get deleted.  Caller is
- * responsible for maintaining *oldestBtpoXact in the case of pages that were
- * deleted by a previous VACUUM.
- *
 * NOTE: this leaks memory.  Rather than trying to clean up everything
 * carefully, it's better to run it in a temp context that can be reset
 * frequently.
 */
 uint32
-_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
+_bt_pagedel(Relation rel, Buffer leafbuf)
 {
 	uint32		ndeleted = 0;
 	BlockNumber rightsib;
@ -1985,8 +1979,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 		{
 			/* Check for interrupts in _bt_unlink_halfdead_page */
 			if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
-										  &rightsib_empty, oldestBtpoXact,
-										  &ndeleted))
+										  &rightsib_empty, &ndeleted))
 			{
 				/*
 				 * _bt_unlink_halfdead_page should never fail, since we
@ -2002,8 +1995,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 		}

 		Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
-		Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
-											*oldestBtpoXact));

 		rightsib = opaque->btpo_next;

@ -2264,12 +2255,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
 * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
 * consistent.)
 *
- * We maintain *oldestBtpoXact for pages that are deleted by the current
- * VACUUM operation here.  This must be handled here because we conservatively
- * assume that there needs to be a new call to ReadNextTransactionId() each
- * time a page gets deleted.  See comments about the underlying assumption
- * below.
- *
 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
 * On success exit, we'll be holding pin and write lock.  On failure exit,
 * we'll release both pin and lock before returning (we define it that way
@ -2277,8 +2262,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
 */
 static bool
 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
-						 bool *rightsib_empty, TransactionId *oldestBtpoXact,
-						 uint32 *ndeleted)
+						 bool *rightsib_empty, uint32 *ndeleted)
 {
 	BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
 	BlockNumber leafleftsib;
@ -2294,12 +2278,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	BTMetaPageData *metad = NULL;
 	ItemId		itemid;
 	Page		page;
-	PageHeader	header;
 	BTPageOpaque opaque;
+	FullTransactionId safexid;
 	bool		rightsib_is_rightmost;
-	int			targetlevel;
+	uint32		targetlevel;
 	IndexTuple	leafhikey;
-	BlockNumber nextchild;
+	BlockNumber leaftopparent;

 	page = BufferGetPage(leafbuf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@ -2343,7 +2327,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		leftsib = opaque->btpo_prev;
-		targetlevel = opaque->btpo.level;
+		targetlevel = opaque->btpo_level;
 		Assert(targetlevel > 0);

 		/*
@ -2450,20 +2434,26 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 			!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
 				 target, RelationGetRelationName(rel));
-		nextchild = InvalidBlockNumber;
+
+		/* Leaf page is also target page: don't set leaftopparent */
+		leaftopparent = InvalidBlockNumber;
 	}
 	else
 	{
+		IndexTuple	finaldataitem;
+
 		if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
 			P_ISLEAF(opaque))
 			elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
 				 target, RelationGetRelationName(rel));

-		/* Remember the next non-leaf child down in the subtree */
+		/* Target is internal: set leaftopparent for next call here...  */
 		itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
-		nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
-		if (nextchild == leafblkno)
-			nextchild = InvalidBlockNumber;
+		finaldataitem = (IndexTuple) PageGetItem(page, itemid);
+		leaftopparent = BTreeTupleGetDownLink(finaldataitem);
+		/* ...except when it would be a redundant pointer-to-self */
+		if (leaftopparent == leafblkno)
+			leaftopparent = InvalidBlockNumber;
 	}

 	/*
@ -2553,13 +2543,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * no lock was held.
 	 */
 	if (target != leafblkno)
-		BTreeTupleSetTopParent(leafhikey, nextchild);
+		BTreeTupleSetTopParent(leafhikey, leaftopparent);

 	/*
 	 * Mark the page itself deleted.  It can be recycled when all current
 	 * transactions are gone.  Storing GetTopTransactionId() would work, but
 	 * we're in VACUUM and would not otherwise have an XID.  Having already
-	 * updated links to the target, ReadNextTransactionId() suffices as an
+	 * updated links to the target, ReadNextFullTransactionId() suffices as an
 	 * upper bound.  Any scan having retained a now-stale link is advertising
 	 * in its PGPROC an xmin less than or equal to the value we read here.  It
 	 * will continue to do so, holding back the xmin horizon, for the duration
@ -2568,17 +2558,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
-	opaque->btpo_flags &= ~BTP_HALF_DEAD;
-	opaque->btpo_flags |= BTP_DELETED;
-	opaque->btpo.xact = ReadNextTransactionId();

 	/*
-	 * Remove the remaining tuples on the page.  This keeps things simple for
-	 * WAL consistency checking.
+	 * Store upper bound XID that's used to determine when deleted page is no
+	 * longer needed as a tombstone
 	 */
-	header = (PageHeader) page;
-	header->pd_lower = SizeOfPageHeaderData;
-	header->pd_upper = header->pd_special;
+	safexid = ReadNextFullTransactionId();
+	BTPageSetDeleted(page, safexid);
+	opaque->btpo_cycleid = 0;

 	/* And update the metapage, if needed */
 	if (BufferIsValid(metabuf))
@ -2616,15 +2603,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		if (target != leafblkno)
 			XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);

-		/* information on the unlinked block */
+		/* information stored on the target/to-be-unlinked block */
 		xlrec.leftsib = leftsib;
 		xlrec.rightsib = rightsib;
-		xlrec.btpo_xact = opaque->btpo.xact;
+		xlrec.level = targetlevel;
+		xlrec.safexid = safexid;

 		/* information needed to recreate the leaf block (if not the target) */
 		xlrec.leafleftsib = leafleftsib;
 		xlrec.leafrightsib = leafrightsib;
-		xlrec.topparent = nextchild;
+		xlrec.leaftopparent = leaftopparent;

 		XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);

@ -2638,7 +2626,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 			xlmeta.level = metad->btm_level;
 			xlmeta.fastroot = metad->btm_fastroot;
 			xlmeta.fastlevel = metad->btm_fastlevel;
-			xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+			xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
 			xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
 			xlmeta.allequalimage = metad->btm_allequalimage;

@ -2681,9 +2669,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		_bt_relbuf(rel, lbuf);
 	_bt_relbuf(rel, rbuf);

-	if (!TransactionIdIsValid(*oldestBtpoXact) ||
-		TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
-		*oldestBtpoXact = opaque->btpo.xact;
+	/* If the target is not leafbuf, we're done with it now -- release it */
+	if (target != leafblkno)
+		_bt_relbuf(rel, buf);

 	/*
 	 * If btvacuumscan won't revisit this page in a future btvacuumpage call
@ -2693,10 +2681,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	if (target <= scanblkno)
 		(*ndeleted)++;

-	/* If the target is not leafbuf, we're done with it now -- release it */
-	if (target != leafblkno)
-		_bt_relbuf(rel, buf);
-
 	return true;
 }

--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@ -46,8 +46,6 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber totFreePages;	/* true total # of free pages */
-	TransactionId oldestBtpoXact;
 	MemoryContext pagedelcontext;
 } BTVacState;

@ -790,7 +788,7 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
 *
 * Called by btvacuumcleanup when btbulkdelete was never called because no
- * tuples need to be deleted.
+ * tuples needed to be deleted by VACUUM.
 *
 * When we return false, VACUUM can even skip the cleanup-only call to
 * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
@ -802,66 +800,75 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	Buffer		metabuf;
 	Page		metapg;
 	BTMetaPageData *metad;
-	bool		result = false;
+	BTOptions  *relopts;
+	float8		cleanup_scale_factor;
+	uint32		btm_version;
+	BlockNumber prev_num_delpages;
+	float8		prev_num_heap_tuples;

+	/*
+	 * Copy details from metapage to local variables quickly.
+	 *
+	 * Note that we deliberately avoid using cached version of metapage here.
+	 */
 	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
+	btm_version = metad->btm_version;
+
+	if (btm_version < BTREE_NOVAC_VERSION)
+	{
+		/*
+		 * Metapage needs to be dynamically upgraded to store fields that are
+		 * only present when btm_version >= BTREE_NOVAC_VERSION
+		 */
+		_bt_relbuf(info->index, metabuf);
+		return true;
+	}
+
+	prev_num_delpages = metad->btm_last_cleanup_num_delpages;
+	prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+	_bt_relbuf(info->index, metabuf);

 	/*
-	 * XXX: If IndexVacuumInfo contained the heap relation, we could be more
-	 * aggressive about vacuuming non catalog relations by passing the table
-	 * to GlobalVisCheckRemovableXid().
+	 * If the underlying table has received a sufficiently high number of
+	 * insertions since the last VACUUM operation that called btvacuumscan(),
+	 * then have the current VACUUM operation call btvacuumscan() now.  This
+	 * happens when the statistics are deemed stale.
+	 *
+	 * XXX: We should have a more principled way of determining what
+	 * "staleness" means. The  vacuum_cleanup_index_scale_factor GUC (and the
+	 * index-level storage param) seem hard to tune in a principled way.
 	 */
+	relopts = (BTOptions *) info->index->rd_options;
+	cleanup_scale_factor = (relopts &&
+							relopts->vacuum_cleanup_index_scale_factor >= 0)
+		? relopts->vacuum_cleanup_index_scale_factor
+		: vacuum_cleanup_index_scale_factor;

-	if (metad->btm_version < BTREE_NOVAC_VERSION)
-	{
-		/*
-		 * Do cleanup if metapage needs upgrade, because we don't have
-		 * cleanup-related meta-information yet.
-		 */
-		result = true;
-	}
-	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
-			 GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact))
-	{
-		/*
-		 * If any oldest btpo.xact from a previously deleted page in the index
-		 * is visible to everyone, then at least one deleted page can be
-		 * recycled -- don't skip cleanup.
-		 */
-		result = true;
-	}
-	else
-	{
-		BTOptions  *relopts;
-		float8		cleanup_scale_factor;
-		float8		prev_num_heap_tuples;
+	if (cleanup_scale_factor <= 0 ||
+		info->num_heap_tuples < 0 ||
+		prev_num_heap_tuples <= 0 ||
+		(info->num_heap_tuples - prev_num_heap_tuples) /
+		prev_num_heap_tuples >= cleanup_scale_factor)
+		return true;

-		/*
-		 * If table receives enough insertions and no cleanup was performed,
-		 * then index would appear have stale statistics.  If scale factor is
-		 * set, we avoid that by performing cleanup if the number of inserted
-		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
-		 * original tuples count.
-		 */
-		relopts = (BTOptions *) info->index->rd_options;
-		cleanup_scale_factor = (relopts &&
-								relopts->vacuum_cleanup_index_scale_factor >= 0)
-			? relopts->vacuum_cleanup_index_scale_factor
-			: vacuum_cleanup_index_scale_factor;
-		prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+	/*
+	 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
+	 * total size of the index.  We can reasonably expect (though are not
+	 * guaranteed) to be able to recycle this many pages if we decide to do a
+	 * btvacuumscan call during the ongoing btvacuumcleanup.
+	 *
+	 * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan
+	 * calls.  That is, we can end up scanning the entire index without ever
+	 * placing even 1 of the prev_num_delpages pages in the free space map, at
+	 * least in certain narrow cases (see nbtree/README section on recycling
+	 * deleted pages for details).  This rarely matters in practice.
+	 */
+	if (prev_num_delpages > RelationGetNumberOfBlocks(info->index) / 20)
+		return true;

-		if (cleanup_scale_factor <= 0 ||
-			info->num_heap_tuples < 0 ||
-			prev_num_heap_tuples <= 0 ||
-			(info->num_heap_tuples - prev_num_heap_tuples) /
-			prev_num_heap_tuples >= cleanup_scale_factor)
-			result = true;
-	}
-
-	_bt_relbuf(info->index, metabuf);
-	return result;
+	return false;
 }

 /*
@ -904,30 +911,62 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 IndexBulkDeleteResult *
 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 {
+	BlockNumber num_delpages;
+
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
 		return stats;

 	/*
-	 * If btbulkdelete was called, we need not do anything, just return the
-	 * stats from the latest btbulkdelete call.  If it wasn't called, we might
-	 * still need to do a pass over the index, to recycle any newly-recyclable
-	 * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
-	 * determines if either are needed.
+	 * If btbulkdelete was called, we need not do anything (we just maintain
+	 * the information used within _bt_vacuum_needs_cleanup() by calling
+	 * _bt_set_cleanup_info() below).
 	 *
-	 * Since we aren't going to actually delete any leaf items, there's no
-	 * need to go through all the vacuum-cycle-ID pushups.
+	 * If btbulkdelete was _not_ called, then we have a choice to make: we
+	 * must decide whether or not a btvacuumscan() call is needed now (i.e.
+	 * whether the ongoing VACUUM operation can entirely avoid a physical scan
+	 * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
+	 * now.
 	 */
 	if (stats == NULL)
 	{
-		/* Check if we need a cleanup */
+		/* Check if VACUUM operation can entirely avoid btvacuumscan() call */
 		if (!_bt_vacuum_needs_cleanup(info))
 			return NULL;

+		/*
+		 * Since we aren't going to actually delete any leaf items, there's no
+		 * need to go through all the vacuum-cycle-ID pushups here
+		 */
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 		btvacuumscan(info, stats, NULL, NULL, 0);
 	}

+	/*
+	 * By here, we know for sure that this VACUUM operation won't be skipping
+	 * its btvacuumscan() call.  Maintain the count of the current number of
+	 * heap tuples in the metapage.  Also maintain the num_delpages value.
+	 * This information will be used by _bt_vacuum_needs_cleanup() during
+	 * future VACUUM operations that don't need to call btbulkdelete().
+	 *
+	 * num_delpages is the number of deleted pages now in the index that were
+	 * not safe to place in the FSM to be recycled just yet.  We expect that
+	 * it will almost certainly be possible to place all of these pages in the
+	 * FSM during the next VACUUM operation.  That factor alone might cause
+	 * _bt_vacuum_needs_cleanup() to force the next VACUUM to proceed with a
+	 * btvacuumscan() call.
+	 *
+	 * Note: We must delay the _bt_set_cleanup_info() call until this late
+	 * stage of VACUUM (the btvacuumcleanup() phase), to keep num_heap_tuples
+	 * accurate.  The btbulkdelete()-time num_heap_tuples value is generally
+	 * just pg_class.reltuples for the heap relation _before_ VACUUM began.
+	 * In general cleanup info should describe the state of the index/table
+	 * _after_ VACUUM finishes.
+	 */
+	Assert(stats->pages_deleted >= stats->pages_free);
+	num_delpages = stats->pages_deleted - stats->pages_free;
+	_bt_set_cleanup_info(info->index, num_delpages, info->num_heap_tuples);
+
 	/*
 	 * It's quite possible for us to be fooled by concurrent page splits into
 	 * double-counting some index tuples, so disbelieve any total that exceeds
@ -957,8 +996,6 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 * deleted, and looking for old deleted pages that can be recycled.  Both
 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
- * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
- * is maintained.
 *
 * The caller is responsible for initially allocating/zeroing a stats struct
 * and for obtaining a vacuum cycle ID if necessary.
@ -975,12 +1012,25 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	bool		needLock;

 	/*
-	 * Reset counts that will be incremented during the scan; needed in case
-	 * of multiple scans during a single VACUUM command
+	 * Reset fields that track information about the entire index now.  This
+	 * avoids double-counting in the case where a single VACUUM command
+	 * requires multiple scans of the index.
+	 *
+	 * Avoid resetting the tuples_removed field here, since it tracks
+	 * information about the VACUUM command, and so must last across each call
+	 * to btvacuumscan().
+	 *
+	 * (Note that pages_free is treated as state about the whole index, not
+	 * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
+	 * calls are idempotent, and get repeated for the same deleted pages in
+	 * some scenarios.  The point for us is to track the number of recyclable
+	 * pages in the index at the end of the VACUUM command.)
 	 */
+	stats->num_pages = 0;
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
 	stats->pages_deleted = 0;
+	stats->pages_free = 0;

 	/* Set up info to pass down to btvacuumpage */
 	vstate.info = info;
@ -988,8 +1038,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
-	vstate.totFreePages = 0;
-	vstate.oldestBtpoXact = InvalidTransactionId;

 	/* Create a temporary memory context to run _bt_pagedel in */
 	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@ -1048,6 +1096,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		}
 	}

+	/* Set statistics num_pages field to final size of index */
+	stats->num_pages = num_pages;
+
 	MemoryContextDelete(vstate.pagedelcontext);

 	/*
@ -1062,27 +1113,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * Note that if no recyclable pages exist, we don't bother vacuuming the
 	 * FSM at all.
 	 */
-	if (vstate.totFreePages > 0)
+	if (stats->pages_free > 0)
 		IndexFreeSpaceMapVacuum(rel);
-
-	/*
-	 * Maintain the oldest btpo.xact and a count of the current number of heap
-	 * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
-	 *
-	 * The page with the oldest btpo.xact is typically a page deleted by this
-	 * VACUUM operation, since pages deleted by a previous VACUUM operation
-	 * tend to be placed in the FSM (by the current VACUUM operation) -- such
-	 * pages are not candidates to be the oldest btpo.xact.  (Note that pages
-	 * placed in the FSM are reported as deleted pages in the bulk delete
-	 * statistics, despite not counting as deleted pages for the purposes of
-	 * determining the oldest btpo.xact.)
-	 */
-	_bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
-								 info->num_heap_tuples);
-
-	/* update statistics */
-	stats->num_pages = num_pages;
-	stats->pages_free = vstate.totFreePages;
 }

 /*
@ -1188,13 +1220,12 @@ backtrack:
 		}
 	}

-	/* Page is valid, see what to do with it */
-	if (_bt_page_recyclable(page))
+	if (!opaque || BTPageIsRecyclable(page))
 	{
 		/* Okay to recycle this page (which could be leaf or internal) */
 		RecordFreeIndexPage(rel, blkno);
-		vstate->totFreePages++;
 		stats->pages_deleted++;
+		stats->pages_free++;
 	}
 	else if (P_ISDELETED(opaque))
 	{
@ -1203,17 +1234,12 @@ backtrack:
 		 * recycle yet.
 		 */
 		stats->pages_deleted++;
-
-		/* Maintain the oldest btpo.xact */
-		if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
-			TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
-			vstate->oldestBtpoXact = opaque->btpo.xact;
 	}
 	else if (P_ISHALFDEAD(opaque))
 	{
 		/*
 		 * Half-dead leaf page.  Try to delete now.  Might update
-		 * oldestBtpoXact and pages_deleted below.
+		 * pages_deleted below.
 		 */
 		attempt_pagedel = true;
 	}
@ -1430,7 +1456,7 @@ backtrack:
 		 * count.  There will be no double-counting.
 		 */
 		Assert(blkno == scanblkno);
-		stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
+		stats->pages_deleted += _bt_pagedel(rel, buf);

 		MemoryContextSwitchTo(oldcontext);
 		/* pagedel released buffer, so we shouldn't */
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * we're on the level 1 and asked to lock leaf page in write mode,
 		 * then lock next page in write mode, because it must be a leaf.
 		 */
-		if (opaque->btpo.level == 1 && access == BT_WRITE)
+		if (opaque->btpo_level == 1 && access == BT_WRITE)
 			page_access = BT_WRITE;

 		/* drop the read lock on the page, then acquire one on its child */
@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 		}

 		/* Done? */
-		if (opaque->btpo.level == level)
+		if (opaque->btpo_level == level)
 			break;
-		if (opaque->btpo.level < level)
+		if (opaque->btpo_level < level)
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg_internal("btree level %u not found in index \"%s\"",
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level)
 	/* Initialize BT opaque state */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
-	opaque->btpo.level = level;
+	opaque->btpo_level = level;
 	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
 	opaque->btpo_cycleid = 0;

--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@ -112,7 +112,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 	md->btm_fastlevel = xlrec->fastlevel;
 	/* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
 	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
-	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+	md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
 	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
 	md->btm_allequalimage = xlrec->allequalimage;

@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)

 	ropaque->btpo_prev = origpagenumber;
 	ropaque->btpo_next = spagenumber;
-	ropaque->btpo.level = xlrec->level;
+	ropaque->btpo_level = xlrec->level;
 	ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
 	ropaque->btpo_cycleid = 0;

@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)

 	pageop->btpo_prev = xlrec->leftblk;
 	pageop->btpo_next = xlrec->rightblk;
-	pageop->btpo.level = 0;
+	pageop->btpo_level = 0;
 	pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
 	pageop->btpo_cycleid = 0;

@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
 	BlockNumber leftsib;
 	BlockNumber rightsib;
+	uint32		level;
+	bool		isleaf;
+	FullTransactionId safexid;
 	Buffer		leftbuf;
 	Buffer		target;
 	Buffer		rightbuf;
@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)

 	leftsib = xlrec->leftsib;
 	rightsib = xlrec->rightsib;
+	level = xlrec->level;
+	isleaf = (level == 0);
+	safexid = xlrec->safexid;
+
+	/* No leaftopparent for level 0 (leaf page) or level 1 target */
+	Assert(xlrec->leaftopparent == InvalidBlockNumber || level > 1);

 	/*
 	 * In normal operation, we would lock all the pages this WAL record
@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)

 	pageop->btpo_prev = leftsib;
 	pageop->btpo_next = rightsib;
-	pageop->btpo.xact = xlrec->btpo_xact;
-	pageop->btpo_flags = BTP_DELETED;
-	if (!BlockNumberIsValid(xlrec->topparent))
+	pageop->btpo_level = level;
+	BTPageSetDeleted(page, safexid);
+	if (isleaf)
 		pageop->btpo_flags |= BTP_LEAF;
 	pageop->btpo_cycleid = 0;

@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		Buffer				leafbuf;
 		IndexTupleData		trunctuple;

+		Assert(!isleaf);
+
 		leafbuf = XLogInitBufferForRedo(record, 3);
 		page = (Page) BufferGetPage(leafbuf);

@ -901,13 +912,13 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
 		pageop->btpo_prev = xlrec->leafleftsib;
 		pageop->btpo_next = xlrec->leafrightsib;
-		pageop->btpo.level = 0;
+		pageop->btpo_level = 0;
 		pageop->btpo_cycleid = 0;

 		/* Add a dummy hikey item */
 		MemSet(&trunctuple, 0, sizeof(IndexTupleData));
 		trunctuple.t_info = sizeof(IndexTupleData);
-		BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
+		BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);

 		if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
 						false, false) == InvalidOffsetNumber)
@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record)

 	pageop->btpo_flags = BTP_ROOT;
 	pageop->btpo_prev = pageop->btpo_next = P_NONE;
-	pageop->btpo.level = xlrec->level;
+	pageop->btpo_level = xlrec->level;
 	if (xlrec->level == 0)
 		pageop->btpo_flags |= BTP_LEAF;
 	pageop->btpo_cycleid = 0;
@ -963,26 +974,40 @@ btree_xlog_newroot(XLogReaderState *record)
 	_bt_restore_meta(record, 2);
 }

+/*
+ * In general VACUUM must defer recycling as a way of avoiding certain race
+ * conditions.  Deleted pages contain a safexid value that is used by VACUUM
+ * to determine whether or not it's safe to place a page that was deleted by
+ * VACUUM earlier into the FSM now.  See nbtree/README.
+ *
+ * As far as any backend operating during original execution is concerned, the
+ * FSM is a cache of recycle-safe pages; the mere presence of the page in the
+ * FSM indicates that the page must already be safe to recycle (actually,
+ * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just
+ * because it would be unwise to completely trust the FSM, given its current
+ * limitations).
+ *
+ * This isn't sufficient to prevent similar concurrent recycling race
+ * conditions during Hot Standby, though.  For that we need to log a
+ * xl_btree_reuse_page record at the point that a page is actually recycled
+ * and reused for an entirely unrelated page inside _bt_split().  These
+ * records include the same safexid value from the original deleted page,
+ * stored in the record's latestRemovedFullXid field.
+ *
+ * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
+ * to determine if it's safe to recycle a page.  This mirrors our own test:
+ * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on primary
+ * and standby.
+ */
 static void
 btree_xlog_reuse_page(XLogReaderState *record)
 {
 	xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);

-	/*
-	 * Btree reuse_page records exist to provide a conflict point when we
-	 * reuse pages in the index via the FSM.  That's all they do though.
-	 *
-	 * latestRemovedXid was the page's btpo.xact.  The
-	 * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually
-	 * mirrors the pgxact->xmin > limitXmin test in
-	 * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
-	 * same exclusion effect on primary and standby.
-	 */
 	if (InHotStandby)
-	{
-		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
-											xlrec->node);
-	}
+		ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+												   xlrec->node);
 }

 void
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@ -80,12 +80,13 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;

-				appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ",
-								 xlrec->leftsib, xlrec->rightsib,
-								 xlrec->btpo_xact);
-				appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u",
+				appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ",
+								 xlrec->leftsib, xlrec->rightsib, xlrec->level,
+								 EpochFromFullTransactionId(xlrec->safexid),
+								 XidFromFullTransactionId(xlrec->safexid));
+				appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u",
 								 xlrec->leafleftsib, xlrec->leafrightsib,
-								 xlrec->topparent);
+								 xlrec->leaftopparent);
 				break;
 			}
 		case XLOG_BTREE_NEWROOT:
@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;

-				appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u",
+				appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
 								 xlrec->node.spcNode, xlrec->node.dbNode,
-								 xlrec->node.relNode, xlrec->latestRemovedXid);
+								 xlrec->node.relNode,
+								 EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
+								 XidFromFullTransactionId(xlrec->latestRemovedFullXid));
 				break;
 			}
 		case XLOG_BTREE_META_CLEANUP:
@ -110,8 +113,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)

 				xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0,
 																  NULL);
-				appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f",
-								 xlrec->oldest_btpo_xact,
+				appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f",
+								 xlrec->last_cleanup_num_delpages,
 								 xlrec->last_cleanup_num_heap_tuples);
 				break;
 			}
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode
 										   true);
 }

+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+										   RelFileNode node)
+{
+	/*
+	 * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+	 * so truncate the logged FullTransactionId.  If the logged value is very
+	 * old, so that XID wrap-around already happened on it, there can't be any
+	 * snapshots that still see it.
+	 */
+	FullTransactionId nextXid = ReadNextFullTransactionId();
+	uint64			  diff;
+
+	diff = U64FromFullTransactionId(nextXid) -
+		U64FromFullTransactionId(latestRemovedFullXid);
+	if (diff < MaxTransactionId / 2)
+	{
+		TransactionId latestRemovedXid;
+
+		latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+	}
+}
+
 void
 ResolveRecoveryConflictWithTablespace(Oid tsid)
 {
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@ -37,8 +37,9 @@ typedef uint16 BTCycleId;
 *
 *	In addition, we store the page's btree level (counting upwards from
 *	zero at a leaf page) as well as some flag bits indicating the page type
- *	and status.  If the page is deleted, we replace the level with the
- *	next-transaction-ID value indicating when it is safe to reclaim the page.
+ *	and status.  If the page is deleted, a BTDeletedPageData struct is stored
+ *	in the page's tuple area, while a standard BTPageOpaqueData struct is
+ *	stored in the page special area.
 *
 *	We also store a "vacuum cycle ID".  When a page is split while VACUUM is
 *	processing the index, a nonzero value associated with the VACUUM run is
@ -52,17 +53,17 @@ typedef uint16 BTCycleId;
 *
 *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
 *	instead.
+ *
+ *	NOTE: the btpo_level field used to be a union type in order to allow
+ *	deleted pages to store a 32-bit safexid in the same field.  We now store
+ *	64-bit/full safexid values using BTDeletedPageData instead.
 */

 typedef struct BTPageOpaqueData
 {
 	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
 	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
-	union
-	{
-		uint32		level;		/* tree level --- zero for leaf pages */
-		TransactionId xact;		/* next transaction ID, if deleted */
-	}			btpo;
+	uint32		btpo_level;		/* tree level --- zero for leaf pages */
 	uint16		btpo_flags;		/* flag bits, see below */
 	BTCycleId	btpo_cycleid;	/* vacuum cycle ID of latest split */
 } BTPageOpaqueData;
@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageData */

 /*
 * The max allowed value of a cycle ID is a bit less than 64K.  This is
@ -105,10 +107,12 @@ typedef struct BTMetaPageData
 	BlockNumber btm_fastroot;	/* current "fast" root location */
 	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
 	/* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
-	TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted
-										 * pages */
-	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
-													 * during last cleanup */
+
+	/* number of deleted, non-recyclable pages during last cleanup */
+	uint32		btm_last_cleanup_num_delpages;
+	/* number of heap tuples during last cleanup */
+	float8		btm_last_cleanup_num_heap_tuples;
+
 	bool		btm_allequalimage;	/* are all columns "equalimage"? */
 } BTMetaPageData;

@ -220,6 +224,93 @@ typedef struct BTMetaPageData
 #define P_IGNORE(opaque)		(((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageData is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageData
+{
+	FullTransactionId safexid;	/* See BTPageIsRecyclable() */
+} BTDeletedPageData;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+	BTPageOpaque opaque;
+	PageHeader	header;
+	BTDeletedPageData *contents;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	header = ((PageHeader) page);
+
+	opaque->btpo_flags &= ~BTP_HALF_DEAD;
+	opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+	header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
+		sizeof(BTDeletedPageData);
+	header->pd_upper = header->pd_special;
+
+	/* Set safexid in deleted page */
+	contents = ((BTDeletedPageData *) PageGetContents(page));
+	contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+	BTPageOpaque opaque;
+	BTDeletedPageData *contents;
+
+	/* We only expect to be called with a deleted page */
+	Assert(!PageIsNew(page));
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISDELETED(opaque));
+
+	/* pg_upgrade'd deleted page -- must be safe to delete now */
+	if (!P_HAS_FULLXID(opaque))
+		return FirstNormalFullTransactionId;
+
+	/* Get safexid from deleted page */
+	contents = ((BTDeletedPageData *) PageGetContents(page));
+	return contents->safexid;
+}
+
+/*
+ * Is an existing page recyclable?
+ *
+ * This exists to centralize the policy on which deleted pages are now safe to
+ * re-use.
+ *
+ * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
+ * them here (caller is responsible for that case themselves).  Caller might
+ * well need special handling for new pages anyway.
+ */
+static inline bool
+BTPageIsRecyclable(Page page)
+{
+	BTPageOpaque opaque;
+
+	Assert(!PageIsNew(page));
+
+	/* Recycling okay iff page is deleted and safexid is old enough */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	if (P_ISDELETED(opaque))
+	{
+		/*
+		 * The page was deleted, but when? If it was just deleted, a scan
+		 * might have seen the downlink to it, and will read the page later.
+		 * As long as that can happen, we must keep the deleted page around as
+		 * a tombstone.
+		 *
+		 * For that check if the deletion XID could still be visible to
+		 * anyone. If not, then no scan that's still in progress could have
+		 * seen its downlink, and we can recycle it.
+		 */
+		return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+	}
+
+	return false;
+}

 /*
 *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@ -962,7 +1053,7 @@ typedef struct BTOptions
 {
 	int32		varlena_header_;	/* varlena header (do not touch directly!) */
 	int			fillfactor;		/* page fill factor in percent (0..100) */
-	/* fraction of newly inserted tuples prior to trigger index cleanup */
+	/* fraction of newly inserted tuples needed to trigger index cleanup */
 	float8		vacuum_cleanup_index_scale_factor;
 	bool		deduplicate_items;	/* Try to deduplicate items? */
 } BTOptions;
@ -1066,8 +1157,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
 */
 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
 							 bool allequalimage);
-extern void _bt_update_meta_cleanup_info(Relation rel,
-										 TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+								 float8 num_heap_tuples);
 extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
@ -1084,15 +1175,13 @@ extern void _bt_unlockbuf(Relation rel, Buffer buf);
 extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
 extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
-extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								BTVacuumPosting *updatable, int nupdatable);
 extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
 									  Relation heapRel,
 									  TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
-						  TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);

 /*
 * prototypes for functions in nbtsearch.c
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@ -13,6 +13,7 @@
 #ifndef NBTXLOG_H
 #define NBTXLOG_H

+#include "access/transam.h"
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 #include "storage/off.h"
@ -52,7 +53,7 @@ typedef struct xl_btree_metadata
 	uint32		level;
 	BlockNumber fastroot;
 	uint32		fastlevel;
-	TransactionId oldest_btpo_xact;
+	uint32		last_cleanup_num_delpages;
 	float8		last_cleanup_num_heap_tuples;
 	bool		allequalimage;
 } xl_btree_metadata;
@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page
 {
 	RelFileNode node;
 	BlockNumber block;
-	TransactionId latestRemovedXid;
+	FullTransactionId latestRemovedFullXid;
 } xl_btree_reuse_page;

 #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead
 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))

 /*
- * This is what we need to know about deletion of a btree page.  Note we do
- * not store any content for the deleted page --- it is just rewritten as empty
- * during recovery, apart from resetting the btpo.xact.
+ * This is what we need to know about deletion of a btree page.  Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while).  It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
 *
 * Backup Blk 0: target block being deleted
 * Backup Blk 1: target block's left sibling, if any
@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page
 {
 	BlockNumber leftsib;		/* target block's left sibling, if any */
 	BlockNumber rightsib;		/* target block's right sibling */
+	uint32		level;			/* target block's level */
+	FullTransactionId safexid;	/* target block's BTPageSetDeleted() XID */

 	/*
-	 * Information needed to recreate the leaf page, when target is an
-	 * internal page.
+	 * Information needed to recreate a half-dead leaf page with correct
+	 * topparent link.  The fields are only used when deletion operation's
+	 * target page is an internal page.  REDO routine creates half-dead page
+	 * from scratch to keep things simple (this is the same convenient
+	 * approach used for the target page itself).
 	 */
 	BlockNumber leafleftsib;
 	BlockNumber leafrightsib;
-	BlockNumber topparent;		/* next child down in the subtree */
+	BlockNumber leaftopparent;	/* next child down in the subtree */

-	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
 } xl_btree_unlink_page;

-#define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
+#define SizeOfBtreeUnlinkPage	(offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))

 /*
 * New root log record.  There are zero tuples if this is to establish an
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@ -31,7 +31,7 @@
 /*
 * Each page of XLOG file has a header like this:
 */
-#define XLOG_PAGE_MAGIC 0xD109	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD10A	/* can be used as WAL version indicator */

 typedef struct XLogPageHeaderData
 {
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void);

 extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
 												RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+													   RelFileNode node);
 extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
 extern void ResolveRecoveryConflictWithDatabase(Oid dbid);