Add deduplication to nbtree.

Deduplication reduces the storage overhead of duplicates in indexes that use the standard nbtree index access method. The deduplication process is applied lazily, after the point where opportunistic deletion of LP_DEAD-marked index tuples occurs. Deduplication is only applied at the point where a leaf page split would otherwise be required. New posting list tuples are formed by merging together existing duplicate tuples. The physical representation of the items on an nbtree leaf page is made more space efficient by deduplication, but the logical contents of the page are not changed. Even unique indexes make use of deduplication as a way of controlling bloat from duplicates whose TIDs point to different versions of the same logical table row. The lazy approach taken by nbtree has significant advantages over a GIN style eager approach. Most individual inserts of index tuples have exactly the same overhead as before. The extra overhead of deduplication is amortized across insertions, just like the overhead of page splits. The key space of indexes works in the same way as it has since commit dd299df8 (the commit that made heap TID a tiebreaker column). Testing has shown that nbtree deduplication can generally make indexes with about 10 or 15 tuples for each distinct key value about 2.5X - 4X smaller, even with single column integer indexes (e.g., an index on a referencing column that accompanies a foreign key). The final size of single column nbtree indexes comes close to the final size of a similar contrib/btree_gin index, at least in cases where GIN's posting list compression isn't very effective. This can significantly improve transaction throughput, and significantly reduce the cost of vacuuming indexes. A new index storage parameter (deduplicate_items) controls the use of deduplication. The default setting is 'on', so all new B-Tree indexes automatically use deduplication where possible. This decision will be reviewed at the end of the Postgres 13 beta period. There is a regression of approximately 2% of transaction throughput with synthetic workloads that consist of append-only inserts into a table with several non-unique indexes, where all indexes have few or no repeated values. The underlying issue is that cycles are wasted on unsuccessful attempts at deduplicating items in non-unique indexes. There doesn't seem to be a way around it short of disabling deduplication entirely. Note that deduplication of items in unique indexes is fairly well targeted in general, which avoids the problem there (we can use a special heuristic to trigger deduplication passes in unique indexes, since we're specifically targeting "version bloat"). Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed. No bump in BTREE_VERSION, since the representation of posting list tuples works in a way that's backwards compatible with version 4 indexes (i.e. indexes built on PostgreSQL 12). However, users must still REINDEX a pg_upgrade'd index to use deduplication, regardless of the Postgres version they've upgraded from. This is the only way to set the new nbtree metapage flag indicating that deduplication is generally safe. Author: Anastasia Lubennikova, Peter Geoghegan Reviewed-By: Peter Geoghegan, Heikki Linnakangas Discussion: https://postgr.es/m/55E4051B.7020209@postgrespro.ru https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 13:05:30 -08:00 · 2020-02-26 13:05:30 -08:00 · 0d861bbb70
parent 612a1ab767
commit 0d861bbb70
28 changed files with 3554 additions and 333 deletions
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid,
 									  bool tupleIsAlive, void *checkstate);
 static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 									 IndexTuple itup);
+static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n);
 static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 											   OffsetNumber offset);
@ -167,6 +168,7 @@ static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block,
 								   Page page, OffsetNumber offset);
 static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
 													  IndexTuple itup, bool nonpivot);
+static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup);

 /*
 * bt_index_check(index regclass, heapallindexed boolean)
@ -278,7 +280,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,

 	if (btree_index_mainfork_expected(indrel))
 	{
-		bool	heapkeyspace;
+		bool		heapkeyspace,
+					allequalimage;

 		RelationOpenSmgr(indrel);
 		if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))
@ -288,7 +291,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
 							RelationGetRelationName(indrel))));

 		/* Check index, possibly against table it is an index on */
-		heapkeyspace = _bt_heapkeyspace(indrel);
+		_bt_metaversion(indrel, &heapkeyspace, &allequalimage);
 		bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
 							 heapallindexed, rootdescend);
 	}
@ -419,12 +422,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		/*
 		 * Size Bloom filter based on estimated number of tuples in index,
 		 * while conservatively assuming that each block must contain at least
-		 * MaxIndexTuplesPerPage / 5 non-pivot tuples.  (Non-leaf pages cannot
-		 * contain non-pivot tuples.  That's okay because they generally make
-		 * up no more than about 1% of all pages in the index.)
+		 * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
+		 * bt_posting_plain_tuple() for definition, and details of how posting
+		 * list tuples are handled.
 		 */
 		total_pages = RelationGetNumberOfBlocks(rel);
-		total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),
+		total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
 						  (int64) state->rel->rd_rel->reltuples);
 		/* Random seed relies on backend srandom() call to avoid repetition */
 		seed = random();
@ -924,6 +927,7 @@ bt_target_page_check(BtreeCheckState *state)
 		size_t		tupsize;
 		BTScanInsert skey;
 		bool		lowersizelimit;
+		ItemPointer scantid;

 		CHECK_FOR_INTERRUPTS();

@ -954,13 +958,15 @@ bt_target_page_check(BtreeCheckState *state)
 		if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
 							 offset))
 		{
+			ItemPointer tid;
 			char	   *itid,
 					   *htid;

 			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			tid = BTreeTupleGetPointsToTID(itup);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));

 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@ -994,18 +1000,20 @@ bt_target_page_check(BtreeCheckState *state)

 		/*
 		 * Readonly callers may optionally verify that non-pivot tuples can
-		 * each be found by an independent search that starts from the root
+		 * each be found by an independent search that starts from the root.
+		 * Note that we deliberately don't do individual searches for each
+		 * TID, since the posting list itself is validated by other checks.
 		 */
 		if (state->rootdescend && P_ISLEAF(topaque) &&
 			!bt_rootdescend(state, itup))
 		{
+			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
 			char	   *itid,
 					   *htid;

 			itid = psprintf("(%u,%u)", state->targetblock, offset);
-			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumber(&(itup->t_tid)),
-							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+			htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
+							ItemPointerGetOffsetNumber(tid));

 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@ -1017,6 +1025,40 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) state->targetlsn)));
 		}

+		/*
+		 * If tuple is a posting list tuple, make sure posting list TIDs are
+		 * in order
+		 */
+		if (BTreeTupleIsPosting(itup))
+		{
+			ItemPointerData last;
+			ItemPointer current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+
+				current = BTreeTupleGetPostingN(itup, i);
+
+				if (ItemPointerCompare(current, &last) <= 0)
+				{
+					char	   *itid = psprintf("(%u,%u)", state->targetblock, offset);
+
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							 errmsg_internal("posting list contains misplaced TID in index \"%s\"",
+											 RelationGetRelationName(state->rel)),
+							 errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.",
+												itid, i,
+												(uint32) (state->targetlsn >> 32),
+												(uint32) state->targetlsn)));
+				}
+
+				ItemPointerCopy(current, &last);
+			}
+		}
+
 		/* Build insertion scankey for current page offset */
 		skey = bt_mkscankey_pivotsearch(state->rel, itup);

@ -1049,13 +1091,14 @@ bt_target_page_check(BtreeCheckState *state)
 		if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
 					   BTMaxItemSizeNoHeapTid(state->target)))
 		{
+			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
 			char	   *itid,
 					   *htid;

 			itid = psprintf("(%u,%u)", state->targetblock, offset);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));

 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@ -1074,12 +1117,32 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			IndexTuple	norm;

-			norm = bt_normalize_tuple(state, itup);
-			bloom_add_element(state->filter, (unsigned char *) norm,
-							  IndexTupleSize(norm));
-			/* Be tidy */
-			if (norm != itup)
-				pfree(norm);
+			if (BTreeTupleIsPosting(itup))
+			{
+				/* Fingerprint all elements as distinct "plain" tuples */
+				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+				{
+					IndexTuple	logtuple;
+
+					logtuple = bt_posting_plain_tuple(itup, i);
+					norm = bt_normalize_tuple(state, logtuple);
+					bloom_add_element(state->filter, (unsigned char *) norm,
+									  IndexTupleSize(norm));
+					/* Be tidy */
+					if (norm != logtuple)
+						pfree(norm);
+					pfree(logtuple);
+				}
+			}
+			else
+			{
+				norm = bt_normalize_tuple(state, itup);
+				bloom_add_element(state->filter, (unsigned char *) norm,
+								  IndexTupleSize(norm));
+				/* Be tidy */
+				if (norm != itup)
+					pfree(norm);
+			}
 		}

 		/*
@ -1087,7 +1150,8 @@ bt_target_page_check(BtreeCheckState *state)
 		 *
 		 * If there is a high key (if this is not the rightmost page on its
 		 * entire level), check that high key actually is upper bound on all
-		 * page items.
+		 * page items.  If this is a posting list tuple, we'll need to set
+		 * scantid to be highest TID in posting list.
 		 *
 		 * We prefer to check all items against high key rather than checking
 		 * just the last and trusting that the operator class obeys the
@ -1127,17 +1191,22 @@ bt_target_page_check(BtreeCheckState *state)
 		 * tuple. (See also: "Notes About Data Representation" in the nbtree
 		 * README.)
 		 */
+		scantid = skey->scantid;
+		if (state->heapkeyspace && BTreeTupleIsPosting(itup))
+			skey->scantid = BTreeTupleGetMaxHeapTID(itup);
+
 		if (!P_RIGHTMOST(topaque) &&
 			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
 			  invariant_l_offset(state, skey, P_HIKEY)))
 		{
+			ItemPointer tid = BTreeTupleGetPointsToTID(itup);
 			char	   *itid,
 					   *htid;

 			itid = psprintf("(%u,%u)", state->targetblock, offset);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));

 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@ -1150,6 +1219,8 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
+		/* Reset, in case scantid was set to (itup) posting tuple's max TID */
+		skey->scantid = scantid;

 		/*
 		 * * Item order check *
@ -1160,15 +1231,17 @@ bt_target_page_check(BtreeCheckState *state)
 		if (OffsetNumberNext(offset) <= max &&
 			!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
 		{
+			ItemPointer tid;
 			char	   *itid,
 					   *htid,
 					   *nitid,
 					   *nhtid;

 			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			tid = BTreeTupleGetPointsToTID(itup);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));
 			nitid = psprintf("(%u,%u)", state->targetblock,
 							 OffsetNumberNext(offset));

@ -1177,9 +1250,10 @@ bt_target_page_check(BtreeCheckState *state)
 										  state->target,
 										  OffsetNumberNext(offset));
 			itup = (IndexTuple) PageGetItem(state->target, itemid);
+			tid = BTreeTupleGetPointsToTID(itup);
 			nhtid = psprintf("(%u,%u)",
-							 ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							 ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							 ItemPointerGetBlockNumberNoCheck(tid),
+							 ItemPointerGetOffsetNumberNoCheck(tid));

 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@ -1953,10 +2027,9 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values,
 * verification.  In particular, it won't try to normalize opclass-equal
 * datums with potentially distinct representations (e.g., btree/numeric_ops
 * index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though.  For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have a posting list,
+ * since dummy CREATE INDEX callback code generates new tuples with the same
+ * normalized representation.
 */
 static IndexTuple
 bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@ -1969,6 +2042,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	IndexTuple	reformed;
 	int			i;

+	/* Caller should only pass "logical" non-pivot tuples here */
+	Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
+
 	/* Easy case: It's immediately clear that tuple has no varlena datums */
 	if (!IndexTupleHasVarwidths(itup))
 		return itup;
@ -2031,6 +2107,29 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	return reformed;
 }

+/*
+ * Produce palloc()'d "plain" tuple for nth posting list entry/TID.
+ *
+ * In general, deduplication is not supposed to change the logical contents of
+ * an index.  Multiple index tuples are merged together into one equivalent
+ * posting list index tuple when convenient.
+ *
+ * heapallindexed verification must normalize-away this variation in
+ * representation by converting posting list tuples into two or more "plain"
+ * tuples.  Each tuple must be fingerprinted separately -- there must be one
+ * tuple for each corresponding Bloom filter probe during the heap scan.
+ *
+ * Note: Caller still needs to call bt_normalize_tuple() with returned tuple.
+ */
+static inline IndexTuple
+bt_posting_plain_tuple(IndexTuple itup, int n)
+{
+	Assert(BTreeTupleIsPosting(itup));
+
+	/* Returns non-posting-list tuple */
+	return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
+}
+
 /*
 * Search for itup in index, starting from fast root page.  itup must be a
 * non-pivot tuple.  This is only supported with heapkeyspace indexes, since
@ -2087,6 +2186,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
 		insertstate.itup_key = key;
+		insertstate.postingoff = 0;
 		insertstate.bounds_valid = false;
 		insertstate.buf = lbuf;

@ -2094,7 +2194,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		offnum = _bt_binsrch_insert(state->rel, &insertstate);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
+		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
+			insertstate.postingoff <= 0 &&
 			_bt_compare(state->rel, key, page, offnum) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
@ -2548,26 +2650,69 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page,
 }

 /*
- * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
- * be present in cases where that is mandatory.
- *
- * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
- * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
- * It may become more useful in the future, when non-pivot tuples support their
- * own alternative INDEX_ALT_TID_MASK representation.
+ * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in
+ * cases where that is mandatory (i.e. for non-pivot tuples)
 */
 static inline ItemPointer
 BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
 							bool nonpivot)
 {
-	ItemPointer result = BTreeTupleGetHeapTID(itup);
-	BlockNumber targetblock = state->targetblock;
+	ItemPointer htid;

-	if (result == NULL && nonpivot)
+	/*
+	 * Caller determines whether this is supposed to be a pivot or non-pivot
+	 * tuple using page type and item offset number.  Verify that tuple
+	 * metadata agrees with this.
+	 */
+	Assert(state->heapkeyspace);
+	if (BTreeTupleIsPivot(itup) && nonpivot)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
+								 state->targetblock,
+								 RelationGetRelationName(state->rel))));
+
+	if (!BTreeTupleIsPivot(itup) && !nonpivot)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
+								 state->targetblock,
+								 RelationGetRelationName(state->rel))));
+
+	htid = BTreeTupleGetHeapTID(itup);
+	if (!ItemPointerIsValid(htid) && nonpivot)
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
-						targetblock, RelationGetRelationName(state->rel))));
+						state->targetblock,
+						RelationGetRelationName(state->rel))));

-	return result;
+	return htid;
+}
+
+/*
+ * Return the "pointed to" TID for itup, which is used to generate a
+ * descriptive error message.  itup must be a "data item" tuple (it wouldn't
+ * make much sense to call here with a high key tuple, since there won't be a
+ * valid downlink/block number to display).
+ *
+ * Returns either a heap TID (which will be the first heap TID in posting list
+ * if itup is posting list tuple), or a TID that contains downlink block
+ * number, plus some encoded metadata (e.g., the number of attributes present
+ * in itup).
+ */
+static inline ItemPointer
+BTreeTupleGetPointsToTID(IndexTuple itup)
+{
+	/*
+	 * Rely on the assumption that !heapkeyspace internal page data items will
+	 * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
+	 * recognize it as a pivot tuple, but everything still works out because
+	 * the t_tid field is still returned
+	 */
+	if (!BTreeTupleIsPivot(itup))
+		return BTreeTupleGetHeapTID(itup);
+
+	/* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
+	return &itup->t_tid;
 }
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@ -557,11 +557,208 @@ equalimage(<replaceable>opcintype</replaceable> <type>oid</type>) returns bool
 <sect1 id="btree-implementation">
 <title>Implementation</title>

+ <para>
+  This section covers B-Tree index implementation details that may be
+  of use to advanced users.  See
+  <filename>src/backend/access/nbtree/README</filename> in the source
+  distribution for a much more detailed, internals-focused description
+  of the B-Tree implementation.
+ </para>
+ <sect2 id="btree-structure">
+  <title>B-Tree Structure</title>
  <para>
-   An introduction to the btree index implementation can be found in
-   <filename>src/backend/access/nbtree/README</filename>.
+   <productname>PostgreSQL</productname> B-Tree indexes are
+   multi-level tree structures, where each level of the tree can be
+   used as a doubly-linked list of pages.  A single metapage is stored
+   in a fixed position at the start of the first segment file of the
+   index.  All other pages are either leaf pages or internal pages.
+   Leaf pages are the pages on the lowest level of the tree.  All
+   other levels consist of internal pages.  Each leaf page contains
+   tuples that point to table rows.  Each internal page contains
+   tuples that point to the next level down in the tree.  Typically,
+   over 99% of all pages are leaf pages.  Both internal pages and leaf
+   pages use the standard page format described in <xref
+    linkend="storage-page-layout"/>.
+  </para>
+  <para>
+   New leaf pages are added to a B-Tree index when an existing leaf
+   page cannot fit an incoming tuple.  A <firstterm>page
+    split</firstterm> operation makes room for items that originally
+   belonged on the overflowing page by moving a portion of the items
+   to a new page.  Page splits must also insert a new
+   <firstterm>downlink</firstterm> to the new page in the parent page,
+   which may cause the parent to split in turn.  Page splits
+   <quote>cascade upwards</quote> in a recursive fashion.  When the
+   root page finally cannot fit a new downlink, a <firstterm>root page
+    split</firstterm> operation takes place.  This adds a new level to
+   the tree structure by creating a new root page that is one level
+   above the original root page.
+  </para>
+ </sect2>
+
+ <sect2 id="btree-deduplication">
+  <title>Deduplication</title>
+  <para>
+   A duplicate is a leaf page tuple (a tuple that points to a table
+   row) where <emphasis>all</emphasis> indexed key columns have values
+   that match corresponding column values from at least one other leaf
+   page tuple that's close by in the same index.  Duplicate tuples are
+   quite common in practice.  B-Tree indexes can use a special,
+   space-efficient representation for duplicates when an optional
+   technique is enabled: <firstterm>deduplication</firstterm>.
+  </para>
+  <para>
+   Deduplication works by periodically merging groups of duplicate
+   tuples together, forming a single posting list tuple for each
+   group.  The column key value(s) only appear once in this
+   representation.  This is followed by a sorted array of
+   <acronym>TID</acronym>s that point to rows in the table.  This
+   significantly reduces the storage size of indexes where each value
+   (or each distinct combination of column values) appears several
+   times on average.  The latency of queries can be reduced
+   significantly.  Overall query throughput may increase
+   significantly.  The overhead of routine index vacuuming may also be
+   reduced significantly.
+  </para>
+  <note>
+   <para>
+    While NULL is generally not considered to be equal to any other
+    value, including NULL, NULL is nevertheless treated as just
+    another value from the domain of indexed values by the B-Tree
+    implementation (except when enforcing uniqueness in a unique
+    index).  B-Tree deduplication is therefore just as effective with
+    <quote>duplicates</quote> that contain a NULL value.
+   </para>
+  </note>
+  <para>
+   The deduplication process occurs lazily, when a new item is
+   inserted that cannot fit on an existing leaf page.  This prevents
+   (or at least delays) leaf page splits.  Unlike GIN posting list
+   tuples, B-Tree posting list tuples do not need to expand every time
+   a new duplicate is inserted; they are merely an alternative
+   physical representation of the original logical contents of the
+   leaf page.  This design prioritizes consistent performance with
+   mixed read-write workloads.  Most client applications will at least
+   see a moderate performance benefit from using deduplication.
+   Deduplication is enabled by default.
+  </para>
+  <para>
+   Write-heavy workloads that don't benefit from deduplication due to
+   having few or no duplicate values in indexes will incur a small,
+   fixed performance penalty (unless deduplication is explicitly
+   disabled).  The <literal>deduplicate_items</literal> storage
+   parameter can be used to disable deduplication within individual
+   indexes.  There is never any performance penalty with read-only
+   workloads, since reading posting list tuples is at least as
+   efficient as reading the standard tuple representation.  Disabling
+   deduplication isn't usually helpful.
+  </para>
+  <para>
+   B-Tree indexes are not directly aware that under MVCC, there might
+   be multiple extant versions of the same logical table row; to an
+   index, each tuple is an independent object that needs its own index
+   entry.  Thus, an update of a row always creates all-new index
+   entries for the row, even if the key values did not change.  Some
+   workloads suffer from index bloat caused by these
+   implementation-level version duplicates (this is typically a
+   problem for <command>UPDATE</command>-heavy workloads that cannot
+   apply the <acronym>HOT</acronym> optimization due to modifying at
+   least one indexed column).  B-Tree deduplication does not
+   distinguish between these implementation-level version duplicates
+   and conventional duplicates.  Deduplication can nevertheless help
+   with controlling index bloat caused by implementation-level version
+   churn.
+  </para>
+  <tip>
+   <para>
+    A special heuristic is applied to determine whether a
+    deduplication pass in a unique index should take place.  It can
+    often skip straight to splitting a leaf page, avoiding a
+    performance penalty from wasting cycles on unhelpful deduplication
+    passes.  If you're concerned about the overhead of deduplication,
+    consider setting <literal>deduplicate_items = off</literal>
+    selectively.  Leaving deduplication enabled in unique indexes has
+    little downside.
+   </para>
+  </tip>
+  <para>
+   Deduplication cannot be used in all cases due to
+   implementation-level restrictions.  Deduplication safety is
+   determined when <command>CREATE INDEX</command> or
+   <command>REINDEX</command> run.
+  </para>
+  <para>
+   Note that deduplication is deemed unsafe and cannot be used in the
+   following cases involving semantically significant differences
+   among equal datums:
+  </para>
+  <para>
+   <itemizedlist>
+    <listitem>
+     <para>
+      <type>text</type>, <type>varchar</type>, and <type>char</type>
+      cannot use deduplication when a
+      <emphasis>nondeterministic</emphasis> collation is used.  Case
+      and accent differences must be preserved among equal datums.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      <type>numeric</type> cannot use deduplication.  Numeric display
+      scale must be preserved among equal datums.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      <type>jsonb</type> cannot use deduplication, since the
+      <type>jsonb</type> B-Tree operator class uses
+      <type>numeric</type> internally.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      <type>float4</type> and <type>float8</type> cannot use
+      deduplication.  These types have distinct representations for
+      <literal>-0</literal> and <literal>0</literal>, which are
+      nevertheless considered equal.  This difference must be
+      preserved.
+     </para>
+    </listitem>
+   </itemizedlist>
+  </para>
+  <para>
+   There is one further implementation-level restriction that may be
+   lifted in a future version of
+   <productname>PostgreSQL</productname>:
+  </para>
+  <para>
+   <itemizedlist>
+    <listitem>
+     <para>
+      Container types (such as composite types, arrays, or range
+      types) cannot use deduplication.
+     </para>
+    </listitem>
+   </itemizedlist>
+  </para>
+  <para>
+   There is one further implementation-level restriction that applies
+   regardless of the operator class or collation used:
+  </para>
+  <para>
+   <itemizedlist>
+    <listitem>
+     <para>
+      <literal>INCLUDE</literal> indexes can never use deduplication.
+     </para>
+    </listitem>
+   </itemizedlist>
  </para>

+ </sect2>
 </sect1>

 </chapter>
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
     nondeterministic collations give a more <quote>correct</quote> behavior,
     especially when considering the full power of Unicode and its many
     special cases, they also have some drawbacks.  Foremost, their use leads
-     to a performance penalty.  Also, certain operations are not possible with
-     nondeterministic collations, such as pattern matching operations.
-     Therefore, they should be used only in cases where they are specifically
-     wanted.
+     to a performance penalty.  Note, in particular, that B-tree cannot use
+     deduplication with indexes that use a nondeterministic collation.  Also,
+     certain operations are not possible with nondeterministic collations,
+     such as pattern matching operations.  Therefore, they should be used
+     only in cases where they are specifically wanted.
    </para>
   </sect3>
  </sect2>
--- a/doc/src/sgml/citext.sgml
+++ b/doc/src/sgml/citext.sgml
@ -233,9 +233,10 @@ SELECT * FROM users WHERE nick = 'Larry';
     <para>
       <type>citext</type> is not as efficient as <type>text</type> because the
       operator functions and the B-tree comparison functions must make copies
-       of the data and convert it to lower case for comparisons. It is,
-       however, slightly more efficient than using <function>lower</function> to get
-       case-insensitive matching.
+       of the data and convert it to lower case for comparisons.  Also, only
+       <type>text</type> can support B-Tree deduplication.  However,
+       <type>citext</type> is slightly more efficient than using
+       <function>lower</function> to get case-insensitive matching.
     </para>
    </listitem>

--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -16561,10 +16561,11 @@ AND
   rows.  Two rows might have a different binary representation even
   though comparisons of the two rows with the equality operator is true.
   The ordering of rows under these comparison operators is deterministic
-   but not otherwise meaningful.  These operators are used internally for
-   materialized views and might be useful for other specialized purposes
-   such as replication but are not intended to be generally useful for
-   writing queries.
+   but not otherwise meaningful.  These operators are used internally
+   for materialized views and might be useful for other specialized
+   purposes such as replication and B-Tree deduplication (see <xref
+   linkend="btree-deduplication"/>).  They are not intended to be
+   generally useful for writing queries, though.
  </para>
  </sect2>
 </sect1>
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@ -171,6 +171,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
        maximum size allowed for the index type, data insertion will fail.
        In any case, non-key columns duplicate data from the index's table
        and bloat the size of the index, thus potentially slowing searches.
+        Furthermore, B-tree deduplication is never used with indexes
+        that have a non-key column.
       </para>

       <para>
@ -393,10 +395,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
   </variablelist>

   <para>
-    B-tree indexes additionally accept this parameter:
+    B-tree indexes also accept these parameters:
   </para>

   <variablelist>
+   <varlistentry id="index-reloption-deduplication" xreflabel="deduplicate_items">
+    <term><literal>deduplicate_items</literal>
+     <indexterm>
+      <primary><varname>deduplicate_items</varname></primary>
+      <secondary>storage parameter</secondary>
+     </indexterm>
+    </term>
+    <listitem>
+    <para>
+      Controls usage of the B-tree deduplication technique described
+      in <xref linkend="btree-deduplication"/>.  Set to
+      <literal>ON</literal> or <literal>OFF</literal> to enable or
+      disable the optimization.  (Alternative spellings of
+      <literal>ON</literal> and <literal>OFF</literal> are allowed as
+      described in <xref linkend="config-setting"/>.) The default is
+      <literal>ON</literal>.
+    </para>
+
+    <note>
+     <para>
+      Turning <literal>deduplicate_items</literal> off via
+      <command>ALTER INDEX</command> prevents future insertions from
+      triggering deduplication, but does not in itself make existing
+      posting list tuples use the standard tuple representation.
+     </para>
+    </note>
+    </listitem>
+   </varlistentry>
+
   <varlistentry id="index-reloption-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
    <term><literal>vacuum_cleanup_index_scale_factor</literal>
     <indexterm>
@ -451,9 +482,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
     This setting controls usage of the fast update technique described in
     <xref linkend="gin-fast-update"/>.  It is a Boolean parameter:
     <literal>ON</literal> enables fast update, <literal>OFF</literal> disables it.
-     (Alternative spellings of <literal>ON</literal> and <literal>OFF</literal> are
-     allowed as described in <xref linkend="config-setting"/>.)  The
-     default is <literal>ON</literal>.
+     The default is <literal>ON</literal>.
    </para>

    <note>
@ -805,6 +834,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) INCLUDE (director, rating);
 </programlisting>
  </para>

+  <para>
+   To create a B-Tree index with deduplication disabled:
+<programlisting>
+CREATE INDEX title_idx ON films (title) WITH (deduplicate_items = off);
+</programlisting>
+  </para>
+
  <para>
   To create an index on the expression <literal>lower(title)</literal>,
   allowing efficient case-insensitive searches:
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@ -158,6 +158,16 @@ static relopt_bool boolRelOpts[] =
 		},
 		true
 	},
+	{
+		{
+			"deduplicate_items",
+			"Enables \"deduplicate items\" feature for this btree index",
+			RELOPT_KIND_BTREE,
+			ShareUpdateExclusiveLock	/* since it applies only to later
+										 * inserts */
+		},
+		true
+	},
 	/* list terminator */
 	{{NULL}}
 };
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation,
 /*
 * Get the latestRemovedXid from the table entries pointed at by the index
 * tuples being deleted.
+ *
+ * Note: index access methods that don't consistently use the standard
+ * IndexTuple + heap TID item pointer representation will need to provide
+ * their own version of this function.
 */
 TransactionId
 index_compute_xid_horizon_for_tuples(Relation irel,
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global

 OBJS = \
 	nbtcompare.o \
+	nbtdedup.o \
 	nbtinsert.o \
 	nbtpage.o \
 	nbtree.o \
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly
 like a hint bit for a heap tuple), but physically removing tuples requires
 exclusive lock.  In the current code we try to remove LP_DEAD tuples when
 we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).
+hence have exclusive lock on it already).  Deduplication can also prevent
+a page split, but removing LP_DEAD tuples is the preferred approach.
+(Note that posting list tuples can only have their LP_DEAD bit set when
+every table TID within the posting list is known dead.)

 This leaves the index in a state where it has no entry for a dead tuple
 that still exists in the heap.  This is not a problem for the current
@ -726,6 +729,134 @@ if it must.  When a page that's already full of duplicates must be split,
 the fallback strategy assumes that duplicates are mostly inserted in
 ascending heap TID order.  The page is split in a way that leaves the left
 half of the page mostly full, and the right half of the page mostly empty.
+The overall effect is that leaf page splits gracefully adapt to inserts of
+large groups of duplicates, maximizing space utilization.  Note also that
+"trapping" large groups of duplicates on the same leaf page like this makes
+deduplication more efficient.  Deduplication can be performed infrequently,
+without merging together existing posting list tuples too often.
+
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid (or at least delay) page splits.  Note that the
+goals for deduplication in unique indexes are rather different; see later
+section for details.  Deduplication alters the physical representation of
+tuples without changing the logical contents of the index, and without
+adding overhead to read queries.  Non-pivot tuples are merged together
+into a single physical tuple with a posting list (a simple array of heap
+TIDs with the standard item pointer format).  Deduplication is always
+applied lazily, at the point where it would otherwise be necessary to
+perform a page split.  It occurs only when LP_DEAD items have been
+removed, as our last line of defense against splitting a leaf page.  We
+can set the LP_DEAD bit with posting list tuples, though only when all
+TIDs are known dead.
+
+Our lazy approach to deduplication allows the page space accounting used
+during page splits to have absolutely minimal special case logic for
+posting lists.  Posting lists can be thought of as extra payload that
+suffix truncation will reliably truncate away as needed during page
+splits, just like non-key columns from an INCLUDE index tuple.
+Incoming/new tuples can generally be treated as non-overlapping plain
+items (though see section on posting list splits for information about how
+overlapping new/incoming items are really handled).
+
+The representation of posting lists is almost identical to the posting
+lists used by GIN, so it would be straightforward to apply GIN's varbyte
+encoding compression scheme to individual posting lists.  Posting list
+compression would break the assumptions made by posting list splits about
+page space accounting (see later section), so it's not clear how
+compression could be integrated with nbtree.  Besides, posting list
+compression does not offer a compelling trade-off for nbtree, since in
+general nbtree is optimized for consistent performance with many
+concurrent readers and writers.
+
+A major goal of our lazy approach to deduplication is to limit the
+performance impact of deduplication with random updates.  Even concurrent
+append-only inserts of the same key value will tend to have inserts of
+individual index tuples in an order that doesn't quite match heap TID
+order.  Delaying deduplication minimizes page level fragmentation.
+
+Deduplication in unique indexes
+-------------------------------
+
+Very often, the range of values that can be placed on a given leaf page in
+a unique index is fixed and permanent.  For example, a primary key on an
+identity column will usually only have page splits caused by the insertion
+of new logical rows within the rightmost leaf page.  If there is a split
+of a non-rightmost leaf page, then the split must have been triggered by
+inserts associated with an UPDATE of an existing logical row.  Splitting a
+leaf page purely to store multiple versions should be considered
+pathological, since it permanently degrades the index structure in order
+to absorb a temporary burst of duplicates.  Deduplication in unique
+indexes helps to prevent these pathological page splits.  Storing
+duplicates in a space efficient manner is not the goal, since in the long
+run there won't be any duplicates anyway.  Rather, we're buying time for
+standard garbage collection mechanisms to run before a page split is
+needed.
+
+Unique index leaf pages only get a deduplication pass when an insertion
+(that might have to split the page) observed an existing duplicate on the
+page in passing.  This is based on the assumption that deduplication will
+only work out when _all_ new insertions are duplicates from UPDATEs.  This
+may mean that we miss an opportunity to delay a page split, but that's
+okay because our ultimate goal is to delay leaf page splits _indefinitely_
+(i.e. to prevent them altogether).  There is little point in trying to
+delay a split that is probably inevitable anyway.  This allows us to avoid
+the overhead of attempting to deduplicate with unique indexes that always
+have few or no duplicates.
+
+Posting list splits
+-------------------
+
+When the incoming tuple happens to overlap with an existing posting list,
+a posting list split is performed.  Like a page split, a posting list
+split resolves a situation where a new/incoming item "won't fit", while
+inserting the incoming item in passing (i.e. as part of the same atomic
+action).  It's possible (though not particularly likely) that an insert of
+a new item on to an almost-full page will overlap with a posting list,
+resulting in both a posting list split and a page split.  Even then, the
+atomic action that splits the posting list also inserts the new item
+(since page splits always insert the new item in passing).  Including the
+posting list split in the same atomic action as the insert avoids problems
+caused by concurrent inserts into the same posting list --  the exact
+details of how we change the posting list depend upon the new item, and
+vice-versa.  A single atomic action also minimizes the volume of extra
+WAL required for a posting list split, since we don't have to explicitly
+WAL-log the original posting list tuple.
+
+Despite piggy-backing on the same atomic action that inserts a new tuple,
+posting list splits can be thought of as a separate, extra action to the
+insert itself (or to the page split itself).  Posting list splits
+conceptually "rewrite" an insert that overlaps with an existing posting
+list into an insert that adds its final new item just to the right of the
+posting list instead.  The size of the posting list won't change, and so
+page space accounting code does not need to care about posting list splits
+at all.  This is an important upside of our design; the page split point
+choice logic is very subtle even without it needing to deal with posting
+list splits.
+
+Only a few isolated extra steps are required to preserve the illusion that
+the new item never overlapped with an existing posting list in the first
+place: the heap TID of the incoming tuple is swapped with the rightmost/max
+heap TID from the existing/originally overlapping posting list.  Also, the
+posting-split-with-page-split case must generate a new high key based on
+an imaginary version of the original page that has both the final new item
+and the after-list-split posting tuple (page splits usually just operate
+against an imaginary version that contains the new item/item that won't
+fit).
+
+This approach avoids inventing an "eager" atomic posting split operation
+that splits the posting list without simultaneously finishing the insert
+of the incoming item.  This alternative design might seem cleaner, but it
+creates subtle problems for page space accounting.  In general, there
+might not be enough free space on the page to split a posting list such
+that the incoming/new item no longer overlaps with either posting list
+half --- the operation could fail before the actual retail insert of the
+new item even begins.  We'd end up having to handle posting list splits
+that need a page split anyway.  Besides, supporting variable "split points"
+while splitting posting lists won't actually improve overall space
+utilization.

 Notes About Data Representation
 -------------------------------
--- a/src/backend/access/nbtree/nbtdedup.c
+++ b/src/backend/access/nbtree/nbtdedup.c
@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ *	  Deduplicate items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+							 OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+									 Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Deduplicate items on a leaf page.  The page will have to be split by caller
+ * if we cannot successfully free at least newitemsz (we also need space for
+ * newitem's line pointer, which isn't included in caller's newitemsz).
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible.  Note, however, that "single
+ * value" strategy is sometimes used for !checkingunique callers, in which
+ * case deduplication will leave a few tuples untouched at the end of the
+ * page.  The general idea is to prepare the page for an anticipated page
+ * split that uses nbtsplitloc.c's "single value" strategy to determine a
+ * split point.  (There is no reason to deduplicate items that will end up on
+ * the right half of the page after the anticipated page split; better to
+ * handle those if and when the anticipated right half page gets its own
+ * deduplication pass, following further inserts of duplicates.)
+ *
+ * This function should be called during insertion, when the page doesn't have
+ * enough space to fit an incoming newitem.  If the BTP_HAS_GARBAGE page flag
+ * was set, caller should have removed any LP_DEAD items by calling
+ * _bt_vacuum_one_page() before calling here.  We may still have to kill
+ * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
+ * but that should be rare.  Also, _bt_vacuum_one_page() won't unset the
+ * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
+ * deduplication pass will always clear it, just to keep things tidy.
+ */
+void
+_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+				   IndexTuple newitem, Size newitemsz, bool checkingunique)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque;
+	Page		newpage;
+	int			newpagendataitems = 0;
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	BTDedupState state;
+	int			ndeletable = 0;
+	Size		pagesaving = 0;
+	bool		singlevalstrat = false;
+	int			natts = IndexRelationGetNumberOfAttributes(rel);
+
+	/*
+	 * We can't assume that there are no LP_DEAD items.  For one thing, VACUUM
+	 * will clear the BTP_HAS_GARBAGE hint without reliably removing items
+	 * that are marked LP_DEAD.  We don't want to unnecessarily unset LP_DEAD
+	 * bits when deduplicating items.  Allowing it would be correct, though
+	 * wasteful.
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemid))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		_bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
+
+		/*
+		 * Return when a split will be avoided.  This is equivalent to
+		 * avoiding a split using the usual _bt_vacuum_one_page() path.
+		 */
+		if (PageGetFreeSpace(page) >= newitemsz)
+			return;
+
+		/*
+		 * Reconsider number of items on page, in case _bt_delitems_delete()
+		 * managed to delete an item or two
+		 */
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+	}
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+
+	/*
+	 * By here, it's clear that deduplication will definitely be attempted.
+	 * Initialize deduplication state.
+	 *
+	 * It would be possible for maxpostingsize (limit on posting list tuple
+	 * size) to be set to one third of the page.  However, it seems like a
+	 * good idea to limit the size of posting lists to one sixth of a page.
+	 * That ought to leave us with a good split point when pages full of
+	 * duplicates can be split several times.
+	 */
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->deduplicate = true;
+	state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+	/* Metadata about base tuple of current pending posting list */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+	/* Metadata about current pending posting list TIDs */
+	state->htids = palloc(state->maxpostingsize);
+	state->nhtids = 0;
+	state->nitems = 0;
+	/* Size of all physical tuples to be replaced by pending posting list */
+	state->phystupsize = 0;
+	/* nintervals should be initialized to zero */
+	state->nintervals = 0;
+
+	/* Determine if "single value" strategy should be used */
+	if (!checkingunique)
+		singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+	/*
+	 * Deduplicate items from page, and write them to newpage.
+	 *
+	 * Copy the original page's LSN into newpage copy.  This will become the
+	 * updated version of the page.  We need this because XLogInsert will
+	 * examine the LSN and possibly dump it in a page image.
+	 */
+	newpage = PageGetTempPageCopySpecial(page);
+	PageSetLSN(newpage, PageGetLSN(page));
+
+	/* Copy high key, if any */
+	if (!P_RIGHTMOST(opaque))
+	{
+		ItemId		hitemid = PageGetItemId(page, P_HIKEY);
+		Size		hitemsz = ItemIdGetLength(hitemid);
+		IndexTuple	hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+		if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add highkey");
+	}
+
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (offnum == minoff)
+		{
+			/*
+			 * No previous/base tuple for the data item -- use the data item
+			 * as base tuple of pending posting list
+			 */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (state->deduplicate &&
+				 _bt_keep_natts_fast(rel, state->base, itup) > natts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/*
+			 * Tuple is equal to base tuple of pending posting list.  Heap
+			 * TID(s) for itup have been saved in state.
+			 */
+		}
+		else
+		{
+			/*
+			 * Tuple is not equal to pending posting list tuple, or
+			 * _bt_dedup_save_htid() opted to not merge current item into
+			 * pending posting list for some other reason (e.g., adding more
+			 * TIDs would have caused posting list to exceed current
+			 * maxpostingsize).
+			 *
+			 * If state contains pending posting list with more than one item,
+			 * form new posting tuple, and actually update the page.  Else
+			 * reset the state and move on without modifying the page.
+			 */
+			pagesaving += _bt_dedup_finish_pending(newpage, state);
+			newpagendataitems++;
+
+			if (singlevalstrat)
+			{
+				/*
+				 * Single value strategy's extra steps.
+				 *
+				 * Lower maxpostingsize for sixth and final item that might be
+				 * deduplicated by current deduplication pass.  When sixth
+				 * item formed/observed, stop deduplicating items.
+				 *
+				 * Note: It's possible that this will be reached even when
+				 * current deduplication pass has yet to merge together some
+				 * existing items.  It doesn't matter whether or not the
+				 * current call generated the maxpostingsize-capped duplicate
+				 * tuples at the start of the page.
+				 */
+				if (newpagendataitems == 5)
+					_bt_singleval_fillfactor(page, state, newitemsz);
+				else if (newpagendataitems == 6)
+				{
+					state->deduplicate = false;
+					singlevalstrat = false; /* won't be back here */
+				}
+			}
+
+			/* itup starts new pending posting list */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+	}
+
+	/* Handle the last item */
+	pagesaving += _bt_dedup_finish_pending(newpage, state);
+	newpagendataitems++;
+
+	/*
+	 * If no items suitable for deduplication were found, newpage must be
+	 * exactly the same as the original page, so just return from function.
+	 *
+	 * We could determine whether or not to proceed on the basis the space
+	 * savings being sufficient to avoid an immediate page split instead.  We
+	 * don't do that because there is some small value in nbtsplitloc.c always
+	 * operating against a page that is fully deduplicated (apart from
+	 * newitem).  Besides, most of the cost has already been paid.
+	 */
+	if (state->nintervals == 0)
+	{
+		/* cannot leak memory here */
+		pfree(newpage);
+		pfree(state->htids);
+		pfree(state);
+		return;
+	}
+
+	/*
+	 * By here, it's clear that deduplication will definitely go ahead.
+	 *
+	 * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
+	 * still falsely set, just to keep things tidy.  (We can't rely on
+	 * _bt_vacuum_one_page() having done this already, and we can't rely on a
+	 * page split or VACUUM getting to it in the near future.)
+	 */
+	if (P_HAS_GARBAGE(opaque))
+	{
+		BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+		nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+	}
+
+	START_CRIT_SECTION();
+
+	PageRestoreTempPage(newpage, page);
+	MarkBufferDirty(buf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr	recptr;
+		xl_btree_dedup xlrec_dedup;
+
+		xlrec_dedup.nintervals = state->nintervals;
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+		/*
+		 * The intervals array is not in the buffer, but pretend that it is.
+		 * When XLogInsert stores the whole buffer, the array need not be
+		 * stored too.
+		 */
+		XLogRegisterBufData(0, (char *) state->intervals,
+							state->nintervals * sizeof(BTDedupInterval));
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* Local space accounting should agree with page accounting */
+	Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+	/* cannot leak memory here */
+	pfree(state->htids);
+	pfree(state);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+						OffsetNumber baseoff)
+{
+	Assert(state->nhtids == 0);
+	Assert(state->nitems == 0);
+	Assert(!BTreeTupleIsPivot(base));
+
+	/*
+	 * Copy heap TID(s) from new base tuple for new candidate posting list
+	 * into working state's array
+	 */
+	if (!BTreeTupleIsPosting(base))
+	{
+		memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+		state->nhtids = 1;
+		state->basetupsize = IndexTupleSize(base);
+	}
+	else
+	{
+		int			nposting;
+
+		nposting = BTreeTupleGetNPosting(base);
+		memcpy(state->htids, BTreeTupleGetPosting(base),
+			   sizeof(ItemPointerData) * nposting);
+		state->nhtids = nposting;
+		/* basetupsize should not include existing posting list */
+		state->basetupsize = BTreeTupleGetPostingOffset(base);
+	}
+
+	/*
+	 * Save new base tuple itself -- it'll be needed if we actually create a
+	 * new posting list from new pending posting list.
+	 *
+	 * Must maintain physical size of all existing tuples (including line
+	 * pointer overhead) so that we can calculate space savings on page.
+	 */
+	state->nitems = 1;
+	state->base = base;
+	state->baseoff = baseoff;
+	state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+	/* Also save baseoff in pending state for interval */
+	state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+	int			nhtids;
+	ItemPointer htids;
+	Size		mergedtupsz;
+
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (!BTreeTupleIsPosting(itup))
+	{
+		nhtids = 1;
+		htids = &itup->t_tid;
+	}
+	else
+	{
+		nhtids = BTreeTupleGetNPosting(itup);
+		htids = BTreeTupleGetPosting(itup);
+	}
+
+	/*
+	 * Don't append (have caller finish pending posting list as-is) if
+	 * appending heap TID(s) from itup would put us over maxpostingsize limit.
+	 *
+	 * This calculation needs to match the code used within _bt_form_posting()
+	 * for new posting list tuples.
+	 */
+	mergedtupsz = MAXALIGN(state->basetupsize +
+						   (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+	if (mergedtupsz > state->maxpostingsize)
+		return false;
+
+	/*
+	 * Save heap TIDs to pending posting list tuple -- itup can be merged into
+	 * pending posting list
+	 */
+	state->nitems++;
+	memcpy(state->htids + state->nhtids, htids,
+		   sizeof(ItemPointerData) * nhtids);
+	state->nhtids += nhtids;
+	state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+	return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead.  This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+	OffsetNumber tupoff;
+	Size		tuplesz;
+	Size		spacesaving;
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+	tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+	if (state->nitems == 1)
+	{
+		/* Use original, unchanged base tuple */
+		tuplesz = IndexTupleSize(state->base);
+		if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		spacesaving = 0;
+	}
+	else
+	{
+		IndexTuple	final;
+
+		/* Form a tuple with a posting list */
+		final = _bt_form_posting(state->base, state->htids, state->nhtids);
+		tuplesz = IndexTupleSize(final);
+		Assert(tuplesz <= state->maxpostingsize);
+
+		/* Save final number of items for posting list */
+		state->intervals[state->nintervals].nitems = state->nitems;
+
+		Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+		if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+						false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		pfree(final);
+		spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+		/* Increment nintervals, since we wrote a new posting list tuple */
+		state->nintervals++;
+		Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+	}
+
+	/* Reset state for next pending posting list */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->phystupsize = 0;
+
+	return spacesaving;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied.  The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value.  When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split.  The first deduplication pass should only
+ * find regular non-pivot tuples.  Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over.  The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples.  The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive.  Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes.  Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ *
+ * Note: We deliberately don't bother checking if the high key is a distinct
+ * value (prior to the TID tiebreaker column) before proceeding, unlike
+ * nbtsplitloc.c.  Its single value strategy only gets applied on the
+ * rightmost page of duplicates of the same value (other leaf pages full of
+ * duplicates will get a simple 50:50 page split instead of splitting towards
+ * the end of the page).  There is little point in making the same distinction
+ * here.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+				 OffsetNumber minoff, IndexTuple newitem)
+{
+	int			natts = IndexRelationGetNumberOfAttributes(rel);
+	ItemId		itemid;
+	IndexTuple	itup;
+
+	itemid = PageGetItemId(page, minoff);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+
+	if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+	{
+		itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple.  The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five.  (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all.  Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass.  It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split.  A future deduplication pass of future right
+ * sibling page might take care of it.  (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+	Size		leftfree;
+	int			reduction;
+
+	/* This calculation needs to match nbtsplitloc.c */
+	leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+		MAXALIGN(sizeof(BTPageOpaqueData));
+	/* Subtract size of new high key (includes pivot heap TID space) */
+	leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+	/*
+	 * Reduce maxpostingsize by an amount equal to target free space on left
+	 * half of page
+	 */
+	reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+	if (state->maxpostingsize > reduction)
+		state->maxpostingsize -= reduction;
+	else
+		state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple.  Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order.  Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+	uint32		keysize,
+				newsize;
+	IndexTuple	itup;
+
+	if (BTreeTupleIsPosting(base))
+		keysize = BTreeTupleGetPostingOffset(base);
+	else
+		keysize = IndexTupleSize(base);
+
+	Assert(!BTreeTupleIsPivot(base));
+	Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+	Assert(keysize == MAXALIGN(keysize));
+
+	/* Determine final size of new tuple */
+	if (nhtids > 1)
+		newsize = MAXALIGN(keysize +
+						   nhtids * sizeof(ItemPointerData));
+	else
+		newsize = keysize;
+
+	Assert(newsize <= INDEX_SIZE_MASK);
+	Assert(newsize == MAXALIGN(newsize));
+
+	/* Allocate memory using palloc0() (matches index_form_tuple()) */
+	itup = palloc0(newsize);
+	memcpy(itup, base, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+	if (nhtids > 1)
+	{
+		/* Form posting list tuple */
+		BTreeTupleSetPosting(itup, nhtids, keysize);
+		memcpy(BTreeTupleGetPosting(itup), htids,
+			   sizeof(ItemPointerData) * nhtids);
+		Assert(_bt_posting_valid(itup));
+	}
+	else
+	{
+		/* Form standard non-pivot tuple */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		ItemPointerCopy(htids, &itup->t_tid);
+		Assert(ItemPointerIsValid(&itup->t_tid));
+	}
+
+	return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by VACUUM.  Caller's vacposting argument points to the existing
+ * posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+	IndexTuple	origtuple = vacposting->itup;
+	uint32		keysize,
+				newsize;
+	IndexTuple	itup;
+	int			nhtids;
+	int			ui,
+				d;
+	ItemPointer htids;
+
+	nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+	Assert(_bt_posting_valid(origtuple));
+	Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+	if (BTreeTupleIsPosting(origtuple))
+		keysize = BTreeTupleGetPostingOffset(origtuple);
+	else
+		keysize = IndexTupleSize(origtuple);
+
+	/*
+	 * Determine final size of new tuple.
+	 *
+	 * This calculation needs to match the code used within _bt_form_posting()
+	 * for new posting list tuples.  We avoid calling _bt_form_posting() here
+	 * to save ourselves a second memory allocation for a htids workspace.
+	 */
+	if (nhtids > 1)
+		newsize = MAXALIGN(keysize +
+						   nhtids * sizeof(ItemPointerData));
+	else
+		newsize = keysize;
+
+	/* Allocate memory using palloc0() (matches index_form_tuple()) */
+	itup = palloc0(newsize);
+	memcpy(itup, origtuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+	if (nhtids > 1)
+	{
+		/* Form posting list tuple */
+		BTreeTupleSetPosting(itup, nhtids, keysize);
+		htids = BTreeTupleGetPosting(itup);
+	}
+	else
+	{
+		/* Form standard non-pivot tuple */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		htids = &itup->t_tid;
+	}
+
+	ui = 0;
+	d = 0;
+	for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+	{
+		if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+		{
+			d++;
+			continue;
+		}
+		htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+	}
+	Assert(ui == nhtids);
+	Assert(d == vacposting->ndeletedtids);
+	Assert(nhtids == 1 || _bt_posting_valid(itup));
+
+	/* vacposting arg's itup will now point to updated version */
+	vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff').  Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem.  For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria).  It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+	int			nhtids;
+	char	   *replacepos;
+	char	   *replaceposright;
+	Size		nmovebytes;
+	IndexTuple	nposting;
+
+	nhtids = BTreeTupleGetNPosting(oposting);
+	Assert(_bt_posting_valid(oposting));
+	Assert(postingoff > 0 && postingoff < nhtids);
+
+	/*
+	 * Move item pointers in posting list to make a gap for the new item's
+	 * heap TID.  We shift TIDs one place to the right, losing original
+	 * rightmost TID. (nmovebytes must not include TIDs to the left of
+	 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+	 */
+	nposting = CopyIndexTuple(oposting);
+	replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+	replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+	nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+	memmove(replaceposright, replacepos, nmovebytes);
+
+	/* Fill the gap at postingoff with TID of new item (original new TID) */
+	Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+	ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+	/* Now copy oposting's rightmost/max TID into new item (final new TID) */
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+							  BTreeTupleGetHeapTID(newitem)) < 0);
+	Assert(_bt_posting_valid(nposting));
+
+	return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple.  Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+	ItemPointerData last;
+	ItemPointer htid;
+
+	if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+		return false;
+
+	/* Remember first heap TID for loop */
+	ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+	if (!ItemPointerIsValid(&last))
+		return false;
+
+	/* Iterate, starting from second TID */
+	for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+	{
+		htid = BTreeTupleGetPostingN(posting, i);
+
+		if (!ItemPointerIsValid(htid))
+			return false;
+		if (ItemPointerCompare(htid, &last) <= 0)
+			return false;
+		ItemPointerCopy(htid, &last);
+	}
+
+	return true;
+}
+#endif
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@ -47,10 +47,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
 						   BTStack stack,
 						   IndexTuple itup,
 						   OffsetNumber newitemoff,
+						   int postingoff,
 						   bool split_only_page);
 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
 						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
-						IndexTuple newitem);
+						IndexTuple newitem, IndexTuple orignewitem,
+						IndexTuple nposting, uint16 postingoff);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 							  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
@ -125,6 +127,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	insertstate.itup_key = itup_key;
 	insertstate.bounds_valid = false;
 	insertstate.buf = InvalidBuffer;
+	insertstate.postingoff = 0;

 	/*
 	 * It's very common to have an index on an auto-incremented or
@ -295,7 +298,7 @@ top:
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 									   stack, heapRel);
 		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
-					   itup, newitemoff, false);
+					   itup, newitemoff, insertstate.postingoff, false);
 	}
 	else
 	{
@ -340,6 +343,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 uint32 *speculativeToken)
 {
 	IndexTuple	itup = insertstate->itup;
+	IndexTuple	curitup;
+	ItemId		curitemid;
 	BTScanInsert itup_key = insertstate->itup_key;
 	SnapshotData SnapshotDirty;
 	OffsetNumber offset;
@ -348,6 +353,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	BTPageOpaque opaque;
 	Buffer		nbuf = InvalidBuffer;
 	bool		found = false;
+	bool		inposting = false;
+	bool		prevalldead = true;
+	int			curposti = 0;

 	/* Assume unique until we find a duplicate */
 	*is_unique = true;
@ -375,13 +383,21 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
-		ItemId		curitemid;
-		IndexTuple	curitup;
-		BlockNumber nblkno;
-
 		/*
-		 * make sure the offset points to an actual item before trying to
-		 * examine it...
+		 * Each iteration of the loop processes one heap TID, not one index
+		 * tuple.  Current offset number for page isn't usually advanced on
+		 * iterations that process heap TIDs from posting list tuples.
+		 *
+		 * "inposting" state is set when _inside_ a posting list --- not when
+		 * we're at the start (or end) of a posting list.  We advance curposti
+		 * at the end of the iteration when inside a posting list tuple.  In
+		 * general, every loop iteration either advances the page offset or
+		 * advances curposti --- an iteration that handles the rightmost/max
+		 * heap TID in a posting list finally advances the page offset (and
+		 * unsets "inposting").
+		 *
+		 * Make sure the offset points to an actual index tuple before trying
+		 * to examine it...
 		 */
 		if (offset <= maxoff)
 		{
@ -406,31 +422,60 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				break;
 			}

-			curitemid = PageGetItemId(page, offset);
-
 			/*
-			 * We can skip items that are marked killed.
+			 * We can skip items that are already marked killed.
 			 *
 			 * In the presence of heavy update activity an index may contain
 			 * many killed items with the same key; running _bt_compare() on
 			 * each killed item gets expensive.  Just advance over killed
 			 * items as quickly as we can.  We only apply _bt_compare() when
-			 * we get to a non-killed item.  Even those comparisons could be
-			 * avoided (in the common case where there is only one page to
-			 * visit) by reusing bounds, but just skipping dead items is fast
-			 * enough.
+			 * we get to a non-killed item.  We could reuse the bounds to
+			 * avoid _bt_compare() calls for known equal tuples, but it
+			 * doesn't seem worth it.  Workloads with heavy update activity
+			 * tend to have many deduplication passes, so we'll often avoid
+			 * most of those comparisons, too (we call _bt_compare() when the
+			 * posting list tuple is initially encountered, though not when
+			 * processing later TIDs from the same tuple).
 			 */
-			if (!ItemIdIsDead(curitemid))
+			if (!inposting)
+				curitemid = PageGetItemId(page, offset);
+			if (inposting || !ItemIdIsDead(curitemid))
 			{
 				ItemPointerData htid;
 				bool		all_dead;

-				if (_bt_compare(rel, itup_key, page, offset) != 0)
-					break;		/* we're past all the equal tuples */
+				if (!inposting)
+				{
+					/* Plain tuple, or first TID in posting list tuple */
+					if (_bt_compare(rel, itup_key, page, offset) != 0)
+						break;	/* we're past all the equal tuples */

-				/* okay, we gotta fetch the heap tuple ... */
-				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htid = curitup->t_tid;
+					/* Advanced curitup */
+					curitup = (IndexTuple) PageGetItem(page, curitemid);
+					Assert(!BTreeTupleIsPivot(curitup));
+				}
+
+				/* okay, we gotta fetch the heap tuple using htid ... */
+				if (!BTreeTupleIsPosting(curitup))
+				{
+					/* ... htid is from simple non-pivot tuple */
+					Assert(!inposting);
+					htid = curitup->t_tid;
+				}
+				else if (!inposting)
+				{
+					/* ... htid is first TID in new posting list */
+					inposting = true;
+					prevalldead = true;
+					curposti = 0;
+					htid = *BTreeTupleGetPostingN(curitup, 0);
+				}
+				else
+				{
+					/* ... htid is second or subsequent TID in posting list */
+					Assert(curposti > 0);
+					htid = *BTreeTupleGetPostingN(curitup, curposti);
+				}

 				/*
 				 * If we are doing a recheck, we expect to find the tuple we
@ -506,8 +551,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * not part of this chain because it had a different index
 					 * entry.
 					 */
-					htid = itup->t_tid;
-					if (table_index_fetch_tuple_check(heapRel, &htid,
+					if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
 													  SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
@ -565,12 +609,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead)
+				else if (all_dead && (!inposting ||
+									  (prevalldead &&
+									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
 				{
 					/*
-					 * The conflicting tuple (or whole HOT chain) is dead to
-					 * everyone, so we may as well mark the index entry
-					 * killed.
+					 * The conflicting tuple (or all HOT chains pointed to by
+					 * all posting list TIDs) is dead to everyone, so mark the
+					 * index entry killed.
 					 */
 					ItemIdMarkDead(curitemid);
 					opaque->btpo_flags |= BTP_HAS_GARBAGE;
@ -584,14 +630,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					else
 						MarkBufferDirtyHint(insertstate->buf, true);
 				}
+
+				/*
+				 * Remember if posting list tuple has even a single HOT chain
+				 * whose members are not all dead
+				 */
+				if (!all_dead && inposting)
+					prevalldead = false;
 			}
 		}

-		/*
-		 * Advance to next tuple to continue checking.
-		 */
-		if (offset < maxoff)
+		if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+		{
+			/* Advance to next TID in same posting list */
+			curposti++;
+			continue;
+		}
+		else if (offset < maxoff)
+		{
+			/* Advance to next tuple */
+			curposti = 0;
+			inposting = false;
 			offset = OffsetNumberNext(offset);
+		}
 		else
 		{
 			int			highkeycmp;
@ -606,7 +667,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			/* Advance to next non-dead page --- there must be one */
 			for (;;)
 			{
-				nblkno = opaque->btpo_next;
+				BlockNumber nblkno = opaque->btpo_next;
+
 				nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
 				page = BufferGetPage(nbuf);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@ -616,6 +678,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
 			}
+			/* Will also advance to next tuple */
+			curposti = 0;
+			inposting = false;
 			maxoff = PageGetMaxOffsetNumber(page);
 			offset = P_FIRSTDATAKEY(opaque);
 			/* Don't invalidate binary search bounds */
@ -684,6 +749,7 @@ _bt_findinsertloc(Relation rel,
 	BTScanInsert itup_key = insertstate->itup_key;
 	Page		page = BufferGetPage(insertstate->buf);
 	BTPageOpaque lpageop;
+	OffsetNumber newitemoff;

 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

@ -696,9 +762,13 @@ _bt_findinsertloc(Relation rel,
 	Assert(!insertstate->bounds_valid || checkingunique);
 	Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
 	Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+	Assert(!itup_key->allequalimage || itup_key->heapkeyspace);

 	if (itup_key->heapkeyspace)
 	{
+		/* Keep track of whether checkingunique duplicate seen */
+		bool		uniquedup = false;
+
 		/*
 		 * If we're inserting into a unique index, we may have to walk right
 		 * through leaf pages to find the one leaf page that we must insert on
@ -715,6 +785,13 @@ _bt_findinsertloc(Relation rel,
 		 */
 		if (checkingunique)
 		{
+			if (insertstate->low < insertstate->stricthigh)
+			{
+				/* Encountered a duplicate in _bt_check_unique() */
+				Assert(insertstate->bounds_valid);
+				uniquedup = true;
+			}
+
 			for (;;)
 			{
 				/*
@ -741,18 +818,43 @@ _bt_findinsertloc(Relation rel,
 				/* Update local state after stepping right */
 				page = BufferGetPage(insertstate->buf);
 				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+				/* Assume duplicates (if checkingunique) */
+				uniquedup = true;
 			}
 		}

 		/*
 		 * If the target page is full, see if we can obtain enough space by
-		 * erasing LP_DEAD items
+		 * erasing LP_DEAD items.  If that fails to free enough space, see if
+		 * we can avoid a page split by performing a deduplication pass over
+		 * the page.
+		 *
+		 * We only perform a deduplication pass for a checkingunique caller
+		 * when the incoming item is a duplicate of an existing item on the
+		 * leaf page.  This heuristic avoids wasting cycles -- we only expect
+		 * to benefit from deduplicating a unique index page when most or all
+		 * recently added items are duplicates.  See nbtree/README.
 		 */
-		if (PageGetFreeSpace(page) < insertstate->itemsz &&
-			P_HAS_GARBAGE(lpageop))
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
-			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
-			insertstate->bounds_valid = false;
+			if (P_HAS_GARBAGE(lpageop))
+			{
+				_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+				insertstate->bounds_valid = false;
+
+				/* Might as well assume duplicates (if checkingunique) */
+				uniquedup = true;
+			}
+
+			if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
+				(!checkingunique || uniquedup) &&
+				PageGetFreeSpace(page) < insertstate->itemsz)
+			{
+				_bt_dedup_one_page(rel, insertstate->buf, heapRel,
+								   insertstate->itup, insertstate->itemsz,
+								   checkingunique);
+				insertstate->bounds_valid = false;
+			}
 		}
 	}
 	else
@ -834,7 +936,30 @@ _bt_findinsertloc(Relation rel,
 	Assert(P_RIGHTMOST(lpageop) ||
 		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);

-	return _bt_binsrch_insert(rel, insertstate);
+	newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+	if (insertstate->postingoff == -1)
+	{
+		/*
+		 * There is an overlapping posting list tuple with its LP_DEAD bit
+		 * set.  We don't want to unnecessarily unset its LP_DEAD bit while
+		 * performing a posting list split, so delete all LP_DEAD items early.
+		 * This is the only case where LP_DEAD deletes happen even though
+		 * there is space for newitem on the page.
+		 */
+		_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+
+		/*
+		 * Do new binary search.  New insert location cannot overlap with any
+		 * posting list now.
+		 */
+		insertstate->bounds_valid = false;
+		insertstate->postingoff = 0;
+		newitemoff = _bt_binsrch_insert(rel, insertstate);
+		Assert(insertstate->postingoff == 0);
+	}
+
+	return newitemoff;
 }

 /*
@ -900,10 +1025,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
 *
 *		This recursive procedure does the following things:
 *
+ *			+  if postingoff != 0, splits existing posting list tuple
+ *			   (since it overlaps with new 'itup' tuple).
 *			+  if necessary, splits the target page, using 'itup_key' for
 *			   suffix truncation on leaf pages (caller passes NULL for
 *			   non-leaf pages).
- *			+  inserts the tuple.
+ *			+  inserts the new tuple (might be split from posting list).
 *			+  if the page was split, pops the parent stack, and finds the
 *			   right place to insert the new child pointer (by walking
 *			   right using information stored in the parent stack).
@ -931,11 +1058,15 @@ _bt_insertonpg(Relation rel,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
+			   int postingoff,
 			   bool split_only_page)
 {
 	Page		page;
 	BTPageOpaque lpageop;
 	Size		itemsz;
+	IndexTuple	oposting;
+	IndexTuple	origitup = NULL;
+	IndexTuple	nposting = NULL;

 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@ -949,6 +1080,7 @@ _bt_insertonpg(Relation rel,
 	Assert(P_ISLEAF(lpageop) ||
 		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(!BTreeTupleIsPosting(itup));

 	/* The caller should've finished any incomplete splits already. */
 	if (P_INCOMPLETE_SPLIT(lpageop))
@ -959,6 +1091,34 @@ _bt_insertonpg(Relation rel,
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */

+	/*
+	 * Do we need to split an existing posting list item?
+	 */
+	if (postingoff != 0)
+	{
+		ItemId		itemid = PageGetItemId(page, newitemoff);
+
+		/*
+		 * The new tuple is a duplicate with a heap TID that falls inside the
+		 * range of an existing posting list tuple on a leaf page.  Prepare to
+		 * split an existing posting list.  Overwriting the posting list with
+		 * its post-split version is treated as an extra step in either the
+		 * insert or page split critical section.
+		 */
+		Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
+		Assert(itup_key->heapkeyspace && itup_key->allequalimage);
+		oposting = (IndexTuple) PageGetItem(page, itemid);
+
+		/* use a mutable copy of itup as our itup from here on */
+		origitup = itup;
+		itup = CopyIndexTuple(origitup);
+		nposting = _bt_swap_posting(itup, oposting, postingoff);
+		/* itup now contains rightmost/max TID from oposting */
+
+		/* Alter offset so that newitem goes after posting list */
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
 	/*
 	 * Do we need to split the page to fit the item on it?
 	 *
@ -991,7 +1151,8 @@ _bt_insertonpg(Relation rel,
 				 BlockNumberIsValid(RelationGetTargetBlock(rel))));

 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 origitup, nposting, postingoff);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@ -1066,6 +1227,9 @@ _bt_insertonpg(Relation rel,
 		/* Do the update.  No ereport(ERROR) until changes are logged */
 		START_CRIT_SECTION();

+		if (postingoff != 0)
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
 		if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
 			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
 				 itup_blkno, RelationGetRelationName(rel));
@ -1115,8 +1279,19 @@ _bt_insertonpg(Relation rel,
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);

-			if (P_ISLEAF(lpageop))
+			if (P_ISLEAF(lpageop) && postingoff == 0)
+			{
+				/* Simple leaf insert */
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
+			}
+			else if (postingoff != 0)
+			{
+				/*
+				 * Leaf insert with posting list split.  Must include
+				 * postingoff field before newitem/orignewitem.
+				 */
+				xlinfo = XLOG_BTREE_INSERT_POST;
+			}
 			else
 			{
 				/*
@ -1139,6 +1314,7 @@ _bt_insertonpg(Relation rel,
 				xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 				xlmeta.last_cleanup_num_heap_tuples =
 					metad->btm_last_cleanup_num_heap_tuples;
+				xlmeta.allequalimage = metad->btm_allequalimage;

 				XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 				XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@ -1147,7 +1323,27 @@ _bt_insertonpg(Relation rel,
 			}

 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
-			XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+			if (postingoff == 0)
+			{
+				/* Simple, common case -- log itup from caller */
+				XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+			}
+			else
+			{
+				/*
+				 * Insert with posting list split (XLOG_BTREE_INSERT_POST
+				 * record) case.
+				 *
+				 * Log postingoff.  Also log origitup, not itup.  REDO routine
+				 * must reconstruct final itup (as well as nposting) using
+				 * _bt_swap_posting().
+				 */
+				uint16		upostingoff = postingoff;
+
+				XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+				XLogRegisterBufData(0, (char *) origitup,
+									IndexTupleSize(origitup));
+			}

 			recptr = XLogInsert(RM_BTREE_ID, xlinfo);

@ -1189,6 +1385,14 @@ _bt_insertonpg(Relation rel,
 			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
 			RelationSetTargetBlock(rel, cachedBlock);
 	}
+
+	/* be tidy */
+	if (postingoff != 0)
+	{
+		/* itup is actually a modified copy of caller's original */
+		pfree(nposting);
+		pfree(itup);
+	}
 }

 /*
@ -1204,12 +1408,24 @@ _bt_insertonpg(Relation rel,
 *		This function will clear the INCOMPLETE_SPLIT flag on it, and
 *		release the buffer.
 *
+ *		orignewitem, nposting, and postingoff are needed when an insert of
+ *		orignewitem results in both a posting list split and a page split.
+ *		These extra posting list split details are used here in the same
+ *		way as they are used in the more common case where a posting list
+ *		split does not coincide with a page split.  We need to deal with
+ *		posting list splits directly in order to ensure that everything
+ *		that follows from the insert of orignewitem is handled as a single
+ *		atomic operation (though caller's insert of a new pivot/downlink
+ *		into parent page will still be a separate operation).  See
+ *		nbtree/README for details on the design of posting list splits.
+ *
 *		Returns the new right sibling of buf, pinned and write-locked.
 *		The pin and lock on buf are maintained.
 */
 static Buffer
 _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
 {
 	Buffer		rbuf;
 	Page		origpage;
@ -1229,6 +1445,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	OffsetNumber leftoff,
 				rightoff;
 	OffsetNumber firstright;
+	OffsetNumber origpagepostingoff;
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	bool		newitemonleft,
@ -1298,6 +1515,34 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	PageSetLSN(leftpage, PageGetLSN(origpage));
 	isleaf = P_ISLEAF(oopaque);

+	/*
+	 * Determine page offset number of existing overlapped-with-orignewitem
+	 * posting list when it is necessary to perform a posting list split in
+	 * passing.  Note that newitem was already changed by caller (newitem no
+	 * longer has the orignewitem TID).
+	 *
+	 * This page offset number (origpagepostingoff) will be used to pretend
+	 * that the posting split has already taken place, even though the
+	 * required modifications to origpage won't occur until we reach the
+	 * critical section.  The lastleft and firstright tuples of our page split
+	 * point should, in effect, come from an imaginary version of origpage
+	 * that has the nposting tuple instead of the original posting list tuple.
+	 *
+	 * Note: _bt_findsplitloc() should have compensated for coinciding posting
+	 * list splits in just the same way, at least in theory.  It doesn't
+	 * bother with that, though.  In practice it won't affect its choice of
+	 * split point.
+	 */
+	origpagepostingoff = InvalidOffsetNumber;
+	if (postingoff != 0)
+	{
+		Assert(isleaf);
+		Assert(ItemPointerCompare(&orignewitem->t_tid,
+								  &newitem->t_tid) < 0);
+		Assert(BTreeTupleIsPosting(nposting));
+		origpagepostingoff = OffsetNumberPrev(newitemoff);
+	}
+
 	/*
 	 * The "high key" for the new left page will be the first key that's going
 	 * to go into the new right page, or a truncated version if this is a leaf
@ -1335,6 +1580,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemid = PageGetItemId(origpage, firstright);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
+		if (firstright == origpagepostingoff)
+			item = nposting;
 	}

 	/*
@ -1368,6 +1615,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
 			itemid = PageGetItemId(origpage, lastleftoff);
 			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+			if (lastleftoff == origpagepostingoff)
+				lastleft = nposting;
 		}

 		Assert(lastleft != item);
@ -1383,6 +1632,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 */
 	leftoff = P_HIKEY;

+	Assert(BTreeTupleIsPivot(lefthikey) || !itup_key->heapkeyspace);
 	Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
 	Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
 	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
@ -1447,6 +1697,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemid = PageGetItemId(origpage, P_HIKEY);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
+		Assert(BTreeTupleIsPivot(item) || !itup_key->heapkeyspace);
 		Assert(BTreeTupleGetNAtts(item, rel) > 0);
 		Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
@ -1475,8 +1726,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);

+		/* replace original item with nposting due to posting split? */
+		if (i == origpagepostingoff)
+		{
+			Assert(BTreeTupleIsPosting(item));
+			Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+			item = nposting;
+		}
+
 		/* does new item belong before this one? */
-		if (i == newitemoff)
+		else if (i == newitemoff)
 		{
 			if (newitemonleft)
 			{
@ -1645,8 +1904,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		XLogRecPtr	recptr;

 		xlrec.level = ropaque->btpo.level;
+		/* See comments below on newitem, orignewitem, and posting lists */
 		xlrec.firstright = firstright;
 		xlrec.newitemoff = newitemoff;
+		xlrec.postingoff = 0;
+		if (postingoff != 0 && origpagepostingoff < firstright)
+			xlrec.postingoff = postingoff;

 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
@ -1665,11 +1928,35 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		 * because it's included with all the other items on the right page.)
 		 * Show the new item as belonging to the left page buffer, so that it
 		 * is not stored if XLogInsert decides it needs a full-page image of
-		 * the left page.  We store the offset anyway, though, to support
-		 * archive compression of these records.
+		 * the left page.  We always store newitemoff in the record, though.
+		 *
+		 * The details are sometimes slightly different for page splits that
+		 * coincide with a posting list split.  If both the replacement
+		 * posting list and newitem go on the right page, then we don't need
+		 * to log anything extra, just like the simple !newitemonleft
+		 * no-posting-split case (postingoff is set to zero in the WAL record,
+		 * so recovery doesn't need to process a posting list split at all).
+		 * Otherwise, we set postingoff and log orignewitem instead of
+		 * newitem, despite having actually inserted newitem.  REDO routine
+		 * must reconstruct nposting and newitem using _bt_swap_posting().
+		 *
+		 * Note: It's possible that our page split point is the point that
+		 * makes the posting list lastleft and newitem firstright.  This is
+		 * the only case where we log orignewitem/newitem despite newitem
+		 * going on the right page.  If XLogInsert decides that it can omit
+		 * orignewitem due to logging a full-page image of the left page,
+		 * everything still works out, since recovery only needs to log
+		 * orignewitem for items on the left page (just like the regular
+		 * newitem-logged case).
 		 */
-		if (newitemonleft)
+		if (newitemonleft && xlrec.postingoff == 0)
 			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+		else if (xlrec.postingoff != 0)
+		{
+			Assert(newitemonleft || firstright == newitemoff);
+			Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem));
+			XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz));
+		}

 		/* Log the left page's new high key */
 		itemid = PageGetItemId(origpage, P_HIKEY);
@ -1829,7 +2116,7 @@ _bt_insert_parent(Relation rel,

 		/* Recursively insert into the parent */
 		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
-					   new_item, stack->bts_offset + 1,
+					   new_item, stack->bts_offset + 1, 0,
 					   is_only);

 		/* be tidy */
@ -2185,6 +2472,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		md.fastlevel = metad->btm_level;
 		md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 		md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+		md.allequalimage = metad->btm_allequalimage;

 		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));

@ -2265,7 +2553,7 @@ _bt_pgaddtup(Page page,
 static void
 _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 {
-	OffsetNumber deletable[MaxOffsetNumber];
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
 	int			ndeletable = 0;
 	OffsetNumber offnum,
 				minoff,
@ -2298,6 +2586,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	 * Note: if we didn't find any LP_DEAD items, then the page's
 	 * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
 	 * separate write to clear it, however.  We will clear it when we split
-	 * the page.
+	 * the page, or when deduplication runs.
 	 */
 }
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@ -24,6 +24,7 @@

 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
+#include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
@ -37,6 +38,8 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
 									 bool *rightsib_empty);
+static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+									 OffsetNumber *deletable, int ndeletable);
 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
 								   BTStack stack, Buffer *topparent, OffsetNumber *topoff,
 								   BlockNumber *target, BlockNumber *rightsib);
@ -47,7 +50,8 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
 *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
 */
 void
-_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+				 bool allequalimage)
 {
 	BTMetaPageData *metad;
 	BTPageOpaque metaopaque;
@ -63,6 +67,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
 	metad->btm_fastlevel = level;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	metad->btm_allequalimage = allequalimage;

 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	metaopaque->btpo_flags = BTP_META;
@ -102,6 +107,9 @@ _bt_upgrademetapage(Page page)
 	metad->btm_version = BTREE_NOVAC_VERSION;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	/* Only a REINDEX can set this field */
+	Assert(!metad->btm_allequalimage);
+	metad->btm_allequalimage = false;

 	/* Adjust pd_lower (see _bt_initmetapage() for details) */
 	((PageHeader) page)->pd_lower =
@ -213,6 +221,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		md.fastlevel = metad->btm_fastlevel;
 		md.oldest_btpo_xact = oldestBtpoXact;
 		md.last_cleanup_num_heap_tuples = numHeapTuples;
+		md.allequalimage = metad->btm_allequalimage;

 		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));

@ -274,6 +283,8 @@ _bt_getroot(Relation rel, int access)
 		Assert(metad->btm_magic == BTREE_MAGIC);
 		Assert(metad->btm_version >= BTREE_MIN_VERSION);
 		Assert(metad->btm_version <= BTREE_VERSION);
+		Assert(!metad->btm_allequalimage ||
+			   metad->btm_version > BTREE_NOVAC_VERSION);
 		Assert(metad->btm_root != P_NONE);

 		rootblkno = metad->btm_fastroot;
@ -394,6 +405,7 @@ _bt_getroot(Relation rel, int access)
 			md.fastlevel = 0;
 			md.oldest_btpo_xact = InvalidTransactionId;
 			md.last_cleanup_num_heap_tuples = -1.0;
+			md.allequalimage = metad->btm_allequalimage;

 			XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));

@ -618,22 +630,34 @@ _bt_getrootheight(Relation rel)
 	Assert(metad->btm_magic == BTREE_MAGIC);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
 	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_allequalimage ||
+		   metad->btm_version > BTREE_NOVAC_VERSION);
 	Assert(metad->btm_fastroot != P_NONE);

 	return metad->btm_fastlevel;
 }

 /*
- *	_bt_heapkeyspace() -- is heap TID being treated as a key?
+ *	_bt_metaversion() -- Get version/status info from metapage.
+ *
+ *		Sets caller's *heapkeyspace and *allequalimage arguments using data
+ *		from the B-Tree metapage (could be locally-cached version).  This
+ *		information needs to be stashed in insertion scankey, so we provide a
+ *		single function that fetches both at once.
 *
 *		This is used to determine the rules that must be used to descend a
 *		btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
 *		pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
 *		performance when inserting a new BTScanInsert-wise duplicate tuple
 *		among many leaf pages already full of such duplicates.
+ *
+ *		Also sets allequalimage field, which indicates whether or not it is
+ *		safe to apply deduplication.  We rely on the assumption that
+ *		btm_allequalimage will be zero'ed on heapkeyspace indexes that were
+ *		pg_upgrade'd from Postgres 12.
 */
-bool
-_bt_heapkeyspace(Relation rel)
+void
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
 {
 	BTMetaPageData *metad;

@ -651,10 +675,11 @@ _bt_heapkeyspace(Relation rel)
 		 */
 		if (metad->btm_root == P_NONE)
 		{
-			uint32		btm_version = metad->btm_version;
+			*heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+			*allequalimage = metad->btm_allequalimage;

 			_bt_relbuf(rel, metabuf);
-			return btm_version > BTREE_NOVAC_VERSION;
+			return;
 		}

 		/*
@ -678,9 +703,12 @@ _bt_heapkeyspace(Relation rel)
 	Assert(metad->btm_magic == BTREE_MAGIC);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
 	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_allequalimage ||
+		   metad->btm_version > BTREE_NOVAC_VERSION);
 	Assert(metad->btm_fastroot != P_NONE);

-	return metad->btm_version > BTREE_NOVAC_VERSION;
+	*heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+	*allequalimage = metad->btm_allequalimage;
 }

 /*
@ -964,28 +992,106 @@ _bt_page_recyclable(Page page)
 * Delete item(s) from a btree leaf page during VACUUM.
 *
 * This routine assumes that the caller has a super-exclusive write lock on
- * the buffer.  Also, the given deletable array *must* be sorted in ascending
- * order.
+ * the buffer.  Also, the given deletable and updatable arrays *must* be
+ * sorted in ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed by VACUUM.  This works
+ * by updating/overwriting an existing item with caller's new version of the
+ * item (a version that lacks the TIDs that are to be deleted).
 *
 * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
 * generate their own latestRemovedXid by accessing the heap directly, whereas
- * VACUUMs rely on the initial heap scan taking care of it indirectly.
+ * VACUUMs rely on the initial heap scan taking care of it indirectly.  Also,
+ * only VACUUM can perform granular deletes of individual TIDs in posting list
+ * tuples.
 */
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
-					OffsetNumber *deletable, int ndeletable)
+					OffsetNumber *deletable, int ndeletable,
+					BTVacuumPosting *updatable, int nupdatable)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	Size		itemsz;
+	char	   *updatedbuf = NULL;
+	Size		updatedbuflen = 0;
+	OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];

 	/* Shouldn't be called unless there's something to do */
-	Assert(ndeletable > 0);
+	Assert(ndeletable > 0 || nupdatable > 0);
+
+	for (int i = 0; i < nupdatable; i++)
+	{
+		/* Replace work area IndexTuple with updated version */
+		_bt_update_posting(updatable[i]);
+
+		/* Maintain array of updatable page offsets for WAL record */
+		updatedoffsets[i] = updatable[i]->updatedoffset;
+	}
+
+	/* XLOG stuff -- allocate and fill buffer before critical section */
+	if (nupdatable > 0 && RelationNeedsWAL(rel))
+	{
+		Size		offset = 0;
+
+		for (int i = 0; i < nupdatable; i++)
+		{
+			BTVacuumPosting vacposting = updatable[i];
+
+			itemsz = SizeOfBtreeUpdate +
+				vacposting->ndeletedtids * sizeof(uint16);
+			updatedbuflen += itemsz;
+		}
+
+		updatedbuf = palloc(updatedbuflen);
+		for (int i = 0; i < nupdatable; i++)
+		{
+			BTVacuumPosting vacposting = updatable[i];
+			xl_btree_update update;
+
+			update.ndeletedtids = vacposting->ndeletedtids;
+			memcpy(updatedbuf + offset, &update.ndeletedtids,
+				   SizeOfBtreeUpdate);
+			offset += SizeOfBtreeUpdate;
+
+			itemsz = update.ndeletedtids * sizeof(uint16);
+			memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+			offset += itemsz;
+		}
+	}

 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();

-	/* Fix the page */
-	PageIndexMultiDelete(page, deletable, ndeletable);
+	/*
+	 * Handle posting tuple updates.
+	 *
+	 * Deliberately do this before handling simple deletes.  If we did it the
+	 * other way around (i.e. WAL record order -- simple deletes before
+	 * updates) then we'd have to make compensating changes to the 'updatable'
+	 * array of offset numbers.
+	 *
+	 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
+	 * happens to already be set.  Although we unset the BTP_HAS_GARBAGE page
+	 * level flag, unsetting individual LP_DEAD bits should still be avoided.
+	 */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		OffsetNumber updatedoffset = updatedoffsets[i];
+		IndexTuple	itup;
+
+		itup = updatable[i]->itup;
+		itemsz = MAXALIGN(IndexTupleSize(itup));
+		if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+									 itemsz))
+			elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+				 BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+	}
+
+	/* Now handle simple deletes of entire tuples */
+	if (ndeletable > 0)
+		PageIndexMultiDelete(page, deletable, ndeletable);

 	/*
 	 * We can clear the vacuum cycle ID since this page has certainly been
@ -1006,7 +1112,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	 * limited, since we never falsely unset an LP_DEAD bit.  Workloads that
 	 * are particularly dependent on LP_DEAD bits being set quickly will
 	 * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
-	 * again anyway.
+	 * again anyway.  Furthermore, attempting a deduplication pass will remove
+	 * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
+	 * is set or not.
 	 */
 	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

@ -1019,18 +1127,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		xl_btree_vacuum xlrec_vacuum;

 		xlrec_vacuum.ndeleted = ndeletable;
+		xlrec_vacuum.nupdated = nupdatable;

 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
 		XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);

-		/*
-		 * The deletable array is not in the buffer, but pretend that it is.
-		 * When XLogInsert stores the whole buffer, the array need not be
-		 * stored too.
-		 */
-		XLogRegisterBufData(0, (char *) deletable,
-							ndeletable * sizeof(OffsetNumber));
+		if (ndeletable > 0)
+			XLogRegisterBufData(0, (char *) deletable,
+								ndeletable * sizeof(OffsetNumber));
+
+		if (nupdatable > 0)
+		{
+			XLogRegisterBufData(0, (char *) updatedoffsets,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+		}

 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);

@ -1038,6 +1150,13 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 	}

 	END_CRIT_SECTION();
+
+	/* can't leak memory here */
+	if (updatedbuf != NULL)
+		pfree(updatedbuf);
+	/* free tuples generated by calling _bt_update_posting() */
+	for (int i = 0; i < nupdatable; i++)
+		pfree(updatable[i]->itup);
 }

 /*
@ -1050,6 +1169,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 * This is nearly the same as _bt_delitems_vacuum as far as what it does to
 * the page, but it needs to generate its own latestRemovedXid by accessing
 * the heap.  This is used by the REDO routine to generate recovery conflicts.
+ * Also, it doesn't handle posting list tuples unless the entire tuple can be
+ * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
 */
 void
 _bt_delitems_delete(Relation rel, Buffer buf,
@ -1065,8 +1186,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,

 	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
 		latestRemovedXid =
-			index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
-												 deletable, ndeletable);
+			_bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);

 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
@ -1113,6 +1233,83 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 	END_CRIT_SECTION();
 }

+/*
+ * Get the latestRemovedXid from the table entries pointed to by the non-pivot
+ * tuples being deleted.
+ *
+ * This is a specialized version of index_compute_xid_horizon_for_tuples().
+ * It's needed because btree tuples don't always store table TID using the
+ * standard index tuple header field.
+ */
+static TransactionId
+_bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+				OffsetNumber *deletable, int ndeletable)
+{
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	int			spacenhtids;
+	int			nhtids;
+	ItemPointer htids;
+
+	/* Array will grow iff there are posting list tuples to consider */
+	spacenhtids = ndeletable;
+	nhtids = 0;
+	htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
+	for (int i = 0; i < ndeletable; i++)
+	{
+		ItemId		itemid;
+		IndexTuple	itup;
+
+		itemid = PageGetItemId(page, deletable[i]);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+		Assert(!BTreeTupleIsPivot(itup));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			if (nhtids + 1 > spacenhtids)
+			{
+				spacenhtids *= 2;
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+			}
+
+			Assert(ItemPointerIsValid(&itup->t_tid));
+			ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
+			nhtids++;
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			if (nhtids + nposting > spacenhtids)
+			{
+				spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+
+				Assert(ItemPointerIsValid(htid));
+				ItemPointerCopy(htid, &htids[nhtids]);
+				nhtids++;
+			}
+		}
+	}
+
+	Assert(nhtids >= ndeletable);
+
+	latestRemovedXid =
+		table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
+
+	pfree(htids);
+
+	return latestRemovedXid;
+}
+
 /*
 * Returns true, if the given block has the half-dead flag set.
 */
@ -2058,6 +2255,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 			xlmeta.fastlevel = metad->btm_fastlevel;
 			xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 			xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+			xlmeta.allequalimage = metad->btm_allequalimage;

 			XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
 			xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@ -95,6 +95,10 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
+static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
+										  IndexTuple posting,
+										  OffsetNumber updatedoffset,
+										  int *nremaining);


 /*
@ -161,7 +165,7 @@ btbuildempty(Relation index)

 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, P_NONE, 0);
+	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));

 	/*
 	 * Write the page and log it.  It might seem that an immediate sync would
@ -264,8 +268,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 				 */
 				if (so->killedItems == NULL)
 					so->killedItems = (int *)
-						palloc(MaxIndexTuplesPerPage * sizeof(int));
-				if (so->numKilled < MaxIndexTuplesPerPage)
+						palloc(MaxTIDsPerBTreePage * sizeof(int));
+				if (so->numKilled < MaxTIDsPerBTreePage)
 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 			}

@ -1154,11 +1158,15 @@ restart:
 	}
 	else if (P_ISLEAF(opaque))
 	{
-		OffsetNumber deletable[MaxOffsetNumber];
+		OffsetNumber deletable[MaxIndexTuplesPerPage];
 		int			ndeletable;
+		BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+		int			nupdatable;
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
+		int			nhtidsdead,
+					nhtidslive;

 		/*
 		 * Trade in the initial read lock for a super-exclusive write lock on
@ -1190,8 +1198,11 @@ restart:
 		 * point using callback.
 		 */
 		ndeletable = 0;
+		nupdatable = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
+		nhtidsdead = 0;
+		nhtidslive = 0;
 		if (callback)
 		{
 			for (offnum = minoff;
@ -1199,11 +1210,9 @@ restart:
 				 offnum = OffsetNumberNext(offnum))
 			{
 				IndexTuple	itup;
-				ItemPointer htup;

 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
-				htup = &(itup->t_tid);

 				/*
 				 * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
@ -1226,22 +1235,82 @@ restart:
 				 * simple, and allows us to always avoid generating our own
 				 * conflicts.
 				 */
-				if (callback(htup, callback_state))
-					deletable[ndeletable++] = offnum;
+				Assert(!BTreeTupleIsPivot(itup));
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Regular tuple, standard table TID representation */
+					if (callback(&itup->t_tid, callback_state))
+					{
+						deletable[ndeletable++] = offnum;
+						nhtidsdead++;
+					}
+					else
+						nhtidslive++;
+				}
+				else
+				{
+					BTVacuumPosting vacposting;
+					int			nremaining;
+
+					/* Posting list tuple */
+					vacposting = btreevacuumposting(vstate, itup, offnum,
+													&nremaining);
+					if (vacposting == NULL)
+					{
+						/*
+						 * All table TIDs from the posting tuple remain, so no
+						 * delete or update required
+						 */
+						Assert(nremaining == BTreeTupleGetNPosting(itup));
+					}
+					else if (nremaining > 0)
+					{
+
+						/*
+						 * Store metadata about posting list tuple in
+						 * updatable array for entire page.  Existing tuple
+						 * will be updated during the later call to
+						 * _bt_delitems_vacuum().
+						 */
+						Assert(nremaining < BTreeTupleGetNPosting(itup));
+						updatable[nupdatable++] = vacposting;
+						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+					}
+					else
+					{
+						/*
+						 * All table TIDs from the posting list must be
+						 * deleted.  We'll delete the index tuple completely
+						 * (no update required).
+						 */
+						Assert(nremaining == 0);
+						deletable[ndeletable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup);
+						pfree(vacposting);
+					}
+
+					nhtidslive += nremaining;
+				}
 			}
 		}

 		/*
-		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
-		 * call per page, so as to minimize WAL traffic.
+		 * Apply any needed deletes or updates.  We issue just one
+		 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
 		 */
-		if (ndeletable > 0)
+		if (ndeletable > 0 || nupdatable > 0)
 		{
-			_bt_delitems_vacuum(rel, buf, deletable, ndeletable);
+			Assert(nhtidsdead >= Max(ndeletable, 1));
+			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+								nupdatable);

-			stats->tuples_removed += ndeletable;
+			stats->tuples_removed += nhtidsdead;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
+
+			/* can't leak memory here */
+			for (int i = 0; i < nupdatable; i++)
+				pfree(updatable[i]);
 		}
 		else
 		{
@ -1254,6 +1323,7 @@ restart:
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
 			 */
+			Assert(nhtidsdead == 0);
 			if (vstate->cycleid != 0 &&
 				opaque->btpo_cycleid == vstate->cycleid)
 			{
@ -1263,15 +1333,18 @@ restart:
 		}

 		/*
-		 * If it's now empty, try to delete; else count the live tuples. We
-		 * don't delete when recursing, though, to avoid putting entries into
-		 * freePages out-of-order (doesn't seem worth any extra code to handle
-		 * the case).
+		 * If it's now empty, try to delete; else count the live tuples (live
+		 * table TIDs in posting lists are counted as separate live tuples).
+		 * We don't delete when recursing, though, to avoid putting entries
+		 * into freePages out-of-order (doesn't seem worth any extra code to
+		 * handle the case).
 		 */
 		if (minoff > maxoff)
 			delete_now = (blkno == orig_blkno);
 		else
-			stats->num_index_tuples += maxoff - minoff + 1;
+			stats->num_index_tuples += nhtidslive;
+
+		Assert(!delete_now || nhtidslive == 0);
 	}

 	if (delete_now)
@ -1303,9 +1376,10 @@ restart:
 	/*
 	 * This is really tail recursion, but if the compiler is too stupid to
 	 * optimize it as such, we'd eat an uncomfortably large amount of stack
-	 * space per recursion level (due to the deletable[] array). A failure is
-	 * improbable since the number of levels isn't likely to be large ... but
-	 * just in case, let's hand-optimize into a loop.
+	 * space per recursion level (due to the arrays used to track details of
+	 * deletable/updatable items).  A failure is improbable since the number
+	 * of levels isn't likely to be large ...  but just in case, let's
+	 * hand-optimize into a loop.
 	 */
 	if (recurse_to != P_NONE)
 	{
@ -1314,6 +1388,61 @@ restart:
 	}
 }

+/*
+ * btreevacuumposting --- determine TIDs still needed in posting list
+ *
+ * Returns metadata describing how to build replacement tuple without the TIDs
+ * that VACUUM needs to delete.  Returned value is NULL in the common case
+ * where no changes are needed to caller's posting list tuple (we avoid
+ * allocating memory here as an optimization).
+ *
+ * The number of TIDs that should remain in the posting list tuple is set for
+ * caller in *nremaining.
+ */
+static BTVacuumPosting
+btreevacuumposting(BTVacState *vstate, IndexTuple posting,
+				   OffsetNumber updatedoffset, int *nremaining)
+{
+	int			live = 0;
+	int			nitem = BTreeTupleGetNPosting(posting);
+	ItemPointer items = BTreeTupleGetPosting(posting);
+	BTVacuumPosting vacposting = NULL;
+
+	for (int i = 0; i < nitem; i++)
+	{
+		if (!vstate->callback(items + i, vstate->callback_state))
+		{
+			/* Live table TID */
+			live++;
+		}
+		else if (vacposting == NULL)
+		{
+			/*
+			 * First dead table TID encountered.
+			 *
+			 * It's now clear that we need to delete one or more dead table
+			 * TIDs, so start maintaining metadata describing how to update
+			 * existing posting list tuple.
+			 */
+			vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+								nitem * sizeof(uint16));
+
+			vacposting->itup = posting;
+			vacposting->updatedoffset = updatedoffset;
+			vacposting->ndeletedtids = 0;
+			vacposting->deletetids[vacposting->ndeletedtids++] = i;
+		}
+		else
+		{
+			/* Second or subsequent dead table TID */
+			vacposting->deletetids[vacposting->ndeletedtids++] = i;
+		}
+	}
+
+	*nremaining = live;
+	return vacposting;
+}
+
 /*
 *	btcanreturn() -- Check whether btree indexes support index-only scans.
 *
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@ -26,10 +26,18 @@

 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int	_bt_binsrch_posting(BTScanInsert key, Page page,
+								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+								  OffsetNumber offnum, ItemPointer heapTid,
+								  IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+									   OffsetNumber offnum,
+									   ItemPointer heapTid, int tupleOffset);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@ -142,6 +150,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		offnum = _bt_binsrch(rel, key, *bufP);
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
+		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
 		blkno = BTreeTupleGetDownLink(itup);
 		par_blkno = BufferGetBlockNumber(*bufP);

@ -434,7 +443,10 @@ _bt_binsrch(Relation rel,
 * low) makes bounds invalid.
 *
 * Caller is responsible for invalidating bounds when it modifies the page
- * before calling here a second time.
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
 */
 OffsetNumber
 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
@ -453,6 +465,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)

 	Assert(P_ISLEAF(opaque));
 	Assert(!key->nextkey);
+	Assert(insertstate->postingoff == 0);

 	if (!insertstate->bounds_valid)
 	{
@ -509,6 +522,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 			if (result != 0)
 				stricthigh = high;
 		}
+
+		/*
+		 * If tuple at offset located by binary search is a posting list whose
+		 * TID range overlaps with caller's scantid, perform posting list
+		 * binary search to set postingoff for caller.  Caller must split the
+		 * posting list when postingoff is set.  This should happen
+		 * infrequently.
+		 */
+		if (unlikely(result == 0 && key->scantid != NULL))
+			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
 	}

 	/*
@ -528,6 +551,73 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	return low;
 }

+/*----------
+ *	_bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+	IndexTuple	itup;
+	ItemId		itemid;
+	int			low,
+				high,
+				mid,
+				res;
+
+	/*
+	 * If this isn't a posting tuple, then the index must be corrupt (if it is
+	 * an ordinary non-pivot tuple then there must be an existing tuple with a
+	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
+	 * check that tuple is a posting list tuple whose posting list range
+	 * includes caller's scantid.
+	 *
+	 * (This is also needed because contrib/amcheck's rootdescend option needs
+	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+	 */
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (!BTreeTupleIsPosting(itup))
+		return 0;
+
+	Assert(key->heapkeyspace && key->allequalimage);
+
+	/*
+	 * In the event that posting list tuple has LP_DEAD bit set, indicate this
+	 * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
+	 * second call to _bt_binsrch_insert() can take place when its caller has
+	 * removed the dead item.
+	 */
+	if (ItemIdIsDead(itemid))
+		return -1;
+
+	/* "high" is past end of posting list for loop invariant */
+	low = 0;
+	high = BTreeTupleGetNPosting(itup);
+	Assert(high >= 2);
+
+	while (high > low)
+	{
+		mid = low + ((high - low) / 2);
+		res = ItemPointerCompare(key->scantid,
+								 BTreeTupleGetPostingN(itup, mid));
+
+		if (res > 0)
+			low = mid + 1;
+		else if (res < 0)
+			high = mid;
+		else
+			return mid;
+	}
+
+	/* Exact match not found */
+	return low;
+}
+
 /*----------
 *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
 *
@ -537,9 +627,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 *			<0 if scankey < tuple at offnum;
 *			 0 if scankey == tuple at offnum;
 *			>0 if scankey > tuple at offnum.
- *		NULLs in the keys are treated as sortable values.  Therefore
- *		"equality" does not necessarily mean that the item should be
- *		returned to the caller as a matching key!
+ *
+ * NULLs in the keys are treated as sortable values.  Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key.  Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid.  There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
 *
 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 * "minus infinity": this routine will always claim it is less than the
@ -563,6 +658,7 @@ _bt_compare(Relation rel,
 	ScanKey		scankey;
 	int			ncmpkey;
 	int			ntupatts;
+	int32		result;

 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
@ -592,12 +688,12 @@ _bt_compare(Relation rel,

 	ncmpkey = Min(ntupatts, key->keysz);
 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
+	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
 	scankey = key->scankeys;
 	for (int i = 1; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
-		int32		result;

 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);

@ -712,8 +808,25 @@ _bt_compare(Relation rel,
 	if (heapTid == NULL)
 		return 1;

+	/*
+	 * Scankey must be treated as equal to a posting list tuple if its scantid
+	 * value falls within the range of the posting list.  In all other cases
+	 * there can only be a single heap TID value, which is compared directly
+	 * with scantid.
+	 */
 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
-	return ItemPointerCompare(key->scantid, heapTid);
+	result = ItemPointerCompare(key->scantid, heapTid);
+	if (result <= 0 || !BTreeTupleIsPosting(itup))
+		return result;
+	else
+	{
+		result = ItemPointerCompare(key->scantid,
+									BTreeTupleGetMaxHeapTID(itup));
+		if (result > 0)
+			return 1;
+	}
+
+	return 0;
 }

 /*
@ -1228,7 +1341,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	}

 	/* Initialize remaining insertion scan key fields */
-	inskey.heapkeyspace = _bt_heapkeyspace(rel);
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
 	inskey.anynullkeys = false; /* unused */
 	inskey.nextkey = nextkey;
 	inskey.pivotsearch = false;
@ -1483,9 +1596,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)

 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
-				/* tuple passes all scan key conditions, so remember it */
-				_bt_saveitem(so, itemIndex, offnum, itup);
-				itemIndex++;
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID
+					 */
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					itemIndex++;
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+						itemIndex++;
+					}
+				}
 			}
 			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
@ -1518,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (!continuescan)
 			so->currPos.moreRight = false;

-		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		Assert(itemIndex <= MaxTIDsPerBTreePage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
 		so->currPos.itemIndex = 0;
@ -1526,7 +1665,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	else
 	{
 		/* load items[] in descending order */
-		itemIndex = MaxIndexTuplesPerPage;
+		itemIndex = MaxTIDsPerBTreePage;

 		offnum = Min(offnum, maxoff);

@ -1567,9 +1706,41 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 										 &continuescan);
 			if (passes_quals && tuple_alive)
 			{
-				/* tuple passes all scan key conditions, so remember it */
-				itemIndex--;
-				_bt_saveitem(so, itemIndex, offnum, itup);
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID.
+					 *
+					 * Note that we deliberately save/return items from
+					 * posting lists in ascending heap TID order for backwards
+					 * scans.  This allows _bt_killitems() to make a
+					 * consistent assumption about the order of items
+					 * associated with the same posting list tuple.
+					 */
+					itemIndex--;
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+					}
+				}
 			}
 			if (!continuescan)
 			{
@ -1583,8 +1754,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)

 		Assert(itemIndex >= 0);
 		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
-		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
 	}

 	return (so->currPos.firstItem <= so->currPos.lastItem);
@ -1597,6 +1768,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 {
 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];

+	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
 	if (so->currTuples)
@ -1609,6 +1782,71 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	}
 }

+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first.  Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		/* Save base IndexTuple (truncate posting list) */
+		IndexTuple	base;
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		memcpy(base, itup, itupsz);
+		/* Defensively reduce work area index tuple header size */
+		base->t_info &= ~INDEX_SIZE_MASK;
+		base->t_info |= itupsz;
+		so->currPos.nextTupleOffset += itupsz;
+
+		return currItem->tupleOffset;
+	}
+
+	return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.  Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid, int tupleOffset)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every TID
+	 * that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = tupleOffset;
+}
+
 /*
 *	_bt_steppage() -- Step to next page containing valid data for scan
 *
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@ -243,6 +243,7 @@ typedef struct BTPageState
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
+	Size		btps_lastextra; /* last item's extra posting list space */
 	uint32		btps_level;		/* tree level (0 = leaf) */
 	Size		btps_full;		/* "full" if less than this much free space */
 	struct BTPageState *btps_next;	/* link to parent level, if any */
@ -277,7 +278,10 @@ static void _bt_slideleft(Page page);
 static void _bt_sortaddtup(Page page, Size itemsize,
 						   IndexTuple itup, OffsetNumber itup_off);
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
-						 IndexTuple itup);
+						 IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+										  BTPageState *state,
+										  BTDedupState dstate);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
@ -563,6 +567,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.heap = btspool->heap;
 	wstate.index = btspool->index;
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
+	/* _bt_mkscankey() won't set allequalimage without metapage */
+	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);

 	/*
 	 * We need to log index creation in WAL iff WAL archiving/streaming is
@ -711,6 +717,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	state->btps_lowkey = NULL;
 	/* initialize lastoff so first item goes into P_FIRSTKEY */
 	state->btps_lastoff = P_HIKEY;
+	state->btps_lastextra = 0;
 	state->btps_level = level;
 	/* set "full" threshold based on level.  See notes at head of file. */
 	if (level > 0)
@ -789,7 +796,8 @@ _bt_sortaddtup(Page page,
 }

 /*----------
- * Add an item to a disk page from the sort output.
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
 *
 * We must be careful to observe the page layout conventions of nbtsearch.c:
 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
@ -821,14 +829,27 @@ _bt_sortaddtup(Page page,
 * the truncated high key at offset 1.
 *
 * 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any.  This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off.  Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page).  Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
 *----------
 */
 static void
-_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+			 Size truncextra)
 {
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
+	Size		last_truncextra;
 	Size		pgspc;
 	Size		itupsz;
 	bool		isleaf;
@ -842,6 +863,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	npage = state->btps_page;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
+	last_truncextra = state->btps_lastextra;
+	state->btps_lastextra = truncextra;

 	pgspc = PageGetFreeSpace(npage);
 	itupsz = IndexTupleSize(itup);
@ -883,10 +906,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	 * page.  Disregard fillfactor and insert on "full" current page if we
 	 * don't have the minimum number of items yet.  (Note that we deliberately
 	 * assume that suffix truncation neither enlarges nor shrinks new high key
-	 * when applying soft limit.)
+	 * when applying soft limit, except when last tuple has a posting list.)
 	 */
 	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
-		(pgspc < state->btps_full && last_off > P_FIRSTKEY))
+		(pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
 	{
 		/*
 		 * Finish off the page and write it out.
@ -944,11 +967,14 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * We don't try to bias our choice of split point to make it more
 			 * likely that _bt_truncate() can truncate away more attributes,
 			 * whereas the split point used within _bt_split() is chosen much
-			 * more delicately.  Suffix truncation is mostly useful because it
-			 * improves space utilization for workloads with random
-			 * insertions.  It doesn't seem worthwhile to add logic for
-			 * choosing a split point here for a benefit that is bound to be
-			 * much smaller.
+			 * more delicately.  Even still, the lastleft and firstright
+			 * tuples passed to _bt_truncate() here are at least not fully
+			 * equal to each other when deduplication is used, unless there is
+			 * a large group of duplicates (also, unique index builds usually
+			 * have few or no spool2 duplicates).  When the split point is
+			 * between two unequal tuples, _bt_truncate() will avoid including
+			 * a heap TID in the new high key, which is the most important
+			 * benefit of suffix truncation.
 			 *
 			 * Overwrite the old item with new truncated high key directly.
 			 * oitup is already located at the physical beginning of tuple
@ -983,7 +1009,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
 			   !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
 		BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
-		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey);
+		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
 		pfree(state->btps_lowkey);

 		/*
@ -1045,6 +1071,43 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	state->btps_lastoff = last_off;
 }

+/*
+ * Finalize pending posting list tuple, and add it to the index.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd().
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+							  BTDedupState dstate)
+{
+	Assert(dstate->nitems > 0);
+
+	if (dstate->nitems == 1)
+		_bt_buildadd(wstate, state, dstate->base, 0);
+	else
+	{
+		IndexTuple	postingtuple;
+		Size		truncextra;
+
+		/* form a tuple with a posting list */
+		postingtuple = _bt_form_posting(dstate->base,
+										dstate->htids,
+										dstate->nhtids);
+		/* Calculate posting list overhead */
+		truncextra = IndexTupleSize(postingtuple) -
+			BTreeTupleGetPostingOffset(postingtuple);
+
+		_bt_buildadd(wstate, state, postingtuple, truncextra);
+		pfree(postingtuple);
+	}
+
+	dstate->nhtids = 0;
+	dstate->nitems = 0;
+	dstate->phystupsize = 0;
+}
+
 /*
 * Finish writing out the completed btree.
 */
@ -1090,7 +1153,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 			Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
 				   !P_LEFTMOST(opaque));
 			BTreeTupleSetDownLink(s->btps_lowkey, blkno);
-			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey);
+			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
 			pfree(s->btps_lowkey);
 			s->btps_lowkey = NULL;
 		}
@ -1111,7 +1174,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * by filling in a valid magic number in the metapage.
 	 */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, rootblkno, rootlevel);
+	_bt_initmetapage(metapage, rootblkno, rootlevel,
+					 wstate->inskey->allequalimage);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
 }

@ -1132,6 +1196,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
+	bool		deduplicate;
+
+	deduplicate = wstate->inskey->allequalimage &&
+		BTGetDeduplicateItems(wstate->index);

 	if (merge)
 	{
@ -1228,12 +1296,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)

 			if (load1)
 			{
-				_bt_buildadd(wstate, state, itup);
+				_bt_buildadd(wstate, state, itup, 0);
 				itup = tuplesort_getindextuple(btspool->sortstate, true);
 			}
 			else
 			{
-				_bt_buildadd(wstate, state, itup2);
+				_bt_buildadd(wstate, state, itup2, 0);
 				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
 			}

@ -1243,9 +1311,100 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		}
 		pfree(sortKeys);
 	}
+	else if (deduplicate)
+	{
+		/* merge is unnecessary, deduplicate into posting lists */
+		BTDedupState dstate;
+
+		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
+		dstate->deduplicate = true; /* unused */
+		dstate->maxpostingsize = 0; /* set later */
+		/* Metadata about base tuple of current pending posting list */
+		dstate->base = NULL;
+		dstate->baseoff = InvalidOffsetNumber;	/* unused */
+		dstate->basetupsize = 0;
+		/* Metadata about current pending posting list TIDs */
+		dstate->htids = NULL;
+		dstate->nhtids = 0;
+		dstate->nitems = 0;
+		dstate->phystupsize = 0;	/* unused */
+		dstate->nintervals = 0; /* unused */
+
+		while ((itup = tuplesort_getindextuple(btspool->sortstate,
+											   true)) != NULL)
+		{
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+			{
+				state = _bt_pagestate(wstate, 0);
+
+				/*
+				 * Limit size of posting list tuples to 1/10 space we want to
+				 * leave behind on the page, plus space for final item's line
+				 * pointer.  This is equal to the space that we'd like to
+				 * leave behind on each leaf page when fillfactor is 90,
+				 * allowing us to get close to fillfactor% space utilization
+				 * when there happen to be a great many duplicates.  (This
+				 * makes higher leaf fillfactor settings ineffective when
+				 * building indexes that have many duplicates, but packing
+				 * leaf pages full with few very large tuples doesn't seem
+				 * like a useful goal.)
+				 */
+				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
+					sizeof(ItemIdData);
+				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
+				dstate->htids = palloc(dstate->maxpostingsize);
+
+				/* start new pending posting list with itup copy */
+				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+										InvalidOffsetNumber);
+			}
+			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+										 itup) > keysz &&
+					 _bt_dedup_save_htid(dstate, itup))
+			{
+				/*
+				 * Tuple is equal to base tuple of pending posting list.  Heap
+				 * TID from itup has been saved in state.
+				 */
+			}
+			else
+			{
+				/*
+				 * Tuple is not equal to pending posting list tuple, or
+				 * _bt_dedup_save_htid() opted to not merge current item into
+				 * pending posting list.
+				 */
+				_bt_sort_dedup_finish_pending(wstate, state, dstate);
+				pfree(dstate->base);
+
+				/* start new pending posting list with itup copy */
+				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+										InvalidOffsetNumber);
+			}
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+
+		if (state)
+		{
+			/*
+			 * Handle the last item (there must be a last item when the
+			 * tuplesort returned one or more tuples)
+			 */
+			_bt_sort_dedup_finish_pending(wstate, state, dstate);
+			pfree(dstate->base);
+			pfree(dstate->htids);
+		}
+
+		pfree(dstate);
+	}
 	else
 	{
-		/* merge is unnecessary */
+		/* merging and deduplication are both unnecessary */
 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
 											   true)) != NULL)
 		{
@ -1253,7 +1412,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			if (state == NULL)
 				state = _bt_pagestate(wstate, 0);

-			_bt_buildadd(wstate, state, itup);
+			_bt_buildadd(wstate, state, itup, 0);

 			/* Report progress */
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel,
 	state.minfirstrightsz = SIZE_MAX;
 	state.newitemoff = newitemoff;

+	/* newitem cannot be a posting list item */
+	Assert(!BTreeTupleIsPosting(newitem));
+
 	/*
 	 * maxsplits should never exceed maxoff because there will be at most as
 	 * many candidate split points as there are points _between_ tuples, once
@ -459,6 +462,7 @@ _bt_recsplitloc(FindSplitData *state,
 	int16		leftfree,
 				rightfree;
 	Size		firstrightitemsz;
+	Size		postingsz = 0;
 	bool		newitemisfirstonright;

 	/* Is the new item going to be the first item on the right page? */
@ -468,8 +472,30 @@ _bt_recsplitloc(FindSplitData *state,
 	if (newitemisfirstonright)
 		firstrightitemsz = state->newitemsz;
 	else
+	{
 		firstrightitemsz = firstoldonrightsz;

+		/*
+		 * Calculate suffix truncation space saving when firstright is a
+		 * posting list tuple, though only when the firstright is over 64
+		 * bytes including line pointer overhead (arbitrary).  This avoids
+		 * accessing the tuple in cases where its posting list must be very
+		 * small (if firstright has one at all).
+		 */
+		if (state->is_leaf && firstrightitemsz > 64)
+		{
+			ItemId		itemid;
+			IndexTuple	newhighkey;
+
+			itemid = PageGetItemId(state->page, firstoldonright);
+			newhighkey = (IndexTuple) PageGetItem(state->page, itemid);
+
+			if (BTreeTupleIsPosting(newhighkey))
+				postingsz = IndexTupleSize(newhighkey) -
+					BTreeTupleGetPostingOffset(newhighkey);
+		}
+	}
+
 	/* Account for all the old tuples */
 	leftfree = state->leftspace - olddataitemstoleft;
 	rightfree = state->rightspace -
@ -491,11 +517,17 @@ _bt_recsplitloc(FindSplitData *state,
 	 * If we are on the leaf level, assume that suffix truncation cannot avoid
 	 * adding a heap TID to the left half's new high key when splitting at the
 	 * leaf level.  In practice the new high key will often be smaller and
-	 * will rarely be larger, but conservatively assume the worst case.
+	 * will rarely be larger, but conservatively assume the worst case.  We do
+	 * go to the trouble of subtracting away posting list overhead, though
+	 * only when it looks like it will make an appreciable difference.
+	 * (Posting lists are the only case where truncation will typically make
+	 * the final high key far smaller than firstright, so being a bit more
+	 * precise there noticeably improves the balance of free space.)
 	 */
 	if (state->is_leaf)
 		leftfree -= (int16) (firstrightitemsz +
-							 MAXALIGN(sizeof(ItemPointerData)));
+							 MAXALIGN(sizeof(ItemPointerData)) -
+							 postingsz);
 	else
 		leftfree -= (int16) firstrightitemsz;

@ -691,7 +723,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
 	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
 	tup = (IndexTuple) PageGetItem(state->page, itemid);
 	/* Do cheaper test first */
-	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+	if (BTreeTupleIsPosting(tup) ||
+		!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
 		return false;
 	/* Check same conditions as rightmost item case, too */
 	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@ -81,7 +81,10 @@ static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
 *		determine whether or not the keys in the index are expected to be
 *		unique (i.e. if this is a "heapkeyspace" index).  We assume a
 *		heapkeyspace index when caller passes a NULL tuple, allowing index
- *		build callers to avoid accessing the non-existent metapage.
+ *		build callers to avoid accessing the non-existent metapage.  We
+ *		also assume that the index is _not_ allequalimage when a NULL tuple
+ *		is passed; CREATE INDEX callers call _bt_allequalimage() to set the
+ *		field themselves.
 */
 BTScanInsert
 _bt_mkscankey(Relation rel, IndexTuple itup)
@ -108,7 +111,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	 */
 	key = palloc(offsetof(BTScanInsertData, scankeys) +
 				 sizeof(ScanKeyData) * indnkeyatts);
-	key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+	if (itup)
+		_bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
+	else
+	{
+		/* Utility statement callers can set these fields themselves */
+		key->heapkeyspace = true;
+		key->allequalimage = false;
+	}
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
@ -1374,6 +1384,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			continue;
 		}

@ -1535,6 +1546,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			cmpresult = 0;
 			if (subkey->sk_flags & SK_ROW_END)
 				break;
@ -1774,10 +1786,65 @@ _bt_killitems(IndexScanDesc scan)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;

-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			if (BTreeTupleIsPosting(ituple))
 			{
-				/* found the item */
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				/*
+				 * Note that we rely on the assumption that heap TIDs in the
+				 * scanpos items array are always in ascending heap TID order
+				 * within a posting list
+				 */
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/* kitem must have matching offnum when heap TIDs match */
+					Assert(kitem->indexOffset == offnum);
+
+					/*
+					 * Read-ahead to later kitems here.
+					 *
+					 * We rely on the assumption that not advancing kitem here
+					 * will prevent us from considering the posting list tuple
+					 * fully dead by not matching its next heap TID in next
+					 * loop iteration.
+					 *
+					 * If, on the other hand, this is the final heap TID in
+					 * the posting list tuple, then tuple gets killed
+					 * regardless (i.e. we handle the case where the last
+					 * kitem is also the last heap TID in the last index tuple
+					 * correctly -- posting tuple still gets killed).
+					 */
+					if (pi < numKilled)
+						kitem = &so->currPos.items[so->killedItems[pi++]];
+				}
+
+				/*
+				 * Don't bother advancing the outermost loop's int iterator to
+				 * avoid processing killed items that relate to the same
+				 * offnum/posting list tuple.  This micro-optimization hardly
+				 * seems worth it.  (Further iterations of the outermost loop
+				 * will fail to match on this same posting list's first heap
+				 * TID instead, so we'll advance to the next offnum/index
+				 * tuple pretty quickly.)
+				 */
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
+
+			if (killtuple)
+			{
+				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
 				killedsomething = true;
 				break;			/* out of inner search loop */
@ -2018,7 +2085,9 @@ btoptions(Datum reloptions, bool validate)
 	static const relopt_parse_elt tab[] = {
 		{"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
 		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
-		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}
+		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+		{"deduplicate_items", RELOPT_TYPE_BOOL,
+		offsetof(BTOptions, deduplicate_items)}

 	};

@ -2119,11 +2188,10 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	Size		newsize;

 	/*
-	 * We should only ever truncate leaf index tuples.  It's never okay to
-	 * truncate a second time.
+	 * We should only ever truncate non-pivot tuples from leaf pages.  It's
+	 * never okay to truncate when splitting an internal page.
 	 */
-	Assert(BTreeTupleGetNAtts(lastleft, rel) == natts);
-	Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
+	Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));

 	/* Determine how many attributes must be kept in truncated tuple */
 	keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
@ -2139,6 +2207,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,

 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);

+		if (BTreeTupleIsPosting(pivot))
+		{
+			/*
+			 * index_truncate_tuple() just returns a straight copy of
+			 * firstright when it has no key attributes to truncate.  We need
+			 * to truncate away the posting list ourselves.
+			 */
+			Assert(keepnatts == nkeyatts);
+			Assert(natts == nkeyatts);
+			pivot->t_info &= ~INDEX_SIZE_MASK;
+			pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+		}
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@ -2155,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * attribute to the new pivot tuple.
 		 */
 		Assert(natts != nkeyatts);
+		Assert(!BTreeTupleIsPosting(lastleft) &&
+			   !BTreeTupleIsPosting(firstright));
 		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
 		tidpivot = palloc0(newsize);
 		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@ -2172,6 +2255,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
 		pivot = palloc0(newsize);
 		memcpy(pivot, firstright, IndexTupleSize(firstright));
+
+		if (BTreeTupleIsPosting(firstright))
+		{
+			/*
+			 * New pivot tuple was copied from firstright, which happens to be
+			 * a posting list tuple.  We will have to include the max lastleft
+			 * heap TID in the final pivot tuple, but we can remove the
+			 * posting list now. (Pivot tuples should never contain a posting
+			 * list.)
+			 */
+			newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) +
+				MAXALIGN(sizeof(ItemPointerData));
+		}
 	}

 	/*
@ -2199,7 +2295,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);

 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@ -2210,9 +2306,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #else

 	/*
@ -2225,7 +2324,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);

 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@ -2234,7 +2333,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #endif

 	BTreeTupleSetNAtts(pivot, nkeyatts);
@ -2301,6 +2401,13 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		keepnatts++;
 	}

+	/*
+	 * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
+	 * expected in an allequalimage index.
+	 */
+	Assert(!itup_key->allequalimage ||
+		   keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
+
 	return keepnatts;
 }

@ -2315,13 +2422,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 * The approach taken here usually provides the same answer as _bt_keep_natts
 * will (for the same pair of tuples from a heapkeyspace index), since the
 * majority of btree opclasses can never indicate that two datums are equal
- * unless they're bitwise equal after detoasting.
+ * unless they're bitwise equal after detoasting.  When an index only has
+ * "equal image" columns, routine is guaranteed to give the same result as
+ * _bt_keep_natts would.
 *
- * These issues must be acceptable to callers, typically because they're only
- * concerned about making suffix truncation as effective as possible without
- * leaving excessive amounts of free space on either side of page split.
 * Callers can rely on the fact that attributes considered equal here are
- * definitely also equal according to _bt_keep_natts.
+ * definitely also equal according to _bt_keep_natts, even when the index uses
+ * an opclass or collation that is not "allequalimage"/deduplication-safe.
+ * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
 */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@ -2393,28 +2503,42 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * Mask allocated for number of keys in index tuple must be able to fit
 	 * maximum possible number of index attributes
 	 */
-	StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
-					 "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+	StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
+					 "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");

 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	tupnatts = BTreeTupleGetNAtts(itup, rel);

+	/* !heapkeyspace indexes do not support deduplication */
+	if (!heapkeyspace && BTreeTupleIsPosting(itup))
+		return false;
+
+	/* Posting list tuples should never have "pivot heap TID" bit set */
+	if (BTreeTupleIsPosting(itup) &&
+		(ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+		 BT_PIVOT_HEAP_TID_ATTR) != 0)
+		return false;
+
+	/* INCLUDE indexes do not support deduplication */
+	if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+		return false;
+
 	if (P_ISLEAF(opaque))
 	{
 		if (offnum >= P_FIRSTDATAKEY(opaque))
 		{
 			/*
-			 * Non-pivot tuples currently never use alternative heap TID
-			 * representation -- even those within heapkeyspace indexes
+			 * Non-pivot tuple should never be explicitly marked as a pivot
+			 * tuple
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;

 			/*
 			 * Leaf tuples that are not the page high key (non-pivot tuples)
 			 * should never be truncated.  (Note that tupnatts must have been
-			 * inferred, rather than coming from an explicit on-disk
-			 * representation.)
+			 * inferred, even with a posting list tuple, because only pivot
+			 * tuples store tupnatts directly.)
 			 */
 			return tupnatts == natts;
 		}
@ -2458,12 +2582,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * non-zero, or when there is no explicit representation and the
 			 * tuple is evidently not a pre-pg_upgrade tuple.
 			 *
-			 * Prior to v11, downlinks always had P_HIKEY as their offset. Use
-			 * that to decide if the tuple is a pre-v11 tuple.
+			 * Prior to v11, downlinks always had P_HIKEY as their offset.
+			 * Accept that as an alternative indication of a valid
+			 * !heapkeyspace negative infinity tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
-				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+				ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
 		}
 		else
 		{
@ -2489,7 +2613,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
+		return false;
+
+	/* Pivot tuple should not use posting list representation (redundant) */
+	if (BTreeTupleIsPosting(itup))
 		return false;

 	/*
@ -2559,8 +2687,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					BTMaxItemSizeNoHeapTid(page),
 					RelationGetRelationName(rel)),
 			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
-					   ItemPointerGetBlockNumber(&newtup->t_tid),
-					   ItemPointerGetOffsetNumber(&newtup->t_tid),
+					   ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+					   ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
 					   RelationGetRelationName(heap)),
 			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 					 "Consider a function index of an MD5 hash of the value, "
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@ -22,6 +22,9 @@
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "storage/procarray.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx;		/* working memory for operations */

 /*
 * _bt_restore_page -- re-enter all the index tuples on a page
@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
 	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
 	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
+	md->btm_allequalimage = xlrec->allequalimage;

 	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
 	pageop->btpo_flags = BTP_META;
@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
 }

 static void
-btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
+btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
+				  XLogReaderState *record)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
 	xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)

 		page = BufferGetPage(buffer);

-		if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
-						false, false) == InvalidOffsetNumber)
-			elog(PANIC, "btree_xlog_insert: failed to add item");
+		if (!posting)
+		{
+			/* Simple retail insertion */
+			if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "failed to add new item");
+		}
+		else
+		{
+			ItemId		itemid;
+			IndexTuple	oposting,
+						newitem,
+						nposting;
+			uint16		postingoff;
+
+			/*
+			 * A posting list split occurred during leaf page insertion.  WAL
+			 * record data will start with an offset number representing the
+			 * point in an existing posting list that a split occurs at.
+			 *
+			 * Use _bt_swap_posting() to repeat posting list split steps from
+			 * primary.  Note that newitem from WAL record is 'orignewitem',
+			 * not the final version of newitem that is actually inserted on
+			 * page.
+			 */
+			postingoff = *((uint16 *) datapos);
+			datapos += sizeof(uint16);
+			datalen -= sizeof(uint16);
+
+			itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+			oposting = (IndexTuple) PageGetItem(page, itemid);
+
+			/* Use mutable, aligned newitem copy in _bt_swap_posting() */
+			Assert(isleaf && postingoff > 0);
+			newitem = CopyIndexTuple((IndexTuple) datapos);
+			nposting = _bt_swap_posting(newitem, oposting, postingoff);
+
+			/* Replace existing posting list with post-split version */
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+			/* Insert "final" new item (not orignewitem from WAL stream) */
+			Assert(IndexTupleSize(newitem) == datalen);
+			if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "failed to add posting split new item");
+		}

 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 		BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
 		OffsetNumber off;
 		IndexTuple	newitem = NULL,
-					left_hikey = NULL;
+					left_hikey = NULL,
+					nposting = NULL;
 		Size		newitemsz = 0,
 					left_hikeysz = 0;
 		Page		newlpage;
-		OffsetNumber leftoff;
+		OffsetNumber leftoff,
+					replacepostingoff = InvalidOffsetNumber;

 		datapos = XLogRecGetBlockData(record, 0, &datalen);

-		if (onleft)
+		if (onleft || xlrec->postingoff != 0)
 		{
 			newitem = (IndexTuple) datapos;
 			newitemsz = MAXALIGN(IndexTupleSize(newitem));
 			datapos += newitemsz;
 			datalen -= newitemsz;
+
+			if (xlrec->postingoff != 0)
+			{
+				ItemId		itemid;
+				IndexTuple	oposting;
+
+				/* Posting list must be at offset number before new item's */
+				replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+				/* Use mutable, aligned newitem copy in _bt_swap_posting() */
+				newitem = CopyIndexTuple(newitem);
+				itemid = PageGetItemId(lpage, replacepostingoff);
+				oposting = (IndexTuple) PageGetItem(lpage, itemid);
+				nposting = _bt_swap_posting(newitem, oposting,
+											xlrec->postingoff);
+			}
 		}

 		/*
@ -308,8 +374,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 			Size		itemsz;
 			IndexTuple	item;

+			/* Add replacement posting list when required */
+			if (off == replacepostingoff)
+			{
+				Assert(onleft || xlrec->firstright == xlrec->newitemoff);
+				if (PageAddItem(newlpage, (Item) nposting,
+								MAXALIGN(IndexTupleSize(nposting)), leftoff,
+								false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add new posting list item to left page after split");
+				leftoff = OffsetNumberNext(leftoff);
+				continue;		/* don't insert oposting */
+			}
+
 			/* add the new item if it was inserted on left page */
-			if (onleft && off == xlrec->newitemoff)
+			else if (onleft && off == xlrec->newitemoff)
 			{
 				if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
 								false, false) == InvalidOffsetNumber)
@ -383,6 +461,98 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 	}
 }

+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+	Buffer		buf;
+
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+	{
+		char	   *ptr = XLogRecGetBlockData(record, 0, NULL);
+		Page		page = (Page) BufferGetPage(buf);
+		BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		OffsetNumber offnum,
+					minoff,
+					maxoff;
+		BTDedupState state;
+		BTDedupInterval *intervals;
+		Page		newpage;
+
+		state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+		state->deduplicate = true;	/* unused */
+		/* Conservatively use larger maxpostingsize than primary */
+		state->maxpostingsize = BTMaxItemSize(page);
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+		state->htids = palloc(state->maxpostingsize);
+		state->nhtids = 0;
+		state->nitems = 0;
+		state->phystupsize = 0;
+		state->nintervals = 0;
+
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+		newpage = PageGetTempPageCopySpecial(page);
+
+		if (!P_RIGHTMOST(opaque))
+		{
+			ItemId		itemid = PageGetItemId(page, P_HIKEY);
+			Size		itemsz = ItemIdGetLength(itemid);
+			IndexTuple	item = (IndexTuple) PageGetItem(page, itemid);
+
+			if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+							false, false) == InvalidOffsetNumber)
+				elog(ERROR, "deduplication failed to add highkey");
+		}
+
+		intervals = (BTDedupInterval *) ptr;
+		for (offnum = minoff;
+			 offnum <= maxoff;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid = PageGetItemId(page, offnum);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+			if (offnum == minoff)
+				_bt_dedup_start_pending(state, itup, offnum);
+			else if (state->nintervals < xlrec->nintervals &&
+					 state->baseoff == intervals[state->nintervals].baseoff &&
+					 state->nitems < intervals[state->nintervals].nitems)
+			{
+				if (!_bt_dedup_save_htid(state, itup))
+					elog(ERROR, "deduplication failed to add heap tid to pending posting list");
+			}
+			else
+			{
+				_bt_dedup_finish_pending(newpage, state);
+				_bt_dedup_start_pending(state, itup, offnum);
+			}
+		}
+
+		_bt_dedup_finish_pending(newpage, state);
+		Assert(state->nintervals == xlrec->nintervals);
+		Assert(memcmp(state->intervals, intervals,
+					  state->nintervals * sizeof(BTDedupInterval)) == 0);
+
+		if (P_HAS_GARBAGE(opaque))
+		{
+			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+			nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+		}
+
+		PageRestoreTempPage(newpage, page);
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buf);
+	}
+
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
 static void
 btree_xlog_vacuum(XLogReaderState *record)
 {
@ -405,7 +575,56 @@ btree_xlog_vacuum(XLogReaderState *record)

 		page = (Page) BufferGetPage(buffer);

-		PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+		if (xlrec->nupdated > 0)
+		{
+			OffsetNumber *updatedoffsets;
+			xl_btree_update *updates;
+
+			updatedoffsets = (OffsetNumber *)
+				(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+			updates = (xl_btree_update *) ((char *) updatedoffsets +
+										   xlrec->nupdated *
+										   sizeof(OffsetNumber));
+
+			for (int i = 0; i < xlrec->nupdated; i++)
+			{
+				BTVacuumPosting vacposting;
+				IndexTuple	origtuple;
+				ItemId		itemid;
+				Size		itemsz;
+
+				itemid = PageGetItemId(page, updatedoffsets[i]);
+				origtuple = (IndexTuple) PageGetItem(page, itemid);
+
+				vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+									updates->ndeletedtids * sizeof(uint16));
+				vacposting->updatedoffset = updatedoffsets[i];
+				vacposting->itup = origtuple;
+				vacposting->ndeletedtids = updates->ndeletedtids;
+				memcpy(vacposting->deletetids,
+					   (char *) updates + SizeOfBtreeUpdate,
+					   updates->ndeletedtids * sizeof(uint16));
+
+				_bt_update_posting(vacposting);
+
+				/* Overwrite updated version of tuple */
+				itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
+				if (!PageIndexTupleOverwrite(page, updatedoffsets[i],
+											 (Item) vacposting->itup, itemsz))
+					elog(PANIC, "failed to update partially dead item");
+
+				pfree(vacposting->itup);
+				pfree(vacposting);
+
+				/* advance to next xl_btree_update from array */
+				updates = (xl_btree_update *)
+					((char *) updates + SizeOfBtreeUpdate +
+					 updates->ndeletedtids * sizeof(uint16));
+			}
+		}
+
+		if (xlrec->ndeleted > 0)
+			PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);

 		/*
 		 * Mark the page as not containing any LP_DEAD items --- see comments
@ -724,17 +943,19 @@ void
 btree_redo(XLogReaderState *record)
 {
 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	MemoryContext oldCtx;

+	oldCtx = MemoryContextSwitchTo(opCtx);
 	switch (info)
 	{
 		case XLOG_BTREE_INSERT_LEAF:
-			btree_xlog_insert(true, false, record);
+			btree_xlog_insert(true, false, false, record);
 			break;
 		case XLOG_BTREE_INSERT_UPPER:
-			btree_xlog_insert(false, false, record);
+			btree_xlog_insert(false, false, false, record);
 			break;
 		case XLOG_BTREE_INSERT_META:
-			btree_xlog_insert(false, true, record);
+			btree_xlog_insert(false, true, false, record);
 			break;
 		case XLOG_BTREE_SPLIT_L:
 			btree_xlog_split(true, record);
@ -742,6 +963,12 @@ btree_redo(XLogReaderState *record)
 		case XLOG_BTREE_SPLIT_R:
 			btree_xlog_split(false, record);
 			break;
+		case XLOG_BTREE_INSERT_POST:
+			btree_xlog_insert(true, false, true, record);
+			break;
+		case XLOG_BTREE_DEDUP:
+			btree_xlog_dedup(record);
+			break;
 		case XLOG_BTREE_VACUUM:
 			btree_xlog_vacuum(record);
 			break;
@ -767,6 +994,23 @@ btree_redo(XLogReaderState *record)
 		default:
 			elog(PANIC, "btree_redo: unknown op code %u", info);
 	}
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+	opCtx = AllocSetContextCreate(CurrentMemoryContext,
+								  "Btree recovery temporary context",
+								  ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+	MemoryContextDelete(opCtx);
+	opCtx = NULL;
 }

 /*
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 		case XLOG_BTREE_INSERT_LEAF:
 		case XLOG_BTREE_INSERT_UPPER:
 		case XLOG_BTREE_INSERT_META:
+		case XLOG_BTREE_INSERT_POST:
 			{
 				xl_btree_insert *xlrec = (xl_btree_insert *) rec;

@ -38,15 +39,24 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_split *xlrec = (xl_btree_split *) rec;

-				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d",
-								 xlrec->level, xlrec->firstright, xlrec->newitemoff);
+				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d",
+								 xlrec->level, xlrec->firstright,
+								 xlrec->newitemoff, xlrec->postingoff);
+				break;
+			}
+		case XLOG_BTREE_DEDUP:
+			{
+				xl_btree_dedup *xlrec = (xl_btree_dedup *) rec;
+
+				appendStringInfo(buf, "nintervals %u", xlrec->nintervals);
 				break;
 			}
 		case XLOG_BTREE_VACUUM:
 			{
 				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;

-				appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted);
+				appendStringInfo(buf, "ndeleted %u; nupdated %u",
+								 xlrec->ndeleted, xlrec->nupdated);
 				break;
 			}
 		case XLOG_BTREE_DELETE:
@ -130,6 +140,12 @@ btree_identify(uint8 info)
 		case XLOG_BTREE_SPLIT_R:
 			id = "SPLIT_R";
 			break;
+		case XLOG_BTREE_INSERT_POST:
+			id = "INSERT_POST";
+			break;
+		case XLOG_BTREE_DEDUP:
+			id = "DEDUP";
+			break;
 		case XLOG_BTREE_VACUUM:
 			id = "VACUUM";
 			break;
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@ -1048,8 +1048,10 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum)
 * This is better than deleting and reinserting the tuple, because it
 * avoids any data shifting when the tuple size doesn't change; and
 * even when it does, we avoid moving the line pointers around.
- * Conceivably this could also be of use to an index AM that cares about
- * the physical order of tuples as well as their ItemId order.
+ * This could be used by an index AM that doesn't want to unset the
+ * LP_DEAD bit when it happens to be set.  It could conceivably also be
+ * used by an index AM that cares about the physical order of tuples as
+ * well as their logical/ItemId order.
 *
 * If there's insufficient space for the new tuple, return false.  Other
 * errors represent data-corruption problems, so we just elog.
@ -1134,8 +1136,9 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 		}
 	}

-	/* Update the item's tuple length (other fields shouldn't change) */
-	ItemIdSetNormal(tupid, offset + size_diff, newsize);
+	/* Update the item's tuple length without changing its lp_flags field */
+	tupid->lp_off = offset + size_diff;
+	tupid->lp_len = newsize;

 	/* Copy new tuple data onto page */
 	memcpy(PageGetItem(page, tupid), newtup, newsize);
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@ -1731,14 +1731,14 @@ psql_completion(const char *text, int start, int end)
 	/* ALTER INDEX <foo> SET|RESET ( */
 	else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "("))
 		COMPLETE_WITH("fillfactor",
-					  "vacuum_cleanup_index_scale_factor",	/* BTREE */
+					  "vacuum_cleanup_index_scale_factor", "deduplicate_items",	/* BTREE */
 					  "fastupdate", "gin_pending_list_limit",	/* GIN */
 					  "buffering",	/* GiST */
 					  "pages_per_range", "autosummarize"	/* BRIN */
 			);
 	else if (Matches("ALTER", "INDEX", MatchAny, "SET", "("))
 		COMPLETE_WITH("fillfactor =",
-					  "vacuum_cleanup_index_scale_factor =",	/* BTREE */
+					  "vacuum_cleanup_index_scale_factor =", "deduplicate_items =",	/* BTREE */
 					  "fastupdate =", "gin_pending_list_limit =",	/* GIN */
 					  "buffering =",	/* GiST */
 					  "pages_per_range =", "autosummarize ="	/* BRIN */
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@ -108,6 +108,7 @@ typedef struct BTMetaPageData
 										 * pages */
 	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
 													 * during last cleanup */
+	bool		btm_allequalimage;	/* are all columns "equalimage"? */
 } BTMetaPageData;

 #define BTPageGetMeta(p) \
@ -124,6 +125,14 @@ typedef struct BTMetaPageData
 * need to be immediately re-indexed at pg_upgrade.  In order to get the
 * new heapkeyspace semantics, however, a REINDEX is needed.
 *
+ * Deduplication is safe to use when the btm_allequalimage field is set to
+ * true.  It's safe to read the btm_allequalimage field on version 3, but
+ * only version 4 indexes make use of deduplication.  Even version 4
+ * indexes created on PostgreSQL v12 will need a REINDEX to make use of
+ * deduplication, though, since there is no other way to set
+ * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
+ * metapage field).
+ *
 * Btree version 2 is mostly the same as version 3.  There are two new
 * fields in the metapage that were introduced in version 3.  A version 2
 * metapage will be automatically upgraded to version 3 on the first
@ -156,6 +165,21 @@ typedef struct BTMetaPageData
 				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)

+/*
+ * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
+ * that may be stored on a btree leaf page.  It is used to size the
+ * per-page temporary buffers used by index scans.)
+ *
+ * Note: we don't bother considering per-tuple overheads here to keep
+ * things simple (value is based on how many elements a single array of
+ * heap TIDs must have to fill the space between the page header and
+ * special area).  The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.
+ */
+#define MaxTIDsPerBTreePage \
+	(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+		   sizeof(ItemPointerData))
+
 /*
 * The leaf-page fillfactor defaults to 90% but is user-adjustable.
 * For pages above the leaf level, we use a fixed 70% fillfactor.
@ -230,16 +254,15 @@ typedef struct BTMetaPageData
 * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
 * described here.
 *
- * Non-pivot tuple format:
+ * Non-pivot tuple format (plain/non-posting variant):
 *
 *  t_tid | t_info | key values | INCLUDE columns, if any
 *
 * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
 *
- * All other types of index tuples ("pivot" tuples) only have key columns,
- * since pivot tuples only exist to represent how the key space is
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
 * separated.  In general, any B-Tree index that has more than one level
 * (i.e. any index that does not just consist of a metapage and a single
 * leaf root page) must have some number of pivot tuples, since pivot
@ -264,7 +287,8 @@ typedef struct BTMetaPageData
 * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
 * TID column sometimes stored in pivot tuples -- that's represented by
 * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
- * t_info is always set on BTREE_VERSION 4 pivot tuples.
+ * t_info is always set on BTREE_VERSION 4 pivot tuples, since
+ * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
 *
 * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
 * pivot tuples.  In that case, the number of key columns is implicitly
@ -279,90 +303,256 @@ typedef struct BTMetaPageData
 * The 12 least significant offset bits from t_tid are used to represent
 * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
 * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
- * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
- * number of columns/attributes <= INDEX_MAX_KEYS.
+ * future use.  BT_OFFSET_MASK should be large enough to store any number
+ * of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
+ * new non-pivot tuple format to support deduplication: posting list
+ * tuples.  Deduplication merges together multiple equal non-pivot tuples
+ * into a logically equivalent, space efficient representation.  A posting
+ * list is an array of ItemPointerData elements.  Non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ *  t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid.  These flags redefine the content of the posting
+ * tuple's t_tid to store an offset to the posting list, as well as the
+ * total number of posting list array elements.
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items present in the tuple, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.  Like any non-pivot tuple, the number of columns stored is
+ * always implicitly the total number in the index (in practice there can
+ * never be non-key columns stored, since deduplication is not supported
+ * with INCLUDE indexes).  BT_OFFSET_MASK should be large enough to store
+ * any number of posting list TIDs that might be present in a tuple (since
+ * tuple size is subject to the INDEX_SIZE_MASK limit).
 *
 * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
 */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT

 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
-#define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_OFFSET_MASK				0x0FFF
 #define BT_PIVOT_HEAP_TID_ATTR		0x1000
-
-/* Get/set downlink block number in pivot tuple */
-#define BTreeTupleGetDownLink(itup) \
-	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetDownLink(itup, blkno) \
-	ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
+#define BT_IS_POSTING				0x2000

 /*
- * Get/set leaf page highkey's link. During the second phase of deletion, the
- * target leaf page's high key may point to an ancestor page (at all other
- * times, the leaf level high key's link is not used).  See the nbtree README
- * for full details.
+ * Note: BTreeTupleIsPivot() can have false negatives (but not false
+ * positives) when used with !heapkeyspace indexes
 */
-#define BTreeTupleGetTopParent(itup) \
-	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetTopParent(itup, blkno)	\
-	do { \
-		ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
-		BTreeTupleSetNAtts((itup), 0); \
-	} while(0)
+static inline bool
+BTreeTupleIsPivot(IndexTuple itup)
+{
+	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+		return false;
+	/* absence of BT_IS_POSTING in offset number indicates pivot tuple */
+	if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
+		return false;
+
+	return true;
+}
+
+static inline bool
+BTreeTupleIsPosting(IndexTuple itup)
+{
+	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+		return false;
+	/* presence of BT_IS_POSTING in offset number indicates posting tuple */
+	if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
+		return false;
+
+	return true;
+}
+
+static inline void
+BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset)
+{
+	Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids);
+	Assert(postingoffset == MAXALIGN(postingoffset));
+	Assert(postingoffset < INDEX_SIZE_MASK);
+
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
+	ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
+}
+
+static inline uint16
+BTreeTupleGetNPosting(IndexTuple posting)
+{
+	OffsetNumber existing;
+
+	Assert(BTreeTupleIsPosting(posting));
+
+	existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
+	return (existing & BT_OFFSET_MASK);
+}
+
+static inline uint32
+BTreeTupleGetPostingOffset(IndexTuple posting)
+{
+	Assert(BTreeTupleIsPosting(posting));
+
+	return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
+}
+
+static inline ItemPointer
+BTreeTupleGetPosting(IndexTuple posting)
+{
+	return (ItemPointer) ((char *) posting +
+						  BTreeTupleGetPostingOffset(posting));
+}
+
+static inline ItemPointer
+BTreeTupleGetPostingN(IndexTuple posting, int n)
+{
+	return BTreeTupleGetPosting(posting) + n;
+}

 /*
- * Get/set number of attributes within B-tree index tuple.
+ * Get/set downlink block number in pivot tuple.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetDownLink(IndexTuple pivot)
+{
+	return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
+}
+
+static inline void
+BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
+{
+	ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
+}
+
+/*
+ * Get number of attributes within tuple.
 *
 * Note that this does not include an implicit tiebreaker heap TID
 * attribute, if any.  Note also that the number of key attributes must be
 * explicitly represented in all heapkeyspace pivot tuples.
+ *
+ * Note: This is defined as a macro rather than an inline function to
+ * avoid including rel.h.
 */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		(BTreeTupleIsPivot(itup)) ? \
 		( \
-			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
 		) \
 		: \
 		IndexRelationGetNumberOfAttributes(rel) \
 	)
-#define BTreeTupleSetNAtts(itup, n) \
-	do { \
-		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
-	} while(0)

 /*
- * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Set number of attributes in tuple, making it into a pivot tuple
 */
-#define BTreeTupleGetHeapTID(itup) \
-	( \
-	  (itup)->t_info & INDEX_ALT_TID_MASK && \
-	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \
-	  ( \
-		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
-					   sizeof(ItemPointerData)) \
-	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
-	)
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int natts)
+{
+	Assert(natts <= INDEX_MAX_KEYS);
+
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	/* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */
+	ItemPointerSetOffsetNumber(&itup->t_tid, natts);
+	Assert(BTreeTupleIsPivot(itup));
+}
+
 /*
- * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * Set the bit indicating heap TID attribute present in pivot tuple
 */
-#define BTreeTupleSetAltHeapTID(itup) \
-	do { \
-		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
-								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \
-	} while(0)
+static inline void
+BTreeTupleSetAltHeapTID(IndexTuple pivot)
+{
+	OffsetNumber existing;
+
+	Assert(BTreeTupleIsPivot(pivot));
+
+	existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid);
+	ItemPointerSetOffsetNumber(&pivot->t_tid,
+							   existing | BT_PIVOT_HEAP_TID_ATTR);
+}
+
+/*
+ * Get/set leaf page's "top parent" link from its high key.  Used during page
+ * deletion.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetTopParent(IndexTuple leafhikey)
+{
+	return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
+}
+
+static inline void
+BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
+{
+	ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
+	BTreeTupleSetNAtts(leafhikey, 0);
+}
+
+/*
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+	if (BTreeTupleIsPivot(itup))
+	{
+		/* Pivot tuple heap TID representation? */
+		if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+			 BT_PIVOT_HEAP_TID_ATTR) != 0)
+			return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+								  sizeof(ItemPointerData));
+
+		/* Heap TID attribute was truncated */
+		return NULL;
+	}
+	else if (BTreeTupleIsPosting(itup))
+		return BTreeTupleGetPosting(itup);
+
+	return &itup->t_tid;
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.
+ *
+ * Works with non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (BTreeTupleIsPosting(itup))
+	{
+		uint16		nposting = BTreeTupleGetNPosting(itup);
+
+		return BTreeTupleGetPostingN(itup, nposting - 1);
+	}
+
+	return &itup->t_tid;
+}

 /*
 *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
@ -439,6 +629,9 @@ typedef BTStackData *BTStack;
 * indexes whose version is >= version 4.  It's convenient to keep this close
 * by, rather than accessing the metapage repeatedly.
 *
+ * allequalimage is set to indicate that deduplication is safe for the index.
+ * This is also a property of the index relation rather than an indexscan.
+ *
 * anynullkeys indicates if any of the keys had NULL value when scankey was
 * built from index tuple (note that already-truncated tuple key attributes
 * set NULL as a placeholder key value, which also affects value of
@ -474,6 +667,7 @@ typedef BTStackData *BTStack;
 typedef struct BTScanInsertData
 {
 	bool		heapkeyspace;
+	bool		allequalimage;
 	bool		anynullkeys;
 	bool		nextkey;
 	bool		pivotsearch;
@ -512,10 +706,94 @@ typedef struct BTInsertStateData
 	bool		bounds_valid;
 	OffsetNumber low;
 	OffsetNumber stricthigh;
+
+	/*
+	 * if _bt_binsrch_insert found the location inside existing posting list,
+	 * save the position inside the list.  -1 sentinel value indicates overlap
+	 * with an existing posting list tuple that has its LP_DEAD bit set.
+	 */
+	int			postingoff;
 } BTInsertStateData;

 typedef BTInsertStateData *BTInsertState;

+/*
+ * State used to representing an individual pending tuple during
+ * deduplication.
+ */
+typedef struct BTDedupInterval
+{
+	OffsetNumber baseoff;
+	uint16		nitems;
+} BTDedupInterval;
+
+/*
+ * BTDedupStateData is a working area used during deduplication.
+ *
+ * The status info fields track the state of a whole-page deduplication pass.
+ * State about the current pending posting list is also tracked.
+ *
+ * A pending posting list is comprised of a contiguous group of equal items
+ * from the page, starting from page offset number 'baseoff'.  This is the
+ * offset number of the "base" tuple for new posting list.  'nitems' is the
+ * current total number of existing items from the page that will be merged to
+ * make a new posting list tuple, including the base tuple item.  (Existing
+ * items may themselves be posting list tuples, or regular non-pivot tuples.)
+ *
+ * The total size of the existing tuples to be freed when pending posting list
+ * is processed gets tracked by 'phystupsize'.  This information allows
+ * deduplication to calculate the space saving for each new posting list
+ * tuple, and for the entire pass over the page as a whole.
+ */
+typedef struct BTDedupStateData
+{
+	/* Deduplication status info for entire pass over page */
+	bool		deduplicate;	/* Still deduplicating page? */
+	Size		maxpostingsize; /* Limit on size of final tuple */
+
+	/* Metadata about base tuple of current pending posting list */
+	IndexTuple	base;			/* Use to form new posting list */
+	OffsetNumber baseoff;		/* page offset of base */
+	Size		basetupsize;	/* base size without original posting list */
+
+	/* Other metadata about pending posting list */
+	ItemPointer htids;			/* Heap TIDs in pending posting list */
+	int			nhtids;			/* Number of heap TIDs in htids array */
+	int			nitems;			/* Number of existing tuples/line pointers */
+	Size		phystupsize;	/* Includes line pointer overhead */
+
+	/*
+	 * Array of tuples to go on new version of the page.  Contains one entry
+	 * for each group of consecutive items.  Note that existing tuples that
+	 * will not become posting list tuples do not appear in the array (they
+	 * are implicitly unchanged by deduplication pass).
+	 */
+	int			nintervals;		/* current size of intervals array */
+	BTDedupInterval intervals[MaxIndexTuplesPerPage];
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
+/*
+ * BTVacuumPostingData is state that represents how to VACUUM a posting list
+ * tuple when some (though not all) of its TIDs are to be deleted.
+ *
+ * Convention is that itup field is the original posting list tuple on input,
+ * and palloc()'d final tuple used to overwrite existing tuple on output.
+ */
+typedef struct BTVacuumPostingData
+{
+	/* Tuple that will be/was updated */
+	IndexTuple	itup;
+	OffsetNumber updatedoffset;
+
+	/* State needed to describe final itup in WAL */
+	uint16		ndeletedtids;
+	uint16		deletetids[FLEXIBLE_ARRAY_MEMBER];
+} BTVacuumPostingData;
+
+typedef BTVacuumPostingData *BTVacuumPosting;
+
 /*
 * BTScanOpaqueData is the btree-private state needed for an indexscan.
 * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState;
 * If we are doing an index-only scan, we save the entire IndexTuple for each
 * matched item, otherwise only its heap TID and offset.  The IndexTuples go
 * into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array.  Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each TID in the posting list
+ * tuple.
 */

 typedef struct BTScanPosItem	/* what we remember about each match */
@ -583,7 +863,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */

-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxTIDsPerBTreePage];	/* MUST BE LAST */
 } BTScanPosData;

 typedef BTScanPosData *BTScanPos;
@ -691,6 +971,7 @@ typedef struct BTOptions
 	int			fillfactor;		/* page fill factor in percent (0..100) */
 	/* fraction of newly inserted tuples prior to trigger index cleanup */
 	float8		vacuum_cleanup_index_scale_factor;
+	bool		deduplicate_items;	/* Try to deduplicate items? */
 } BTOptions;

 #define BTGetFillFactor(relation) \
@ -701,6 +982,11 @@ typedef struct BTOptions
 	 BTREE_DEFAULT_FILLFACTOR)
 #define BTGetTargetPageFreeSpace(relation) \
 	(BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetDeduplicateItems(relation) \
+	(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+				 relation->rd_rel->relam == BTREE_AM_OID), \
+	((relation)->rd_options ? \
+	 ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))

 /*
 * Constant definition for progress reporting.  Phase numbers must match
@ -747,6 +1033,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);

+/*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+							   IndexTuple newitem, Size newitemsz,
+							   bool checkingunique);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+									OffsetNumber baseoff);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
+extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
+								   int nhtids);
+extern void _bt_update_posting(BTVacuumPosting vacposting);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+								   int postingoff);
+
 /*
 * prototypes for functions in nbtinsert.c
 */
@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 /*
 * prototypes for functions in nbtpage.c
 */
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+							 bool allequalimage);
 extern void _bt_update_meta_cleanup_info(Relation rel,
 										 TransactionId oldestBtpoXact, float8 numHeapTuples);
 extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
-extern bool _bt_heapkeyspace(Relation rel);
+extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
+							bool *allequalimage);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
-								OffsetNumber *deletable, int ndeletable);
+								OffsetNumber *deletable, int ndeletable,
+								BTVacuumPosting *updatable, int nupdatable);
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *deletable, int ndeletable,
 								Relation heapRel);
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@ -28,7 +28,8 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_INSERT_POST	0x50	/* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP		0x60	/* deduplicate tuples for a page */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@ -53,21 +54,34 @@ typedef struct xl_btree_metadata
 	uint32		fastlevel;
 	TransactionId oldest_btpo_xact;
 	float8		last_cleanup_num_heap_tuples;
+	bool		allequalimage;
 } xl_btree_metadata;

 /*
 * This is what we need to know about simple (without split) insert.
 *
- * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
- * Note that INSERT_META implies it's not a leaf page.
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
 *
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page
 * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
 * Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case).  A split offset for
+ * the posting list is logged before the original new item.  Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step.  Also, recovery generates a "final"
+ * newitem.  See _bt_swap_posting() for details on posting list splits.
 */
 typedef struct xl_btree_insert
 {
 	OffsetNumber offnum;
+
+	/* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+	/* NEW TUPLE ALWAYS FOLLOWS AT THE END */
 } xl_btree_insert;

 #define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
@ -92,8 +106,37 @@ typedef struct xl_btree_insert
 * Backup Blk 0: original page / new left page
 *
 * The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list).  The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after).  A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split).  Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that).  However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple).  The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone.  (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling.  While the details are more complicated
+ * here, the concept and goals are exactly the same.  See _bt_swap_posting()
+ * for details on posting list splits.
 *
 * Backup Blk 1: new right page
 *
@ -111,15 +154,33 @@ typedef struct xl_btree_split
 {
 	uint32		level;			/* tree level of page being split */
 	OffsetNumber firstright;	/* first item moved to right page */
-	OffsetNumber newitemoff;	/* new item's offset (useful for _L variant) */
+	OffsetNumber newitemoff;	/* new item's offset */
+	uint16		postingoff;		/* offset inside orig posting tuple */
 } xl_btree_split;

-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page.  An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+	uint16		nintervals;
+
+	/* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))

 /*
 * This is what we need to know about delete of individual leaf index tuples.
 * The WAL record can represent deletion of any number of index tuples on a
- * single index page when *not* executed by VACUUM.
+ * single index page when *not* executed by VACUUM.  Deletion of a subset of
+ * the TIDs within a posting list tuple is not supported.
 *
 * Backup Blk 0: index page
 */
@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page
 #define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))

 /*
- * This is what we need to know about vacuum of individual leaf index tuples.
- * The WAL record can represent deletion of any number of index tuples on a
- * single index page when executed by VACUUM.
+ * This is what we need to know about which TIDs to remove from an individual
+ * posting list tuple during vacuuming.  An array of these may appear at the
+ * end of xl_btree_vacuum records.
+ */
+typedef struct xl_btree_update
+{
+	uint16		ndeletedtids;
+
+	/* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate	(offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a VACUUM of a leaf page.  The WAL record
+ * can represent deletion of any number of index tuples on a single index page
+ * when executed by VACUUM.  It can also support "updates" of index tuples,
+ * which is how deletes of a subset of TIDs contained in an existing posting
+ * list tuple are implemented. (Updates are only used when there will be some
+ * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
+ * just be deleted).
 *
- * Note that the WAL record in any vacuum of an index must have at least one
- * item to delete.
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routine uses each xl_btree_update (plus its corresponding original
+ * index tuple from the target leaf page) to generate the final updated tuple.
 */
 typedef struct xl_btree_vacuum
 {
-	uint32		ndeleted;
+	uint16		ndeleted;
+	uint16		nupdated;

 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES METADATA ARRAY FOLLOWS */
 } xl_btree_vacuum;

-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))

 /*
 * This is what we need to know about marking an empty branch for deletion.
@ -245,6 +328,8 @@ typedef struct xl_btree_newroot
 extern void btree_redo(XLogReaderState *record);
 extern void btree_desc(StringInfo buf, XLogReaderState *record);
 extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);

 #endif							/* NBTXLOG_H */
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
 PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
 PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
 PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
 PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@ -31,7 +31,7 @@
 /*
 * Each page of XLOG file has a header like this:
 */
-#define XLOG_PAGE_MAGIC 0xD104	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD105	/* can be used as WAL version indicator */

 typedef struct XLogPageHeaderData
 {
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@ -200,7 +200,7 @@ reset enable_indexscan;
 reset enable_bitmapscan;
 -- Also check LIKE optimization with binary-compatible cases
 create temp table btree_bpchar (f1 text collate "C");
-create index on btree_bpchar(f1 bpchar_ops);
+create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
 insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
 -- doesn't match index:
 explain (costs off)
@ -266,6 +266,24 @@ select * from btree_bpchar where f1::bpchar like 'foo%';
 fool
 (2 rows)

+-- get test coverage for "single value" deduplication strategy:
+insert into btree_bpchar select 'foo' from generate_series(1,1500);
+--
+-- Perform unique checking, with and without the use of deduplication
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+    FOR r IN 1..1350 LOOP
+        DELETE FROM dedup_unique_test_table;
+        INSERT INTO dedup_unique_test_table SELECT 1;
+    END LOOP;
+END$$;
 --
 -- Test B-tree fast path (cache rightmost leaf page) optimization.
 --
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@ -86,7 +86,7 @@ reset enable_bitmapscan;
 -- Also check LIKE optimization with binary-compatible cases

 create temp table btree_bpchar (f1 text collate "C");
-create index on btree_bpchar(f1 bpchar_ops);
+create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
 insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
 -- doesn't match index:
 explain (costs off)
@ -103,6 +103,26 @@ explain (costs off)
 select * from btree_bpchar where f1::bpchar like 'foo%';
 select * from btree_bpchar where f1::bpchar like 'foo%';

+-- get test coverage for "single value" deduplication strategy:
+insert into btree_bpchar select 'foo' from generate_series(1,1500);
+
+--
+-- Perform unique checking, with and without the use of deduplication
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+    FOR r IN 1..1350 LOOP
+        DELETE FROM dedup_unique_test_table;
+        INSERT INTO dedup_unique_test_table SELECT 1;
+    END LOOP;
+END$$;
+
 --
 -- Test B-tree fast path (cache rightmost leaf page) optimization.
 --