Refactor nbtree insertion scankeys.

Use dedicated struct to represent nbtree insertion scan keys. Having a dedicated struct makes the difference between search type scankeys and insertion scankeys a lot clearer, and simplifies the signature of several related functions. This is based on a suggestion by Andrey Lepikhov. Streamline how unique index insertions cache binary search progress. Cache the state of in-progress binary searches within _bt_check_unique() for later instead of having callers avoid repeating the binary search in an ad-hoc manner. This makes it easy to add a new optimization: _bt_check_unique() now falls out of its loop immediately in the common case where it's already clear that there couldn't possibly be a duplicate. The new _bt_check_unique() scheme makes it a lot easier to manage cached binary search effort afterwards, from within _bt_findinsertloc(). This is needed for the upcoming patch to make nbtree tuples unique by treating heap TID as a final tiebreaker column. Unique key binary searches need to restore lower and upper bounds. They cannot simply continue to use the >= lower bound as the offset to insert at, because the heap TID tiebreaker column must be used in comparisons for the restored binary search (unlike the original _bt_check_unique() binary search, where scankey's heap TID column must be omitted). Author: Peter Geoghegan, Heikki Linnakangas Reviewed-By: Heikki Linnakangas, Andrey Lepikhov Discussion: https://postgr.es/m/CAH2-WzmE6AhUdk9NdWBf4K3HjWXZBX3+umC7mH7+WDrKcRtsOw@mail.gmail.com
2019-03-20 09:30:57 -07:00 · 2019-03-20 09:30:57 -07:00 · e5adcb789d
parent 550b9d26f8
commit e5adcb789d
9 changed files with 532 additions and 390 deletions
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@ -127,9 +127,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 							 BtreeLevel level);
 static void bt_target_page_check(BtreeCheckState *state);
-static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
-static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
-				  ScanKey targetkey);
+static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
+static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
+				  BlockNumber childblock);
 static void bt_downlink_missing_check(BtreeCheckState *state);
 static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 						  Datum *values, bool *isnull,
@ -139,14 +139,14 @@ static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 							OffsetNumber offset);
 static inline bool invariant_leq_offset(BtreeCheckState *state,
-					 ScanKey key,
+					 BTScanInsert key,
 					 OffsetNumber upperbound);
 static inline bool invariant_geq_offset(BtreeCheckState *state,
-					 ScanKey key,
+					 BTScanInsert key,
 					 OffsetNumber lowerbound);
 static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
-							   Page other,
-							   ScanKey key,
+							   BTScanInsert key,
+							   Page nontarget,
 							   OffsetNumber upperbound);
 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);

@ -838,8 +838,8 @@ bt_target_page_check(BtreeCheckState *state)
 	{
 		ItemId		itemid;
 		IndexTuple	itup;
-		ScanKey		skey;
 		size_t		tupsize;
+		BTScanInsert skey;

 		CHECK_FOR_INTERRUPTS();

@ -1030,7 +1030,7 @@ bt_target_page_check(BtreeCheckState *state)
 		 */
 		else if (offset == max)
 		{
-			ScanKey		rightkey;
+			BTScanInsert	rightkey;

 			/* Get item in next/right page */
 			rightkey = bt_right_page_check_scankey(state);
@ -1082,7 +1082,7 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);

-			bt_downlink_check(state, childblock, skey);
+			bt_downlink_check(state, skey, childblock);
 		}
 	}

@ -1111,11 +1111,12 @@ bt_target_page_check(BtreeCheckState *state)
 * Note that !readonly callers must reverify that target page has not
 * been concurrently deleted.
 */
-static ScanKey
+static BTScanInsert
 bt_right_page_check_scankey(BtreeCheckState *state)
 {
 	BTPageOpaque opaque;
 	ItemId		rightitem;
+	IndexTuple	firstitup;
 	BlockNumber targetnext;
 	Page		rightpage;
 	OffsetNumber nline;
@ -1303,8 +1304,8 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	 * Return first real item scankey.  Note that this relies on right page
 	 * memory remaining allocated.
 	 */
-	return _bt_mkscankey(state->rel,
-						 (IndexTuple) PageGetItem(rightpage, rightitem));
+	firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
+	return _bt_mkscankey(state->rel, firstitup);
 }

 /*
@ -1317,8 +1318,8 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 * verification this way around is much more practical.
 */
 static void
-bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
-				  ScanKey targetkey)
+bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
+				  BlockNumber childblock)
 {
 	OffsetNumber offset;
 	OffsetNumber maxoffset;
@ -1423,8 +1424,7 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
 		if (offset_is_negative_infinity(copaque, offset))
 			continue;

-		if (!invariant_leq_nontarget_offset(state, child,
-											targetkey, offset))
+		if (!invariant_leq_nontarget_offset(state, targetkey, child, offset))
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("down-link lower bound invariant violated for index \"%s\"",
@ -1864,13 +1864,12 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
 * to corruption.
 */
 static inline bool
-invariant_leq_offset(BtreeCheckState *state, ScanKey key,
+invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 					 OffsetNumber upperbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;

-	cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, key, state->target, upperbound);

 	return cmp <= 0;
 }
@ -1883,13 +1882,12 @@ invariant_leq_offset(BtreeCheckState *state, ScanKey key,
 * to corruption.
 */
 static inline bool
-invariant_geq_offset(BtreeCheckState *state, ScanKey key,
+invariant_geq_offset(BtreeCheckState *state, BTScanInsert key,
 					 OffsetNumber lowerbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;

-	cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound);
+	cmp = _bt_compare(state->rel, key, state->target, lowerbound);

 	return cmp >= 0;
 }
@ -1905,14 +1903,12 @@ invariant_geq_offset(BtreeCheckState *state, ScanKey key,
 * to corruption.
 */
 static inline bool
-invariant_leq_nontarget_offset(BtreeCheckState *state,
-							   Page nontarget, ScanKey key,
-							   OffsetNumber upperbound)
+invariant_leq_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
+							   Page nontarget, OffsetNumber upperbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;

-	cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound);
+	cmp = _bt_compare(state->rel, key, nontarget, upperbound);

 	return cmp <= 0;
 }
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@ -598,19 +598,22 @@ scankey point to comparison functions that return boolean, such as int4lt.
 There might be more than one scankey entry for a given index column, or
 none at all.  (We require the keys to appear in index column order, but
 the order of multiple keys for a given column is unspecified.)  An
-insertion scankey uses the same array-of-ScanKey data structure, but the
-sk_func pointers point to btree comparison support functions (ie, 3-way
-comparators that return int4 values interpreted as <0, =0, >0).  In an
-insertion scankey there is exactly one entry per index column.  Insertion
-scankeys are built within the btree code (eg, by _bt_mkscankey()) and are
-used to locate the starting point of a scan, as well as for locating the
-place to insert a new index tuple.  (Note: in the case of an insertion
-scankey built from a search scankey, there might be fewer keys than
-index columns, indicating that we have no constraints for the remaining
-index columns.)  After we have located the starting point of a scan, the
-original search scankey is consulted as each index entry is sequentially
-scanned to decide whether to return the entry and whether the scan can
-stop (see _bt_checkkeys()).
+insertion scankey ("BTScanInsert" data structure) uses a similar
+array-of-ScanKey data structure, but the sk_func pointers point to btree
+comparison support functions (ie, 3-way comparators that return int4 values
+interpreted as <0, =0, >0).  In an insertion scankey there is at most one
+entry per index column.  There is also other data about the rules used to
+locate where to begin the scan, such as whether or not the scan is a
+"nextkey" scan.  Insertion scankeys are built within the btree code (eg, by
+_bt_mkscankey()) and are used to locate the starting point of a scan, as
+well as for locating the place to insert a new index tuple.  (Note: in the
+case of an insertion scankey built from a search scankey or built from a
+truncated pivot tuple, there might be fewer keys than index columns,
+indicating that we have no constraints for the remaining index columns.)
+After we have located the starting point of a scan, the original search
+scankey is consulted as each index entry is sequentially scanned to decide
+whether to return the entry and whether the scan can stop (see
+_bt_checkkeys()).

 We use term "pivot" index tuples to distinguish tuples which don't point
 to heap tuples, but rather used for tree navigation.  Pivot tuples includes
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@ -51,19 +51,16 @@ typedef struct

 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

-static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
-				 Relation heapRel, Buffer buf, OffsetNumber offset,
-				 ScanKey itup_scankey,
+static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
+				 Relation heapRel,
 				 IndexUniqueCheck checkUnique, bool *is_unique,
 				 uint32 *speculativeToken);
-static void _bt_findinsertloc(Relation rel,
-				  Buffer *bufptr,
-				  OffsetNumber *offsetptr,
-				  int keysz,
-				  ScanKey scankey,
-				  IndexTuple newtup,
+static OffsetNumber _bt_findinsertloc(Relation rel,
+				  BTInsertState insertstate,
+				  bool checkingunique,
 				  BTStack stack,
 				  Relation heapRel);
+static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
 static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
 			   BTStack stack,
 			   IndexTuple itup,
@ -83,8 +80,8 @@ static void _bt_checksplitloc(FindSplitData *state,
 				  int dataitemstoleft, Size firstoldonrightsz);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 			 OffsetNumber itup_off);
-static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
-			int keysz, ScanKey scankey);
+static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key,
+			Page page, OffsetNumber offnum);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);

 /*
@ -110,18 +107,26 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 			 IndexUniqueCheck checkUnique, Relation heapRel)
 {
 	bool		is_unique = false;
-	int			indnkeyatts;
-	ScanKey		itup_scankey;
+	BTInsertStateData insertstate;
+	BTScanInsert itup_key;
 	BTStack		stack = NULL;
 	Buffer		buf;
-	OffsetNumber offset;
 	bool		fastpath;
-
-	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
-	Assert(indnkeyatts != 0);
+	bool		checkingunique = (checkUnique != UNIQUE_CHECK_NO);

 	/* we need an insertion scan key to do our search, so build one */
-	itup_scankey = _bt_mkscankey(rel, itup);
+	itup_key = _bt_mkscankey(rel, itup);
+
+	/*
+	 * Fill in the BTInsertState working area, to track the current page and
+	 * position within the page to insert on
+	 */
+	insertstate.itup = itup;
+	/* PageAddItem will MAXALIGN(), but be consistent */
+	insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
+	insertstate.itup_key = itup_key;
+	insertstate.bounds_valid = false;
+	insertstate.buf = InvalidBuffer;

 	/*
 	 * It's very common to have an index on an auto-incremented or
@ -144,10 +149,8 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	 */
 top:
 	fastpath = false;
-	offset = InvalidOffsetNumber;
 	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
 	{
-		Size		itemsz;
 		Page		page;
 		BTPageOpaque lpageop;

@ -166,9 +169,6 @@ top:
 			page = BufferGetPage(buf);

 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
-			itemsz = IndexTupleSize(itup);
-			itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this
-										 * but we need to be consistent */

 			/*
 			 * Check if the page is still the rightmost leaf page, has enough
@ -177,10 +177,9 @@ top:
 			 */
 			if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
 				!P_IGNORE(lpageop) &&
-				(PageGetFreeSpace(page) > itemsz) &&
+				(PageGetFreeSpace(page) > insertstate.itemsz) &&
 				PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
-				_bt_compare(rel, indnkeyatts, itup_scankey, page,
-							P_FIRSTDATAKEY(lpageop)) > 0)
+				_bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
 			{
 				/*
 				 * The right-most block should never have an incomplete split.
@ -219,10 +218,12 @@ top:
 		 * Find the first page containing this key.  Buffer returned by
 		 * _bt_search() is locked in exclusive mode.
 		 */
-		stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
-						   NULL);
+		stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
 	}

+	insertstate.buf = buf;
+	buf = InvalidBuffer;		/* insertstate.buf now owns the buffer */
+
 	/*
 	 * If we're not allowing duplicates, make sure the key isn't already in
 	 * the index.
@ -244,19 +245,19 @@ top:
 	 * let the tuple in and return false for possibly non-unique, or true for
 	 * definitely unique.
 	 */
-	if (checkUnique != UNIQUE_CHECK_NO)
+	if (checkingunique)
 	{
 		TransactionId xwait;
 		uint32		speculativeToken;

-		offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
-		xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
-								 checkUnique, &is_unique, &speculativeToken);
+		xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
+								 &is_unique, &speculativeToken);

 		if (TransactionIdIsValid(xwait))
 		{
 			/* Have to wait for the other guy ... */
-			_bt_relbuf(rel, buf);
+			_bt_relbuf(rel, insertstate.buf);
+			insertstate.buf = InvalidBuffer;

 			/*
 			 * If it's a speculative insertion, wait for it to finish (ie. to
@ -277,6 +278,8 @@ top:

 	if (checkUnique != UNIQUE_CHECK_EXISTING)
 	{
+		OffsetNumber newitemoff;
+
 		/*
 		 * The only conflict predicate locking cares about for indexes is when
 		 * an index tuple insert conflicts with an existing lock.  Since the
@ -286,22 +289,28 @@ top:
 		 * This reasoning also applies to INCLUDE indexes, whose extra
 		 * attributes are not considered part of the key space.
 		 */
-		CheckForSerializableConflictIn(rel, NULL, buf);
-		/* do the insertion */
-		_bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
-						  stack, heapRel);
-		_bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
+		CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
+
+		/*
+		 * Do the insertion.  Note that insertstate contains cached binary
+		 * search bounds established within _bt_check_unique when insertion is
+		 * checkingunique.
+		 */
+		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
+									   stack, heapRel);
+		_bt_insertonpg(rel, insertstate.buf, InvalidBuffer, stack, itup,
+					   newitemoff, false);
 	}
 	else
 	{
 		/* just release the buffer */
-		_bt_relbuf(rel, buf);
+		_bt_relbuf(rel, insertstate.buf);
 	}

 	/* be tidy */
 	if (stack)
 		_bt_freestack(stack);
-	_bt_freeskey(itup_scankey);
+	pfree(itup_key);

 	return is_unique;
 }
@ -309,10 +318,6 @@ top:
 /*
 *	_bt_check_unique() -- Check for violation of unique index constraint
 *
- * offset points to the first possible item that could conflict. It can
- * also point to end-of-page, which means that the first tuple to check
- * is the first tuple on the next page.
- *
 * Returns InvalidTransactionId if there is no conflict, else an xact ID
 * we must wait for to see if it commits a conflicting tuple.   If an actual
 * conflict is detected, no return --- just ereport().  If an xact ID is
@ -324,16 +329,21 @@ top:
 * InvalidTransactionId because we don't want to wait.  In this case we
 * set *is_unique to false if there is a potential conflict, and the
 * core code must redo the uniqueness check later.
+ *
+ * As a side-effect, sets state in insertstate that can later be used by
+ * _bt_findinsertloc() to reuse most of the binary search work we do
+ * here.
 */
 static TransactionId
-_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
-				 Buffer buf, OffsetNumber offset, ScanKey itup_scankey,
+_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 IndexUniqueCheck checkUnique, bool *is_unique,
 				 uint32 *speculativeToken)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
-	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	IndexTuple	itup = insertstate->itup;
+	BTScanInsert itup_key = insertstate->itup_key;
 	SnapshotData SnapshotDirty;
+	OffsetNumber offset;
 	OffsetNumber maxoff;
 	Page		page;
 	BTPageOpaque opaque;
@ -345,13 +355,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,

 	InitDirtySnapshot(SnapshotDirty);

-	page = BufferGetPage(buf);
+	page = BufferGetPage(insertstate->buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);

+	/*
+	 * Find the first tuple with the same key.
+	 *
+	 * This also saves the binary search bounds in insertstate.  We use them
+	 * in the fastpath below, but also in the _bt_findinsertloc() call later.
+	 */
+	offset = _bt_binsrch_insert(rel, insertstate);
+
 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
 	 */
+	Assert(!insertstate->bounds_valid || insertstate->low == offset);
 	for (;;)
 	{
 		ItemId		curitemid;
@ -364,21 +383,40 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 		 */
 		if (offset <= maxoff)
 		{
+			/*
+			 * Fastpath: In most cases, we can use cached search bounds to
+			 * limit our consideration to items that are definitely
+			 * duplicates.  This fastpath doesn't apply when the original page
+			 * is empty, or when initial offset is past the end of the
+			 * original page, which may indicate that we need to examine a
+			 * second or subsequent page.
+			 *
+			 * Note that this optimization avoids calling _bt_isequal()
+			 * entirely when there are no duplicates, as long as the offset
+			 * where the key will go is not at the end of the page.
+			 */
+			if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
+			{
+				Assert(insertstate->bounds_valid);
+				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
+				Assert(insertstate->low <= insertstate->stricthigh);
+				Assert(!_bt_isequal(itupdesc, itup_key, page, offset));
+				break;
+			}
+
 			curitemid = PageGetItemId(page, offset);

 			/*
 			 * We can skip items that are marked killed.
 			 *
-			 * Formerly, we applied _bt_isequal() before checking the kill
-			 * flag, so as to fall out of the item loop as soon as possible.
-			 * However, in the presence of heavy update activity an index may
-			 * contain many killed items with the same key; running
-			 * _bt_isequal() on each killed item gets expensive. Furthermore
-			 * it is likely that the non-killed version of each key appears
-			 * first, so that we didn't actually get to exit any sooner
-			 * anyway. So now we just advance over killed items as quickly as
-			 * we can. We only apply _bt_isequal() when we get to a non-killed
-			 * item or the end of the page.
+			 * In the presence of heavy update activity an index may contain
+			 * many killed items with the same key; running _bt_isequal() on
+			 * each killed item gets expensive.  Just advance over killed
+			 * items as quickly as we can.  We only apply _bt_isequal() when
+			 * we get to a non-killed item.  Even those comparisons could be
+			 * avoided (in the common case where there is only one page to
+			 * visit) by reusing bounds, but just skipping dead items is fast
+			 * enough.
 			 */
 			if (!ItemIdIsDead(curitemid))
 			{
@ -391,7 +429,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 				 * in real comparison, but only for ordering/finding items on
 				 * pages. - vadim 03/24/97
 				 */
-				if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
+				if (!_bt_isequal(itupdesc, itup_key, page, offset))
 					break;		/* we're past all the equal tuples */

 				/* okay, we gotta fetch the heap tuple ... */
@ -488,7 +526,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					 * otherwise be masked by this unique constraint
 					 * violation.
 					 */
-					CheckForSerializableConflictIn(rel, NULL, buf);
+					CheckForSerializableConflictIn(rel, NULL, insertstate->buf);

 					/*
 					 * This is a definite conflict.  Break the tuple down into
@ -500,7 +538,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					 */
 					if (nbuf != InvalidBuffer)
 						_bt_relbuf(rel, nbuf);
-					_bt_relbuf(rel, buf);
+					_bt_relbuf(rel, insertstate->buf);
+					insertstate->buf = InvalidBuffer;

 					{
 						Datum		values[INDEX_MAX_KEYS];
@ -540,7 +579,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					if (nbuf != InvalidBuffer)
 						MarkBufferDirtyHint(nbuf, true);
 					else
-						MarkBufferDirtyHint(buf, true);
+						MarkBufferDirtyHint(insertstate->buf, true);
 				}
 			}
 		}
@ -552,11 +591,14 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 			offset = OffsetNumberNext(offset);
 		else
 		{
+			int			highkeycmp;
+
 			/* If scankey == hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
-			if (!_bt_isequal(itupdesc, page, P_HIKEY,
-							 indnkeyatts, itup_scankey))
+			highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
+			Assert(highkeycmp <= 0);
+			if (highkeycmp != 0)
 				break;
 			/* Advance to next non-dead page --- there must be one */
 			for (;;)
@ -600,57 +642,41 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 /*
 *	_bt_findinsertloc() -- Finds an insert location for a tuple
 *
+ *		On entry, insertstate buffer contains the first legal page the new
+ *		tuple could be inserted to.  It is exclusive-locked and pinned by the
+ *		caller.
+ *
 *		If the new key is equal to one or more existing keys, we can
 *		legitimately place it anywhere in the series of equal keys --- in fact,
 *		if the new key is equal to the page's "high key" we can place it on
 *		the next page.  If it is equal to the high key, and there's not room
 *		to insert the new tuple on the current page without splitting, then
 *		we can move right hoping to find more free space and avoid a split.
- *		(We should not move right indefinitely, however, since that leads to
- *		O(N^2) insertion behavior in the presence of many equal keys.)
- *		Once we have chosen the page to put the key on, we'll insert it before
- *		any existing equal keys because of the way _bt_binsrch() works.
+ *		Furthermore, if there's not enough room on a page, we try to make
+ *		room by removing any LP_DEAD tuples.
 *
- *		If there's not enough room in the space, we try to make room by
- *		removing any LP_DEAD tuples.
+ *		On exit, insertstate buffer contains the chosen insertion page, and
+ *		the offset within that page is returned.  If _bt_findinsertloc needed
+ *		to move right, the lock and pin on the original page are released, and
+ *		the new buffer is exclusively locked and pinned instead.
 *
- *		On entry, *bufptr and *offsetptr point to the first legal position
- *		where the new tuple could be inserted.  The caller should hold an
- *		exclusive lock on *bufptr.  *offsetptr can also be set to
- *		InvalidOffsetNumber, in which case the function will search for the
- *		right location within the page if needed.  On exit, they point to the
- *		chosen insert location.  If _bt_findinsertloc decides to move right,
- *		the lock and pin on the original page will be released and the new
- *		page returned to the caller is exclusively locked instead.
- *
- *		newtup is the new tuple we're inserting, and scankey is an insertion
- *		type scan key for it.
+ *		If insertstate contains cached binary search bounds, we will take
+ *		advantage of them.  This avoids repeating comparisons that we made in
+ *		_bt_check_unique() already.
 */
-static void
+static OffsetNumber
 _bt_findinsertloc(Relation rel,
-				  Buffer *bufptr,
-				  OffsetNumber *offsetptr,
-				  int keysz,
-				  ScanKey scankey,
-				  IndexTuple newtup,
+				  BTInsertState insertstate,
+				  bool checkingunique,
 				  BTStack stack,
 				  Relation heapRel)
 {
-	Buffer		buf = *bufptr;
-	Page		page = BufferGetPage(buf);
-	Size		itemsz;
+	BTScanInsert itup_key = insertstate->itup_key;
+	Page		page = BufferGetPage(insertstate->buf);
 	BTPageOpaque lpageop;
-	bool		movedright,
-				vacuumed;
-	OffsetNumber newitemoff;
-	OffsetNumber firstlegaloff = *offsetptr;

 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

-	itemsz = IndexTupleSize(newtup);
-	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
-								 * need to be consistent */
-
 	/*
 	 * Check whether the item can fit on a btree page at all. (Eventually, we
 	 * ought to try to apply TOAST methods if not.) We actually need to be
@ -660,11 +686,11 @@ _bt_findinsertloc(Relation rel,
 	 *
 	 * NOTE: if you change this, see also the similar code in _bt_buildadd().
 	 */
-	if (itemsz > BTMaxItemSize(page))
+	if (insertstate->itemsz > BTMaxItemSize(page))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
-						itemsz, BTMaxItemSize(page),
+						insertstate->itemsz, BTMaxItemSize(page),
 						RelationGetRelationName(rel)),
 				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 						 "Consider a function index of an MD5 hash of the value, "
@ -690,100 +716,113 @@ _bt_findinsertloc(Relation rel,
 	 * excellent job of preventing O(N^2) behavior with many equal keys.
 	 *----------
 	 */
-	movedright = false;
-	vacuumed = false;
-	while (PageGetFreeSpace(page) < itemsz)
-	{
-		Buffer		rbuf;
-		BlockNumber rblkno;
+	Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
+	Assert(!insertstate->bounds_valid || checkingunique);

+	while (PageGetFreeSpace(page) < insertstate->itemsz)
+	{
 		/*
 		 * before considering moving right, see if we can obtain enough space
 		 * by erasing LP_DEAD items
 		 */
-		if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
+		if (P_HAS_GARBAGE(lpageop))
 		{
-			_bt_vacuum_one_page(rel, buf, heapRel);
+			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+			insertstate->bounds_valid = false;

-			/*
-			 * remember that we vacuumed this page, because that makes the
-			 * hint supplied by the caller invalid
-			 */
-			vacuumed = true;
-
-			if (PageGetFreeSpace(page) >= itemsz)
+			if (PageGetFreeSpace(page) >= insertstate->itemsz)
 				break;			/* OK, now we have enough space */
 		}

 		/*
-		 * nope, so check conditions (b) and (c) enumerated above
+		 * Nope, so check conditions (b) and (c) enumerated above
+		 *
+		 * The earlier _bt_check_unique() call may well have established a
+		 * strict upper bound on the offset for the new item.  If it's not the
+		 * last item of the page (i.e. if there is at least one tuple on the
+		 * page that's greater than the tuple we're inserting to) then we know
+		 * that the tuple belongs on this page.  We can skip the high key
+		 * check.
 		 */
+		if (insertstate->bounds_valid &&
+			insertstate->low <= insertstate->stricthigh &&
+			insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+			break;
+
 		if (P_RIGHTMOST(lpageop) ||
-			_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+			_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
 			random() <= (MAX_RANDOM_VALUE / 100))
 			break;

-		/*
-		 * step right to next non-dead page
-		 *
-		 * must write-lock that page before releasing write lock on current
-		 * page; else someone else's _bt_check_unique scan could fail to see
-		 * our insertion.  write locks on intermediate dead pages won't do
-		 * because we don't know when they will get de-linked from the tree.
-		 */
-		rbuf = InvalidBuffer;
-
-		rblkno = lpageop->btpo_next;
-		for (;;)
-		{
-			rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
-			page = BufferGetPage(rbuf);
-			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-			/*
-			 * If this page was incompletely split, finish the split now. We
-			 * do this while holding a lock on the left sibling, which is not
-			 * good because finishing the split could be a fairly lengthy
-			 * operation.  But this should happen very seldom.
-			 */
-			if (P_INCOMPLETE_SPLIT(lpageop))
-			{
-				_bt_finish_split(rel, rbuf, stack);
-				rbuf = InvalidBuffer;
-				continue;
-			}
-
-			if (!P_IGNORE(lpageop))
-				break;
-			if (P_RIGHTMOST(lpageop))
-				elog(ERROR, "fell off the end of index \"%s\"",
-					 RelationGetRelationName(rel));
-
-			rblkno = lpageop->btpo_next;
-		}
-		_bt_relbuf(rel, buf);
-		buf = rbuf;
-		movedright = true;
-		vacuumed = false;
+		_bt_stepright(rel, insertstate, stack);
+		/* Update local state after stepping right */
+		page = BufferGetPage(insertstate->buf);
+		lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 	}

 	/*
-	 * Now we are on the right page, so find the insert position. If we moved
-	 * right at all, we know we should insert at the start of the page. If we
-	 * didn't move right, we can use the firstlegaloff hint if the caller
-	 * supplied one, unless we vacuumed the page which might have moved tuples
-	 * around making the hint invalid. If we didn't move right or can't use
-	 * the hint, find the position by searching.
+	 * We should now be on the correct page.  Find the offset within the page
+	 * for the new tuple. (Possibly reusing earlier search bounds.)
 	 */
-	if (movedright)
-		newitemoff = P_FIRSTDATAKEY(lpageop);
-	else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
-		newitemoff = firstlegaloff;
-	else
-		newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+	Assert(P_RIGHTMOST(lpageop) ||
+		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);

-	*bufptr = buf;
-	*offsetptr = newitemoff;
+	return _bt_binsrch_insert(rel, insertstate);
+}
+
+/*
+ * Step right to next non-dead page, during insertion.
+ *
+ * This is a bit more complicated than moving right in a search.  We must
+ * write-lock the target page before releasing write lock on current page;
+ * else someone else's _bt_check_unique scan could fail to see our insertion.
+ * Write locks on intermediate dead pages won't do because we don't know when
+ * they will get de-linked from the tree.
+ */
+static void
+_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
+{
+	Page		page;
+	BTPageOpaque lpageop;
+	Buffer		rbuf;
+	BlockNumber rblkno;
+
+	page = BufferGetPage(insertstate->buf);
+	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	rbuf = InvalidBuffer;
+	rblkno = lpageop->btpo_next;
+	for (;;)
+	{
+		rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
+		page = BufferGetPage(rbuf);
+		lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		/*
+		 * If this page was incompletely split, finish the split now.  We do
+		 * this while holding a lock on the left sibling, which is not good
+		 * because finishing the split could be a fairly lengthy operation.
+		 * But this should happen very seldom.
+		 */
+		if (P_INCOMPLETE_SPLIT(lpageop))
+		{
+			_bt_finish_split(rel, rbuf, stack);
+			rbuf = InvalidBuffer;
+			continue;
+		}
+
+		if (!P_IGNORE(lpageop))
+			break;
+		if (P_RIGHTMOST(lpageop))
+			elog(ERROR, "fell off the end of index \"%s\"",
+				 RelationGetRelationName(rel));
+
+		rblkno = lpageop->btpo_next;
+	}
+	/* rbuf locked; unlock buf, update state for caller */
+	_bt_relbuf(rel, insertstate->buf);
+	insertstate->buf = rbuf;
+	insertstate->bounds_valid = false;
 }

 /*----------
@ -2312,24 +2351,21 @@ _bt_pgaddtup(Page page,
 * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
 */
 static bool
-_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
-			int keysz, ScanKey scankey)
+_bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
+			OffsetNumber offnum)
 {
 	IndexTuple	itup;
+	ScanKey		scankey;
 	int			i;

-	/* Better be comparing to a leaf item */
+	/* Better be comparing to a non-pivot item */
 	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
+	Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page)));

+	scankey = itup_key->scankeys;
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

-	/*
-	 * It's okay that we might perform a comparison against a truncated page
-	 * high key when caller needs to determine if _bt_check_unique scan must
-	 * continue on to the next page.  Caller never asks us to compare non-key
-	 * attributes within an INCLUDE index.
-	 */
-	for (i = 1; i <= keysz; i++)
+	for (i = 1; i <= itup_key->keysz; i++)
 	{
 		AttrNumber	attno;
 		Datum		datum;
@ -2377,6 +2413,8 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	Page		page = BufferGetPage(buffer);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

+	Assert(P_ISLEAF(opaque));
+
 	/*
 	 * Scan over all items to see which ones need to be deleted according to
 	 * LP_DEAD flags.
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@ -1371,7 +1371,7 @@ _bt_pagedel(Relation rel, Buffer buf)
 			 */
 			if (!stack)
 			{
-				ScanKey		itup_scankey;
+				BTScanInsert itup_key;
 				ItemId		itemid;
 				IndexTuple	targetkey;
 				Buffer		lbuf;
@ -1421,12 +1421,10 @@ _bt_pagedel(Relation rel, Buffer buf)
 				}

 				/* we need an insertion scan key for the search, so build one */
-				itup_scankey = _bt_mkscankey(rel, targetkey);
-				/* find the leftmost leaf page containing this key */
-				stack = _bt_search(rel,
-								   IndexRelationGetNumberOfKeyAttributes(rel),
-								   itup_scankey, false, &lbuf, BT_READ, NULL);
-				/* don't need a pin on the page */
+				itup_key = _bt_mkscankey(rel, targetkey);
+				/* get stack to leaf page by searching index */
+				stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
+				/* don't need a lock or second pin on the page */
 				_bt_relbuf(rel, lbuf);

 				/*
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@ -25,6 +25,7 @@


 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 			 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
@ -70,13 +71,9 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 *	_bt_search() -- Search the tree for a particular scankey,
 *		or more precisely for the first leaf page it could be on.
 *
- * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * The passed scankey is an insertion-type scankey (see nbtree/README),
 * but it can omit the rightmost column(s) of the index.
 *
- * When nextkey is false (the usual case), we are looking for the first
- * item >= scankey.  When nextkey is true, we are looking for the first
- * item strictly greater than scankey.
- *
 * Return value is a stack of parent-page pointers.  *bufP is set to the
 * address of the leaf-page buffer, which is read-locked and pinned.
 * No locks are held on the parent pages, however!
@ -92,8 +89,8 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 * during the search will be finished.
 */
 BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
-		   Buffer *bufP, int access, Snapshot snapshot)
+_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
+		   Snapshot snapshot)
 {
 	BTStack		stack_in = NULL;
 	int			page_access = BT_READ;
@ -129,8 +126,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * if the leaf page is split and we insert to the parent page).  But
 		 * this is a good opportunity to finish splits of internal pages too.
 		 */
-		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
-							  (access == BT_WRITE), stack_in,
+		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
 							  page_access, snapshot);

 		/* if this is a leaf page, we're done */
@ -143,7 +139,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * Find the appropriate item on the internal page, and get the child
 		 * page that it points to.
 		 */
-		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
+		offnum = _bt_binsrch(rel, key, *bufP);
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
 		blkno = BTreeInnerTupleGetDownLink(itup);
@ -197,8 +193,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * need to move right in the tree.  See Lehman and Yao for an
 		 * excruciatingly precise description.
 		 */
-		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
-							  true, stack_in, BT_WRITE, snapshot);
+		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
+							  snapshot);
 	}

 	return stack_in;
@ -214,16 +210,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 * or strictly to the right of it.
 *
 * This routine decides whether or not we need to move right in the
- * tree by examining the high key entry on the page.  If that entry
- * is strictly less than the scankey, or <= the scankey in the nextkey=true
- * case, then we followed the wrong link and we need to move right.
+ * tree by examining the high key entry on the page.  If that entry is
+ * strictly less than the scankey, or <= the scankey in the
+ * key.nextkey=true case, then we followed the wrong link and we need
+ * to move right.
 *
- * The passed scankey must be an insertion-type scankey (see nbtree/README),
- * but it can omit the rightmost column(s) of the index.
+ * The passed insertion-type scankey can omit the rightmost column(s) of the
+ * index. (see nbtree/README)
 *
- * When nextkey is false (the usual case), we are looking for the first
- * item >= scankey.  When nextkey is true, we are looking for the first
- * item strictly greater than scankey.
+ * When key.nextkey is false (the usual case), we are looking for the first
+ * item >= key.  When key.nextkey is true, we are looking for the first item
+ * strictly greater than key.
 *
 * If forupdate is true, we will attempt to finish any incomplete splits
 * that we encounter.  This is required when locking a target page for an
@ -240,10 +237,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 */
 Buffer
 _bt_moveright(Relation rel,
+			  BTScanInsert key,
 			  Buffer buf,
-			  int keysz,
-			  ScanKey scankey,
-			  bool nextkey,
 			  bool forupdate,
 			  BTStack stack,
 			  int access,
@ -268,7 +263,7 @@ _bt_moveright(Relation rel,
 	 * We also have to move right if we followed a link that brought us to a
 	 * dead page.
 	 */
-	cmpval = nextkey ? 0 : 1;
+	cmpval = key->nextkey ? 0 : 1;

 	for (;;)
 	{
@ -303,7 +298,7 @@ _bt_moveright(Relation rel,
 			continue;
 		}

-		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
 		{
 			/* step right one page */
 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
@ -323,13 +318,6 @@ _bt_moveright(Relation rel,
 /*
 *	_bt_binsrch() -- Do a binary search for a key on a particular page.
 *
- * The passed scankey must be an insertion-type scankey (see nbtree/README),
- * but it can omit the rightmost column(s) of the index.
- *
- * When nextkey is false (the usual case), we are looking for the first
- * item >= scankey.  When nextkey is true, we are looking for the first
- * item strictly greater than scankey.
- *
 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
 * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
 * particular, this means it is possible to return a value 1 greater than the
@ -347,12 +335,10 @@ _bt_moveright(Relation rel,
 * the given page.  _bt_binsrch() has no lock or refcount side effects
 * on the buffer.
 */
-OffsetNumber
+static OffsetNumber
 _bt_binsrch(Relation rel,
-			Buffer buf,
-			int keysz,
-			ScanKey scankey,
-			bool nextkey)
+			BTScanInsert key,
+			Buffer buf)
 {
 	Page		page;
 	BTPageOpaque opaque;
@ -374,7 +360,7 @@ _bt_binsrch(Relation rel,
 	 * This can never happen on an internal page, however, since they are
 	 * never empty (an internal page must have children).
 	 */
-	if (high < low)
+	if (unlikely(high < low))
 		return low;

 	/*
@ -391,7 +377,7 @@ _bt_binsrch(Relation rel,
 	 */
 	high++;						/* establish the loop invariant for high */

-	cmpval = nextkey ? 0 : 1;	/* select comparison value */
+	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */

 	while (high > low)
 	{
@ -399,7 +385,7 @@ _bt_binsrch(Relation rel,

 		/* We have low <= mid < high, so mid points at a real slot */

-		result = _bt_compare(rel, keysz, scankey, page, mid);
+		result = _bt_compare(rel, key, page, mid);

 		if (result >= cmpval)
 			low = mid + 1;
@ -426,14 +412,120 @@ _bt_binsrch(Relation rel,
 	return OffsetNumberPrev(low);
 }

+/*
+ *
+ *	bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
+ *
+ * Like _bt_binsrch(), but with support for caching the binary search
+ * bounds.  Only used during insertion, and only on the leaf page that it
+ * looks like caller will insert tuple on.  Exclusive-locked and pinned
+ * leaf page is contained within insertstate.
+ *
+ * Caches the bounds fields in insertstate so that a subsequent call can
+ * reuse the low and strict high bounds of original binary search.  Callers
+ * that use these fields directly must be prepared for the case where low
+ * and/or stricthigh are not on the same page (one or both exceed maxoff
+ * for the page).  The case where there are no items on the page (high <
+ * low) makes bounds invalid.
+ *
+ * Caller is responsible for invalidating bounds when it modifies the page
+ * before calling here a second time.
+ */
+OffsetNumber
+_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
+{
+	BTScanInsert key = insertstate->itup_key;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber low,
+				high,
+				stricthigh;
+	int32		result,
+				cmpval;
+
+	page = BufferGetPage(insertstate->buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	Assert(P_ISLEAF(opaque));
+	Assert(!key->nextkey);
+
+	if (!insertstate->bounds_valid)
+	{
+		/* Start new binary search */
+		low = P_FIRSTDATAKEY(opaque);
+		high = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		/* Restore result of previous binary search against same page */
+		low = insertstate->low;
+		high = insertstate->stricthigh;
+	}
+
+	/* If there are no keys on the page, return the first available slot */
+	if (unlikely(high < low))
+	{
+		/* Caller can't reuse bounds */
+		insertstate->low = InvalidOffsetNumber;
+		insertstate->stricthigh = InvalidOffsetNumber;
+		insertstate->bounds_valid = false;
+		return low;
+	}
+
+	/*
+	 * Binary search to find the first key on the page >= scan key. (nextkey
+	 * is always false when inserting).
+	 *
+	 * The loop invariant is: all slots before 'low' are < scan key, all slots
+	 * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
+	 * maintained to save additional search effort for caller.
+	 *
+	 * We can fall out when high == low.
+	 */
+	if (!insertstate->bounds_valid)
+		high++;					/* establish the loop invariant for high */
+	stricthigh = high;			/* high initially strictly higher */
+
+	cmpval = 1;					/* !nextkey comparison value */
+
+	while (high > low)
+	{
+		OffsetNumber mid = low + ((high - low) / 2);
+
+		/* We have low <= mid < high, so mid points at a real slot */
+
+		result = _bt_compare(rel, key, page, mid);
+
+		if (result >= cmpval)
+			low = mid + 1;
+		else
+		{
+			high = mid;
+			if (result != 0)
+				stricthigh = high;
+		}
+	}
+
+	/*
+	 * On a leaf page, a binary search always returns the first key >= scan
+	 * key (at least in !nextkey case), which could be the last slot + 1. This
+	 * is also the lower bound of cached search.
+	 *
+	 * stricthigh may also be the last slot + 1, which prevents caller from
+	 * using bounds directly, but is still useful to us if we're called a
+	 * second time with cached bounds (cached low will be < stricthigh when
+	 * that happens).
+	 */
+	insertstate->low = low;
+	insertstate->stricthigh = stricthigh;
+	insertstate->bounds_valid = true;
+
+	return low;
+}
+
 /*----------
- *	_bt_compare() -- Compare scankey to a particular tuple on the page.
+ *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
 *
- * The passed scankey must be an insertion-type scankey (see nbtree/README),
- * but it can omit the rightmost column(s) of the index.
- *
- *	keysz: number of key conditions to be checked (might be less than the
- *		number of index columns!)
 *	page/offnum: location of btree item to be compared to.
 *
 *		This routine returns:
@ -446,25 +538,26 @@ _bt_binsrch(Relation rel,
 *
 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 * "minus infinity": this routine will always claim it is less than the
- * scankey.  The actual key value stored (if any, which there probably isn't)
- * does not matter.  This convention allows us to implement the Lehman and
- * Yao convention that the first down-link pointer is before the first key.
- * See backend/access/nbtree/README for details.
+ * scankey.  The actual key value stored is explicitly truncated to 0
+ * attributes (explicitly minus infinity) with version 3+ indexes, but
+ * that isn't relied upon.  This allows us to implement the Lehman and
+ * Yao convention that the first down-link pointer is before the first
+ * key.  See backend/access/nbtree/README for details.
 *----------
 */
 int32
 _bt_compare(Relation rel,
-			int keysz,
-			ScanKey scankey,
+			BTScanInsert key,
 			Page page,
 			OffsetNumber offnum)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	IndexTuple	itup;
-	int			i;
+	ScanKey		scankey;

 	Assert(_bt_check_natts(rel, page, offnum));
+	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));

 	/*
 	 * Force result ">" if target item is first data item on an internal page
@ -487,7 +580,8 @@ _bt_compare(Relation rel,
 	 * _bt_first).
 	 */

-	for (i = 1; i <= keysz; i++)
+	scankey = key->scankeys;
+	for (int i = 1; i <= key->keysz; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@ -573,8 +667,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	StrategyNumber strat;
 	bool		nextkey;
 	bool		goback;
+	BTScanInsertData inskey;
 	ScanKey		startKeys[INDEX_MAX_KEYS];
-	ScanKeyData scankeys[INDEX_MAX_KEYS];
 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
 	int			keysCount = 0;
 	int			i;
@ -820,8 +914,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/*
 	 * We want to start the scan somewhere within the index.  Set up an
 	 * insertion scankey we can use to search for the boundary point we
-	 * identified above.  The insertion scankey is built in the local
-	 * scankeys[] array, using the keys identified by startKeys[].
+	 * identified above.  The insertion scankey is built using the keys
+	 * identified by startKeys[].  (Remaining insertion scankey fields are
+	 * initialized after initial-positioning strategy is finalized.)
 	 */
 	Assert(keysCount <= INDEX_MAX_KEYS);
 	for (i = 0; i < keysCount; i++)
@ -849,7 +944,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				_bt_parallel_done(scan);
 				return false;
 			}
-			memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
+			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));

 			/*
 			 * If the row comparison is the last positioning key we accepted,
@ -881,7 +976,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					if (subkey->sk_flags & SK_ISNULL)
 						break;	/* can't use null keys */
 					Assert(keysCount < INDEX_MAX_KEYS);
-					memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
+					memcpy(inskey.scankeys + keysCount, subkey,
+						   sizeof(ScanKeyData));
 					keysCount++;
 					if (subkey->sk_flags & SK_ROW_END)
 					{
@ -927,7 +1023,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				FmgrInfo   *procinfo;

 				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
-				ScanKeyEntryInitializeWithInfo(scankeys + i,
+				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
 											   cur->sk_flags,
 											   cur->sk_attno,
 											   InvalidStrategy,
@ -948,7 +1044,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
 						 cur->sk_attno, RelationGetRelationName(rel));
-				ScanKeyEntryInitialize(scankeys + i,
+				ScanKeyEntryInitialize(inskey.scankeys + i,
 									   cur->sk_flags,
 									   cur->sk_attno,
 									   InvalidStrategy,
@ -1051,12 +1147,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			return false;
 	}

+	/* Initialize remaining insertion scan key fields */
+	inskey.nextkey = nextkey;
+	inskey.keysz = keysCount;
+
 	/*
 	 * Use the manufactured insertion scan key to descend the tree and
 	 * position ourselves on the target leaf page.
 	 */
-	stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
-					   scan->xs_snapshot);
+	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);

 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
@ -1085,7 +1184,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	_bt_initialize_more_data(so, dir);

 	/* position to the precise item on the page */
-	offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
+	offnum = _bt_binsrch(rel, &inskey, buf);

 	/*
 	 * If nextkey = false, we are positioned at the first item >= scan key, or
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@ -263,6 +263,7 @@ typedef struct BTWriteState
 {
 	Relation	heap;
 	Relation	index;
+	BTScanInsert inskey;		/* generic insertion scankey */
 	bool		btws_use_wal;	/* dump pages to WAL? */
 	BlockNumber btws_pages_alloced; /* # pages allocated */
 	BlockNumber btws_pages_written; /* # pages written out */
@ -540,6 +541,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)

 	wstate.heap = btspool->heap;
 	wstate.index = btspool->index;
+	wstate.inskey = _bt_mkscankey(wstate.index, NULL);

 	/*
 	 * We need to log index creation in WAL iff WAL archiving/streaming is
@ -1085,7 +1087,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
 	int			i,
 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
-	ScanKey		indexScanKey = NULL;
 	SortSupport sortKeys;

 	if (merge)
@ -1098,7 +1099,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		/* the preparation of merge */
 		itup = tuplesort_getindextuple(btspool->sortstate, true);
 		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
-		indexScanKey = _bt_mkscankey_nodata(wstate->index);

 		/* Prepare SortSupport data for each column */
 		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
@ -1106,7 +1106,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		for (i = 0; i < keysz; i++)
 		{
 			SortSupport sortKey = sortKeys + i;
-			ScanKey		scanKey = indexScanKey + i;
+			ScanKey		scanKey = wstate->inskey->scankeys + i;
 			int16		strategy;

 			sortKey->ssup_cxt = CurrentMemoryContext;
@ -1125,8 +1125,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
 		}

-		_bt_freeskey(indexScanKey);
-
 		for (;;)
 		{
 			load1 = true;		/* load BTSpool next ? */
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@ -56,34 +56,37 @@ static bool _bt_check_rowcompare(ScanKey skey,
 *		Build an insertion scan key that contains comparison data from itup
 *		as well as comparator routines appropriate to the key datatypes.
 *
- *		The result is intended for use with _bt_compare().
+ *		Result is intended for use with _bt_compare().  Callers that don't
+ *		need to fill out the insertion scankey arguments (e.g. they use an
+ *		ad-hoc comparison routine) can pass a NULL index tuple.
 */
-ScanKey
+BTScanInsert
 _bt_mkscankey(Relation rel, IndexTuple itup)
 {
+	BTScanInsert key;
 	ScanKey		skey;
 	TupleDesc	itupdesc;
-	int			indnatts PG_USED_FOR_ASSERTS_ONLY;
 	int			indnkeyatts;
 	int16	   *indoption;
+	int			tupnatts;
 	int			i;

 	itupdesc = RelationGetDescr(rel);
-	indnatts = IndexRelationGetNumberOfAttributes(rel);
 	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 	indoption = rel->rd_indoption;
+	tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;

-	Assert(indnkeyatts > 0);
-	Assert(indnkeyatts <= indnatts);
-	Assert(BTreeTupleGetNAtts(itup, rel) == indnatts ||
-		   BTreeTupleGetNAtts(itup, rel) == indnkeyatts);
+	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));

 	/*
 	 * We'll execute search using scan key constructed on key columns. Non-key
 	 * (INCLUDE index) columns are always omitted from scan keys.
 	 */
-	skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData));
-
+	key = palloc(offsetof(BTScanInsertData, scankeys) +
+				 sizeof(ScanKeyData) * indnkeyatts);
+	key->nextkey = false;
+	key->keysz = Min(indnkeyatts, tupnatts);
+	skey = key->scankeys;
 	for (i = 0; i < indnkeyatts; i++)
 	{
 		FmgrInfo   *procinfo;
@ -96,7 +99,19 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 		 * comparison can be needed.
 		 */
 		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
-		arg = index_getattr(itup, i + 1, itupdesc, &null);
+
+		/*
+		 * Key arguments built when caller provides no tuple are
+		 * defensively represented as NULL values.  They should never be
+		 * used.
+		 */
+		if (i < tupnatts)
+			arg = index_getattr(itup, i + 1, itupdesc, &null);
+		else
+		{
+			arg = (Datum) 0;
+			null = true;
+		}
 		flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
 									   flags,
@ -108,64 +123,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 									   arg);
 	}

-	return skey;
-}
-
-/*
- * _bt_mkscankey_nodata
- *		Build an insertion scan key that contains 3-way comparator routines
- *		appropriate to the key datatypes, but no comparison data.  The
- *		comparison data ultimately used must match the key datatypes.
- *
- *		The result cannot be used with _bt_compare(), unless comparison
- *		data is first stored into the key entries.  Currently this
- *		routine is only called by nbtsort.c and tuplesort.c, which have
- *		their own comparison routines.
- */
-ScanKey
-_bt_mkscankey_nodata(Relation rel)
-{
-	ScanKey		skey;
-	int			indnkeyatts;
-	int16	   *indoption;
-	int			i;
-
-	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
-	indoption = rel->rd_indoption;
-
-	skey = (ScanKey) palloc(indnkeyatts * sizeof(ScanKeyData));
-
-	for (i = 0; i < indnkeyatts; i++)
-	{
-		FmgrInfo   *procinfo;
-		int			flags;
-
-		/*
-		 * We can use the cached (default) support procs since no cross-type
-		 * comparison can be needed.
-		 */
-		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
-		flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
-		ScanKeyEntryInitializeWithInfo(&skey[i],
-									   flags,
-									   (AttrNumber) (i + 1),
-									   InvalidStrategy,
-									   InvalidOid,
-									   rel->rd_indcollation[i],
-									   procinfo,
-									   (Datum) 0);
-	}
-
-	return skey;
-}
-
-/*
- * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
- */
-void
-_bt_freeskey(ScanKey skey)
-{
-	pfree(skey);
+	return key;
 }

 /*
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@ -884,7 +884,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
 												   randomAccess);
-	ScanKey		indexScanKey;
+	BTScanInsert indexScanKey;
 	MemoryContext oldcontext;
 	int			i;

@ -919,7 +919,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,

 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */

-	indexScanKey = _bt_mkscankey_nodata(indexRel);
+	indexScanKey = _bt_mkscankey(indexRel, NULL);

 	if (state->indexInfo->ii_Expressions != NULL)
 	{
@ -945,7 +945,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 	for (i = 0; i < state->nKeys; i++)
 	{
 		SortSupport sortKey = state->sortKeys + i;
-		ScanKey		scanKey = indexScanKey + i;
+		ScanKey		scanKey = indexScanKey->scankeys + i;
 		int16		strategy;

 		sortKey->ssup_cxt = CurrentMemoryContext;
@ -964,7 +964,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 	}

-	_bt_freeskey(indexScanKey);
+	pfree(indexScanKey);

 	MemoryContextSwitchTo(oldcontext);

@ -981,7 +981,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
 												   randomAccess);
-	ScanKey		indexScanKey;
+	BTScanInsert indexScanKey;
 	MemoryContext oldcontext;
 	int			i;

@ -1014,7 +1014,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	state->indexRel = indexRel;
 	state->enforceUnique = enforceUnique;

-	indexScanKey = _bt_mkscankey_nodata(indexRel);
+	indexScanKey = _bt_mkscankey(indexRel, NULL);

 	/* Prepare SortSupport data for each column */
 	state->sortKeys = (SortSupport) palloc0(state->nKeys *
@ -1023,7 +1023,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	for (i = 0; i < state->nKeys; i++)
 	{
 		SortSupport sortKey = state->sortKeys + i;
-		ScanKey		scanKey = indexScanKey + i;
+		ScanKey		scanKey = indexScanKey->scankeys + i;
 		int16		strategy;

 		sortKey->ssup_cxt = CurrentMemoryContext;
@ -1042,7 +1042,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
 	}

-	_bt_freeskey(indexScanKey);
+	pfree(indexScanKey);

 	MemoryContextSwitchTo(oldcontext);

--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@ -319,6 +319,64 @@ typedef struct BTStackData

 typedef BTStackData *BTStack;

+/*
+ * BTScanInsert is the btree-private state needed to find an initial position
+ * for an indexscan, or to insert new tuples -- an "insertion scankey" (not to
+ * be confused with a search scankey).  It's used to descend a B-Tree using
+ * _bt_search.
+ *
+ * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
+ * locate the first item >= scankey.  When nextkey is true, they will locate
+ * the first item > scan key.
+ *
+ * scankeys is an array of scan key entries for attributes that are compared.
+ * keysz is the size of the array.  During insertion, there must be a scan key
+ * for every attribute, but when starting a regular index scan some can be
+ * omitted.  The array is used as a flexible array member, though it's sized
+ * in a way that makes it possible to use stack allocations.  See
+ * nbtree/README for full details.
+ */
+typedef struct BTScanInsertData
+{
+	bool		nextkey;
+	int			keysz;			/* Size of scankeys array */
+	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
+} BTScanInsertData;
+
+typedef BTScanInsertData *BTScanInsert;
+
+/*
+ * BTInsertStateData is a working area used during insertion.
+ *
+ * This is filled in after descending the tree to the first leaf page the new
+ * tuple might belong on.  Tracks the current position while performing
+ * uniqueness check, before we have determined which exact page to insert
+ * to.
+ *
+ * (This should be private to nbtinsert.c, but it's also used by
+ * _bt_binsrch_insert)
+ */
+typedef struct BTInsertStateData
+{
+	IndexTuple	itup;			/* Item we're inserting */
+	Size		itemsz;			/* Size of itup -- should be MAXALIGN()'d */
+	BTScanInsert itup_key;		/* Insertion scankey */
+
+	/* Buffer containing leaf page we're likely to insert itup on */
+	Buffer		buf;
+
+	/*
+	 * Cache of bounds within the current buffer.  Only used for insertions
+	 * where _bt_check_unique is called.  See _bt_binsrch_insert and
+	 * _bt_findinsertloc for details.
+	 */
+	bool		bounds_valid;
+	OffsetNumber low;
+	OffsetNumber stricthigh;
+} BTInsertStateData;
+
+typedef BTInsertStateData *BTInsertState;
+
 /*
 * BTScanOpaqueData is the btree-private state needed for an indexscan.
 * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
@ -558,16 +616,12 @@ extern int	_bt_pagedel(Relation rel, Buffer buf);
 /*
 * prototypes for functions in nbtsearch.c
 */
-extern BTStack _bt_search(Relation rel,
-		   int keysz, ScanKey scankey, bool nextkey,
-		   Buffer *bufP, int access, Snapshot snapshot);
-extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
-			  ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
-			  int access, Snapshot snapshot);
-extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
-			ScanKey scankey, bool nextkey);
-extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
-			Page page, OffsetNumber offnum);
+extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP,
+		   int access, Snapshot snapshot);
+extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
+			  bool forupdate, BTStack stack, int access, Snapshot snapshot);
+extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
+extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
@ -576,9 +630,7 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 /*
 * prototypes for functions in nbtutils.c
 */
-extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
-extern ScanKey _bt_mkscankey_nodata(Relation rel);
-extern void _bt_freeskey(ScanKey skey);
+extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
 extern void _bt_freestack(BTStack stack);
 extern void _bt_preprocess_array_keys(IndexScanDesc scan);
 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);