Add nbtree Valgrind buffer lock checks.

Holding just a buffer pin (with no buffer lock) on an nbtree buffer/page
provides very weak guarantees, especially compared to heapam, where it's
often safe to read a page while only holding a buffer pin.  This commit
has Valgrind enforce the following rule: it is never okay to access an
nbtree buffer without holding both a pin and a lock on the buffer.

A draft version of this patch detected questionable code that was
cleaned up by commits fa7ff642 and 7154aa16.  The code in question used
to access an nbtree buffer page's special/opaque area with no buffer
lock (only a buffer pin).  This practice (which isn't obviously unsafe)
is hereby formally disallowed in nbtree.  There doesn't seem to be any
reason to allow it, and banning it keeps things simple for Valgrind.

The new checks are implemented by adding custom nbtree client requests
(located in LockBuffer() wrapper functions); these requests are
"superimposed" on top of the generic bufmgr.c Valgrind client requests
added by commit 1e0dfd16.  No custom resource management cleanup code is
needed to undo the effects of marking buffers as non-accessible under
this scheme.

Author: Peter Geoghegan
Reviewed-By: Anastasia Lubennikova, Georgios Kokolatos
Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com
This commit is contained in:
Peter Geoghegan 2020-07-21 15:50:58 -07:00
parent 670c0a1d47
commit 4a70f829d8
8 changed files with 170 additions and 44 deletions

View File

@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
{
/* Simulate a _bt_getbuf() call with conditional locking */
insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
if (ConditionalLockBuffer(insertstate->buf))
if (_bt_conditionallockbuf(rel, insertstate->buf))
{
Page page;
BTPageOpaque lpageop;

View File

@ -32,6 +32,7 @@
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "utils/memdebug.h"
#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
}
/* trade in our read lock for a write lock */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
_bt_unlockbuf(rel, metabuf);
_bt_lockbuf(rel, metabuf, BT_WRITE);
START_CRIT_SECTION();
@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access)
}
/* trade in our read lock for a write lock */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
_bt_unlockbuf(rel, metabuf);
_bt_lockbuf(rel, metabuf, BT_WRITE);
/*
* Race condition: if someone else initialized the metadata between
@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access)
* else accessing the new root page while it's unlocked, since no one
* else knows where it is yet.
*/
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(rootbuf, BT_READ);
_bt_unlockbuf(rel, rootbuf);
_bt_lockbuf(rel, rootbuf, BT_READ);
/* okay, metadata is correct, release lock on it without caching */
_bt_relbuf(rel, metabuf);
@ -783,10 +784,20 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
* blkno == P_NEW means to get an unallocated index page. The page
* will be initialized before returning it.
*
* The general rule in nbtree is that it's never okay to access a
* page without holding both a buffer pin and a buffer lock on
* the page's buffer.
*
* When this routine returns, the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned"). Also, we apply
* _bt_checkpage to sanity-check the page (except in P_NEW case).
* _bt_checkpage to sanity-check the page (except in P_NEW case),
* and perform Valgrind client requests that help Valgrind detect
* unsafe page accesses.
*
* Note: raw LockBuffer() calls are disallowed in nbtree; all
* buffer lock requests need to go through wrapper functions such
* as _bt_lockbuf().
*/
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
@ -797,7 +808,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
/* Read an existing block of the relation */
buf = ReadBuffer(rel, blkno);
LockBuffer(buf, access);
_bt_lockbuf(rel, buf, access);
_bt_checkpage(rel, buf);
}
else
@ -837,7 +848,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
if (blkno == InvalidBlockNumber)
break;
buf = ReadBuffer(rel, blkno);
if (ConditionalLockBuffer(buf))
if (_bt_conditionallockbuf(rel, buf))
{
page = BufferGetPage(buf);
if (_bt_page_recyclable(page))
@ -890,7 +901,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
buf = ReadBuffer(rel, P_NEW);
/* Acquire buffer lock on new page */
LockBuffer(buf, BT_WRITE);
_bt_lockbuf(rel, buf, BT_WRITE);
/*
* Release the file-extension lock; it's now OK for someone else to
@ -931,9 +942,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Assert(blkno != P_NEW);
if (BufferIsValid(obuf))
LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(rel, obuf);
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
LockBuffer(buf, access);
_bt_lockbuf(rel, buf, access);
_bt_checkpage(rel, buf);
return buf;
}
@ -946,7 +958,102 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
void
_bt_relbuf(Relation rel, Buffer buf)
{
UnlockReleaseBuffer(buf);
_bt_unlockbuf(rel, buf);
ReleaseBuffer(buf);
}
/*
* _bt_lockbuf() -- lock a pinned buffer.
*
* Lock is acquired without acquiring another pin. This is like a raw
* LockBuffer() call, but performs extra steps needed by Valgrind.
*
* Note: Caller may need to call _bt_checkpage() with buf when pin on buf
* wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
*/
void
_bt_lockbuf(Relation rel, Buffer buf, int access)
{
/* LockBuffer() asserts that pin is held by this backend */
LockBuffer(buf, access);
/*
* It doesn't matter that _bt_unlockbuf() won't get called in the
* event of an nbtree error (e.g. a unique violation error). That
* won't cause Valgrind false positives.
*
* The nbtree client requests are superimposed on top of the
* bufmgr.c buffer pin client requests. In the event of an nbtree
* error the buffer will certainly get marked as defined when the
* backend once again acquires its first pin on the buffer. (Of
* course, if the backend never touches the buffer again then it
* doesn't matter that it remains non-accessible to Valgrind.)
*
* Note: When an IndexTuple C pointer gets computed using an
* ItemId read from a page while a lock was held, the C pointer
* becomes unsafe to dereference forever as soon as the lock is
* released. Valgrind can only detect cases where the pointer
* gets dereferenced with no _current_ lock/pin held, though.
*/
if (!RelationUsesLocalBuffers(rel))
VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
}
/*
* _bt_unlockbuf() -- unlock a pinned buffer.
*/
void
_bt_unlockbuf(Relation rel, Buffer buf)
{
/*
* Buffer is pinned and locked, which means that it is expected to be
* defined and addressable. Check that proactively.
*/
VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
/* LockBuffer() asserts that pin is held by this backend */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
if (!RelationUsesLocalBuffers(rel))
VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
}
/*
* _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
* buffer.
*
* Note: Caller may need to call _bt_checkpage() with buf when pin on buf
* wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
*/
bool
_bt_conditionallockbuf(Relation rel, Buffer buf)
{
/* ConditionalLockBuffer() asserts that pin is held by this backend */
if (!ConditionalLockBuffer(buf))
return false;
if (!RelationUsesLocalBuffers(rel))
VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
return true;
}
/*
* _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
* lock.
*/
void
_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
{
/*
* Buffer is pinned and locked, which means that it is expected to be
* defined and addressable. Check that proactively.
*/
VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
/* LockBuffer() asserts that pin is held by this backend */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf);
}
/*
@ -1580,7 +1687,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(rel, leafbuf);
/*
* Check that the left sibling of leafbuf (if any) is not
@ -1616,7 +1723,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* (Page deletion can cope with the stack being to the left of
* leafbuf, but not to the right of leafbuf.)
*/
LockBuffer(leafbuf, BT_WRITE);
_bt_lockbuf(rel, leafbuf, BT_WRITE);
continue;
}
@ -1970,7 +2077,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
leafleftsib = opaque->btpo_prev;
leafrightsib = opaque->btpo_next;
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(rel, leafbuf);
/*
* Check here, as calling loops will have locks held, preventing
@ -2005,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* To avoid deadlocks, we'd better drop the target page lock before
* going further.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(rel, buf);
}
/*
@ -2022,7 +2129,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* table.)
*/
if (target != leafblkno)
LockBuffer(leafbuf, BT_WRITE);
_bt_lockbuf(rel, leafbuf, BT_WRITE);
if (leftsib != P_NONE)
{
lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@ -2072,7 +2179,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* rather than a superexclusive lock, since no scan will stop on an empty
* page.
*/
LockBuffer(buf, BT_WRITE);
_bt_lockbuf(rel, buf, BT_WRITE);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);

View File

@ -1115,7 +1115,7 @@ backtrack:
*/
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
info->strategy);
LockBuffer(buf, BT_READ);
_bt_lockbuf(rel, buf, BT_READ);
page = BufferGetPage(buf);
opaque = NULL;
if (!PageIsNew(page))
@ -1222,8 +1222,7 @@ backtrack:
* course of the vacuum scan, whether or not it actually contains any
* deletable tuples --- see nbtree/README.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf);
_bt_upgradelockbufcleanup(rel, buf);
/*
* Check whether we need to backtrack to earlier pages. What we are

View File

@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(scan->indexRelation, sp->buf);
if (IsMVCCSnapshot(scan->xs_snapshot) &&
RelationNeedsWAL(scan->indexRelation) &&
@ -187,14 +187,13 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
if (access == BT_WRITE && page_access == BT_READ)
{
/* trade in our read lock for a write lock */
LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
LockBuffer(*bufP, BT_WRITE);
_bt_unlockbuf(rel, *bufP);
_bt_lockbuf(rel, *bufP, BT_WRITE);
/*
* If the page was split between the time that we surrendered our read
* lock and acquired our write lock, then this page may no longer be
* the right place for the key we want to insert. In this case, we
* need to move right in the tree.
* Race -- the leaf page may have split after we dropped the read lock
* but before we acquired a write lock. If it has, we may need to
* move right to its new sibling. Do that.
*/
*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
snapshot);
@ -289,8 +288,8 @@ _bt_moveright(Relation rel,
/* upgrade our lock if necessary */
if (access == BT_READ)
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BT_WRITE);
_bt_unlockbuf(rel, buf);
_bt_lockbuf(rel, buf, BT_WRITE);
}
if (P_INCOMPLETE_SPLIT(opaque))
@ -1413,7 +1412,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}
@ -2061,7 +2060,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
* deleted.
*/
if (BTScanPosIsPinned(so->currPos))
LockBuffer(so->currPos.buf, BT_READ);
_bt_lockbuf(rel, so->currPos.buf, BT_READ);
else
so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
@ -2439,7 +2438,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}

View File

@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
* LSN.
*/
droppedpin = false;
LockBuffer(so->currPos.buf, BT_READ);
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
}
@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan)
MarkBufferDirtyHint(so->currPos.buf, true);
}
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
}

View File

@ -1639,8 +1639,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
* Assume that we acquired a buffer pin for the purposes of
* Valgrind buffer client checks (even in !result case) to
* keep things simple. Buffers that are unsafe to access are
* not generally guaranteed to be marked undefined in any
* case.
* not generally guaranteed to be marked undefined or
* non-accessible in any case.
*/
VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
break;
@ -1649,7 +1649,16 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
}
else
{
/* If we previously pinned the buffer, it must surely be valid */
/*
* If we previously pinned the buffer, it must surely be valid.
*
* Note: We deliberately avoid a Valgrind client request here.
* Individual access methods can optionally superimpose buffer page
* client requests on top of our client requests to enforce that
* buffers are only accessed while locked (and pinned). It's possible
* that the buffer page is legitimately non-accessible here. We
* cannot meddle with that.
*/
result = true;
}
@ -1745,7 +1754,13 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
uint32 buf_state;
uint32 old_buf_state;
/* Mark undefined, now that no pins remain in backend */
/*
* Mark buffer non-accessible to Valgrind.
*
* Note that the buffer may have already been marked non-accessible
* within access method code that enforces that buffers are only
* accessed while a buffer lock is held.
*/
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
/* I'd better not still hold any locks on the buffer */

View File

@ -1073,6 +1073,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
extern void _bt_unlockbuf(Relation rel, Buffer buf);
extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,

View File

@ -271,11 +271,13 @@
* Valgrind understands PostgreSQL memory contexts. This permits detecting
* memory errors that Valgrind would not detect on a vanilla build. It also
* enables detection of buffer accesses that take place without holding a
* buffer pin. See also src/tools/valgrind.supp.
* buffer pin (or without holding a buffer lock in the case of index access
* methods that superimpose their own custom client requests on top of the
* generic bufmgr.c requests). See also src/tools/valgrind.supp.
*
* "make installcheck" is significantly slower under Valgrind. The client
* requests fall in hot code paths, so USE_VALGRIND slows native execution by
* a few percentage points even when not run under Valgrind.
* requests fall in hot code paths, so USE_VALGRIND slows execution by a few
* percentage points even when not run under Valgrind.
*
* You should normally use MEMORY_CONTEXT_CHECKING with USE_VALGRIND;
* instrumentation of repalloc() is inferior without it.