Defer flushing of SLRU files.

Previously, we called fsync() after writing out individual pg_xact,
pg_multixact and pg_commit_ts pages due to cache pressure, leading to
regular I/O stalls in user backends and recovery.  Collapse requests for
the same file into a single system call as part of the next checkpoint,
as we already did for relation files, using the infrastructure developed
by commit 3eb77eba.  This can cause a significant improvement to
recovery performance, especially when it's otherwise CPU-bound.

Hoist ProcessSyncRequests() up into CheckPointGuts() to make it clearer
that it applies to all the SLRU mini-buffer-pools as well as the main
buffer pool.  Rearrange things so that data collected in CheckpointStats
includes SLRU activity.

Also remove the Shutdown{CLOG,CommitTS,SUBTRANS,MultiXact}() functions,
because they were redundant after the shutdown checkpoint that
immediately precedes them.  (I'm not sure if they were ever needed, but
they aren't now.)

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (parts)
Tested-by: Jakub Wartak <Jakub.Wartak@tomtom.com>
Discussion: https://postgr.es/m/CA+hUKGLJ=84YT+NvhkEEDAuUtVHMfQ9i-N7k_o50JmQ6Rpj_OQ@mail.gmail.com
This commit is contained in:
Thomas Munro 2020-09-25 18:49:43 +12:00
parent ca7f8e2b86
commit dee663f784
16 changed files with 255 additions and 175 deletions

View File

@ -42,6 +42,7 @@
#include "pg_trace.h" #include "pg_trace.h"
#include "pgstat.h" #include "pgstat.h"
#include "storage/proc.h" #include "storage/proc.h"
#include "storage/sync.h"
/* /*
* Defines for CLOG page sizes. A page is the same BLCKSZ as is used * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
@ -691,7 +692,8 @@ CLOGShmemInit(void)
{ {
XactCtl->PagePrecedes = CLOGPagePrecedes; XactCtl->PagePrecedes = CLOGPagePrecedes;
SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER); XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG);
} }
/* /*
@ -808,34 +810,19 @@ TrimCLOG(void)
LWLockRelease(XactSLRULock); LWLockRelease(XactSLRULock);
} }
/*
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void
ShutdownCLOG(void)
{
/* Flush dirty CLOG pages to disk */
TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false);
SimpleLruFlush(XactCtl, false);
/*
* fsync pg_xact to ensure that any files flushed previously are durably
* on disk.
*/
fsync_fname("pg_xact", true);
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false);
}
/* /*
* Perform a checkpoint --- either during shutdown, or on-the-fly * Perform a checkpoint --- either during shutdown, or on-the-fly
*/ */
void void
CheckPointCLOG(void) CheckPointCLOG(void)
{ {
/* Flush dirty CLOG pages to disk */ /*
* Write dirty CLOG pages to disk. This may result in sync requests
* queued for later handling by ProcessSyncRequests(), as part of the
* checkpoint.
*/
TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
SimpleLruFlush(XactCtl, true); SimpleLruWriteAll(XactCtl, true);
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
} }
@ -1026,3 +1013,12 @@ clog_redo(XLogReaderState *record)
else else
elog(PANIC, "clog_redo: unknown op code %u", info); elog(PANIC, "clog_redo: unknown op code %u", info);
} }
/*
* Entrypoint for sync.c to sync clog files.
*/
int
clogsyncfiletag(const FileTag *ftag, char *path)
{
return SlruSyncFileTag(XactCtl, ftag, path);
}

View File

@ -555,7 +555,8 @@ CommitTsShmemInit(void)
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
CommitTsSLRULock, "pg_commit_ts", CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER); LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS);
commitTsShared = ShmemInitStruct("CommitTs shared", commitTsShared = ShmemInitStruct("CommitTs shared",
sizeof(CommitTimestampShared), sizeof(CommitTimestampShared),
@ -798,30 +799,18 @@ DeactivateCommitTs(void)
LWLockRelease(CommitTsSLRULock); LWLockRelease(CommitTsSLRULock);
} }
/*
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void
ShutdownCommitTs(void)
{
/* Flush dirty CommitTs pages to disk */
SimpleLruFlush(CommitTsCtl, false);
/*
* fsync pg_commit_ts to ensure that any files flushed previously are
* durably on disk.
*/
fsync_fname("pg_commit_ts", true);
}
/* /*
* Perform a checkpoint --- either during shutdown, or on-the-fly * Perform a checkpoint --- either during shutdown, or on-the-fly
*/ */
void void
CheckPointCommitTs(void) CheckPointCommitTs(void)
{ {
/* Flush dirty CommitTs pages to disk */ /*
SimpleLruFlush(CommitTsCtl, true); * Write dirty CommitTs pages to disk. This may result in sync requests
* queued for later handling by ProcessSyncRequests(), as part of the
* checkpoint.
*/
SimpleLruWriteAll(CommitTsCtl, true);
} }
/* /*
@ -1077,3 +1066,12 @@ commit_ts_redo(XLogReaderState *record)
else else
elog(PANIC, "commit_ts_redo: unknown op code %u", info); elog(PANIC, "commit_ts_redo: unknown op code %u", info);
} }
/*
* Entrypoint for sync.c to sync commit_ts files.
*/
int
committssyncfiletag(const FileTag *ftag, char *path)
{
return SlruSyncFileTag(CommitTsCtl, ftag, path);
}

View File

@ -1831,11 +1831,13 @@ MultiXactShmemInit(void)
SimpleLruInit(MultiXactOffsetCtl, SimpleLruInit(MultiXactOffsetCtl,
"MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
MultiXactOffsetSLRULock, "pg_multixact/offsets", MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER); LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET);
SimpleLruInit(MultiXactMemberCtl, SimpleLruInit(MultiXactMemberCtl,
"MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
MultiXactMemberSLRULock, "pg_multixact/members", MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER); LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER);
/* Initialize our shared state struct */ /* Initialize our shared state struct */
MultiXactState = ShmemInitStruct("Shared MultiXact State", MultiXactState = ShmemInitStruct("Shared MultiXact State",
@ -2100,19 +2102,6 @@ TrimMultiXact(void)
SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
} }
/*
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void
ShutdownMultiXact(void)
{
/* Flush dirty MultiXact pages to disk */
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
SimpleLruFlush(MultiXactOffsetCtl, false);
SimpleLruFlush(MultiXactMemberCtl, false);
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
}
/* /*
* Get the MultiXact data to save in a checkpoint record * Get the MultiXact data to save in a checkpoint record
*/ */
@ -2143,9 +2132,13 @@ CheckPointMultiXact(void)
{ {
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
/* Flush dirty MultiXact pages to disk */ /*
SimpleLruFlush(MultiXactOffsetCtl, true); * Write dirty MultiXact pages to disk. This may result in sync requests
SimpleLruFlush(MultiXactMemberCtl, true); * queued for later handling by ProcessSyncRequests(), as part of the
* checkpoint.
*/
SimpleLruWriteAll(MultiXactOffsetCtl, true);
SimpleLruWriteAll(MultiXactMemberCtl, true);
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
} }
@ -2728,14 +2721,10 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
entryno = MultiXactIdToOffsetEntry(multi); entryno = MultiXactIdToOffsetEntry(multi);
/* /*
* Flush out dirty data, so PhysicalPageExists can work correctly. * Write out dirty data, so PhysicalPageExists can work correctly.
* SimpleLruFlush() is a pretty big hammer for that. Alternatively we
* could add an in-memory version of page exists, but find_multixact_start
* is called infrequently, and it doesn't seem bad to flush buffers to
* disk before truncation.
*/ */
SimpleLruFlush(MultiXactOffsetCtl, true); SimpleLruWriteAll(MultiXactOffsetCtl, true);
SimpleLruFlush(MultiXactMemberCtl, true); SimpleLruWriteAll(MultiXactMemberCtl, true);
if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
return false; return false;
@ -3386,3 +3375,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funccxt); SRF_RETURN_DONE(funccxt);
} }
/*
* Entrypoint for sync.c to sync offsets files.
*/
int
multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
{
return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
}
/*
* Entrypoint for sync.c to sync members files.
*/
int
multixactmemberssyncfiletag(const FileTag *ftag, char *path)
{
return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
}

View File

@ -63,22 +63,33 @@
snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
/* /*
* During SimpleLruFlush(), we will usually not need to write/fsync more * During SimpleLruWriteAll(), we will usually not need to write more than one
* than one or two physical files, but we may need to write several pages * or two physical files, but we may need to write several pages per file. We
* per file. We can consolidate the I/O requests by leaving files open * can consolidate the I/O requests by leaving files open until control returns
* until control returns to SimpleLruFlush(). This data structure remembers * to SimpleLruWriteAll(). This data structure remembers which files are open.
* which files are open.
*/ */
#define MAX_FLUSH_BUFFERS 16 #define MAX_WRITEALL_BUFFERS 16
typedef struct SlruFlushData typedef struct SlruWriteAllData
{ {
int num_files; /* # files actually open */ int num_files; /* # files actually open */
int fd[MAX_FLUSH_BUFFERS]; /* their FD's */ int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */ int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
} SlruFlushData; } SlruWriteAllData;
typedef struct SlruFlushData *SlruFlush; typedef struct SlruWriteAllData *SlruWriteAll;
/*
* Populate a file tag describing a segment file. We only use the segment
* number, since we can derive everything else we need by having separate
* sync handler functions for clog, multixact etc.
*/
#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = (xx_handler), \
(a).segno = (xx_segno) \
)
/* /*
* Macro to mark a buffer slot "most recently used". Note multiple evaluation * Macro to mark a buffer slot "most recently used". Note multiple evaluation
@ -125,10 +136,10 @@ static int slru_errno;
static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
SlruFlush fdata); SlruWriteAll fdata);
static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
static int SlruSelectLRUPage(SlruCtl ctl, int pageno); static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
@ -173,7 +184,8 @@ SimpleLruShmemSize(int nslots, int nlsns)
*/ */
void void
SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
LWLock *ctllock, const char *subdir, int tranche_id) LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler)
{ {
SlruShared shared; SlruShared shared;
bool found; bool found;
@ -251,7 +263,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
* assume caller set PagePrecedes. * assume caller set PagePrecedes.
*/ */
ctl->shared = shared; ctl->shared = shared;
ctl->do_fsync = true; /* default behavior */ ctl->sync_handler = sync_handler;
strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
} }
@ -523,7 +535,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
* Control lock must be held at entry, and will be held at exit. * Control lock must be held at entry, and will be held at exit.
*/ */
static void static void
SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
{ {
SlruShared shared = ctl->shared; SlruShared shared = ctl->shared;
int pageno = shared->page_number[slotno]; int pageno = shared->page_number[slotno];
@ -587,6 +599,10 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
/* Now it's okay to ereport if we failed */ /* Now it's okay to ereport if we failed */
if (!ok) if (!ok)
SlruReportIOError(ctl, pageno, InvalidTransactionId); SlruReportIOError(ctl, pageno, InvalidTransactionId);
/* If part of a checkpoint, count this as a buffer written. */
if (fdata)
CheckpointStats.ckpt_bufs_written++;
} }
/* /*
@ -730,13 +746,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
* *
* For now, assume it's not worth keeping a file pointer open across * For now, assume it's not worth keeping a file pointer open across
* independent read/write operations. We do batch operations during * independent read/write operations. We do batch operations during
* SimpleLruFlush, though. * SimpleLruWriteAll, though.
* *
* fdata is NULL for a standalone write, pointer to open-file info during * fdata is NULL for a standalone write, pointer to open-file info during
* SimpleLruFlush. * SimpleLruWriteAll.
*/ */
static bool static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
{ {
SlruShared shared = ctl->shared; SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT; int segno = pageno / SLRU_PAGES_PER_SEGMENT;
@ -791,7 +807,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
} }
/* /*
* During a Flush, we may already have the desired file open. * During a WriteAll, we may already have the desired file open.
*/ */
if (fdata) if (fdata)
{ {
@ -837,7 +853,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
if (fdata) if (fdata)
{ {
if (fdata->num_files < MAX_FLUSH_BUFFERS) if (fdata->num_files < MAX_WRITEALL_BUFFERS)
{ {
fdata->fd[fdata->num_files] = fd; fdata->fd[fdata->num_files] = fd;
fdata->segno[fdata->num_files] = segno; fdata->segno[fdata->num_files] = segno;
@ -870,23 +886,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
} }
pgstat_report_wait_end(); pgstat_report_wait_end();
/* /* Queue up a sync request for the checkpointer. */
* If not part of Flush, need to fsync now. We assume this happens if (ctl->sync_handler != SYNC_HANDLER_NONE)
* infrequently enough that it's not a performance issue. {
*/ FileTag tag;
INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
{
/* No space to enqueue sync request. Do it synchronously. */
pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
if (pg_fsync(fd) != 0)
{
pgstat_report_wait_end();
slru_errcause = SLRU_FSYNC_FAILED;
slru_errno = errno;
CloseTransientFile(fd);
return false;
}
pgstat_report_wait_end();
}
}
/* Close file, unless part of flush request. */
if (!fdata) if (!fdata)
{ {
pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
if (ctl->do_fsync && pg_fsync(fd) != 0)
{
pgstat_report_wait_end();
slru_errcause = SLRU_FSYNC_FAILED;
slru_errno = errno;
CloseTransientFile(fd);
return false;
}
pgstat_report_wait_end();
if (CloseTransientFile(fd) != 0) if (CloseTransientFile(fd) != 0)
{ {
slru_errcause = SLRU_CLOSE_FAILED; slru_errcause = SLRU_CLOSE_FAILED;
@ -1122,13 +1146,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
} }
/* /*
* Flush dirty pages to disk during checkpoint or database shutdown * Write dirty pages to disk during checkpoint or database shutdown. Flushing
* is deferred until the next call to ProcessSyncRequests(), though we do fsync
* the containing directory here to make sure that newly created directory
* entries are on disk.
*/ */
void void
SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
{ {
SlruShared shared = ctl->shared; SlruShared shared = ctl->shared;
SlruFlushData fdata; SlruWriteAllData fdata;
int slotno; int slotno;
int pageno = 0; int pageno = 0;
int i; int i;
@ -1162,21 +1189,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
LWLockRelease(shared->ControlLock); LWLockRelease(shared->ControlLock);
/* /*
* Now fsync and close any files that were open * Now close any files that were open
*/ */
ok = true; ok = true;
for (i = 0; i < fdata.num_files; i++) for (i = 0; i < fdata.num_files; i++)
{ {
pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0)
{
slru_errcause = SLRU_FSYNC_FAILED;
slru_errno = errno;
pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
ok = false;
}
pgstat_report_wait_end();
if (CloseTransientFile(fdata.fd[i]) != 0) if (CloseTransientFile(fdata.fd[i]) != 0)
{ {
slru_errcause = SLRU_CLOSE_FAILED; slru_errcause = SLRU_CLOSE_FAILED;
@ -1189,7 +1206,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
SlruReportIOError(ctl, pageno, InvalidTransactionId); SlruReportIOError(ctl, pageno, InvalidTransactionId);
/* Ensure that directory entries for new files are on disk. */ /* Ensure that directory entries for new files are on disk. */
if (ctl->do_fsync) if (ctl->sync_handler != SYNC_HANDLER_NONE)
fsync_fname(ctl->Dir, true); fsync_fname(ctl->Dir, true);
} }
@ -1350,6 +1367,19 @@ restart:
snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno); snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
ereport(DEBUG2, ereport(DEBUG2,
(errmsg("removing file \"%s\"", path))); (errmsg("removing file \"%s\"", path)));
/*
* Tell the checkpointer to forget any sync requests, before we unlink the
* file.
*/
if (ctl->sync_handler != SYNC_HANDLER_NONE)
{
FileTag tag;
INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
}
unlink(path); unlink(path);
LWLockRelease(shared->ControlLock); LWLockRelease(shared->ControlLock);
@ -1448,3 +1478,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
return retval; return retval;
} }
/*
* Individual SLRUs (clog, ...) have to provide a sync.c handler function so
* that they can provide the correct "SlruCtl" (otherwise we don't know how to
* build the path), but they just forward to this common implementation that
* performs the fsync.
*/
int
SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
{
int fd;
int save_errno;
int result;
SlruFileName(ctl, path, ftag->segno);
fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
if (fd < 0)
return -1;
result = pg_fsync(fd);
save_errno = errno;
CloseTransientFile(fd);
errno = save_errno;
return result;
}

View File

@ -193,9 +193,7 @@ SUBTRANSShmemInit(void)
SubTransCtl->PagePrecedes = SubTransPagePrecedes; SubTransCtl->PagePrecedes = SubTransPagePrecedes;
SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
SubtransSLRULock, "pg_subtrans", SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER); LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
/* Override default assumption that writes should be fsync'd */
SubTransCtl->do_fsync = false;
} }
/* /*
@ -278,23 +276,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID)
LWLockRelease(SubtransSLRULock); LWLockRelease(SubtransSLRULock);
} }
/*
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void
ShutdownSUBTRANS(void)
{
/*
* Flush dirty SUBTRANS pages to disk
*
* This is not actually necessary from a correctness point of view. We do
* it merely as a debugging aid.
*/
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false);
SimpleLruFlush(SubTransCtl, false);
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false);
}
/* /*
* Perform a checkpoint --- either during shutdown, or on-the-fly * Perform a checkpoint --- either during shutdown, or on-the-fly
*/ */
@ -302,14 +283,14 @@ void
CheckPointSUBTRANS(void) CheckPointSUBTRANS(void)
{ {
/* /*
* Flush dirty SUBTRANS pages to disk * Write dirty SUBTRANS pages to disk
* *
* This is not actually necessary from a correctness point of view. We do * This is not actually necessary from a correctness point of view. We do
* it merely to improve the odds that writing of dirty pages is done by * it merely to improve the odds that writing of dirty pages is done by
* the checkpoint process and not by backends. * the checkpoint process and not by backends.
*/ */
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
SimpleLruFlush(SubTransCtl, true); SimpleLruWriteAll(SubTransCtl, true);
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
} }

View File

@ -8528,10 +8528,6 @@ ShutdownXLOG(int code, Datum arg)
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
} }
ShutdownCLOG();
ShutdownCommitTs();
ShutdownSUBTRANS();
ShutdownMultiXact();
} }
/* /*
@ -9176,17 +9172,29 @@ CreateEndOfRecoveryRecord(void)
static void static void
CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
{ {
CheckPointRelationMap();
CheckPointReplicationSlots();
CheckPointSnapBuild();
CheckPointLogicalRewriteHeap();
CheckPointReplicationOrigin();
/* Write out all dirty data in SLRUs and the main buffer pool */
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
CheckPointCLOG(); CheckPointCLOG();
CheckPointCommitTs(); CheckPointCommitTs();
CheckPointSUBTRANS(); CheckPointSUBTRANS();
CheckPointMultiXact(); CheckPointMultiXact();
CheckPointPredicate(); CheckPointPredicate();
CheckPointRelationMap(); CheckPointBuffers(flags);
CheckPointReplicationSlots();
CheckPointSnapBuild(); /* Perform all queued up fsyncs */
CheckPointLogicalRewriteHeap(); TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
CheckPointBuffers(flags); /* performs all required fsyncs */ CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
CheckPointReplicationOrigin(); ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
/* We deliberately delay 2PC checkpointing as long as possible */ /* We deliberately delay 2PC checkpointing as long as possible */
CheckPointTwoPhase(checkPointRedo); CheckPointTwoPhase(checkPointRedo);
} }

View File

@ -554,9 +554,8 @@ AsyncShmemInit(void)
*/ */
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes; NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0, SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER); NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
/* Override default assumption that writes should be fsync'd */ SYNC_HANDLER_NONE);
NotifyCtl->do_fsync = false;
if (!found) if (!found)
{ {

View File

@ -2636,14 +2636,7 @@ PrintBufferLeakWarning(Buffer buffer)
void void
CheckPointBuffers(int flags) CheckPointBuffers(int flags)
{ {
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
BufferSync(flags); BufferSync(flags);
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
} }

View File

@ -821,9 +821,7 @@ SerialInit(void)
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial", SimpleLruInit(SerialSlruCtl, "Serial",
NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER); LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
/* Override default assumption that writes should be fsync'd */
SerialSlruCtl->do_fsync = false;
/* /*
* Create or attach to the SerialControl structure. * Create or attach to the SerialControl structure.
@ -1052,7 +1050,7 @@ CheckPointPredicate(void)
SimpleLruTruncate(SerialSlruCtl, tailPage); SimpleLruTruncate(SerialSlruCtl, tailPage);
/* /*
* Flush dirty SLRU pages to disk * Write dirty SLRU pages to disk
* *
* This is not actually necessary from a correctness point of view. We do * This is not actually necessary from a correctness point of view. We do
* it merely as a debugging aid. * it merely as a debugging aid.
@ -1061,7 +1059,7 @@ CheckPointPredicate(void)
* before deleting the file in which they sit, which would be completely * before deleting the file in which they sit, which would be completely
* pointless. * pointless.
*/ */
SimpleLruFlush(SerialSlruCtl, true); SimpleLruWriteAll(SerialSlruCtl, true);
} }
/*------------------------------------------------------------------------*/ /*------------------------------------------------------------------------*/

View File

@ -18,6 +18,9 @@
#include <fcntl.h> #include <fcntl.h>
#include <sys/file.h> #include <sys/file.h>
#include "access/commit_ts.h"
#include "access/clog.h"
#include "access/multixact.h"
#include "access/xlog.h" #include "access/xlog.h"
#include "access/xlogutils.h" #include "access/xlogutils.h"
#include "commands/tablespace.h" #include "commands/tablespace.h"
@ -90,12 +93,31 @@ typedef struct SyncOps
const FileTag *candidate); const FileTag *candidate);
} SyncOps; } SyncOps;
/*
* These indexes must correspond to the values of the SyncRequestHandler enum.
*/
static const SyncOps syncsw[] = { static const SyncOps syncsw[] = {
/* magnetic disk */ /* magnetic disk */
{ [SYNC_HANDLER_MD] = {
.sync_syncfiletag = mdsyncfiletag, .sync_syncfiletag = mdsyncfiletag,
.sync_unlinkfiletag = mdunlinkfiletag, .sync_unlinkfiletag = mdunlinkfiletag,
.sync_filetagmatches = mdfiletagmatches .sync_filetagmatches = mdfiletagmatches
},
/* pg_xact */
[SYNC_HANDLER_CLOG] = {
.sync_syncfiletag = clogsyncfiletag
},
/* pg_commit_ts */
[SYNC_HANDLER_COMMIT_TS] = {
.sync_syncfiletag = committssyncfiletag
},
/* pg_multixact/offsets */
[SYNC_HANDLER_MULTIXACT_OFFSET] = {
.sync_syncfiletag = multixactoffsetssyncfiletag
},
/* pg_multixact/members */
[SYNC_HANDLER_MULTIXACT_MEMBER] = {
.sync_syncfiletag = multixactmemberssyncfiletag
} }
}; };
@ -505,8 +527,8 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
(void *) ftag, (void *) ftag,
HASH_ENTER, HASH_ENTER,
&found); &found);
/* if new entry, initialize it */ /* if new entry, or was previously canceled, initialize it */
if (!found) if (!found || entry->canceled)
{ {
entry->cycle_ctr = sync_cycle_ctr; entry->cycle_ctr = sync_cycle_ctr;
entry->canceled = false; entry->canceled = false;

View File

@ -12,6 +12,7 @@
#define CLOG_H #define CLOG_H
#include "access/xlogreader.h" #include "access/xlogreader.h"
#include "storage/sync.h"
#include "lib/stringinfo.h" #include "lib/stringinfo.h"
/* /*
@ -50,6 +51,8 @@ extern void CheckPointCLOG(void);
extern void ExtendCLOG(TransactionId newestXact); extern void ExtendCLOG(TransactionId newestXact);
extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
extern int clogsyncfiletag(const FileTag *ftag, char *path);
/* XLOG stuff */ /* XLOG stuff */
#define CLOG_ZEROPAGE 0x00 #define CLOG_ZEROPAGE 0x00
#define CLOG_TRUNCATE 0x10 #define CLOG_TRUNCATE 0x10

View File

@ -14,6 +14,7 @@
#include "access/xlog.h" #include "access/xlog.h"
#include "datatype/timestamp.h" #include "datatype/timestamp.h"
#include "replication/origin.h" #include "replication/origin.h"
#include "storage/sync.h"
#include "utils/guc.h" #include "utils/guc.h"
@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact,
TransactionId newestXact); TransactionId newestXact);
extern void AdvanceOldestCommitTsXid(TransactionId oldestXact); extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
extern int committssyncfiletag(const FileTag *ftag, char *path);
/* XLOG stuff */ /* XLOG stuff */
#define COMMIT_TS_ZEROPAGE 0x00 #define COMMIT_TS_ZEROPAGE 0x00
#define COMMIT_TS_TRUNCATE 0x10 #define COMMIT_TS_TRUNCATE 0x10

View File

@ -13,6 +13,7 @@
#include "access/xlogreader.h" #include "access/xlogreader.h"
#include "lib/stringinfo.h" #include "lib/stringinfo.h"
#include "storage/sync.h"
/* /*
@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
MultiXactId multi2); MultiXactId multi2);
extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path);
extern void AtEOXact_MultiXact(void); extern void AtEOXact_MultiXact(void);
extern void AtPrepare_MultiXact(void); extern void AtPrepare_MultiXact(void);
extern void PostPrepare_MultiXact(TransactionId xid); extern void PostPrepare_MultiXact(TransactionId xid);

View File

@ -15,6 +15,7 @@
#include "access/xlogdefs.h" #include "access/xlogdefs.h"
#include "storage/lwlock.h" #include "storage/lwlock.h"
#include "storage/sync.h"
/* /*
@ -111,10 +112,10 @@ typedef struct SlruCtlData
SlruShared shared; SlruShared shared;
/* /*
* This flag tells whether to fsync writes (true for pg_xact and multixact * Which sync handler function to use when handing sync requests over to
* stuff, false for pg_subtrans and pg_notify). * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
*/ */
bool do_fsync; SyncRequestHandler sync_handler;
/* /*
* Decide which of two page numbers is "older" for truncation purposes. We * Decide which of two page numbers is "older" for truncation purposes. We
@ -135,14 +136,15 @@ typedef SlruCtlData *SlruCtl;
extern Size SimpleLruShmemSize(int nslots, int nlsns); extern Size SimpleLruShmemSize(int nslots, int nlsns);
extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
LWLock *ctllock, const char *subdir, int tranche_id); LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler);
extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
TransactionId xid); TransactionId xid);
extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
TransactionId xid); TransactionId xid);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno); extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied); extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
extern void SlruDeleteSegment(SlruCtl ctl, int segno); extern void SlruDeleteSegment(SlruCtl ctl, int segno);
extern int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path);
/* SlruScanDirectory public callbacks */ /* SlruScanDirectory public callbacks */
extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
int segpage, void *data); int segpage, void *data);

View File

@ -34,7 +34,12 @@ typedef enum SyncRequestType
*/ */
typedef enum SyncRequestHandler typedef enum SyncRequestHandler
{ {
SYNC_HANDLER_MD = 0 /* md smgr */ SYNC_HANDLER_MD = 0,
SYNC_HANDLER_CLOG,
SYNC_HANDLER_COMMIT_TS,
SYNC_HANDLER_MULTIXACT_OFFSET,
SYNC_HANDLER_MULTIXACT_MEMBER,
SYNC_HANDLER_NONE
} SyncRequestHandler; } SyncRequestHandler;
/* /*

View File

@ -2295,12 +2295,12 @@ SlotNumber
SlruCtl SlruCtl
SlruCtlData SlruCtlData
SlruErrorCause SlruErrorCause
SlruFlush
SlruFlushData
SlruPageStatus SlruPageStatus
SlruScanCallback SlruScanCallback
SlruShared SlruShared
SlruSharedData SlruSharedData
SlruWriteAll
SlruWriteAllData
SnapBuild SnapBuild
SnapBuildOnDisk SnapBuildOnDisk
SnapBuildState SnapBuildState
@ -2404,6 +2404,7 @@ Syn
SyncOps SyncOps
SyncRepConfigData SyncRepConfigData
SyncRepStandbyData SyncRepStandbyData
SyncRequestHandler
SyncRequestType SyncRequestType
SysScanDesc SysScanDesc
SyscacheCallbackFunction SyscacheCallbackFunction