Add recovery_end_command option to recovery.conf. recovery_end_command

is run at the end of archive recovery, providing a chance to do external
cleanup. Modify pg_standby so that it no longer removes the trigger file,
that is to be done using the recovery_end_command now.

Provide a "smart" failover mode in pg_standby, where we don't fail over
immediately, but only after recovering all unapplied WAL from the archive.
That gives you zero data loss assuming all WAL was archived before
failover, which is what most users of pg_standby actually want.

recovery_end_command by Simon Riggs, pg_standby changes by Fujii Masao and
myself.
This commit is contained in:
Heikki Linnakangas 2009-05-14 20:31:09 +00:00
parent a710713644
commit 9e403c2587
4 changed files with 356 additions and 81 deletions

View File

@ -1,5 +1,5 @@
/*
* $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.21 2009/03/26 22:29:13 tgl Exp $
* $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.22 2009/05/14 20:31:09 heikki Exp $
*
*
* pg_standby.c
@ -26,6 +26,7 @@
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#ifdef WIN32
@ -52,7 +53,6 @@ int maxwaittime = 0; /* how long are we prepared to wait for? */
int keepfiles = 0; /* number of WAL files to keep, 0 keep all */
int maxretries = 3; /* number of retries on restore command */
bool debug = false; /* are we debugging? */
bool triggered = false; /* have we been triggered? */
bool need_cleanup = false; /* do we need to remove files from
* archive? */
@ -69,6 +69,30 @@ char restoreCommand[MAXPGPATH]; /* run this to restore */
char exclusiveCleanupFileName[MAXPGPATH]; /* the file we need to
* get from archive */
/*
* Two types of failover are supported (smart and fast failover).
*
* The content of the trigger file determines the type of failover. If the
* trigger file contains the word "smart" (or the file is empty), smart
* failover is chosen: pg_standby acts as cp or ln command itself, on
* successful completion all the available WAL records will be applied
* resulting in zero data loss. But, it might take a long time to finish
* recovery if there's a lot of unapplied WAL.
*
* On the other hand, if the trigger file contains the word "fast", the
* recovery is finished immediately even if unapplied WAL files remain. Any
* transactions in the unapplied WAL files are lost.
*
* An empty trigger file performs smart failover. SIGUSR or SIGINT triggers
* fast failover. A timeout causes fast failover (smart failover would have
* the same effect, since if the timeout is reached there is no unapplied WAL).
*/
#define NoFailover 0
#define SmartFailover 1
#define FastFailover 2
static int Failover = NoFailover;
#define RESTORE_COMMAND_COPY 0
#define RESTORE_COMMAND_LINK 1
int restoreCommandType;
@ -108,7 +132,6 @@ struct stat stat_buf;
*
* As an example, and probably the common case, we use either
* cp/ln commands on *nix, or copy/move command on Windows.
*
*/
static void
CustomizableInitialize(void)
@ -352,12 +375,16 @@ SetWALFileNameForCleanup(void)
/*
* CheckForExternalTrigger()
*
* Is there a trigger file?
* Is there a trigger file? Sets global 'Failover' variable to indicate
* what kind of a trigger file it was. A "fast" trigger file is turned
* into a "smart" file as a side-effect.
*/
static bool
static void
CheckForExternalTrigger(void)
{
int rc;
char buf[32];
int fd;
int len;
/*
* Look for a trigger file, if that option has been selected
@ -365,28 +392,79 @@ CheckForExternalTrigger(void)
* We use stat() here because triggerPath is always a file rather than
* potentially being in an archive
*/
if (triggerPath && stat(triggerPath, &stat_buf) == 0)
if (!triggerPath || stat(triggerPath, &stat_buf) != 0)
return;
/*
* An empty trigger file performs smart failover. There's a little race
* condition here: if the writer of the trigger file has just created
* the file, but not yet written anything to it, we'll treat that as
* smart shutdown even if the other process was just about to write "fast"
* to it. But that's fine: we'll restore one more WAL file, and when we're
* invoked next time, we'll see the word "fast" and fail over immediately.
*/
if (stat_buf.st_size == 0)
{
fprintf(stderr, "trigger file found\n");
Failover = SmartFailover;
fprintf(stderr, "trigger file found: smart failover\n");
fflush(stderr);
return;
}
if ((fd = open(triggerPath, O_RDWR, 0)) < 0)
{
fprintf(stderr, "WARNING: could not open \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
return;
}
if ((len = read(fd, buf, sizeof(buf))) < 0)
{
fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
close(fd);
return;
}
buf[len] = '\0';
if (strncmp(buf, "smart", 5) == 0)
{
Failover = SmartFailover;
fprintf(stderr, "trigger file found: smart failover\n");
fflush(stderr);
close(fd);
return;
}
if (strncmp(buf, "fast", 4) == 0)
{
Failover = FastFailover;
fprintf(stderr, "trigger file found: fast failover\n");
fflush(stderr);
/*
* If trigger file found, we *must* delete it. Here's why: When
* recovery completes, we will be asked again for the same file from
* the archive using pg_standby so must remove trigger file so we can
* reload file again and come up correctly.
* Turn it into a "smart" trigger by truncating the file. Otherwise
* if the server asks us again to restore a segment that was restored
* restored already, we would return "not found" and upset the server.
*/
rc = unlink(triggerPath);
if (rc != 0)
if (ftruncate(fd, 0) < 0)
{
fprintf(stderr, "\n ERROR: could not remove \"%s\": %s", triggerPath, strerror(errno));
fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
exit(rc);
}
return true;
}
close(fd);
return false;
return;
}
close(fd);
fprintf(stderr, "WARNING: invalid content in \"%s\"\n", triggerPath);
fflush(stderr);
return;
}
/*
@ -402,7 +480,7 @@ RestoreWALFileForRecovery(void)
if (debug)
{
fprintf(stderr, "\nrunning restore :");
fprintf(stderr, "running restore :");
fflush(stderr);
}
@ -413,7 +491,7 @@ RestoreWALFileForRecovery(void)
{
if (debug)
{
fprintf(stderr, " OK");
fprintf(stderr, " OK\n");
fflush(stderr);
}
return true;
@ -425,7 +503,7 @@ RestoreWALFileForRecovery(void)
* Allow caller to add additional info
*/
if (debug)
fprintf(stderr, "not restored : ");
fprintf(stderr, "not restored\n");
return false;
}
@ -552,8 +630,6 @@ main(int argc, char **argv)
break;
case 't': /* Trigger file */
triggerPath = optarg;
if (CheckForExternalTrigger())
exit(1); /* Normal exit, with non-zero */
break;
case 'w': /* Max wait time */
maxwaittime = atoi(optarg);
@ -633,20 +709,20 @@ main(int argc, char **argv)
if (debug)
{
fprintf(stderr, "\nTrigger file : %s", triggerPath ? triggerPath : "<not set>");
fprintf(stderr, "\nWaiting for WAL file : %s", nextWALFileName);
fprintf(stderr, "\nWAL file path : %s", WALFilePath);
fprintf(stderr, "\nRestoring to... : %s", xlogFilePath);
fprintf(stderr, "\nSleep interval : %d second%s",
fprintf(stderr, "Trigger file : %s\n", triggerPath ? triggerPath : "<not set>");
fprintf(stderr, "Waiting for WAL file : %s\n", nextWALFileName);
fprintf(stderr, "WAL file path : %s\n", WALFilePath);
fprintf(stderr, "Restoring to : %s\n", xlogFilePath);
fprintf(stderr, "Sleep interval : %d second%s\n",
sleeptime, (sleeptime > 1 ? "s" : " "));
fprintf(stderr, "\nMax wait interval : %d %s",
fprintf(stderr, "Max wait interval : %d %s\n",
maxwaittime, (maxwaittime > 0 ? "seconds" : "forever"));
fprintf(stderr, "\nCommand for restore : %s", restoreCommand);
fprintf(stderr, "\nKeep archive history : ");
fprintf(stderr, "Command for restore : %s\n", restoreCommand);
fprintf(stderr, "Keep archive history : ");
if (need_cleanup)
fprintf(stderr, "%s and later", exclusiveCleanupFileName);
fprintf(stderr, "%s and later\n", exclusiveCleanupFileName);
else
fprintf(stderr, "No cleanup required");
fprintf(stderr, "No cleanup required\n");
fflush(stderr);
}
@ -676,56 +752,74 @@ main(int argc, char **argv)
/*
* Main wait loop
*/
while (!CustomizableNextWALFileReady() && !triggered)
for (;;)
{
/* Check for trigger file or signal first */
CheckForExternalTrigger();
if (signaled)
{
Failover = FastFailover;
if (debug)
{
fprintf(stderr, "signaled to exit: fast failover\n");
fflush(stderr);
}
}
/*
* Check for fast failover immediately, before checking if the
* requested WAL file is available
*/
if (Failover == FastFailover)
exit(1);
if (CustomizableNextWALFileReady())
{
/*
* Once we have restored this file successfully we can remove some
* prior WAL files. If this restore fails we musn't remove any file
* because some of them will be requested again immediately after
* the failed restore, or when we restart recovery.
*/
if (RestoreWALFileForRecovery())
{
if (need_cleanup)
CustomizableCleanupPriorWALFiles();
exit(0);
}
else
{
/* Something went wrong in copying the file */
exit(1);
}
}
/* Check for smart failover if the next WAL file was not available */
if (Failover == SmartFailover)
exit(1);
if (sleeptime <= 60)
pg_usleep(sleeptime * 1000000L);
if (signaled)
waittime += sleeptime;
if (waittime >= maxwaittime && maxwaittime > 0)
{
triggered = true;
Failover = FastFailover;
if (debug)
{
fprintf(stderr, "\nsignaled to exit\n");
fprintf(stderr, "Timed out after %d seconds: fast failover\n",
waittime);
fflush(stderr);
}
}
else
if (debug)
{
if (debug)
{
fprintf(stderr, "\nWAL file not present yet.");
if (triggerPath)
fprintf(stderr, " Checking for trigger file...");
fflush(stderr);
}
waittime += sleeptime;
if (!triggered && (CheckForExternalTrigger() || (waittime >= maxwaittime && maxwaittime > 0)))
{
triggered = true;
if (debug && waittime >= maxwaittime && maxwaittime > 0)
fprintf(stderr, "\nTimed out after %d seconds\n", waittime);
}
fprintf(stderr, "WAL file not present yet.");
if (triggerPath)
fprintf(stderr, " Checking for trigger file...");
fprintf(stderr, "\n");
fflush(stderr);
}
}
/*
* Action on exit
*/
if (triggered)
exit(1); /* Normal exit, with non-zero */
/*
* Once we have restored this file successfully we can remove some prior
* WAL files. If this restore fails we musn't remove any file because some
* of them will be requested again immediately after the failed restore,
* or when we restart recovery.
*/
if (RestoreWALFileForRecovery() && need_cleanup)
CustomizableCleanupPriorWALFiles();
return 0;
}

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/backup.sgml,v 2.125 2009/04/27 16:27:35 momjian Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/backup.sgml,v 2.126 2009/05/14 20:31:09 heikki Exp $ -->
<chapter id="backup">
<title>Backup and Restore</title>
@ -1126,6 +1126,29 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows
</listitem>
</varlistentry>
<varlistentry id="recovery-end-command" xreflabel="recovery_end_command">
<term><varname>recovery_end_command</varname> (<type>string</type>)</term>
<listitem>
<para>
This parameter specifies a shell command that will be executed once only
at the end of recovery. This parameter is optional. The purpose of the
recovery_end_command is to provide a mechanism for cleanup following
replication or recovery.
Any <literal>%r</> is replaced by the name of the file
containing the last valid restart point. That is the earliest file that
must be kept to allow a restore to be restartable, so this information
can be used to truncate the archive to just the minimum required to
support restart of the current restore. <literal>%r</> would only be
used in a warm-standby configuration (see <xref linkend="warm-standby">).
Write <literal>%%</> to embed an actual <literal>%</> character
in the command.
If the command returns a non-zero exit status then a WARNING log
message will be written, unless signalled in which case we return
a FATAL error.
</para>
</listitem>
</varlistentry>
<varlistentry id="recovery-target-time" xreflabel="recovery_target_time">
<term><varname>recovery_target_time</varname>
(<type>timestamp</type>)

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstandby.sgml,v 2.7 2009/02/27 09:30:21 petere Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstandby.sgml,v 2.8 2009/05/14 20:31:09 heikki Exp $ -->
<sect1 id="pgstandby">
<title>pg_standby</title>
@ -92,6 +92,37 @@ pg_standby <optional> <replaceable>option</> ... </optional> <replaceable>archiv
is specified,
the <replaceable>archivelocation</> directory must be writable too.
</para>
<para>
There are two ways to fail over a <quote>warm standby</> database server.
You control the type of failover with the contents of the trigger file:
<variablelist>
<varlistentry>
<term>Smart Failover</term>
<listitem>
<para>
In smart failover, the server is brought up after applying all
WAL files available in the archive. This results in zero data loss,
even if the standby server has fallen behind, but if there is a lot
unapplied WAL the recovery can take a long time. To trigger a smart
failover, create a trigger file containing the word <literal>smart</>,
or just leave it empty.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>Fast Failover</term>
<listitem>
<para>
In fast failover, the server is brought up immediately. Any WAL files
in the archive that have not yet been applied will be ignored, and
all transactions in those files are lost. To trigger a fast failover,
write the word <literal>fast</> into the trigger file.
</para>
</listitem>
</varlistentry>
</variablelist>
</para>
<table>
<title><application>pg_standby</> options</title>
@ -177,8 +208,7 @@ pg_standby <optional> <replaceable>option</> ... </optional> <replaceable>archiv
<entry><literal>-t</> <replaceable>triggerfile</></entry>
<entry>none</entry>
<entry>
Specify a trigger file whose presence should cause recovery to end
whether or not the next WAL file is available.
Specify a trigger file whose presence should perform failover.
It is recommended that you use a structured filename to
avoid confusion as to which server is being triggered
when multiple servers exist on the same system; for example
@ -190,7 +220,7 @@ pg_standby <optional> <replaceable>option</> ... </optional> <replaceable>archiv
<entry>0</entry>
<entry>
Set the maximum number of seconds to wait for the next WAL file,
after which recovery will end and the standby will come up.
after which a fast failover will be performed.
A setting of zero (the default) means wait forever.
The default setting is not necessarily recommended;
consult <xref linkend="warm-standby"> for discussion.
@ -210,6 +240,7 @@ pg_standby <optional> <replaceable>option</> ... </optional> <replaceable>archiv
archive_command = 'cp %p .../archive/%f'
restore_command = 'pg_standby -l -d -s 2 -t /tmp/pgsql.trigger.5442 .../archive %f %p %r 2>>standby.log'
recovery_end_command = 'rm -f /tmp/pgsql.trigger.5442'
</programlisting>
<para>
where the archive directory is physically located on the standby server,
@ -236,7 +267,13 @@ restore_command = 'pg_standby -l -d -s 2 -t /tmp/pgsql.trigger.5442 .../archive
<listitem>
<para>
stop waiting only when a trigger file called
<filename>/tmp/pgsql.trigger.5442</> appears
<filename>/tmp/pgsql.trigger.5442</> appears,
and perform failover according to its content
</para>
</listitem>
<listitem>
<para>
remove the trigger file when recovery ends
</para>
</listitem>
<listitem>
@ -277,7 +314,8 @@ restore_command = 'pg_standby -d -s 5 -t C:\pgsql.trigger.5442 ...\archive %f %p
<listitem>
<para>
stop waiting only when a trigger file called
<filename>C:\pgsql.trigger.5442</> appears
<filename>C:\pgsql.trigger.5442</> appears,
and perform failover according to its content
</para>
</listitem>
<listitem>

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.337 2009/05/07 11:25:25 heikki Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.338 2009/05/14 20:31:09 heikki Exp $
*
*-------------------------------------------------------------------------
*/
@ -147,6 +147,7 @@ static bool restoredFromArchive = false;
/* options taken from recovery.conf */
static char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
@ -463,6 +464,7 @@ static int XLogFileRead(uint32 log, uint32 seg, int emode);
static void XLogFileClose(void);
static bool RestoreArchivedFile(char *path, const char *xlogfname,
const char *recovername, off_t expectedSize);
static void ExecuteRecoveryEndCommand(void);
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
static void ValidateXLOGDirectoryStructure(void);
@ -2849,6 +2851,114 @@ RestoreArchivedFile(char *path, const char *xlogfname,
return false;
}
/*
* Attempt to execute the recovery_end_command.
*/
static void
ExecuteRecoveryEndCommand(void)
{
char xlogRecoveryEndCmd[MAXPGPATH];
char lastRestartPointFname[MAXPGPATH];
char *dp;
char *endp;
const char *sp;
int rc;
bool signaled;
uint32 restartLog;
uint32 restartSeg;
Assert(recoveryEndCommand);
/*
* Calculate the archive file cutoff point for use during log shipping
* replication. All files earlier than this point can be deleted
* from the archive, though there is no requirement to do so.
*
* We initialise this with the filename of an InvalidXLogRecPtr, which
* will prevent the deletion of any WAL files from the archive
* because of the alphabetic sorting property of WAL filenames.
*
* Once we have successfully located the redo pointer of the checkpoint
* from which we start recovery we never request a file prior to the redo
* pointer of the last restartpoint. When redo begins we know that we
* have successfully located it, so there is no need for additional
* status flags to signify the point when we can begin deleting WAL files
* from the archive.
*/
if (InRedo)
{
XLByteToSeg(ControlFile->checkPointCopy.redo,
restartLog, restartSeg);
XLogFileName(lastRestartPointFname,
ControlFile->checkPointCopy.ThisTimeLineID,
restartLog, restartSeg);
}
else
XLogFileName(lastRestartPointFname, 0, 0, 0);
/*
* construct the command to be executed
*/
dp = xlogRecoveryEndCmd;
endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
*endp = '\0';
for (sp = recoveryEndCommand; *sp; sp++)
{
if (*sp == '%')
{
switch (sp[1])
{
case 'r':
/* %r: filename of last restartpoint */
sp++;
StrNCpy(dp, lastRestartPointFname, endp - dp);
dp += strlen(dp);
break;
case '%':
/* convert %% to a single % */
sp++;
if (dp < endp)
*dp++ = *sp;
break;
default:
/* otherwise treat the % as not special */
if (dp < endp)
*dp++ = *sp;
break;
}
}
else
{
if (dp < endp)
*dp++ = *sp;
}
}
*dp = '\0';
ereport(DEBUG3,
(errmsg_internal("executing recovery end command \"%s\"",
xlogRecoveryEndCmd)));
/*
* Copy xlog from archival storage to XLOGDIR
*/
rc = system(xlogRecoveryEndCmd);
if (rc != 0)
{
/*
* If the failure was due to any sort of signal, it's best to punt and
* abort recovery. See also detailed comments on signals in
* RestoreArchivedFile().
*/
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
ereport(signaled ? FATAL : WARNING,
(errmsg("recovery_end_command \"%s\": return code %d",
xlogRecoveryEndCmd, rc)));
}
}
/*
* Preallocate log files beyond the specified log endpoint.
*
@ -4664,6 +4774,13 @@ readRecoveryCommandFile(void)
(errmsg("restore_command = '%s'",
recoveryRestoreCommand)));
}
else if (strcmp(tok1, "recovery_end_command") == 0)
{
recoveryEndCommand = pstrdup(tok2);
ereport(LOG,
(errmsg("recovery_end_command = '%s'",
recoveryEndCommand)));
}
else if (strcmp(tok1, "recovery_target_timeline") == 0)
{
rtliGiven = true;
@ -5622,6 +5739,9 @@ StartupXLOG(void)
* allows some extra error checking in xlog_redo.
*/
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
if (recoveryEndCommand)
ExecuteRecoveryEndCommand();
}
/*