Use pg_rewind when target timeline was switched

Allow pg_rewind to work when target timeline was switched. Now
user can return promoted standby to old master.

Target timeline history becomes a global variable. Index
in target timeline history is used in function interfaces instead of
specifying TLI directly. Thus, SimpleXLogPageRead() can easily start
reading XLOGs from next timeline when current timeline ends.

Author: Alexander Korotkov
Review: Michael Paquier
This commit is contained in:
Teodor Sigaev 2015-12-01 18:56:44 +03:00
parent 0e0776bc99
commit e50cda7840
5 changed files with 181 additions and 77 deletions

View File

@ -61,13 +61,17 @@ PostgreSQL documentation
<application>pg_rewind</> examines the timeline histories of the source
and target clusters to determine the point where they diverged, and
expects to find WAL in the target cluster's <filename>pg_xlog</> directory
reaching all the way back to the point of divergence. In the typical
failover scenario where the target cluster was shut down soon after the
divergence, that is not a problem, but if the target cluster had run for a
long time after the divergence, the old WAL files might not be present
anymore. In that case, they can be manually copied from the WAL archive to
the <filename>pg_xlog</> directory. Fetching missing files from a WAL
archive automatically is currently not supported.
reaching all the way back to the point of divergence. The point of divergence
could be found either on target timeline, source timeline or their common
ancestor. In the typical failover scenario where the target cluster was
shut down soon after the divergence, that is not a problem, but if the
target cluster had run for a long time after the divergence, the old WAL
files might not be present anymore. In that case, they can be manually
copied from the WAL archive to the <filename>pg_xlog</> directory. Fetching
missing files from a WAL archive automatically is currently not supported.
Besides, <application>pg_rewind</> use cases are not limited by failover.
For instance, standby server could be promoted, run some writes and
then be returned back as stanby.
</para>
<para>

View File

@ -8,7 +8,7 @@
#
#-------------------------------------------------------------------------
PGFILEDESC = "pg_rewind - repurpose an old master server as standby"
PGFILEDESC = "pg_rewind - synchronize a data directory with another one forked from"
PGAPPICON = win32
subdir = src/bin/pg_rewind

View File

@ -45,7 +45,7 @@ static char xlogfpath[MAXPGPATH];
typedef struct XLogPageReadPrivate
{
const char *datadir;
TimeLineID tli;
int tliIndex;
} XLogPageReadPrivate;
static int SimpleXLogPageRead(XLogReaderState *xlogreader,
@ -55,11 +55,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
/*
* Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline
* 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL
* records, and return them in a page map.
* index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
* the data blocks touched by the WAL records, and return them in a page map.
*/
void
extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
XLogRecPtr endpoint)
{
XLogRecord *record;
@ -68,7 +68,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
XLogPageReadPrivate private;
private.datadir = datadir;
private.tli = tli;
private.tliIndex = tliIndex;
xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
if (xlogreader == NULL)
pg_fatal("out of memory\n");
@ -112,7 +112,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
* doing anything with the record itself.
*/
XLogRecPtr
readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex)
{
XLogRecord *record;
XLogReaderState *xlogreader;
@ -121,7 +121,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
XLogRecPtr endptr;
private.datadir = datadir;
private.tli = tli;
private.tliIndex = tliIndex;
xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
if (xlogreader == NULL)
pg_fatal("out of memory\n");
@ -152,7 +152,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli)
* Find the previous checkpoint preceding given WAL position.
*/
void
findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
XLogRecPtr *lastchkptredo)
{
@ -173,7 +173,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli,
forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
private.datadir = datadir;
private.tli = tli;
private.tliIndex = tliIndex;
xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private);
if (xlogreader == NULL)
pg_fatal("out of memory\n");
@ -236,9 +236,11 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
{
XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
uint32 targetPageOff;
XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
XLogRecPtr targetSegEnd;
XLogSegNo targetSegNo;
XLByteToSeg(targetPagePtr, targetSegNo);
XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, targetSegEnd);
targetPageOff = targetPagePtr % XLogSegSize;
/*
@ -257,7 +259,20 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
{
char xlogfname[MAXFNAMELEN];
XLogFileName(xlogfname, private->tli, xlogreadsegno);
/*
* Since incomplete segments are copied into next timelines, switch to
* the timeline holding the required segment. Assuming this scan can be
* done both forward and backward, consider also switching timeline
* accordingly.
*/
while (private->tliIndex < targetNentries - 1 &&
targetHistory[private->tliIndex].end < targetSegEnd)
private->tliIndex++;
while (private->tliIndex > 0 &&
targetHistory[private->tliIndex].begin >= targetSegEnd)
private->tliIndex--;
XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno);
snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname);
@ -293,7 +308,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
Assert(targetSegNo == xlogreadsegno);
*pageTLI = private->tli;
*pageTLI = targetHistory[private->tliIndex].tli;
return XLOG_BLCKSZ;
}

View File

@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* pg_rewind.c
* Synchronizes an old master server to a new timeline
* Synchronizes a PostgreSQL data directory to a new timeline
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
*
@ -37,7 +37,7 @@ static void digestControlFile(ControlFileData *ControlFile, char *source,
size_t size);
static void updateControlFile(ControlFileData *ControlFile);
static void sanityChecks(void);
static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli);
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
static ControlFileData ControlFile_target;
static ControlFileData ControlFile_source;
@ -53,6 +53,10 @@ bool debug = false;
bool showprogress = false;
bool dry_run = false;
/* Target history */
TimeLineHistoryEntry *targetHistory;
int targetNentries;
static void
usage(const char *progname)
{
@ -88,7 +92,7 @@ main(int argc, char **argv)
int option_index;
int c;
XLogRecPtr divergerec;
TimeLineID lastcommontli;
int lastcommontliIndex;
XLogRecPtr chkptrec;
TimeLineID chkpttli;
XLogRecPtr chkptredo;
@ -214,9 +218,10 @@ main(int argc, char **argv)
if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)
pg_fatal("source and target cluster are on the same timeline\n");
findCommonAncestorTimeline(&divergerec, &lastcommontli);
findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
printf(_("servers diverged at WAL position %X/%X on timeline %u\n"),
(uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli);
(uint32) (divergerec >> 32), (uint32) divergerec,
targetHistory[lastcommontliIndex].tli);
/*
* Check for the possibility that the target is in fact a direct ancestor
@ -234,7 +239,7 @@ main(int argc, char **argv)
/* Read the checkpoint record on the target to see where it ends. */
chkptendrec = readOneRecord(datadir_target,
ControlFile_target.checkPoint,
ControlFile_target.checkPointCopy.ThisTimeLineID);
targetNentries - 1);
/*
* If the histories diverged exactly at the end of the shutdown
@ -254,7 +259,8 @@ main(int argc, char **argv)
exit(0);
}
findLastCheckpoint(datadir_target, divergerec, lastcommontli,
findLastCheckpoint(datadir_target, divergerec,
lastcommontliIndex,
&chkptrec, &chkpttli, &chkptredo);
printf(_("rewinding from last common checkpoint at %X/%X on timeline %u\n"),
(uint32) (chkptrec >> 32), (uint32) chkptrec,
@ -277,7 +283,7 @@ main(int argc, char **argv)
* we would need to replay until the end of WAL here.
*/
pg_log(PG_PROGRESS, "reading WAL in target\n");
extractPageMap(datadir_target, chkptrec, lastcommontli,
extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
ControlFile_target.checkPoint);
filemap_finalize();
@ -374,10 +380,11 @@ sanityChecks(void)
/*
* Target cluster better not be running. This doesn't guard against
* someone starting the cluster concurrently. Also, this is probably more
* strict than necessary; it's OK if the master was not shut down cleanly,
* as long as it isn't running at the moment.
* strict than necessary; it's OK if the target node was not shut down
* cleanly, as long as it isn't running at the moment.
*/
if (ControlFile_target.state != DB_SHUTDOWNED)
if (ControlFile_target.state != DB_SHUTDOWNED &&
ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("target server must be shut down cleanly\n");
/*
@ -385,75 +392,149 @@ sanityChecks(void)
* server is shut down. There isn't any very strong reason for this
* limitation, but better safe than sorry.
*/
if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED)
if (datadir_source &&
ControlFile_source.state != DB_SHUTDOWNED &&
ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("source data directory must be shut down cleanly\n");
}
/*
* Determine the TLI of the last common timeline in the histories of the two
* clusters. *tli is set to the last common timeline, and *recptr is set to
* the position where the histories diverged (ie. the first WAL record that's
* not the same in both clusters).
*
* Control files of both clusters must be read into ControlFile_target/source
* before calling this.
* Find minimum from two XLOG positions assuming InvalidXLogRecPtr means
* infinity as src/include/access/timeline.h states. This routine should
* be used only when comparing XLOG positions related to history files.
*/
static void
findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli)
static XLogRecPtr
MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
{
TimeLineID targettli;
TimeLineHistoryEntry *sourceHistory;
int nentries;
int i;
TimeLineID sourcetli;
if (XLogRecPtrIsInvalid(a))
return b;
else if (XLogRecPtrIsInvalid(b))
return a;
else
return Min(a, b);
}
targettli = ControlFile_target.checkPointCopy.ThisTimeLineID;
sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID;
/*
* Retrieve timeline history for given control file which should behold
* either source or target.
*/
static TimeLineHistoryEntry *
getTimelineHistory(ControlFileData *controlFile, int *nentries)
{
TimeLineHistoryEntry *history;
TimeLineID tli;
/* Timeline 1 does not have a history file, so no need to check */
if (sourcetli == 1)
tli = controlFile->checkPointCopy.ThisTimeLineID;
/*
* Timeline 1 does not have a history file, so there is no need to check and
* fake an entry with infinite start and end positions.
*/
if (tli == 1)
{
sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
sourceHistory->tli = sourcetli;
sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr;
nentries = 1;
history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
history->tli = tli;
history->begin = history->end = InvalidXLogRecPtr;
*nentries = 1;
}
else
{
char path[MAXPGPATH];
char *histfile;
TLHistoryFilePath(path, sourcetli);
histfile = fetchFile(path, NULL);
TLHistoryFilePath(path, tli);
sourceHistory = rewind_parseTimeLineHistory(histfile,
ControlFile_source.checkPointCopy.ThisTimeLineID,
&nentries);
/* Get history file from appropriate source */
if (controlFile == &ControlFile_source)
histfile = fetchFile(path, NULL);
else if (controlFile == &ControlFile_target)
histfile = slurpFile(datadir_target, path, NULL);
else
pg_fatal("Invalid control file");
history = rewind_parseTimeLineHistory(histfile, tli, nentries);
pg_free(histfile);
}
/*
* Trace the history backwards, until we hit the target timeline.
*
* TODO: This assumes that there are no timeline switches on the target
* cluster after the fork.
*/
for (i = nentries - 1; i >= 0; i--)
if (debug)
{
TimeLineHistoryEntry *entry = &sourceHistory[i];
int i;
if (entry->tli == targettli)
if (controlFile == &ControlFile_source)
printf("Source timeline history:\n");
else if (controlFile == &ControlFile_target)
printf("Target timeline history:\n");
else
Assert(false);
/*
* Print the target timeline history.
*/
for (i = 0; i < targetNentries; i++)
{
/* found it */
*recptr = entry->end;
*tli = entry->tli;
TimeLineHistoryEntry *entry;
pg_free(sourceHistory);
return;
entry = &history[i];
printf("%d: %X/%X - %X/%X\n", entry->tli,
(uint32) (entry->begin >> 32), (uint32) (entry->begin),
(uint32) (entry->end >> 32), (uint32) (entry->end));
}
}
pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
return history;
}
/*
* Determine the TLI of the last common timeline in the timeline history of the
* two clusters. targetHistory is filled with target timeline history and
* targetNentries is number of items in targetHistory. *tliIndex is set to the
* index of last common timeline in targetHistory array, and *recptr is set to
* the position where the timeline history diverged (ie. the first WAL record
* that's not the same in both clusters).
*
* Control files of both clusters must be read into ControlFile_target/source
* before calling this routine.
*/
static void
findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
{
TimeLineHistoryEntry *sourceHistory;
int sourceNentries;
int i, n;
/* Retrieve timelines for both source and target */
sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
/*
* Trace the history forward, until we hit the timeline diverge. It may
* still be possible that the source and target nodes used the same
* timeline number in their history but with different start position
* depending on the history files that each node has fetched in previous
* recovery processes. Hence check the start position of the new timeline
* as well and move down by one extra timeline entry if they do not match.
*/
n = Min(sourceNentries, targetNentries);
for (i = 0; i < n; i++)
{
if (sourceHistory[i].tli != targetHistory[i].tli ||
sourceHistory[i].begin != targetHistory[i].begin)
break;
}
if (i > 0)
{
i--;
*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
*tliIndex = i;
pg_free(sourceHistory);
return;
}
else
{
pg_fatal("could not find common ancestor of the source and target cluster's timelines\n");
}
}

View File

@ -27,15 +27,19 @@ extern bool debug;
extern bool showprogress;
extern bool dry_run;
/* Target history */
extern TimeLineHistoryEntry *targetHistory;
extern int targetNentries;
/* in parsexlog.c */
extern void extractPageMap(const char *datadir, XLogRecPtr startpoint,
TimeLineID tli, XLogRecPtr endpoint);
int tliIndex, XLogRecPtr endpoint);
extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr,
TimeLineID tli,
int tliIndex,
XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
XLogRecPtr *lastchkptredo);
extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
TimeLineID tli);
int tliIndex);
/* in timeline.c */
extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,