Reduce pg_ctl's reaction time when waiting for postmaster start/stop.

pg_ctl has traditionally waited one second between probes for whether
the start or stop request has completed.  That behavior was embodied
in the original shell script written in 1999 (commit 5b912b089) and
I doubt anyone's questioned it since.  Nowadays, machines are a lot
faster, and the shell script is long since replaced by C code, so it's
fair to reconsider how long we ought to wait.

This patch adjusts the coding so that the wait time can be any even
divisor of 1 second, and sets the actual probe rate to 10 per second.
That's based on experimentation with the src/test/recovery TAP tests,
which include a lot of postmaster starts and stops.  This patch alone
reduces the (non-parallelized) runtime of those tests from ~4m30s to
~3m5s on my machine.  Increasing the probe rate further doesn't help
much, so this seems like a good number.

In the real world this probably won't have much impact, since people
don't start/stop production postmasters often, and the shutdown checkpoint
usually takes nontrivial time too.  But it makes development work and
testing noticeably snappier, and that's good enough reason for me.

Also, by reducing the dead time in postmaster restart sequences, this
change has made it easier to reproduce some bugs that have been lurking
for awhile.  Patches for those will follow.

Discussion: https://postgr.es/m/18444.1498428798@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2017-06-26 15:13:23 -04:00
parent 5c77690f6f
commit c61559ec3a
1 changed files with 37 additions and 26 deletions

View File

@ -68,6 +68,10 @@ typedef enum
#define DEFAULT_WAIT 60
#define USEC_PER_SEC 1000000
#define WAITS_PER_SEC 10 /* should divide USEC_PER_SEC evenly */
static bool do_wait = true;
static int wait_seconds = DEFAULT_WAIT;
static bool wait_seconds_arg = false;
@ -531,7 +535,7 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
connstr[0] = '\0';
for (i = 0; i < wait_seconds; i++)
for (i = 0; i < wait_seconds * WAITS_PER_SEC; i++)
{
/* Do we need a connection string? */
if (connstr[0] == '\0')
@ -701,24 +705,28 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
#endif
/* No response, or startup still in process; wait */
#ifdef WIN32
if (do_checkpoint)
if (i % WAITS_PER_SEC == 0)
{
/*
* Increment the wait hint by 6 secs (connection timeout + sleep)
* We must do this to indicate to the SCM that our startup time is
* changing, otherwise it'll usually send a stop signal after 20
* seconds, despite incrementing the checkpoint counter.
*/
status.dwWaitHint += 6000;
status.dwCheckPoint++;
SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
}
else
#ifdef WIN32
if (do_checkpoint)
{
/*
* Increment the wait hint by 6 secs (connection timeout +
* sleep). We must do this to indicate to the SCM that our
* startup time is changing, otherwise it'll usually send a
* stop signal after 20 seconds, despite incrementing the
* checkpoint counter.
*/
status.dwWaitHint += 6000;
status.dwCheckPoint++;
SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
}
else
#endif
print_msg(".");
print_msg(".");
}
pg_usleep(1000000); /* 1 sec */
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
}
/* return result of last call to PQping */
@ -998,12 +1006,13 @@ do_stop(void)
print_msg(_("waiting for server to shut down..."));
for (cnt = 0; cnt < wait_seconds; cnt++)
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
{
if ((pid = get_pgpid(false)) != 0)
{
print_msg(".");
pg_usleep(1000000); /* 1 sec */
if (cnt % WAITS_PER_SEC == 0)
print_msg(".");
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
}
else
break;
@ -1088,12 +1097,13 @@ do_restart(void)
/* always wait for restart */
for (cnt = 0; cnt < wait_seconds; cnt++)
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
{
if ((pid = get_pgpid(false)) != 0)
{
print_msg(".");
pg_usleep(1000000); /* 1 sec */
if (cnt % WAITS_PER_SEC == 0)
print_msg(".");
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
}
else
break;
@ -1225,17 +1235,18 @@ do_promote(void)
if (do_wait)
{
DBState state = DB_STARTUP;
int cnt;
print_msg(_("waiting for server to promote..."));
while (wait_seconds > 0)
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
{
state = get_control_dbstate();
if (state == DB_IN_PRODUCTION)
break;
print_msg(".");
pg_usleep(1000000); /* 1 sec */
wait_seconds--;
if (cnt % WAITS_PER_SEC == 0)
print_msg(".");
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
}
if (state == DB_IN_PRODUCTION)
{