Track last_inactive_time in pg_replication_slots.

This commit adds a new property called last_inactive_time for slots. It is
set to 0 whenever a slot is made active/acquired and set to the current
timestamp whenever the slot is inactive/released or restored from the disk.
Note that we don't set the last_inactive_time for the slots currently being
synced from the primary to the standby because such slots are typically
inactive as decoding is not allowed on those.

The 'last_inactive_time' will be useful on production servers to debug and
analyze inactive replication slots. It will also help to know the lifetime
of a replication slot - one can know how long a streaming standby, logical
subscriber, or replication slot consumer is down.

The 'last_inactive_time' will also be useful to implement inactive
timeout-based replication slot invalidation in a future commit.

Author: Bharath Rupireddy
Reviewed-by: Bertrand Drouvot, Amit Kapila, Shveta Malik
Discussion: https://www.postgresql.org/message-id/CALj2ACW4aUe-_uFQOjdWCEN-xXoLGhmvRFnL8SNw_TZ5nJe+aw@mail.gmail.com
This commit is contained in:
Amit Kapila 2024-03-25 16:34:33 +05:30
parent 0f7863afef
commit a11f330b55
9 changed files with 213 additions and 6 deletions

View File

@ -2523,6 +2523,16 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>last_inactive_time</structfield> <type>timestamptz</type>
</para>
<para>
The time at which the slot became inactive.
<literal>NULL</literal> if the slot is currently being used.
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>conflicting</structfield> <type>bool</type>

View File

@ -1023,6 +1023,7 @@ CREATE VIEW pg_replication_slots AS
L.wal_status,
L.safe_wal_size,
L.two_phase,
L.last_inactive_time,
L.conflicting,
L.invalidation_reason,
L.failover,

View File

@ -409,6 +409,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
slot->candidate_restart_valid = InvalidXLogRecPtr;
slot->candidate_restart_lsn = InvalidXLogRecPtr;
slot->last_saved_confirmed_flush = InvalidXLogRecPtr;
slot->last_inactive_time = 0;
/*
* Create the slot on disk. We haven't actually marked the slot allocated
@ -622,6 +623,11 @@ retry:
if (SlotIsLogical(s))
pgstat_acquire_replslot(s);
/* Reset the last inactive time as the slot is active now. */
SpinLockAcquire(&s->mutex);
s->last_inactive_time = 0;
SpinLockRelease(&s->mutex);
if (am_walsender)
{
ereport(log_replication_commands ? LOG : DEBUG1,
@ -645,6 +651,7 @@ ReplicationSlotRelease(void)
ReplicationSlot *slot = MyReplicationSlot;
char *slotname = NULL; /* keep compiler quiet */
bool is_logical = false; /* keep compiler quiet */
TimestampTz now = 0;
Assert(slot != NULL && slot->active_pid != 0);
@ -679,6 +686,15 @@ ReplicationSlotRelease(void)
ReplicationSlotsComputeRequiredXmin(false);
}
/*
* Set the last inactive time after marking the slot inactive. We don't set
* it for the slots currently being synced from the primary to the standby
* because such slots are typically inactive as decoding is not allowed on
* those.
*/
if (!(RecoveryInProgress() && slot->data.synced))
now = GetCurrentTimestamp();
if (slot->data.persistency == RS_PERSISTENT)
{
/*
@ -687,9 +703,16 @@ ReplicationSlotRelease(void)
*/
SpinLockAcquire(&slot->mutex);
slot->active_pid = 0;
slot->last_inactive_time = now;
SpinLockRelease(&slot->mutex);
ConditionVariableBroadcast(&slot->active_cv);
}
else
{
SpinLockAcquire(&slot->mutex);
slot->last_inactive_time = now;
SpinLockRelease(&slot->mutex);
}
MyReplicationSlot = NULL;
@ -2342,6 +2365,18 @@ RestoreSlotFromDisk(const char *name)
slot->in_use = true;
slot->active_pid = 0;
/*
* We set the last inactive time after loading the slot from the disk
* into memory. Whoever acquires the slot i.e. makes the slot active
* will reset it. We don't set it for the slots currently being synced
* from the primary to the standby because such slots are typically
* inactive as decoding is not allowed on those.
*/
if (!(RecoveryInProgress() && slot->data.synced))
slot->last_inactive_time = GetCurrentTimestamp();
else
slot->last_inactive_time = 0;
restored = true;
break;
}

View File

@ -239,7 +239,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
Datum
pg_get_replication_slots(PG_FUNCTION_ARGS)
{
#define PG_GET_REPLICATION_SLOTS_COLS 18
#define PG_GET_REPLICATION_SLOTS_COLS 19
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
XLogRecPtr currlsn;
int slotno;
@ -410,6 +410,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
values[i++] = BoolGetDatum(slot_contents.data.two_phase);
if (slot_contents.last_inactive_time > 0)
values[i++] = TimestampTzGetDatum(slot_contents.last_inactive_time);
else
nulls[i++] = true;
cause = slot_contents.data.invalidated;
if (SlotIsPhysical(&slot_contents))

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202403222
#define CATALOG_VERSION_NO 202403251
#endif

View File

@ -11133,9 +11133,9 @@
proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
proretset => 't', provolatile => 's', prorettype => 'record',
proargtypes => '',
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,bool,text,bool,bool}',
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,conflicting,invalidation_reason,failover,synced}',
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,timestamptz,bool,text,bool,bool}',
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,last_inactive_time,conflicting,invalidation_reason,failover,synced}',
prosrc => 'pg_get_replication_slots' },
{ oid => '3786', descr => 'set up a logical replication slot',
proname => 'pg_create_logical_replication_slot', provolatile => 'v',

View File

@ -201,6 +201,9 @@ typedef struct ReplicationSlot
* forcibly flushed or not.
*/
XLogRecPtr last_saved_confirmed_flush;
/* The time at which this slot becomes inactive */
TimestampTz last_inactive_time;
} ReplicationSlot;
#define SlotIsPhysical(slot) ((slot)->data.database == InvalidOid)

View File

@ -410,4 +410,156 @@ kill 'CONT', $receiverpid;
$node_primary3->stop;
$node_standby3->stop;
# =============================================================================
# Testcase start: Check last_inactive_time property of the streaming standby's slot
#
# Initialize primary node
my $primary4 = PostgreSQL::Test::Cluster->new('primary4');
$primary4->init(allows_streaming => 'logical');
$primary4->start;
# Take backup
$backup_name = 'my_backup4';
$primary4->backup($backup_name);
# Create a standby linking to the primary using the replication slot
my $standby4 = PostgreSQL::Test::Cluster->new('standby4');
$standby4->init_from_backup($primary4, $backup_name, has_streaming => 1);
my $sb4_slot = 'sb4_slot';
$standby4->append_conf('postgresql.conf', "primary_slot_name = '$sb4_slot'");
my $slot_creation_time = $primary4->safe_psql(
'postgres', qq[
SELECT current_timestamp;
]);
$primary4->safe_psql(
'postgres', qq[
SELECT pg_create_physical_replication_slot(slot_name := '$sb4_slot');
]);
# Get last_inactive_time value after the slot's creation. Note that the slot
# is still inactive till it's used by the standby below.
my $last_inactive_time =
capture_and_validate_slot_last_inactive_time($primary4, $sb4_slot, $slot_creation_time);
$standby4->start;
# Wait until standby has replayed enough data
$primary4->wait_for_catchup($standby4);
# Now the slot is active so last_inactive_time value must be NULL
is( $primary4->safe_psql(
'postgres',
qq[SELECT last_inactive_time IS NULL FROM pg_replication_slots WHERE slot_name = '$sb4_slot';]
),
't',
'last inactive time for an active physical slot is NULL');
# Stop the standby to check its last_inactive_time value is updated
$standby4->stop;
# Let's restart the primary so that the last_inactive_time is set upon
# loading the slot from the disk.
$primary4->restart;
is( $primary4->safe_psql(
'postgres',
qq[SELECT last_inactive_time > '$last_inactive_time'::timestamptz FROM pg_replication_slots WHERE slot_name = '$sb4_slot' AND last_inactive_time IS NOT NULL;]
),
't',
'last inactive time for an inactive physical slot is updated correctly');
$standby4->stop;
# Testcase end: Check last_inactive_time property of the streaming standby's slot
# =============================================================================
# =============================================================================
# Testcase start: Check last_inactive_time property of the logical subscriber's slot
my $publisher4 = $primary4;
# Create subscriber node
my $subscriber4 = PostgreSQL::Test::Cluster->new('subscriber4');
$subscriber4->init;
# Setup logical replication
my $publisher4_connstr = $publisher4->connstr . ' dbname=postgres';
$publisher4->safe_psql('postgres', "CREATE PUBLICATION pub FOR ALL TABLES");
$slot_creation_time = $publisher4->safe_psql(
'postgres', qq[
SELECT current_timestamp;
]);
my $lsub4_slot = 'lsub4_slot';
$publisher4->safe_psql('postgres',
"SELECT pg_create_logical_replication_slot(slot_name := '$lsub4_slot', plugin := 'pgoutput');"
);
# Get last_inactive_time value after the slot's creation. Note that the slot
# is still inactive till it's used by the subscriber below.
$last_inactive_time =
capture_and_validate_slot_last_inactive_time($publisher4, $lsub4_slot, $slot_creation_time);
$subscriber4->start;
$subscriber4->safe_psql('postgres',
"CREATE SUBSCRIPTION sub CONNECTION '$publisher4_connstr' PUBLICATION pub WITH (slot_name = '$lsub4_slot', create_slot = false)"
);
# Wait until subscriber has caught up
$subscriber4->wait_for_subscription_sync($publisher4, 'sub');
# Now the slot is active so last_inactive_time value must be NULL
is( $publisher4->safe_psql(
'postgres',
qq[SELECT last_inactive_time IS NULL FROM pg_replication_slots WHERE slot_name = '$lsub4_slot';]
),
't',
'last inactive time for an active logical slot is NULL');
# Stop the subscriber to check its last_inactive_time value is updated
$subscriber4->stop;
# Let's restart the publisher so that the last_inactive_time is set upon
# loading the slot from the disk.
$publisher4->restart;
is( $publisher4->safe_psql(
'postgres',
qq[SELECT last_inactive_time > '$last_inactive_time'::timestamptz FROM pg_replication_slots WHERE slot_name = '$lsub4_slot' AND last_inactive_time IS NOT NULL;]
),
't',
'last inactive time for an inactive logical slot is updated correctly');
# Testcase end: Check last_inactive_time property of the logical subscriber's slot
# =============================================================================
$publisher4->stop;
$subscriber4->stop;
# Capture and validate last_inactive_time of a given slot.
sub capture_and_validate_slot_last_inactive_time
{
my ($node, $slot_name, $slot_creation_time) = @_;
my $last_inactive_time = $node->safe_psql('postgres',
qq(SELECT last_inactive_time FROM pg_replication_slots
WHERE slot_name = '$slot_name' AND last_inactive_time IS NOT NULL;)
);
# Check that the captured time is sane
is( $node->safe_psql(
'postgres',
qq[SELECT '$last_inactive_time'::timestamptz > to_timestamp(0) AND
'$last_inactive_time'::timestamptz >= '$slot_creation_time'::timestamptz;]
),
't',
"last inactive time for an active slot $slot_name is sane");
return $last_inactive_time;
}
done_testing();

View File

@ -1473,11 +1473,12 @@ pg_replication_slots| SELECT l.slot_name,
l.wal_status,
l.safe_wal_size,
l.two_phase,
l.last_inactive_time,
l.conflicting,
l.invalidation_reason,
l.failover,
l.synced
FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, conflicting, invalidation_reason, failover, synced)
FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, last_inactive_time, conflicting, invalidation_reason, failover, synced)
LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
pg_roles| SELECT pg_authid.rolname,
pg_authid.rolsuper,