Track invalidation_reason in pg_replication_slots.

Till now, the reason for replication slot invalidation is not tracked
directly in pg_replication_slots. A recent commit 007693f2a3 added
'conflict_reason' to show the reasons for slot conflict/invalidation, but
only for logical slots.

This commit adds a new column 'invalidation_reason' to show invalidation
reasons for both physical and logical slots. And, this commit also turns
'conflict_reason' text column to 'conflicting' boolean column (effectively
reverting commit 007693f2a3). The 'conflicting' column is true for
invalidation reasons 'rows_removed' and 'wal_level_insufficient' because
those make the slot conflict with recovery. When 'conflicting' is true,
one can now look at the new 'invalidation_reason' column for the reason
for the logical slot's conflict with recovery.

The new 'invalidation_reason' column will also be useful to track other
invalidation reasons in the future commit.

Author: Bharath Rupireddy
Reviewed-by: Bertrand Drouvot, Amit Kapila, Shveta Malik
Discussion: https://www.postgresql.org/message-id/ZfR7HuzFEswakt/a%40ip-10-97-1-34.eu-west-3.compute.internal
Discussion: https://www.postgresql.org/message-id/CALj2ACW4aUe-_uFQOjdWCEN-xXoLGhmvRFnL8SNw_TZ5nJe+aw@mail.gmail.com
This commit is contained in:
Amit Kapila 2024-03-22 13:52:05 +05:30
parent b4080fa3dc
commit 6ae701b437
13 changed files with 94 additions and 72 deletions

View File

@ -453,8 +453,8 @@ make prefix=/usr/local/pgsql.new install
<para>
All slots on the old cluster must be usable, i.e., there are no slots
whose
<link linkend="view-pg-replication-slots">pg_replication_slots</link>.<structfield>conflict_reason</structfield>
is not <literal>NULL</literal>.
<link linkend="view-pg-replication-slots">pg_replication_slots</link>.<structfield>conflicting</structfield>
is not <literal>true</literal>.
</para>
</listitem>
<listitem>

View File

@ -2525,13 +2525,24 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>conflict_reason</structfield> <type>text</type>
<structfield>conflicting</structfield> <type>bool</type>
</para>
<para>
The reason for the logical slot's conflict with recovery. It is always
NULL for physical slots, as well as for logical slots which are not
invalidated. The non-NULL values indicate that the slot is marked
as invalidated. Possible values are:
True if this logical slot conflicted with recovery (and so is now
invalidated). When this column is true, check
<structfield>invalidation_reason</structfield> column for the conflict
reason. Always NULL for physical slots.
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>invalidation_reason</structfield> <type>text</type>
</para>
<para>
The reason for the slot's invalidation. It is set for both logical and
physical slots. <literal>NULL</literal> if the slot is not invalidated.
Possible values are:
<itemizedlist spacing="compact">
<listitem>
<para>
@ -2542,14 +2553,14 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
<listitem>
<para>
<literal>rows_removed</literal> means that the required rows have
been removed.
been removed. It is set only for logical slots.
</para>
</listitem>
<listitem>
<para>
<literal>wal_level_insufficient</literal> means that the
primary doesn't have a <xref linkend="guc-wal-level"/> sufficient to
perform logical decoding.
perform logical decoding. It is set only for logical slots.
</para>
</listitem>
</itemizedlist>

View File

@ -1023,7 +1023,8 @@ CREATE VIEW pg_replication_slots AS
L.wal_status,
L.safe_wal_size,
L.two_phase,
L.conflict_reason,
L.conflicting,
L.invalidation_reason,
L.failover,
L.synced
FROM pg_get_replication_slots() AS L

View File

@ -663,7 +663,7 @@ synchronize_slots(WalReceiverConn *wrconn)
bool started_tx = false;
const char *query = "SELECT slot_name, plugin, confirmed_flush_lsn,"
" restart_lsn, catalog_xmin, two_phase, failover,"
" database, conflict_reason"
" database, invalidation_reason"
" FROM pg_catalog.pg_replication_slots"
" WHERE failover and NOT temporary";

View File

@ -1525,14 +1525,14 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
XLogRecPtr initial_effective_xmin = InvalidXLogRecPtr;
XLogRecPtr initial_catalog_effective_xmin = InvalidXLogRecPtr;
XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr;
ReplicationSlotInvalidationCause conflict_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
ReplicationSlotInvalidationCause invalidation_cause_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
for (;;)
{
XLogRecPtr restart_lsn;
NameData slotname;
int active_pid = 0;
ReplicationSlotInvalidationCause conflict = RS_INVAL_NONE;
ReplicationSlotInvalidationCause invalidation_cause = RS_INVAL_NONE;
Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock, LW_SHARED));
@ -1554,17 +1554,14 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
restart_lsn = s->data.restart_lsn;
/*
* If the slot is already invalid or is a non conflicting slot, we
* don't need to do anything.
*/
/* we do nothing if the slot is already invalid */
if (s->data.invalidated == RS_INVAL_NONE)
{
/*
* The slot's mutex will be released soon, and it is possible that
* those values change since the process holding the slot has been
* terminated (if any), so record them here to ensure that we
* would report the correct conflict cause.
* would report the correct invalidation cause.
*/
if (!terminated)
{
@ -1578,7 +1575,7 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
case RS_INVAL_WAL_REMOVED:
if (initial_restart_lsn != InvalidXLogRecPtr &&
initial_restart_lsn < oldestLSN)
conflict = cause;
invalidation_cause = cause;
break;
case RS_INVAL_HORIZON:
if (!SlotIsLogical(s))
@ -1589,15 +1586,15 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
if (TransactionIdIsValid(initial_effective_xmin) &&
TransactionIdPrecedesOrEquals(initial_effective_xmin,
snapshotConflictHorizon))
conflict = cause;
invalidation_cause = cause;
else if (TransactionIdIsValid(initial_catalog_effective_xmin) &&
TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin,
snapshotConflictHorizon))
conflict = cause;
invalidation_cause = cause;
break;
case RS_INVAL_WAL_LEVEL:
if (SlotIsLogical(s))
conflict = cause;
invalidation_cause = cause;
break;
case RS_INVAL_NONE:
pg_unreachable();
@ -1605,14 +1602,14 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
}
/*
* The conflict cause recorded previously should not change while the
* process owning the slot (if any) has been terminated.
* The invalidation cause recorded previously should not change while
* the process owning the slot (if any) has been terminated.
*/
Assert(!(conflict_prev != RS_INVAL_NONE && terminated &&
conflict_prev != conflict));
Assert(!(invalidation_cause_prev != RS_INVAL_NONE && terminated &&
invalidation_cause_prev != invalidation_cause));
/* if there's no conflict, we're done */
if (conflict == RS_INVAL_NONE)
/* if there's no invalidation, we're done */
if (invalidation_cause == RS_INVAL_NONE)
{
SpinLockRelease(&s->mutex);
if (released_lock)
@ -1632,13 +1629,13 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
{
MyReplicationSlot = s;
s->active_pid = MyProcPid;
s->data.invalidated = conflict;
s->data.invalidated = invalidation_cause;
/*
* XXX: We should consider not overwriting restart_lsn and instead
* just rely on .invalidated.
*/
if (conflict == RS_INVAL_WAL_REMOVED)
if (invalidation_cause == RS_INVAL_WAL_REMOVED)
s->data.restart_lsn = InvalidXLogRecPtr;
/* Let caller know */
@ -1681,7 +1678,7 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
*/
if (last_signaled_pid != active_pid)
{
ReportSlotInvalidation(conflict, true, active_pid,
ReportSlotInvalidation(invalidation_cause, true, active_pid,
slotname, restart_lsn,
oldestLSN, snapshotConflictHorizon);
@ -1694,7 +1691,7 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
last_signaled_pid = active_pid;
terminated = true;
conflict_prev = conflict;
invalidation_cause_prev = invalidation_cause;
}
/* Wait until the slot is released. */
@ -1727,7 +1724,7 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
ReplicationSlotSave();
ReplicationSlotRelease();
ReportSlotInvalidation(conflict, false, active_pid,
ReportSlotInvalidation(invalidation_cause, false, active_pid,
slotname, restart_lsn,
oldestLSN, snapshotConflictHorizon);
@ -2356,21 +2353,21 @@ RestoreSlotFromDisk(const char *name)
}
/*
* Maps a conflict reason for a replication slot to
* Maps an invalidation reason for a replication slot to
* ReplicationSlotInvalidationCause.
*/
ReplicationSlotInvalidationCause
GetSlotInvalidationCause(const char *conflict_reason)
GetSlotInvalidationCause(const char *invalidation_reason)
{
ReplicationSlotInvalidationCause cause;
ReplicationSlotInvalidationCause result = RS_INVAL_NONE;
bool found PG_USED_FOR_ASSERTS_ONLY = false;
Assert(conflict_reason);
Assert(invalidation_reason);
for (cause = RS_INVAL_NONE; cause <= RS_INVAL_MAX_CAUSES; cause++)
{
if (strcmp(SlotInvalidationCauses[cause], conflict_reason) == 0)
if (strcmp(SlotInvalidationCauses[cause], invalidation_reason) == 0)
{
found = true;
result = cause;

View File

@ -239,7 +239,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
Datum
pg_get_replication_slots(PG_FUNCTION_ARGS)
{
#define PG_GET_REPLICATION_SLOTS_COLS 17
#define PG_GET_REPLICATION_SLOTS_COLS 18
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
XLogRecPtr currlsn;
int slotno;
@ -263,6 +263,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
bool nulls[PG_GET_REPLICATION_SLOTS_COLS];
WALAvailability walstate;
int i;
ReplicationSlotInvalidationCause cause;
if (!slot->in_use)
continue;
@ -409,18 +410,28 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
values[i++] = BoolGetDatum(slot_contents.data.two_phase);
if (slot_contents.data.database == InvalidOid)
cause = slot_contents.data.invalidated;
if (SlotIsPhysical(&slot_contents))
nulls[i++] = true;
else
{
ReplicationSlotInvalidationCause cause = slot_contents.data.invalidated;
if (cause == RS_INVAL_NONE)
nulls[i++] = true;
/*
* rows_removed and wal_level_insufficient are the only two
* reasons for the logical slot's conflict with recovery.
*/
if (cause == RS_INVAL_HORIZON ||
cause == RS_INVAL_WAL_LEVEL)
values[i++] = BoolGetDatum(true);
else
values[i++] = CStringGetTextDatum(SlotInvalidationCauses[cause]);
values[i++] = BoolGetDatum(false);
}
if (cause == RS_INVAL_NONE)
nulls[i++] = true;
else
values[i++] = CStringGetTextDatum(SlotInvalidationCauses[cause]);
values[i++] = BoolGetDatum(slot_contents.data.failover);
values[i++] = BoolGetDatum(slot_contents.data.synced);

View File

@ -676,13 +676,13 @@ get_old_cluster_logical_slot_infos(DbInfo *dbinfo, bool live_check)
* removed.
*/
res = executeQueryOrDie(conn, "SELECT slot_name, plugin, two_phase, failover, "
"%s as caught_up, conflict_reason IS NOT NULL as invalid "
"%s as caught_up, invalidation_reason IS NOT NULL as invalid "
"FROM pg_catalog.pg_replication_slots "
"WHERE slot_type = 'logical' AND "
"database = current_database() AND "
"temporary IS FALSE;",
live_check ? "FALSE" :
"(CASE WHEN conflict_reason IS NOT NULL THEN FALSE "
"(CASE WHEN invalidation_reason IS NOT NULL THEN FALSE "
"ELSE (SELECT pg_catalog.binary_upgrade_logical_slot_has_caught_up(slot_name)) "
"END)");

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202403221
#define CATALOG_VERSION_NO 202403222
#endif

View File

@ -11133,9 +11133,9 @@
proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
proretset => 't', provolatile => 's', prorettype => 'record',
proargtypes => '',
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,text,bool,bool}',
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,conflict_reason,failover,synced}',
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,bool,text,bool,bool}',
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,conflicting,invalidation_reason,failover,synced}',
prosrc => 'pg_get_replication_slots' },
{ oid => '3786', descr => 'set up a logical replication slot',
proname => 'pg_create_logical_replication_slot', provolatile => 'v',

View File

@ -273,7 +273,7 @@ extern void CheckPointReplicationSlots(bool is_shutdown);
extern void CheckSlotRequirements(void);
extern void CheckSlotPermissions(void);
extern ReplicationSlotInvalidationCause
GetSlotInvalidationCause(const char *conflict_reason);
GetSlotInvalidationCause(const char *invalidation_reason);
extern bool SlotExistsInStandbySlotNames(const char *slot_name);
extern bool StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel);

View File

@ -168,7 +168,7 @@ sub change_hot_standby_feedback_and_wait_for_xmins
}
}
# Check conflict_reason in pg_replication_slots.
# Check reason for conflict in pg_replication_slots.
sub check_slots_conflict_reason
{
my ($slot_prefix, $reason) = @_;
@ -178,15 +178,15 @@ sub check_slots_conflict_reason
$res = $node_standby->safe_psql(
'postgres', qq(
select conflict_reason from pg_replication_slots where slot_name = '$active_slot';));
select invalidation_reason from pg_replication_slots where slot_name = '$active_slot' and conflicting;));
is($res, "$reason", "$active_slot conflict_reason is $reason");
is($res, "$reason", "$active_slot reason for conflict is $reason");
$res = $node_standby->safe_psql(
'postgres', qq(
select conflict_reason from pg_replication_slots where slot_name = '$inactive_slot';));
select invalidation_reason from pg_replication_slots where slot_name = '$inactive_slot' and conflicting;));
is($res, "$reason", "$inactive_slot conflict_reason is $reason");
is($res, "$reason", "$inactive_slot reason for conflict is $reason");
}
# Drop the slots, re-create them, change hot_standby_feedback,
@ -293,13 +293,13 @@ $node_primary->safe_psql('testdb',
qq[SELECT * FROM pg_create_physical_replication_slot('$primary_slotname');]
);
# Check conflict_reason is NULL for physical slot
# Check conflicting is NULL for physical slot
$res = $node_primary->safe_psql(
'postgres', qq[
SELECT conflict_reason is null FROM pg_replication_slots where slot_name = '$primary_slotname';]
SELECT conflicting is null FROM pg_replication_slots where slot_name = '$primary_slotname';]
);
is($res, 't', "Physical slot reports conflict_reason as NULL");
is($res, 't', "Physical slot reports conflicting as NULL");
my $backup_name = 'b1';
$node_primary->backup($backup_name);
@ -524,7 +524,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
# Check invalidation in the logfile and in pg_stat_database_conflicts
check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class');
# Verify conflict_reason is 'rows_removed' in pg_replication_slots
# Verify reason for conflict is 'rows_removed' in pg_replication_slots
check_slots_conflict_reason('vacuum_full_', 'rows_removed');
# Ensure that replication slot stats are not removed after invalidation.
@ -551,7 +551,7 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
##################################################
$node_standby->restart;
# Verify conflict_reason is retained across a restart.
# Verify reason for conflict is retained across a restart.
check_slots_conflict_reason('vacuum_full_', 'rows_removed');
##################################################
@ -560,7 +560,8 @@ check_slots_conflict_reason('vacuum_full_', 'rows_removed');
# Get the restart_lsn from an invalidated slot
my $restart_lsn = $node_standby->safe_psql('postgres',
"SELECT restart_lsn from pg_replication_slots WHERE slot_name = 'vacuum_full_activeslot' and conflict_reason is not null;"
"SELECT restart_lsn FROM pg_replication_slots
WHERE slot_name = 'vacuum_full_activeslot' AND conflicting;"
);
chomp($restart_lsn);
@ -611,7 +612,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
# Check invalidation in the logfile and in pg_stat_database_conflicts
check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class');
# Verify conflict_reason is 'rows_removed' in pg_replication_slots
# Verify reason for conflict is 'rows_removed' in pg_replication_slots
check_slots_conflict_reason('row_removal_', 'rows_removed');
$handle =
@ -647,7 +648,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
check_for_invalidation('shared_row_removal_', $logstart,
'with vacuum on pg_authid');
# Verify conflict_reason is 'rows_removed' in pg_replication_slots
# Verify reason for conflict is 'rows_removed' in pg_replication_slots
check_slots_conflict_reason('shared_row_removal_', 'rows_removed');
$handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout,
@ -700,8 +701,8 @@ ok( $node_standby->poll_query_until(
is( $node_standby->safe_psql(
'postgres',
q[select bool_or(conflicting) from
(select conflict_reason is not NULL as conflicting
from pg_replication_slots WHERE slot_type = 'logical')]),
(select conflicting from pg_replication_slots
where slot_type = 'logical')]),
'f',
'Logical slots are reported as non conflicting');
@ -739,7 +740,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
# Check invalidation in the logfile and in pg_stat_database_conflicts
check_for_invalidation('pruning_', $logstart, 'with on-access pruning');
# Verify conflict_reason is 'rows_removed' in pg_replication_slots
# Verify reason for conflict is 'rows_removed' in pg_replication_slots
check_slots_conflict_reason('pruning_', 'rows_removed');
$handle = make_slot_active($node_standby, 'pruning_', 0, \$stdout, \$stderr);
@ -783,7 +784,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
# Check invalidation in the logfile and in pg_stat_database_conflicts
check_for_invalidation('wal_level_', $logstart, 'due to wal_level');
# Verify conflict_reason is 'wal_level_insufficient' in pg_replication_slots
# Verify reason for conflict is 'wal_level_insufficient' in pg_replication_slots
check_slots_conflict_reason('wal_level_', 'wal_level_insufficient');
$handle =

View File

@ -228,7 +228,7 @@ $standby1->safe_psql('postgres', "CHECKPOINT");
# Check if the synced slot is invalidated
is( $standby1->safe_psql(
'postgres',
q{SELECT conflict_reason = 'wal_removed' FROM pg_replication_slots WHERE slot_name = 'lsub1_slot';}
q{SELECT invalidation_reason = 'wal_removed' FROM pg_replication_slots WHERE slot_name = 'lsub1_slot';}
),
"t",
'synchronized slot has been invalidated');
@ -274,7 +274,7 @@ $standby1->wait_for_log(qr/dropped replication slot "lsub1_slot" of dbid [0-9]+/
# flagged as 'synced'
is( $standby1->safe_psql(
'postgres',
q{SELECT conflict_reason IS NULL AND synced AND NOT temporary FROM pg_replication_slots WHERE slot_name = 'lsub1_slot';}
q{SELECT invalidation_reason IS NULL AND synced AND NOT temporary FROM pg_replication_slots WHERE slot_name = 'lsub1_slot';}
),
"t",
'logical slot is re-synced');

View File

@ -1473,10 +1473,11 @@ pg_replication_slots| SELECT l.slot_name,
l.wal_status,
l.safe_wal_size,
l.two_phase,
l.conflict_reason,
l.conflicting,
l.invalidation_reason,
l.failover,
l.synced
FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, conflict_reason, failover, synced)
FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size, two_phase, conflicting, invalidation_reason, failover, synced)
LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
pg_roles| SELECT pg_authid.rolname,
pg_authid.rolsuper,