2008-11-19 11:34:52 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* storage.c
|
|
|
|
* code to create and destroy physical storage for relations
|
|
|
|
*
|
2019-01-02 18:44:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
2008-11-19 11:34:52 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/catalog/storage.c
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* NOTES
|
|
|
|
* Some of this code used to be in storage/smgr/smgr.c, and the
|
|
|
|
* function names still reflect that.
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
2008-12-03 14:05:22 +01:00
|
|
|
#include "access/visibilitymap.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "access/xact.h"
|
2014-11-06 12:52:08 +01:00
|
|
|
#include "access/xlog.h"
|
|
|
|
#include "access/xloginsert.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "access/xlogutils.h"
|
|
|
|
#include "catalog/storage.h"
|
2012-11-28 16:35:01 +01:00
|
|
|
#include "catalog/storage_xlog.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "miscadmin.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "storage/freespace.h"
|
|
|
|
#include "storage/smgr.h"
|
|
|
|
#include "utils/memutils.h"
|
|
|
|
#include "utils/rel.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep a list of all relations (represented as RelFileNode values)
|
|
|
|
* that have been created or deleted in the current transaction. When
|
|
|
|
* a relation is created, we create the physical file immediately, but
|
|
|
|
* remember it so that we can delete the file again if the current
|
2014-05-06 18:12:18 +02:00
|
|
|
* transaction is aborted. Conversely, a deletion request is NOT
|
2008-11-19 11:34:52 +01:00
|
|
|
* executed immediately, but is just entered in the list. When and if
|
|
|
|
* the transaction commits, we can delete the physical file.
|
|
|
|
*
|
|
|
|
* To handle subtransactions, every entry is marked with its transaction
|
|
|
|
* nesting level. At subtransaction commit, we reassign the subtransaction's
|
|
|
|
* entries to the parent nesting level. At subtransaction abort, we can
|
|
|
|
* immediately execute the abort-time actions for all entries of the current
|
|
|
|
* nesting level.
|
|
|
|
*
|
|
|
|
* NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
|
|
|
|
* unbetimes. It'd probably be OK to keep it in TopTransactionContext,
|
|
|
|
* but I'm being paranoid.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct PendingRelDelete
|
|
|
|
{
|
|
|
|
RelFileNode relnode; /* relation that may need to be deleted */
|
2010-08-13 22:10:54 +02:00
|
|
|
BackendId backend; /* InvalidBackendId if not a temp rel */
|
2008-11-19 11:34:52 +01:00
|
|
|
bool atCommit; /* T=delete at commit; F=delete at abort */
|
|
|
|
int nestLevel; /* xact nesting level of request */
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
struct PendingRelDelete *next; /* linked-list link */
|
2008-11-19 11:34:52 +01:00
|
|
|
} PendingRelDelete;
|
|
|
|
|
|
|
|
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationCreateStorage
|
|
|
|
* Create physical storage for a relation.
|
|
|
|
*
|
|
|
|
* Create the underlying disk file storage for the relation. This only
|
|
|
|
* creates the main fork; additional forks are created lazily by the
|
|
|
|
* modules that need them.
|
|
|
|
*
|
|
|
|
* This function is transactional. The creation is WAL-logged, and if the
|
|
|
|
* transaction aborts later on, the storage will be destroyed.
|
|
|
|
*/
|
2019-04-30 04:28:05 +02:00
|
|
|
SMgrRelation
|
2010-12-13 18:34:26 +01:00
|
|
|
RelationCreateStorage(RelFileNode rnode, char relpersistence)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
SMgrRelation srel;
|
2010-12-13 18:34:26 +01:00
|
|
|
BackendId backend;
|
|
|
|
bool needs_wal;
|
|
|
|
|
|
|
|
switch (relpersistence)
|
|
|
|
{
|
|
|
|
case RELPERSISTENCE_TEMP:
|
Improve the situation for parallel query versus temp relations.
Transmit the leader's temp-namespace state to workers. This is important
because without it, the workers do not really have the same search path
as the leader. For example, there is no good reason (and no extant code
either) to prevent a worker from executing a temp function that the
leader created previously; but as things stood it would fail to find the
temp function, and then either fail or execute the wrong function entirely.
We still prohibit a worker from creating a temp namespace on its own.
In effect, a worker can only see the session's temp namespace if the leader
had created it before starting the worker, which seems like the right
semantics.
Also, transmit the leader's BackendId to workers, and arrange for workers
to use that when determining the physical file path of a temp relation
belonging to their session. While the original intent was to prevent such
accesses entirely, there were a number of holes in that, notably in places
like dbsize.c which assume they can safely access temp rels of other
sessions anyway. We might as well get this right, as a small down payment
on someday allowing workers to access the leader's temp tables. (With
this change, directly using "MyBackendId" as a relation or buffer backend
ID is deprecated; you should use BackendIdForTempRelations() instead.
I left a couple of such uses alone though, as they're not going to be
reachable in parallel workers until we do something about localbuf.c.)
Move the thou-shalt-not-access-thy-leader's-temp-tables prohibition down
into localbuf.c, which is where it actually matters, instead of having it
in relation_open(). This amounts to recognizing that access to temp
tables' catalog entries is perfectly safe in a worker, it's only the data
in local buffers that is problematic.
Having done all that, we can get rid of the test in has_parallel_hazard()
that says that use of a temp table's rowtype is unsafe in parallel workers.
That test was unduly expensive, and if we really did need such a
prohibition, that was not even close to being a bulletproof guard for it.
(For example, any user-defined function executed in a parallel worker
might have attempted such access.)
2016-06-10 02:16:11 +02:00
|
|
|
backend = BackendIdForTempRelations();
|
2010-12-13 18:34:26 +01:00
|
|
|
needs_wal = false;
|
|
|
|
break;
|
2010-12-29 12:48:53 +01:00
|
|
|
case RELPERSISTENCE_UNLOGGED:
|
|
|
|
backend = InvalidBackendId;
|
|
|
|
needs_wal = false;
|
|
|
|
break;
|
2010-12-13 18:34:26 +01:00
|
|
|
case RELPERSISTENCE_PERMANENT:
|
|
|
|
backend = InvalidBackendId;
|
|
|
|
needs_wal = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
elog(ERROR, "invalid relpersistence: %c", relpersistence);
|
2019-05-22 18:55:34 +02:00
|
|
|
return NULL; /* placate compiler */
|
2010-12-13 18:34:26 +01:00
|
|
|
}
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
srel = smgropen(rnode, backend);
|
2008-11-19 11:34:52 +01:00
|
|
|
smgrcreate(srel, MAIN_FORKNUM, false);
|
|
|
|
|
2010-12-13 18:34:26 +01:00
|
|
|
if (needs_wal)
|
2010-12-29 12:48:53 +01:00
|
|
|
log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Add the relation to the list of stuff to delete at abort */
|
|
|
|
pending = (PendingRelDelete *)
|
|
|
|
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
|
|
|
|
pending->relnode = rnode;
|
2010-08-13 22:10:54 +02:00
|
|
|
pending->backend = backend;
|
2008-11-19 11:34:52 +01:00
|
|
|
pending->atCommit = false; /* delete if abort */
|
|
|
|
pending->nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
pending->next = pendingDeletes;
|
|
|
|
pendingDeletes = pending;
|
2019-04-30 04:28:05 +02:00
|
|
|
|
|
|
|
return srel;
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
|
2010-12-29 12:48:53 +01:00
|
|
|
/*
|
2015-05-20 15:18:11 +02:00
|
|
|
* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
|
2010-12-29 12:48:53 +01:00
|
|
|
*/
|
|
|
|
void
|
2019-04-30 04:28:05 +02:00
|
|
|
log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
|
2010-12-29 12:48:53 +01:00
|
|
|
{
|
|
|
|
xl_smgr_create xlrec;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make an XLOG entry reporting the file creation.
|
|
|
|
*/
|
|
|
|
xlrec.rnode = *rnode;
|
|
|
|
xlrec.forkNum = forkNum;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
|
|
|
|
XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
|
2010-12-29 12:48:53 +01:00
|
|
|
}
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
|
|
|
* RelationDropStorage
|
|
|
|
* Schedule unlinking of physical storage at transaction commit.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationDropStorage(Relation rel)
|
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
/* Add the relation to the list of stuff to delete at commit */
|
|
|
|
pending = (PendingRelDelete *)
|
|
|
|
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
|
|
|
|
pending->relnode = rel->rd_node;
|
2010-08-13 22:10:54 +02:00
|
|
|
pending->backend = rel->rd_backend;
|
2008-11-19 11:34:52 +01:00
|
|
|
pending->atCommit = true; /* delete if commit */
|
|
|
|
pending->nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
pending->next = pendingDeletes;
|
|
|
|
pendingDeletes = pending;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NOTE: if the relation was created in this transaction, it will now be
|
|
|
|
* present in the pending-delete list twice, once with atCommit true and
|
|
|
|
* once with atCommit false. Hence, it will be physically deleted at end
|
|
|
|
* of xact in either case (and the other entry will be ignored by
|
|
|
|
* smgrDoPendingDeletes, so no error will occur). We could instead remove
|
|
|
|
* the existing list entry and delete the physical file immediately, but
|
|
|
|
* for now I'll keep the logic simple.
|
|
|
|
*/
|
|
|
|
|
|
|
|
RelationCloseSmgr(rel);
|
|
|
|
}
|
|
|
|
|
2010-02-07 21:48:13 +01:00
|
|
|
/*
|
|
|
|
* RelationPreserveStorage
|
|
|
|
* Mark a relation as not to be deleted after all.
|
|
|
|
*
|
|
|
|
* We need this function because relation mapping changes are committed
|
|
|
|
* separately from commit of the whole transaction, so it's still possible
|
|
|
|
* for the transaction to abort after the mapping update is done.
|
|
|
|
* When a new physical relation is installed in the map, it would be
|
|
|
|
* scheduled for delete-on-abort, so we'd delete it, and be in trouble.
|
|
|
|
* The relation mapper fixes this by telling us to not delete such relations
|
|
|
|
* after all as part of its commit.
|
|
|
|
*
|
2011-07-18 17:02:48 +02:00
|
|
|
* We also use this to reuse an old build of an index during ALTER TABLE, this
|
|
|
|
* time removing the delete-at-commit entry.
|
|
|
|
*
|
2010-02-07 21:48:13 +01:00
|
|
|
* No-op if the relation is not among those scheduled for deletion.
|
|
|
|
*/
|
|
|
|
void
|
2011-07-18 17:02:48 +02:00
|
|
|
RelationPreserveStorage(RelFileNode rnode, bool atCommit)
|
2010-02-07 21:48:13 +01:00
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *prev;
|
|
|
|
PendingRelDelete *next;
|
|
|
|
|
|
|
|
prev = NULL;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
2011-07-18 17:02:48 +02:00
|
|
|
if (RelFileNodeEquals(rnode, pending->relnode)
|
|
|
|
&& pending->atCommit == atCommit)
|
2010-02-07 21:48:13 +01:00
|
|
|
{
|
|
|
|
/* unlink and delete list entry */
|
|
|
|
if (prev)
|
|
|
|
prev->next = next;
|
|
|
|
else
|
|
|
|
pendingDeletes = next;
|
|
|
|
pfree(pending);
|
|
|
|
/* prev does not change */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* unrelated entry, don't touch it */
|
|
|
|
prev = pending;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
|
|
|
* RelationTruncate
|
|
|
|
* Physically truncate a relation to the specified number of blocks.
|
|
|
|
*
|
|
|
|
* This includes getting rid of any buffers for the blocks that are to be
|
2010-02-08 20:59:49 +01:00
|
|
|
* dropped.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationTruncate(Relation rel, BlockNumber nblocks)
|
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
bool fsm;
|
|
|
|
bool vm;
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
bool need_fsm_vacuum = false;
|
|
|
|
ForkNumber forks[MAX_FORKNUM];
|
|
|
|
BlockNumber blocks[MAX_FORKNUM];
|
|
|
|
int nforks = 0;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Open it at the smgr level if not already done */
|
|
|
|
RelationOpenSmgr(rel);
|
|
|
|
|
2010-02-09 22:43:30 +01:00
|
|
|
/*
|
|
|
|
* Make sure smgr_targblock etc aren't pointing somewhere past new end
|
|
|
|
*/
|
|
|
|
rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
|
|
|
|
rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
|
|
|
|
rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
/* Prepare for truncation of MAIN fork of the relation */
|
|
|
|
forks[nforks] = MAIN_FORKNUM;
|
|
|
|
blocks[nforks] = nblocks;
|
|
|
|
nforks++;
|
|
|
|
|
|
|
|
/* Prepare for truncation of the FSM if it exists */
|
2008-11-19 11:34:52 +01:00
|
|
|
fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
|
|
|
|
if (fsm)
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
{
|
|
|
|
blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
|
|
|
|
if (BlockNumberIsValid(blocks[nforks]))
|
|
|
|
{
|
|
|
|
forks[nforks] = FSM_FORKNUM;
|
|
|
|
nforks++;
|
|
|
|
need_fsm_vacuum = true;
|
|
|
|
}
|
|
|
|
}
|
2008-11-19 11:34:52 +01:00
|
|
|
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
/* Prepare for truncation of the visibility map too if it exists */
|
2008-12-03 14:05:22 +01:00
|
|
|
vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
|
|
|
|
if (vm)
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
{
|
|
|
|
blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
|
|
|
|
if (BlockNumberIsValid(blocks[nforks]))
|
|
|
|
{
|
|
|
|
forks[nforks] = VISIBILITYMAP_FORKNUM;
|
|
|
|
nforks++;
|
|
|
|
}
|
|
|
|
}
|
2008-12-03 14:05:22 +01:00
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* We WAL-log the truncation before actually truncating, which means
|
|
|
|
* trouble if the truncation fails. If we then crash, the WAL replay
|
|
|
|
* likely isn't going to succeed in the truncation either, and cause a
|
|
|
|
* PANIC. It's tempting to put a critical section here, but that cure
|
|
|
|
* would be worse than the disease. It would turn a usually harmless
|
2010-02-07 21:48:13 +01:00
|
|
|
* failure to truncate, that might spell trouble at WAL replay, into a
|
2009-06-11 16:49:15 +02:00
|
|
|
* certain PANIC.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
/*
|
2010-02-07 21:48:13 +01:00
|
|
|
* Make an XLOG entry reporting the file truncation.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
XLogRecPtr lsn;
|
|
|
|
xl_smgr_truncate xlrec;
|
|
|
|
|
|
|
|
xlrec.blkno = nblocks;
|
|
|
|
xlrec.rnode = rel->rd_node;
|
2016-06-17 23:37:30 +02:00
|
|
|
xlrec.flags = SMGR_TRUNCATE_ALL;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
|
2008-11-19 11:34:52 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
lsn = XLogInsert(RM_SMGR_ID,
|
|
|
|
XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* Flush, because otherwise the truncation of the main relation might
|
|
|
|
* hit the disk before the WAL record, and the truncation of the FSM
|
|
|
|
* or visibility map. If we crashed during that window, we'd be left
|
|
|
|
* with a truncated heap, but the FSM or visibility map would still
|
|
|
|
* contain entries for the non-existent heap pages.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
2008-12-03 14:05:22 +01:00
|
|
|
if (fsm || vm)
|
2008-11-19 11:34:52 +01:00
|
|
|
XLogFlush(lsn);
|
|
|
|
}
|
|
|
|
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
/* Do the real work to truncate relation forks */
|
|
|
|
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update upper-level FSM pages to account for the truncation.
|
|
|
|
* This is important because the just-truncated pages were likely
|
|
|
|
* marked as all-free, and would be preferentially selected.
|
|
|
|
*/
|
|
|
|
if (need_fsm_vacuum)
|
|
|
|
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
|
2019-03-29 04:01:14 +01:00
|
|
|
/*
|
|
|
|
* Copy a fork's data, block by block.
|
2019-04-30 04:28:05 +02:00
|
|
|
*
|
|
|
|
* Note that this requires that there is no dirty data in shared buffers. If
|
|
|
|
* it's possible that there are, callers need to flush those using
|
|
|
|
* e.g. FlushRelationBuffers(rel).
|
2019-03-29 04:01:14 +01:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
|
|
|
|
ForkNumber forkNum, char relpersistence)
|
|
|
|
{
|
|
|
|
PGAlignedBlock buf;
|
|
|
|
Page page;
|
|
|
|
bool use_wal;
|
|
|
|
bool copying_initfork;
|
|
|
|
BlockNumber nblocks;
|
|
|
|
BlockNumber blkno;
|
|
|
|
|
|
|
|
page = (Page) buf.data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The init fork for an unlogged relation in many respects has to be
|
|
|
|
* treated the same as normal relation, changes need to be WAL logged and
|
|
|
|
* it needs to be synced to disk.
|
|
|
|
*/
|
|
|
|
copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
|
|
|
|
forkNum == INIT_FORKNUM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to log the copied data in WAL iff WAL archiving/streaming is
|
|
|
|
* enabled AND it's a permanent relation.
|
|
|
|
*/
|
|
|
|
use_wal = XLogIsNeeded() &&
|
|
|
|
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
|
|
|
|
|
|
|
|
nblocks = smgrnblocks(src, forkNum);
|
|
|
|
|
|
|
|
for (blkno = 0; blkno < nblocks; blkno++)
|
|
|
|
{
|
|
|
|
/* If we got a cancel signal during the copy of the data, quit */
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
|
|
|
smgrread(src, forkNum, blkno, buf.data);
|
|
|
|
|
|
|
|
if (!PageIsVerified(page, blkno))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
|
|
errmsg("invalid page in block %u of relation %s",
|
|
|
|
blkno,
|
|
|
|
relpathbackend(src->smgr_rnode.node,
|
|
|
|
src->smgr_rnode.backend,
|
|
|
|
forkNum))));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* WAL-log the copied page. Unfortunately we don't know what kind of a
|
|
|
|
* page this is, so we have to log the full page including any unused
|
|
|
|
* space.
|
|
|
|
*/
|
|
|
|
if (use_wal)
|
|
|
|
log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
|
|
|
|
|
|
|
|
PageSetChecksumInplace(page, blkno);
|
|
|
|
|
|
|
|
/*
|
2019-10-22 09:58:20 +02:00
|
|
|
* Now write the page. We say skipFsync = true because there's no
|
|
|
|
* need for smgr to schedule an fsync for this write; we'll do it
|
|
|
|
* ourselves below.
|
2019-03-29 04:01:14 +01:00
|
|
|
*/
|
|
|
|
smgrextend(dst, forkNum, blkno, buf.data, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
|
|
|
|
* to ensure that the toast table gets fsync'd too. (For a temp or
|
|
|
|
* unlogged rel we don't care since the data will be gone after a crash
|
|
|
|
* anyway.)
|
|
|
|
*
|
|
|
|
* It's obvious that we must do this when not WAL-logging the copy. It's
|
|
|
|
* less obvious that we have to do it even if we did WAL-log the copied
|
|
|
|
* pages. The reason is that since we're copying outside shared buffers, a
|
|
|
|
* CHECKPOINT occurring during the copy has no way to flush the previously
|
|
|
|
* written data to disk (indeed it won't know the new rel even exists). A
|
|
|
|
* crash later on would replay WAL from the checkpoint, therefore it
|
|
|
|
* wouldn't replay our earlier WAL entries. If we do not fsync those pages
|
|
|
|
* here, they might still not be on disk when the crash occurs.
|
|
|
|
*/
|
|
|
|
if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
|
|
|
|
smgrimmedsync(dst, forkNum);
|
|
|
|
}
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
|
|
|
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
|
|
|
|
*
|
|
|
|
* This also runs when aborting a subxact; we want to clean up a failed
|
|
|
|
* subxact immediately.
|
2010-08-13 22:10:54 +02:00
|
|
|
*
|
|
|
|
* Note: It's possible that we're being asked to remove a relation that has
|
|
|
|
* no physical storage in any fork. In particular, it's possible that we're
|
|
|
|
* cleaning up an old temporary relation for which RemovePgTempFiles has
|
|
|
|
* already recovered the physical storage.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
smgrDoPendingDeletes(bool isCommit)
|
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *prev;
|
|
|
|
PendingRelDelete *next;
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
int nrels = 0,
|
|
|
|
i = 0,
|
2013-12-20 16:37:30 +01:00
|
|
|
maxrels = 0;
|
|
|
|
SMgrRelation *srels = NULL;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
prev = NULL;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
|
|
|
if (pending->nestLevel < nestLevel)
|
|
|
|
{
|
|
|
|
/* outer-level entries should not be processed yet */
|
|
|
|
prev = pending;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* unlink list entry first, so we don't retry on failure */
|
|
|
|
if (prev)
|
|
|
|
prev->next = next;
|
|
|
|
else
|
|
|
|
pendingDeletes = next;
|
|
|
|
/* do deletion if called for */
|
|
|
|
if (pending->atCommit == isCommit)
|
|
|
|
{
|
|
|
|
SMgrRelation srel;
|
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
srel = smgropen(pending->relnode, pending->backend);
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
|
2013-12-20 16:37:30 +01:00
|
|
|
/* allocate the initial array, or extend it, if needed */
|
|
|
|
if (maxrels == 0)
|
|
|
|
{
|
|
|
|
maxrels = 8;
|
2014-05-06 18:12:18 +02:00
|
|
|
srels = palloc(sizeof(SMgrRelation) * maxrels);
|
2013-12-20 16:37:30 +01:00
|
|
|
}
|
|
|
|
else if (maxrels <= nrels)
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
{
|
|
|
|
maxrels *= 2;
|
|
|
|
srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
|
|
|
|
}
|
|
|
|
|
|
|
|
srels[nrels++] = srel;
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
/* must explicitly free the list entry */
|
|
|
|
pfree(pending);
|
|
|
|
/* prev does not change */
|
|
|
|
}
|
|
|
|
}
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
|
|
|
|
if (nrels > 0)
|
|
|
|
{
|
|
|
|
smgrdounlinkall(srels, nrels, false);
|
|
|
|
|
|
|
|
for (i = 0; i < nrels; i++)
|
|
|
|
smgrclose(srels[i]);
|
|
|
|
|
2013-12-20 16:37:30 +01:00
|
|
|
pfree(srels);
|
|
|
|
}
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-08-13 22:10:54 +02:00
|
|
|
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* The return value is the number of relations scheduled for termination.
|
|
|
|
* *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
|
|
|
|
* If there are no relations to be deleted, *ptr is set to NULL.
|
|
|
|
*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Only non-temporary relations are included in the returned list. This is OK
|
2010-08-13 22:10:54 +02:00
|
|
|
* because the list is used only in contexts where temporary relations don't
|
|
|
|
* matter: we're either writing to the two-phase state file (and transactions
|
|
|
|
* that have touched temp tables can't be prepared) or we're writing to xlog
|
|
|
|
* (and all temporary files will be zapped if we restart anyway, so no need
|
|
|
|
* for redo to do it also).
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* Note that the list does not include anything scheduled for termination
|
|
|
|
* by upper-level transactions.
|
|
|
|
*/
|
|
|
|
int
|
2010-08-13 22:10:54 +02:00
|
|
|
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
int nrels;
|
|
|
|
RelFileNode *rptr;
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
nrels = 0;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
2010-08-13 22:10:54 +02:00
|
|
|
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
|
|
|
|
&& pending->backend == InvalidBackendId)
|
2008-11-19 11:34:52 +01:00
|
|
|
nrels++;
|
|
|
|
}
|
|
|
|
if (nrels == 0)
|
|
|
|
{
|
|
|
|
*ptr = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
|
|
|
|
*ptr = rptr;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
2010-08-13 22:10:54 +02:00
|
|
|
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
|
|
|
|
&& pending->backend == InvalidBackendId)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
*rptr = pending->relnode;
|
|
|
|
rptr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nrels;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PostPrepare_smgr -- Clean up after a successful PREPARE
|
|
|
|
*
|
|
|
|
* What we have to do here is throw away the in-memory state about pending
|
|
|
|
* relation deletes. It's all been recorded in the 2PC state file and
|
|
|
|
* it's no longer smgr's job to worry about it.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
PostPrepare_smgr(void)
|
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *next;
|
|
|
|
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
|
|
|
pendingDeletes = next;
|
|
|
|
/* must explicitly free the list entry */
|
|
|
|
pfree(pending);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AtSubCommit_smgr() --- Take care of subtransaction commit.
|
|
|
|
*
|
|
|
|
* Reassign all items in the pending-deletes list to the parent transaction.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
AtSubCommit_smgr(void)
|
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
|
|
|
if (pending->nestLevel >= nestLevel)
|
|
|
|
pending->nestLevel = nestLevel - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AtSubAbort_smgr() --- Take care of subtransaction abort.
|
|
|
|
*
|
|
|
|
* Delete created relations and forget about deleted relations.
|
|
|
|
* We can execute these operations immediately because we know this
|
|
|
|
* subtransaction will not commit.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
AtSubAbort_smgr(void)
|
|
|
|
{
|
|
|
|
smgrDoPendingDeletes(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
smgr_redo(XLogReaderState *record)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
|
|
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2009-01-20 19:59:37 +01:00
|
|
|
/* Backup blocks are not used in smgr records */
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
Assert(!XLogRecHasAnyBlockRefs(record));
|
2009-01-20 19:59:37 +01:00
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
if (info == XLOG_SMGR_CREATE)
|
|
|
|
{
|
|
|
|
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
|
|
|
|
SMgrRelation reln;
|
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
reln = smgropen(xlrec->rnode, InvalidBackendId);
|
2010-12-29 12:48:53 +01:00
|
|
|
smgrcreate(reln, xlrec->forkNum, true);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
else if (info == XLOG_SMGR_TRUNCATE)
|
|
|
|
{
|
|
|
|
xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
|
|
|
|
SMgrRelation reln;
|
2010-02-09 22:43:30 +01:00
|
|
|
Relation rel;
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
ForkNumber forks[MAX_FORKNUM];
|
|
|
|
BlockNumber blocks[MAX_FORKNUM];
|
|
|
|
int nforks = 0;
|
|
|
|
bool need_fsm_vacuum = false;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
reln = smgropen(xlrec->rnode, InvalidBackendId);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Forcibly create relation if it doesn't exist (which suggests that
|
|
|
|
* it was dropped somewhere later in the WAL sequence). As in
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* XLogReadBufferForRedo, we prefer to recreate the rel and replay the
|
|
|
|
* log as best we can until the drop is seen.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
smgrcreate(reln, MAIN_FORKNUM, true);
|
|
|
|
|
2012-12-10 14:54:42 +01:00
|
|
|
/*
|
2013-05-29 22:58:43 +02:00
|
|
|
* Before we perform the truncation, update minimum recovery point to
|
|
|
|
* cover this WAL record. Once the relation is truncated, there's no
|
|
|
|
* going back. The buffer manager enforces the WAL-first rule for
|
|
|
|
* normal updates to relation files, so that the minimum recovery
|
|
|
|
* point is always updated before the corresponding change in the data
|
|
|
|
* file is flushed to disk. We have to do the same manually here.
|
2012-12-10 14:54:42 +01:00
|
|
|
*
|
|
|
|
* Doing this before the truncation means that if the truncation fails
|
|
|
|
* for some reason, you cannot start up the system even after restart,
|
|
|
|
* until you fix the underlying situation so that the truncation will
|
|
|
|
* succeed. Alternatively, we could update the minimum recovery point
|
|
|
|
* after truncation, but that would leave a small window where the
|
|
|
|
* WAL-first rule could be violated.
|
|
|
|
*/
|
|
|
|
XLogFlush(lsn);
|
|
|
|
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
/* Prepare for truncation of MAIN fork */
|
2016-06-17 23:37:30 +02:00
|
|
|
if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
|
|
|
|
{
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
forks[nforks] = MAIN_FORKNUM;
|
|
|
|
blocks[nforks] = xlrec->blkno;
|
|
|
|
nforks++;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2016-06-17 23:37:30 +02:00
|
|
|
/* Also tell xlogutils.c about it */
|
|
|
|
XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
|
|
|
|
}
|
2009-06-11 16:49:15 +02:00
|
|
|
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
/* Prepare for truncation of FSM and VM too */
|
2010-02-09 22:43:30 +01:00
|
|
|
rel = CreateFakeRelcacheEntry(xlrec->rnode);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2016-06-17 23:37:30 +02:00
|
|
|
if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
|
|
|
|
smgrexists(reln, FSM_FORKNUM))
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
{
|
|
|
|
blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
|
|
|
|
if (BlockNumberIsValid(blocks[nforks]))
|
|
|
|
{
|
|
|
|
forks[nforks] = FSM_FORKNUM;
|
|
|
|
nforks++;
|
|
|
|
need_fsm_vacuum = true;
|
|
|
|
}
|
|
|
|
}
|
2016-06-17 23:37:30 +02:00
|
|
|
if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
|
|
|
|
smgrexists(reln, VISIBILITYMAP_FORKNUM))
|
Speedup truncations of relation forks.
When a relation is truncated, shared_buffers needs to be scanned
so that any buffers for the relation forks are invalidated in it.
Previously, shared_buffers was scanned for each relation forks, i.e.,
MAIN, FSM and VM, when VACUUM truncated off any empty pages
at the end of relation or TRUNCATE truncated the relation in place.
Since shared_buffers needed to be scanned multiple times,
it could take a long time to finish those commands especially
when shared_buffers was large.
This commit changes the logic so that shared_buffers is scanned only
one time for those three relation forks.
Author: Kirk Jamison
Reviewed-by: Masahiko Sawada, Thomas Munro, Alvaro Herrera, Takayuki Tsunakawa and Fujii Masao
Discussion: https://postgr.es/m/D09B13F772D2274BB348A310EE3027C64E2067@g01jpexmbkw24
2019-09-24 10:31:26 +02:00
|
|
|
{
|
|
|
|
blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
|
|
|
|
if (BlockNumberIsValid(blocks[nforks]))
|
|
|
|
{
|
|
|
|
forks[nforks] = VISIBILITYMAP_FORKNUM;
|
|
|
|
nforks++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do the real work to truncate relation forks */
|
|
|
|
if (nforks > 0)
|
|
|
|
smgrtruncate(reln, forks, nforks, blocks);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update upper-level FSM pages to account for the truncation.
|
|
|
|
* This is important because the just-truncated pages were likely
|
|
|
|
* marked as all-free, and would be preferentially selected.
|
|
|
|
*/
|
|
|
|
if (need_fsm_vacuum)
|
|
|
|
FreeSpaceMapVacuumRange(rel, xlrec->blkno,
|
|
|
|
InvalidBlockNumber);
|
2010-02-09 22:43:30 +01:00
|
|
|
|
|
|
|
FreeFakeRelcacheEntry(rel);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
elog(PANIC, "smgr_redo: unknown op code %u", info);
|
|
|
|
}
|